fix: Allow to disable scraping private IPs.

pull/62/head
Raphaël Vinot 2019-07-05 16:27:23 +02:00
parent 07d1636fdf
commit a6693535de
2 changed files with 25 additions and 2 deletions

View File

@ -13,6 +13,10 @@ import tempfile
import pathlib
import time
import ipaddress
import socket
from urllib.parse import urlsplit
from io import BytesIO
import base64
from uuid import uuid4
@ -29,11 +33,12 @@ import logging
class Lookyloo():
def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG):
def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups=False):
self.__init_logger(loglevel)
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir = get_homedir() / 'scraped'
self.splash_url = splash_url
self.only_global_lookups = only_global_lookups
if not self.scrape_dir.exists():
self.scrape_dir.mkdir(parents=True, exist_ok=True)
@ -159,6 +164,16 @@ class Lookyloo():
os: str=None, browser: str=None):
if not url.startswith('http'):
url = f'http://{url}'
if self.only_global_lookups:
splitted_url = urlsplit(url)
if splitted_url.netloc:
if ':' in splitted_url.netloc:
ip = socket.gethostbyname(splitted_url.netloc.split(':')[0])
else:
ip = socket.gethostbyname(splitted_url.netloc)
if not ipaddress.ip_address(ip).is_global:
return False
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
if not items:
# broken

View File

@ -6,6 +6,7 @@ import pickle
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
import os
import logging
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap
@ -31,7 +32,14 @@ app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
lookyloo = Lookyloo()
# API entry point for splash
splash_url = 'http://127.0.0.1:8050'
# Splash log level
loglevel = logging.DEBUG
# Set it to True if your instance is publicly available so users aren't able to scan your internal network
only_global_lookups = False
lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_global_lookups=only_global_lookups)
# keep