fix: Allow to disable scraping private IPs.

pull/62/head
Raphaël Vinot 2019-07-05 16:27:23 +02:00
parent 07d1636fdf
commit a6693535de
2 changed files with 25 additions and 2 deletions

View File

@ -13,6 +13,10 @@ import tempfile
import pathlib import pathlib
import time import time
import ipaddress
import socket
from urllib.parse import urlsplit
from io import BytesIO from io import BytesIO
import base64 import base64
from uuid import uuid4 from uuid import uuid4
@ -29,11 +33,12 @@ import logging
class Lookyloo(): class Lookyloo():
def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG): def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG, only_global_lookups=False):
self.__init_logger(loglevel) self.__init_logger(loglevel)
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir = get_homedir() / 'scraped' self.scrape_dir = get_homedir() / 'scraped'
self.splash_url = splash_url self.splash_url = splash_url
self.only_global_lookups = only_global_lookups
if not self.scrape_dir.exists(): if not self.scrape_dir.exists():
self.scrape_dir.mkdir(parents=True, exist_ok=True) self.scrape_dir.mkdir(parents=True, exist_ok=True)
@ -159,6 +164,16 @@ class Lookyloo():
os: str=None, browser: str=None): os: str=None, browser: str=None):
if not url.startswith('http'): if not url.startswith('http'):
url = f'http://{url}' url = f'http://{url}'
if self.only_global_lookups:
splitted_url = urlsplit(url)
if splitted_url.netloc:
if ':' in splitted_url.netloc:
ip = socket.gethostbyname(splitted_url.netloc.split(':')[0])
else:
ip = socket.gethostbyname(splitted_url.netloc)
if not ipaddress.ip_address(ip).is_global:
return False
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
if not items: if not items:
# broken # broken

View File

@ -6,6 +6,7 @@ import pickle
from zipfile import ZipFile, ZIP_DEFLATED from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO from io import BytesIO
import os import os
import logging
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap from flask_bootstrap import Bootstrap
@ -31,7 +32,14 @@ app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False app.debug = False
lookyloo = Lookyloo() # API entry point for splash
splash_url = 'http://127.0.0.1:8050'
# Splash log level
loglevel = logging.DEBUG
# Set it to True if your instance is publicly available so users aren't able to scan your internal network
only_global_lookups = False
lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_global_lookups=only_global_lookups)
# keep # keep