2019-01-30 14:30:01 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import pickle
from datetime import datetime
import tempfile
import pathlib
import time
2019-07-05 16:27:23 +02:00
import ipaddress
import socket
from urllib . parse import urlsplit
2020-01-24 11:25:53 +01:00
from io import BufferedIOBase , BytesIO
2019-01-30 14:30:01 +01:00
import base64
from uuid import uuid4
from pathlib import Path
2020-01-23 10:52:50 +01:00
from . helpers import get_homedir , get_socket_path , load_cookies
2019-02-18 13:52:48 +01:00
from . exceptions import NoValidHarFile
2019-01-30 14:30:01 +01:00
from redis import Redis
2020-01-23 10:52:50 +01:00
from typing import Union , Dict , List , Tuple , Optional
2020-01-06 15:32:38 +01:00
2019-01-30 14:30:01 +01:00
import logging
2020-01-23 10:52:50 +01:00
from pysanejs import SaneJS
from scrapysplashwrapper import crawl
2020-02-03 22:25:48 +01:00
from har2tree import CrawledTree , Har2TreeError , HarFile
2020-01-06 15:32:38 +01:00
2019-01-30 14:30:01 +01:00
class Lookyloo ( ) :
2020-01-06 15:32:38 +01:00
def __init__ ( self , splash_url : str = ' http://127.0.0.1:8050 ' , loglevel : int = logging . DEBUG , only_global_lookups : bool = False ) - > None :
2019-01-30 14:30:01 +01:00
self . __init_logger ( loglevel )
2020-01-06 15:32:38 +01:00
self . redis : Redis = Redis ( unix_socket_path = get_socket_path ( ' cache ' ) , decode_responses = True )
self . scrape_dir : Path = get_homedir ( ) / ' scraped '
self . splash_url : str = splash_url
self . only_global_lookups : bool = only_global_lookups
2019-01-30 14:30:01 +01:00
if not self . scrape_dir . exists ( ) :
self . scrape_dir . mkdir ( parents = True , exist_ok = True )
2019-04-05 15:07:22 +02:00
if not self . redis . exists ( ' cache_loaded ' ) :
self . _init_existing_dumps ( )
2019-02-01 16:11:16 +01:00
2019-01-30 14:30:01 +01:00
# Try to reach sanejs
self . sanejs = SaneJS ( )
if not self . sanejs . is_up :
2020-01-23 10:52:50 +01:00
self . use_sane_js = False
else :
self . use_sane_js = True
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
def __init_logger ( self , loglevel : int ) - > None :
2019-01-30 14:30:01 +01:00
self . logger = logging . getLogger ( f ' { self . __class__ . __name__ } ' )
self . logger . setLevel ( loglevel )
2020-01-06 15:32:38 +01:00
def _set_report_cache ( self , report_dir : Path ) - > None :
2019-06-25 18:08:52 +02:00
if self . redis . exists ( str ( report_dir ) ) :
return
2020-02-16 23:38:42 +01:00
if ( report_dir / ' error.txt ' ) . exists ( ) :
# Something went wrong
return
2019-02-01 16:11:16 +01:00
har_files = sorted ( report_dir . glob ( ' *.har ' ) )
if not har_files :
self . logger . warning ( f ' No har files in { report_dir } ' )
2020-02-16 23:38:42 +01:00
# if (report_dir / 'uuid').exists():
# (report_dir / 'uuid').unlink()
# if (report_dir / 'no_index').exists():
# (report_dir / 'no_index').unlink()
# report_dir.rmdir()
2019-02-01 16:11:16 +01:00
return
with ( report_dir / ' uuid ' ) . open ( ) as f :
uuid = f . read ( ) . strip ( )
2020-02-03 22:25:48 +01:00
har = HarFile ( har_files [ 0 ] )
cache : Dict [ str , Union [ str , int ] ] = { ' uuid ' : uuid ,
' title ' : har . initial_title ,
' timestamp ' : har . initial_start_time ,
' url ' : har . first_url ,
' redirects ' : json . dumps ( har . initial_redirects ) }
2019-02-18 14:29:15 +01:00
if ( report_dir / ' no_index ' ) . exists ( ) : # If the folders claims anonymity
cache [ ' no_index ' ] = 1
2019-04-05 15:07:22 +02:00
if uuid and not self . redis . exists ( str ( report_dir ) ) :
2019-04-05 14:48:25 +02:00
self . redis . hmset ( str ( report_dir ) , cache )
self . redis . hset ( ' lookup_dirs ' , uuid , str ( report_dir ) )
2019-02-01 16:11:16 +01:00
2020-02-11 17:03:25 +01:00
def report_cache ( self , report_dir : Union [ str , Path ] ) - > Optional [ Dict [ str , Union [ str , int ] ] ] :
2019-02-01 16:11:16 +01:00
if isinstance ( report_dir , Path ) :
report_dir = str ( report_dir )
2020-02-16 23:38:42 +01:00
if ( Path ( report_dir ) / ' error.txt ' ) . exists ( ) :
2020-02-16 23:51:04 +01:00
with ( Path ( report_dir ) / ' error.txt ' ) . open ( ) as _error :
2020-02-16 23:38:42 +01:00
self . logger . warning ( f ' Capture in ( { report_dir } ) has an error: { _error . read ( ) } , see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go ' )
2020-02-03 22:25:48 +01:00
cached = self . redis . hgetall ( report_dir )
2020-02-11 17:26:01 +01:00
if all ( key in cached . keys ( ) for key in [ ' uuid ' , ' title ' , ' timestamp ' , ' url ' , ' redirects ' ] ) :
2020-02-11 17:03:25 +01:00
cached [ ' redirects ' ] = json . loads ( cached [ ' redirects ' ] )
return cached
2020-02-11 17:09:15 +01:00
self . logger . warning ( f ' Cache ( { report_dir } ) is invalid: { json . dumps ( cached , indent = 2 ) } ' )
2020-02-11 17:03:25 +01:00
return None
2019-02-01 16:11:16 +01:00
2020-01-06 15:32:38 +01:00
def _init_existing_dumps ( self ) - > None :
2019-02-01 16:11:16 +01:00
for report_dir in self . report_dirs :
2019-04-05 15:07:22 +02:00
if report_dir . exists ( ) :
self . _set_report_cache ( report_dir )
self . redis . set ( ' cache_loaded ' , 1 )
2019-02-01 16:11:16 +01:00
2019-01-30 14:30:01 +01:00
@property
2020-01-06 15:32:38 +01:00
def report_dirs ( self ) - > List [ Path ] :
2019-01-30 14:30:01 +01:00
for report_dir in self . scrape_dir . iterdir ( ) :
if report_dir . is_dir ( ) and not report_dir . iterdir ( ) :
# Cleanup self.scrape_dir of failed runs.
report_dir . rmdir ( )
if not ( report_dir / ' uuid ' ) . exists ( ) :
# Create uuid if missing
with ( report_dir / ' uuid ' ) . open ( ' w ' ) as f :
f . write ( str ( uuid4 ( ) ) )
return sorted ( self . scrape_dir . iterdir ( ) , reverse = True )
2020-01-06 15:32:38 +01:00
def lookup_report_dir ( self , uuid ) - > Union [ Path , None ] :
2019-02-01 16:11:16 +01:00
report_dir = self . redis . hget ( ' lookup_dirs ' , uuid )
if report_dir :
return Path ( report_dir )
return None
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
def enqueue_scrape ( self , query : dict ) - > str :
2019-01-30 14:30:01 +01:00
perma_uuid = str ( uuid4 ( ) )
p = self . redis . pipeline ( )
p . hmset ( perma_uuid , query )
p . sadd ( ' to_scrape ' , perma_uuid )
p . execute ( )
return perma_uuid
2020-01-06 15:32:38 +01:00
def process_scrape_queue ( self ) - > Union [ bool , None ] :
2019-01-30 14:30:01 +01:00
uuid = self . redis . spop ( ' to_scrape ' )
if not uuid :
2019-04-05 16:12:54 +02:00
return None
2019-01-30 14:30:01 +01:00
to_scrape = self . redis . hgetall ( uuid )
self . redis . delete ( uuid )
to_scrape [ ' perma_uuid ' ] = uuid
2019-04-05 16:12:54 +02:00
if self . scrape ( * * to_scrape ) :
self . logger . info ( f ' Processed { to_scrape [ " url " ] } ' )
return True
return False
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
def load_tree ( self , report_dir : Path ) - > Tuple [ str , dict , str , str , str , dict ] :
2019-01-30 14:30:01 +01:00
har_files = sorted ( report_dir . glob ( ' *.har ' ) )
2019-02-18 13:52:48 +01:00
try :
2019-04-07 23:54:16 +02:00
meta = { }
if ( report_dir / ' meta ' ) . exists ( ) :
with open ( ( report_dir / ' meta ' ) , ' r ' ) as f :
meta = json . load ( f )
2019-02-18 13:52:48 +01:00
ct = CrawledTree ( har_files )
temp = tempfile . NamedTemporaryFile ( prefix = ' lookyloo ' , delete = False )
pickle . dump ( ct , temp )
temp . close ( )
2019-04-07 23:54:16 +02:00
return temp . name , ct . to_json ( ) , ct . start_time . isoformat ( ) , ct . user_agent , ct . root_url , meta
2019-02-18 13:52:48 +01:00
except Har2TreeError as e :
raise NoValidHarFile ( e . message )
2019-01-30 14:30:01 +01:00
def cleanup_old_tmpfiles ( self ) :
for tmpfile in pathlib . Path ( tempfile . gettempdir ( ) ) . glob ( ' lookyloo* ' ) :
if time . time ( ) - tmpfile . stat ( ) . st_atime > 36000 :
tmpfile . unlink ( )
2020-01-06 15:32:38 +01:00
def load_image ( self , report_dir : Path ) - > BytesIO :
2019-01-30 14:30:01 +01:00
with open ( list ( report_dir . glob ( ' *.png ' ) ) [ 0 ] , ' rb ' ) as f :
return BytesIO ( f . read ( ) )
2020-01-06 15:32:38 +01:00
def sane_js_query ( self , sha512 : str ) - > Dict :
2020-01-23 10:52:50 +01:00
if self . use_sane_js :
2019-01-30 14:30:01 +01:00
return self . sanejs . sha512 ( sha512 )
return { ' response ' : [ ] }
2020-01-24 11:25:53 +01:00
def scrape ( self , url : str , cookies_pseudofile : Optional [ BufferedIOBase ] = None , depth : int = 1 , listing : bool = True , user_agent : Optional [ str ] = None , perma_uuid : str = None ,
2020-01-06 15:32:38 +01:00
os : str = None , browser : str = None ) - > Union [ bool , str ] :
2019-01-30 14:30:01 +01:00
if not url . startswith ( ' http ' ) :
url = f ' http:// { url } '
2019-07-05 16:27:23 +02:00
if self . only_global_lookups :
splitted_url = urlsplit ( url )
if splitted_url . netloc :
2020-01-06 15:32:38 +01:00
if splitted_url . hostname :
ip = socket . gethostbyname ( splitted_url . hostname )
if not ipaddress . ip_address ( ip ) . is_global :
return False
2019-07-05 18:41:23 +02:00
else :
return False
2020-01-24 10:17:41 +01:00
cookies = load_cookies ( cookies_pseudofile )
2020-01-21 11:47:36 +01:00
items = crawl ( self . splash_url , url , cookies = cookies , depth = depth , user_agent = user_agent , log_enabled = True , log_level = ' INFO ' )
2019-01-30 14:30:01 +01:00
if not items :
# broken
2019-04-05 16:12:54 +02:00
return False
2019-01-30 14:30:01 +01:00
if not perma_uuid :
perma_uuid = str ( uuid4 ( ) )
width = len ( str ( len ( items ) ) )
dirpath = self . scrape_dir / datetime . now ( ) . isoformat ( )
dirpath . mkdir ( )
for i , item in enumerate ( items ) :
2020-02-16 23:38:42 +01:00
if not listing : # Write no_index marker
( dirpath / ' no_index ' ) . touch ( )
with ( dirpath / ' uuid ' ) . open ( ' w ' ) as _uuid :
_uuid . write ( perma_uuid )
if os or browser :
meta = { }
if os :
meta [ ' os ' ] = os
if browser :
meta [ ' browser ' ] = browser
with ( dirpath / ' meta ' ) . open ( ' w ' ) as _meta :
json . dump ( meta , _meta )
if ' error ' in item :
with ( dirpath / ' error.txt ' ) . open ( ' w ' ) as _error :
_error . write ( item [ ' error ' ] )
continue
# The capture went fine
2019-01-30 14:30:01 +01:00
harfile = item [ ' har ' ]
png = base64 . b64decode ( item [ ' png ' ] )
2020-02-16 23:38:42 +01:00
html = item [ ' html ' ]
with ( dirpath / ' { 0:0 {width} }.har ' . format ( i , width = width ) ) . open ( ' w ' ) as _har :
json . dump ( harfile , _har )
with ( dirpath / ' { 0:0 {width} }.png ' . format ( i , width = width ) ) . open ( ' wb ' ) as _img :
_img . write ( png )
with ( dirpath / ' { 0:0 {width} }.html ' . format ( i , width = width ) ) . open ( ' w ' ) as _html :
_html . write ( html )
2020-01-21 11:47:36 +01:00
if ' childFrames ' in item :
child_frames = item [ ' childFrames ' ]
with ( dirpath / ' { 0:0 {width} }.frames.json ' . format ( i , width = width ) ) . open ( ' w ' ) as _iframes :
json . dump ( child_frames , _iframes )
if ' cookies ' in item :
cookies = item [ ' cookies ' ]
with ( dirpath / ' { 0:0 {width} }.cookies.json ' . format ( i , width = width ) ) . open ( ' w ' ) as _cookies :
json . dump ( cookies , _cookies )
2019-02-01 16:11:16 +01:00
self . _set_report_cache ( dirpath )
2019-01-30 14:30:01 +01:00
return perma_uuid