2019-01-30 14:30:01 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
|
|
|
from scrapysplashwrapper import crawl
|
2019-02-18 13:52:48 +01:00
|
|
|
from har2tree import CrawledTree, Har2TreeError
|
2019-01-30 14:30:01 +01:00
|
|
|
import pickle
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
|
|
|
import tempfile
|
|
|
|
import pathlib
|
|
|
|
import time
|
|
|
|
|
|
|
|
from io import BytesIO
|
|
|
|
import base64
|
|
|
|
from uuid import uuid4
|
|
|
|
|
|
|
|
from pysanejs import SaneJS
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
from .helpers import get_homedir, get_socket_path
|
2019-02-18 13:52:48 +01:00
|
|
|
from .exceptions import NoValidHarFile
|
2019-01-30 14:30:01 +01:00
|
|
|
from redis import Redis
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
|
|
|
|
|
|
|
class Lookyloo():
|
|
|
|
|
|
|
|
def __init__(self, splash_url: str='http://127.0.0.1:8050', loglevel: int=logging.DEBUG):
|
|
|
|
self.__init_logger(loglevel)
|
|
|
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
|
|
|
self.scrape_dir = get_homedir() / 'scraped'
|
|
|
|
self.splash_url = splash_url
|
|
|
|
if not self.scrape_dir.exists():
|
|
|
|
self.scrape_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
2019-04-05 15:07:22 +02:00
|
|
|
if not self.redis.exists('cache_loaded'):
|
|
|
|
self._init_existing_dumps()
|
2019-02-01 16:11:16 +01:00
|
|
|
|
2019-01-30 14:30:01 +01:00
|
|
|
# Try to reach sanejs
|
|
|
|
self.sanejs = SaneJS()
|
|
|
|
if not self.sanejs.is_up:
|
|
|
|
self.sanejs = None
|
|
|
|
|
|
|
|
def __init_logger(self, loglevel) -> None:
|
|
|
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
|
|
|
self.logger.setLevel(loglevel)
|
|
|
|
|
2019-02-01 16:11:16 +01:00
|
|
|
def _set_report_cache(self, report_dir: str):
|
|
|
|
har_files = sorted(report_dir.glob('*.har'))
|
|
|
|
if not har_files:
|
|
|
|
self.logger.warning(f'No har files in {report_dir}')
|
2019-04-04 00:41:15 +02:00
|
|
|
if (report_dir / 'uuid').exists():
|
|
|
|
(report_dir / 'uuid').unlink()
|
|
|
|
if (report_dir / 'no_index').exists():
|
|
|
|
(report_dir / 'no_index').unlink()
|
2019-03-27 10:21:37 +01:00
|
|
|
report_dir.rmdir()
|
2019-02-01 16:11:16 +01:00
|
|
|
return
|
|
|
|
with (report_dir / 'uuid').open() as f:
|
|
|
|
uuid = f.read().strip()
|
|
|
|
with har_files[0].open() as f:
|
|
|
|
j = json.load(f)
|
|
|
|
title = j['log']['pages'][0]['title']
|
|
|
|
if not title:
|
|
|
|
title = '!! No title found !! '
|
2019-02-18 14:29:15 +01:00
|
|
|
cache = {'uuid': uuid, 'title': title}
|
|
|
|
if (report_dir / 'no_index').exists(): # If the folders claims anonymity
|
|
|
|
cache['no_index'] = 1
|
2019-04-05 15:07:22 +02:00
|
|
|
if uuid and not self.redis.exists(str(report_dir)):
|
2019-04-05 14:48:25 +02:00
|
|
|
self.redis.hmset(str(report_dir), cache)
|
|
|
|
self.redis.hset('lookup_dirs', uuid, str(report_dir))
|
2019-02-01 16:11:16 +01:00
|
|
|
|
|
|
|
def report_cache(self, report_dir) -> dict:
|
|
|
|
if isinstance(report_dir, Path):
|
|
|
|
report_dir = str(report_dir)
|
|
|
|
return self.redis.hgetall(report_dir)
|
|
|
|
|
|
|
|
def _init_existing_dumps(self):
|
|
|
|
for report_dir in self.report_dirs:
|
2019-04-05 15:07:22 +02:00
|
|
|
if report_dir.exists():
|
|
|
|
self._set_report_cache(report_dir)
|
|
|
|
self.redis.set('cache_loaded', 1)
|
2019-02-01 16:11:16 +01:00
|
|
|
|
2019-01-30 14:30:01 +01:00
|
|
|
@property
|
|
|
|
def report_dirs(self):
|
|
|
|
for report_dir in self.scrape_dir.iterdir():
|
|
|
|
if report_dir.is_dir() and not report_dir.iterdir():
|
|
|
|
# Cleanup self.scrape_dir of failed runs.
|
|
|
|
report_dir.rmdir()
|
|
|
|
if not (report_dir / 'uuid').exists():
|
|
|
|
# Create uuid if missing
|
|
|
|
with (report_dir / 'uuid').open('w') as f:
|
|
|
|
f.write(str(uuid4()))
|
|
|
|
return sorted(self.scrape_dir.iterdir(), reverse=True)
|
|
|
|
|
2019-02-01 16:11:16 +01:00
|
|
|
def lookup_report_dir(self, uuid) -> Path:
|
|
|
|
report_dir = self.redis.hget('lookup_dirs', uuid)
|
|
|
|
if report_dir:
|
|
|
|
return Path(report_dir)
|
|
|
|
return None
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
def enqueue_scrape(self, query: dict):
|
|
|
|
perma_uuid = str(uuid4())
|
|
|
|
p = self.redis.pipeline()
|
|
|
|
p.hmset(perma_uuid, query)
|
|
|
|
p.sadd('to_scrape', perma_uuid)
|
|
|
|
p.execute()
|
|
|
|
return perma_uuid
|
|
|
|
|
|
|
|
def process_scrape_queue(self):
|
|
|
|
uuid = self.redis.spop('to_scrape')
|
|
|
|
if not uuid:
|
2019-04-05 16:12:54 +02:00
|
|
|
return None
|
2019-01-30 14:30:01 +01:00
|
|
|
to_scrape = self.redis.hgetall(uuid)
|
|
|
|
self.redis.delete(uuid)
|
|
|
|
to_scrape['perma_uuid'] = uuid
|
2019-04-05 16:12:54 +02:00
|
|
|
if self.scrape(**to_scrape):
|
|
|
|
self.logger.info(f'Processed {to_scrape["url"]}')
|
|
|
|
return True
|
|
|
|
return False
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
def load_tree(self, report_dir: Path):
|
|
|
|
har_files = sorted(report_dir.glob('*.har'))
|
2019-02-18 13:52:48 +01:00
|
|
|
try:
|
2019-04-07 23:54:16 +02:00
|
|
|
meta = {}
|
|
|
|
if (report_dir / 'meta').exists():
|
|
|
|
with open((report_dir / 'meta'), 'r') as f:
|
|
|
|
meta = json.load(f)
|
2019-02-18 13:52:48 +01:00
|
|
|
ct = CrawledTree(har_files)
|
|
|
|
ct.find_parents()
|
|
|
|
ct.join_trees()
|
|
|
|
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
|
|
|
|
pickle.dump(ct, temp)
|
|
|
|
temp.close()
|
2019-04-07 23:54:16 +02:00
|
|
|
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
|
2019-02-18 13:52:48 +01:00
|
|
|
except Har2TreeError as e:
|
|
|
|
raise NoValidHarFile(e.message)
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
def cleanup_old_tmpfiles(self):
|
|
|
|
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
|
|
|
|
if time.time() - tmpfile.stat().st_atime > 36000:
|
|
|
|
tmpfile.unlink()
|
|
|
|
|
|
|
|
def load_image(self, report_dir):
|
|
|
|
with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
|
|
|
|
return BytesIO(f.read())
|
|
|
|
|
|
|
|
def sane_js_query(self, sha512: str):
|
|
|
|
if self.sanejs:
|
|
|
|
return self.sanejs.sha512(sha512)
|
|
|
|
return {'response': []}
|
|
|
|
|
2019-04-07 23:54:16 +02:00
|
|
|
def scrape(self, url, depth: int=1, listing: bool=True, user_agent: str=None, perma_uuid: str=None,
|
|
|
|
os: str=None, browser: str=None):
|
2019-01-30 14:30:01 +01:00
|
|
|
if not url.startswith('http'):
|
|
|
|
url = f'http://{url}'
|
|
|
|
items = crawl(self.splash_url, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
|
|
|
if not items:
|
|
|
|
# broken
|
2019-04-05 16:12:54 +02:00
|
|
|
return False
|
2019-01-30 14:30:01 +01:00
|
|
|
if not perma_uuid:
|
|
|
|
perma_uuid = str(uuid4())
|
|
|
|
width = len(str(len(items)))
|
|
|
|
dirpath = self.scrape_dir / datetime.now().isoformat()
|
|
|
|
dirpath.mkdir()
|
|
|
|
for i, item in enumerate(items):
|
|
|
|
harfile = item['har']
|
|
|
|
png = base64.b64decode(item['png'])
|
|
|
|
child_frames = item['childFrames']
|
|
|
|
html = item['html']
|
|
|
|
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
|
|
|
|
json.dump(harfile, f)
|
|
|
|
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
|
|
|
|
f.write(png)
|
|
|
|
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
|
|
|
|
f.write(html)
|
|
|
|
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
|
|
|
|
json.dump(child_frames, f)
|
|
|
|
with (dirpath / 'uuid').open('w') as f:
|
|
|
|
f.write(perma_uuid)
|
2019-04-07 23:54:16 +02:00
|
|
|
if not listing: # Write no_index marker
|
|
|
|
(dirpath / 'no_index').touch()
|
|
|
|
if os or browser:
|
|
|
|
meta = {}
|
|
|
|
if os:
|
|
|
|
meta['os'] = os
|
|
|
|
if browser:
|
|
|
|
meta['browser'] = browser
|
|
|
|
with (dirpath / 'meta').open('w') as f:
|
|
|
|
json.dump(meta, f)
|
2019-02-01 16:11:16 +01:00
|
|
|
self._set_report_cache(dirpath)
|
2019-01-30 14:30:01 +01:00
|
|
|
return perma_uuid
|