lookyloo/lookyloo/lookyloo.py

322 lines
12 KiB
Python
Raw Normal View History

2019-01-30 14:30:01 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import pickle
from datetime import datetime
import tempfile
import pathlib
import time
import ipaddress
import socket
from urllib.parse import urlsplit
2020-01-24 11:25:53 +01:00
from io import BufferedIOBase, BytesIO
2019-01-30 14:30:01 +01:00
import base64
from uuid import uuid4
from pathlib import Path
2020-04-01 14:33:35 +02:00
from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir
2019-02-18 13:52:48 +01:00
from .exceptions import NoValidHarFile
2019-01-30 14:30:01 +01:00
from redis import Redis
from typing import Union, Dict, List, Tuple, Optional, Any
2020-01-06 15:32:38 +01:00
2019-01-30 14:30:01 +01:00
import logging
from pysanejs import SaneJS
from scrapysplashwrapper import crawl
2020-02-03 22:25:48 +01:00
from har2tree import CrawledTree, Har2TreeError, HarFile
2020-01-06 15:32:38 +01:00
2020-03-16 13:51:21 +01:00
from defang import refang # type: ignore
from .modules import VirusTotal
2019-01-30 14:30:01 +01:00
class Lookyloo():
def __init__(self) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.configs: Dict[str, Dict[str, Any]] = load_configs()
self.logger.setLevel(self.get_config('loglevel'))
2020-01-06 15:32:38 +01:00
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir: Path = get_homedir() / 'scraped'
self.splash_url: str = self.get_config('splash_url')
self.only_global_lookups: bool = self.get_config('only_global_lookups')
2020-04-01 14:33:35 +02:00
safe_create_dir(self.scrape_dir)
2019-01-30 14:30:01 +01:00
# Initialize 3rd party components
if 'modules' not in self.configs:
self.logger.info('No third party components available in the config directory')
else:
if 'VirusTotal' in self.configs['modules']:
self.vt = VirusTotal(self.configs['modules']['VirusTotal'])
if not self.vt.available:
self.logger.warning('Unable to setup the VirusTotal module')
if not self.redis.exists('cache_loaded'):
self._init_existing_dumps()
2019-02-01 16:11:16 +01:00
2019-01-30 14:30:01 +01:00
# Try to reach sanejs
self.sanejs = SaneJS()
if not self.sanejs.is_up:
self.use_sane_js = False
else:
self.use_sane_js = True
2019-01-30 14:30:01 +01:00
2020-04-01 17:44:06 +02:00
def rebuild_cache(self):
self.redis.flushdb()
self._init_existing_dumps()
def remove_pickle(self, capture_dir: Path):
if (capture_dir / 'tree.pickle').exists():
(capture_dir / 'tree.pickle').unlink()
def rebuild_all(self):
for capture_dir in self.capture_dirs:
self.remove_pickle(capture_dir)
self.rebuild_cache()
2020-04-01 14:33:35 +02:00
def get_config(self, entry: str) -> Any:
"""Get an entry from the generic config file. Automatic fallback to the sample file"""
if 'generic' in self.configs:
if entry in self.configs['generic']:
return self.configs['generic'][entry]
else:
self.logger.warning(f'Unable to fing {entry} in config file.')
else:
self.logger.warning('No generic config file available.')
self.logger.warning('Falling back on sample config, please initialize the generic config file.')
with (get_homedir() / 'config' / 'generic.json.sample').open() as _c:
sample_config = json.load(_c)
return sample_config[entry]
2019-01-30 14:30:01 +01:00
def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
if force or not self.redis.exists(str(capture_dir)):
# (re)build cache
pass
else:
2019-06-25 18:08:52 +02:00
return
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
har_files = sorted(capture_dir.glob('*.har'))
2020-03-17 15:27:04 +01:00
error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (Path(capture_dir) / 'error.txt').open() as _error:
error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
2020-03-17 15:27:04 +01:00
elif not har_files:
error_cache['error'] = f'No har files in {capture_dir}'
if error_cache:
self.logger.warning(error_cache['error'])
self.redis.hmset(str(capture_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
2019-02-01 16:11:16 +01:00
return
2020-02-03 22:25:48 +01:00
har = HarFile(har_files[0])
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = self._load_pickle(capture_dir / 'tree.pickle')
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
2020-02-03 22:25:48 +01:00
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title,
'timestamp': har.initial_start_time,
'url': har.first_url,
'redirects': json.dumps(redirects),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
2019-02-18 14:29:15 +01:00
cache['no_index'] = 1
self.redis.hmset(str(capture_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
# try to rebuild the cache
self._set_capture_cache(capture_dir, force=True)
cached = self.redis.hgetall(str(capture_dir))
2020-02-11 17:26:01 +01:00
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
2020-02-11 17:03:25 +01:00
cached['redirects'] = json.loads(cached['redirects'])
return cached
elif 'error' in cached:
return cached
else:
self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
return None
2019-02-01 16:11:16 +01:00
2020-01-06 15:32:38 +01:00
def _init_existing_dumps(self) -> None:
for capture_dir in self.capture_dirs:
if capture_dir.exists():
self._set_capture_cache(capture_dir)
self.redis.set('cache_loaded', 1)
2019-02-01 16:11:16 +01:00
2019-01-30 14:30:01 +01:00
@property
def capture_dirs(self) -> List[Path]:
for capture_dir in self.scrape_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir():
2019-01-30 14:30:01 +01:00
# Cleanup self.scrape_dir of failed runs.
capture_dir.rmdir()
if not (capture_dir / 'uuid').exists():
2019-01-30 14:30:01 +01:00
# Create uuid if missing
with (capture_dir / 'uuid').open('w') as f:
2019-01-30 14:30:01 +01:00
f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True)
def lookup_capture_dir(self, uuid) -> Union[Path, None]:
capture_dir = self.redis.hget('lookup_dirs', uuid)
if capture_dir:
return Path(capture_dir)
2019-02-01 16:11:16 +01:00
return None
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
def enqueue_scrape(self, query: dict) -> str:
2019-01-30 14:30:01 +01:00
perma_uuid = str(uuid4())
p = self.redis.pipeline()
p.hmset(perma_uuid, query)
p.sadd('to_scrape', perma_uuid)
p.execute()
return perma_uuid
2020-01-06 15:32:38 +01:00
def process_scrape_queue(self) -> Union[bool, None]:
2019-01-30 14:30:01 +01:00
uuid = self.redis.spop('to_scrape')
if not uuid:
2019-04-05 16:12:54 +02:00
return None
2019-01-30 14:30:01 +01:00
to_scrape = self.redis.hgetall(uuid)
self.redis.delete(uuid)
to_scrape['perma_uuid'] = uuid
2019-04-05 16:12:54 +02:00
if self.scrape(**to_scrape):
self.logger.info(f'Processed {to_scrape["url"]}')
return True
return False
2019-01-30 14:30:01 +01:00
def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
if pickle_file.exists():
with pickle_file.open('rb') as _p:
return pickle.load(_p)
return None
def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle'
2019-02-18 13:52:48 +01:00
try:
meta = {}
if (capture_dir / 'meta').exists():
# NOTE: Legacy, the meta file should be present
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = self._load_pickle(pickle_file)
if not ct:
ct = CrawledTree(har_files)
with pickle_file.open('wb') as _p:
pickle.dump(ct, _p)
return str(pickle_file), ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
2019-02-18 13:52:48 +01:00
except Har2TreeError as e:
raise NoValidHarFile(e.message)
2019-01-30 14:30:01 +01:00
def cleanup_old_tmpfiles(self):
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
if time.time() - tmpfile.stat().st_atime > 36000:
tmpfile.unlink()
def load_image(self, capture_dir: Path) -> BytesIO:
with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
2019-01-30 14:30:01 +01:00
return BytesIO(f.read())
2020-01-06 15:32:38 +01:00
def sane_js_query(self, sha512: str) -> Dict:
if self.use_sane_js:
2019-01-30 14:30:01 +01:00
return self.sanejs.sha512(sha512)
return {'response': []}
2020-01-24 11:25:53 +01:00
def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, perma_uuid: str=None,
2020-01-06 15:32:38 +01:00
os: str=None, browser: str=None) -> Union[bool, str]:
2020-03-19 14:05:19 +01:00
url = url.strip()
2020-03-19 11:05:29 +01:00
url = refang(url)
2019-01-30 14:30:01 +01:00
if not url.startswith('http'):
url = f'http://{url}'
if self.only_global_lookups:
splitted_url = urlsplit(url)
if splitted_url.netloc:
2020-01-06 15:32:38 +01:00
if splitted_url.hostname:
try:
ip = socket.gethostbyname(splitted_url.hostname)
except socket.gaierror:
self.logger.info(f'Name or service not known')
return False
2020-01-06 15:32:38 +01:00
if not ipaddress.ip_address(ip).is_global:
return False
else:
return False
2020-01-24 10:17:41 +01:00
cookies = load_cookies(cookies_pseudofile)
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
2019-01-30 14:30:01 +01:00
if not items:
# broken
2019-04-05 16:12:54 +02:00
return False
2019-01-30 14:30:01 +01:00
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat()
2020-04-01 14:33:35 +02:00
safe_create_dir(dirpath)
2019-01-30 14:30:01 +01:00
for i, item in enumerate(items):
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
with (dirpath / 'uuid').open('w') as _uuid:
_uuid.write(perma_uuid)
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as _meta:
json.dump(meta, _meta)
if 'error' in item:
with (dirpath / 'error.txt').open('w') as _error:
_error.write(item['error'])
continue
# The capture went fine
2019-01-30 14:30:01 +01:00
harfile = item['har']
png = base64.b64decode(item['png'])
html = item['html']
2020-03-18 21:14:48 +01:00
last_redirect = item['last_redirected_url']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
_img.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
_html.write(html)
2020-03-18 21:14:48 +01:00
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
_redir.write(last_redirect)
if 'childFrames' in item:
child_frames = item['childFrames']
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
json.dump(child_frames, _iframes)
if 'cookies' in item:
cookies = item['cookies']
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
self._set_capture_cache(dirpath)
2019-01-30 14:30:01 +01:00
return perma_uuid