lookyloo/lookyloo/lookyloo.py

942 lines
42 KiB
Python
Raw Normal View History

2019-01-30 14:30:01 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
2020-05-11 19:01:02 +02:00
import base64
from collections import defaultdict, Counter
2020-06-04 18:23:36 +02:00
from datetime import datetime, date, timedelta
2020-05-11 19:01:02 +02:00
from email.message import EmailMessage
from io import BufferedIOBase, BytesIO
import ipaddress
2020-05-11 19:01:02 +02:00
import json
import logging
from pathlib import Path
import pickle
import smtplib
import socket
2020-06-11 15:32:43 +02:00
from typing import Union, Dict, List, Tuple, Optional, Any, MutableMapping, Set, Iterable
from urllib.parse import urlsplit
2019-01-30 14:30:01 +01:00
from uuid import uuid4
from zipfile import ZipFile
2019-01-30 14:30:01 +01:00
import publicsuffix2 # type: ignore
2020-05-11 19:01:02 +02:00
from defang import refang # type: ignore
2020-05-20 19:11:15 +02:00
from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode
2020-05-11 19:01:02 +02:00
from redis import Redis
from scrapysplashwrapper import crawl
2020-03-16 13:51:21 +01:00
from werkzeug.useragents import UserAgent
2020-05-21 23:46:35 +02:00
from .exceptions import NoValidHarFile, MissingUUID
from .helpers import get_homedir, get_socket_path, load_cookies, load_configs, safe_create_dir, get_email_template, load_pickle_tree, remove_pickle_tree
2020-06-09 15:06:35 +02:00
from .modules import VirusTotal, SaneJavaScript, PhishingInitiative
2019-01-30 14:30:01 +01:00
class Indexing():
def __init__(self) -> None:
self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True)
def clear_indexes(self):
self.redis.flushdb()
# ###### Cookies ######
@property
def cookies_names(self) -> List[Tuple[str, float]]:
return self.redis.zrevrange('cookies_names', 0, -1, withscores=True)
def cookies_names_number_domains(self, cookie_name: str) -> int:
return self.redis.zcard(f'cn|{cookie_name}')
def cookies_names_domains_values(self, cookie_name: str, domain: str) -> List[Tuple[str, float]]:
return self.redis.zrevrange(f'cn|{cookie_name}|{domain}', 0, -1, withscores=True)
def get_cookie_domains(self, cookie_name: str) -> List[Tuple[str, float]]:
return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)
def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]:
return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')]
2020-06-29 17:23:01 +02:00
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_cookies', crawled_tree.uuid)
pipeline = self.redis.pipeline()
already_loaded: Set[Tuple[str, str]] = set()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if hasattr(urlnode, 'cookies_received'):
for domain, cookie, _ in urlnode.cookies_received:
name, value = cookie.split('=', 1)
if (name, domain) in already_loaded:
# Only add cookie name once / capture
continue
already_loaded.add((name, domain))
pipeline.zincrby('cookies_names', 1, name)
pipeline.zincrby(f'cn|{name}', 1, domain)
pipeline.sadd(f'cn|{name}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
pipeline.zincrby(f'cn|{name}|{domain}', 1, value)
pipeline.sadd('lookyloo_domains', domain)
pipeline.sadd(domain, name)
pipeline.execute()
def aggregate_domain_cookies(self):
psl = publicsuffix2.PublicSuffixList()
pipeline = self.redis.pipeline()
for cn, cn_freq in self.cookies_names:
for domain, d_freq in self.get_cookie_domains(cn):
tld = psl.get_tld(domain)
main_domain_part = domain.strip(f'.{tld}').split('.')[-1]
pipeline.zincrby('aggregate_domains_cn', cn_freq, f'{main_domain_part}|{cn}')
pipeline.zincrby('aggregate_cn_domains', d_freq, f'{cn}|{main_domain_part}')
pipeline.execute()
aggregate_domains_cn = self.redis.zrevrange('aggregate_domains_cn', 0, -1, withscores=True)
aggregate_cn_domains = self.redis.zrevrange('aggregate_cn_domains', 0, -1, withscores=True)
self.redis.delete('aggregate_domains_cn')
self.redis.delete('aggregate_cn_domains')
return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains}
# ###### Body hashes ######
2020-06-19 00:40:02 +02:00
def body_hash_fequency(self, body_hash: str) -> Dict[str, float]:
return {'hash_freq': self.redis.zscore('body_hashes', body_hash),
'hash_domains_freq': self.redis.zcard(f'bh|{body_hash}')}
2020-06-29 17:23:01 +02:00
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
# Do not reindex
return
self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
pipeline = self.redis.pipeline()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if urlnode.empty_response:
continue
pipeline.zincrby('body_hashes', 1, urlnode.body_hash)
pipeline.zincrby(f'bh|{urlnode.body_hash}', 1, urlnode.hostname)
# set of all captures with this hash
pipeline.sadd(f'bh|{urlnode.body_hash}|captures', crawled_tree.uuid)
# ZSet of all urlnode_UUIDs|full_url
pipeline.zincrby(f'bh|{urlnode.body_hash}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
if hasattr(urlnode, 'embedded_ressources') and urlnode.embedded_ressources:
for mimetype, blobs in urlnode.embedded_ressources.items():
for h, body in blobs:
pipeline.zincrby('body_hashes', 1, h)
pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname)
pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
pipeline.execute()
2020-06-23 02:16:33 +02:00
def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None) -> List[Tuple[str, str, str, bool]]:
to_return: List[Tuple[str, str, str, bool]] = []
for capture_uuid in self.redis.smembers(f'bh|{body_hash}|captures'):
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, hostnode_uuid, url = entry.split('|', 2)
2020-06-23 02:16:33 +02:00
if filter_url:
to_return.append((capture_uuid, hostnode_uuid, urlsplit(url).hostname, url == filter_url))
else:
to_return.append((capture_uuid, hostnode_uuid, urlsplit(url).hostname, False))
return to_return
def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
def legitimate_capture(self, crawled_tree: CrawledTree) -> None:
pipeline = self.redis.pipeline()
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
if urlnode.empty_response:
continue
pipeline.sadd(f'bh|{urlnode.body_hash}|legitimate', urlnode.hostname)
pipeline.execute()
def legitimate_hostnode(self, hostnode: HostNode) -> None:
pipeline = self.redis.pipeline()
for urlnode in hostnode.urls:
if urlnode.empty_response:
continue
pipeline.sadd(f'bh|{urlnode.body_hash}|legitimate', urlnode.hostname)
pipeline.execute()
def legitimate_urlnode(self, urlnode: URLNode) -> None:
if urlnode.empty_response:
return
self.redis.sadd(f'bh|{urlnode.body_hash}|legitimate', urlnode.hostname)
2020-08-24 15:31:53 +02:00
def malicious_node(self, urlnode: URLNode) -> None:
if urlnode.empty_response:
return
self.redis.sadd('bh|malicious', urlnode.body_hash)
# Query DB
def is_legitimate(self, urlnode: URLNode) -> Optional[bool]:
2020-08-24 15:31:53 +02:00
if urlnode.empty_response:
return None
hostnames = self.redis.smembers(f'bh|{urlnode.body_hash}|legitimate')
if hostnames:
if urlnode.hostname in hostnames:
return True # Legitimate
return False # Malicious
elif self.redis.sismember('bh|malicious', urlnode.body_hash):
return False
return None # Unknown
def is_malicious(self, urlnode: URLNode) -> Optional[bool]:
if urlnode.empty_response:
return None
if self.redis.sismember('bh|malicious', urlnode.body_hash):
return True
legitimate = self.is_legitimate(urlnode)
if legitimate is True:
return False
if legitimate is False:
return True
return None
2020-08-24 15:31:53 +02:00
def legitimacy_details(self, urlnode: URLNode) -> Optional[Tuple[bool, Optional[List[str]]]]:
if urlnode.empty_response:
return None
hostnames = self.redis.smembers(f'bh|{urlnode.body_hash}|legitimate')
if hostnames:
if urlnode.hostname in hostnames:
return (True, hostnames)
else:
return (False, hostnames)
elif self.redis.sismember('bh|malicious', urlnode.body_hash):
return False
return None
2019-01-30 14:30:01 +01:00
class Lookyloo():
def __init__(self) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.configs: Dict[str, Dict[str, Any]] = load_configs()
self.logger.setLevel(self.get_config('loglevel'))
self.indexing = Indexing()
self.is_public_instance = self.get_config('public_instance')
2020-01-06 15:32:38 +01:00
self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
self.scrape_dir: Path = get_homedir() / 'scraped'
if os.environ.get('SPLASH_URL_DOCKER'):
# In order to have a working default for the docker image, it is easier to use an environment variable
self.splash_url: str = os.environ['SPLASH_URL_DOCKER']
else:
self.splash_url = self.get_config('splash_url')
self.only_global_lookups: bool = self.get_config('only_global_lookups')
2020-04-01 14:33:35 +02:00
safe_create_dir(self.scrape_dir)
2019-01-30 14:30:01 +01:00
# Initialize 3rd party components
if 'modules' not in self.configs:
self.logger.info('No third party components available in the config directory')
else:
2020-06-09 15:06:35 +02:00
if 'PhishingInitiative' in self.configs['modules']:
self.pi = PhishingInitiative(self.configs['modules']['PhishingInitiative'])
if not self.pi.available:
self.logger.warning('Unable to setup the PhishingInitiative module')
if 'VirusTotal' in self.configs['modules']:
self.vt = VirusTotal(self.configs['modules']['VirusTotal'])
if not self.vt.available:
self.logger.warning('Unable to setup the VirusTotal module')
if 'SaneJS' in self.configs['modules']:
self.sanejs = SaneJavaScript(self.configs['modules']['SaneJS'])
if not self.sanejs.available:
self.logger.warning('Unable to setup the SaneJS module')
if not self.redis.exists('cache_loaded'):
self._init_existing_dumps()
2019-02-01 16:11:16 +01:00
def cache_user_agents(self, user_agent: str, remote_ip: str) -> None:
today = date.today().isoformat()
self.redis.zincrby(f'user_agents|{today}', 1, f'{remote_ip}|{user_agent}')
def build_ua_file(self) -> None:
yesterday = (date.today() - timedelta(days=1))
self_generated_ua_file_path = get_homedir() / 'own_user_agents' / str(yesterday.year) / f'{yesterday.month:02}'
safe_create_dir(self_generated_ua_file_path)
self_generated_ua_file = self_generated_ua_file_path / f'{yesterday.isoformat()}.json'
if self_generated_ua_file.exists():
return
entries = self.redis.zrevrange(f'user_agents|{yesterday.isoformat()}', 0, -1)
if not entries:
return
to_store: Dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, count in uas.most_common():
parsed_ua = UserAgent(ua)
2020-06-25 17:08:11 +02:00
if not parsed_ua.platform or not parsed_ua.browser: # type: ignore
2020-06-25 16:50:57 +02:00
continue
2020-06-25 17:08:11 +02:00
if parsed_ua.platform not in to_store: # type: ignore
to_store[parsed_ua.platform] = {} # type: ignore
if f'{parsed_ua.browser} {parsed_ua.version}' not in to_store[parsed_ua.platform]: # type: ignore
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'] = [] # type: ignore
to_store[parsed_ua.platform][f'{parsed_ua.browser} {parsed_ua.version}'].append(parsed_ua.string) # type: ignore
to_store['by_frequency'].append({'os': parsed_ua.platform, # type: ignore
'browser': f'{parsed_ua.browser} {parsed_ua.version}', # type: ignore
'useragent': parsed_ua.string}) # type: ignore
with self_generated_ua_file.open('w') as f:
json.dump(to_store, f, indent=2)
2020-07-06 18:15:03 +02:00
def cache_tree(self, capture_uuid: str) -> None:
'''Generate the pickle, add capture in the indexes'''
2020-07-06 18:15:03 +02:00
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
with open((capture_dir / 'uuid'), 'r') as f:
uuid = f.read()
2020-07-06 18:15:03 +02:00
har_files = sorted(capture_dir.glob('*.har'))
# NOTE: We only index the public captures
index = True
try:
ct = CrawledTree(har_files, uuid)
if self.is_public_instance:
cache = self.capture_cache(capture_uuid)
if cache.get('no_index') is not None:
index = False
if index:
self.indexing.index_cookies_capture(ct)
self.indexing.index_body_hashes_capture(ct)
except Har2TreeError as e:
raise NoValidHarFile(e.message)
2020-07-06 18:15:03 +02:00
with (capture_dir / 'tree.pickle').open('wb') as _p:
pickle.dump(ct, _p)
def get_crawled_tree(self, capture_uuid: str) -> CrawledTree:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct:
self.cache_tree(capture_uuid)
ct = load_pickle_tree(capture_dir)
if not ct:
raise NoValidHarFile(f'Unable to get tree from {capture_dir}')
return ct
def add_to_legitimate(self, capture_uuid: str, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None):
ct = self.get_crawled_tree(capture_uuid)
if not hostnode_uuid and not urlnode_uuid:
self.indexing.legitimate_capture(ct)
return
if hostnode_uuid:
hostnode = ct.root_hartree.get_host_node_by_uuid(hostnode_uuid)
self.indexing.legitimate_hostnode(hostnode)
if urlnode_uuid:
urlnode = ct.root_hartree.get_url_node_by_uuid(urlnode_uuid)
self.indexing.legitimate_urlnode(urlnode)
def bodies_legitimacy_check(self, tree: CrawledTree) -> CrawledTree:
hostnodes_with_malicious_content = set()
for urlnode in tree.root_hartree.url_tree.traverse():
malicious = self.indexing.is_malicious(urlnode)
if malicious is not None:
urlnode.add_feature('malicious', malicious)
hostnodes_with_malicious_content.add(urlnode.hostnode_uuid)
for hostnode_with_malicious_content in hostnodes_with_malicious_content:
hostnode = tree.root_hartree.get_host_node_by_uuid(hostnode_with_malicious_content)
hostnode.add_feature('malicious', malicious)
return tree
def load_tree(self, capture_uuid: str) -> Tuple[str, str, str, str, Dict[str, str]]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
meta = {}
if (capture_dir / 'meta').exists():
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = self.get_crawled_tree(capture_uuid)
ct = self.bodies_legitimacy_check(ct)
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
def remove_pickle(self, capture_uuid: str) -> None:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
remove_pickle_tree(capture_dir)
2020-05-18 18:32:59 +02:00
def rebuild_cache(self) -> None:
2020-04-01 17:44:06 +02:00
self.redis.flushdb()
self._init_existing_dumps()
2020-05-18 18:32:59 +02:00
def rebuild_all(self) -> None:
2020-04-01 17:44:06 +02:00
for capture_dir in self.capture_dirs:
remove_pickle_tree(capture_dir)
2020-04-01 17:44:06 +02:00
self.rebuild_cache()
2020-04-01 14:33:35 +02:00
def get_config(self, entry: str) -> Any:
"""Get an entry from the generic config file. Automatic fallback to the sample file"""
if 'generic' in self.configs:
if entry in self.configs['generic']:
return self.configs['generic'][entry]
else:
2020-04-03 17:51:58 +02:00
self.logger.warning(f'Unable to find {entry} in config file.')
else:
self.logger.warning('No generic config file available.')
self.logger.warning('Falling back on sample config, please initialize the generic config file.')
with (get_homedir() / 'config' / 'generic.json.sample').open() as _c:
sample_config = json.load(_c)
return sample_config[entry]
2019-01-30 14:30:01 +01:00
def get_urlnode_from_tree(self, capture_uuid: str, node_uuid: str) -> URLNode:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
2020-05-21 23:46:35 +02:00
if not ct:
raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}')
2020-05-20 19:11:15 +02:00
return ct.root_hartree.get_url_node_by_uuid(node_uuid)
def get_hostnode_from_tree(self, capture_uuid: str, node_uuid: str) -> HostNode:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
2020-05-21 23:46:35 +02:00
if not ct:
raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}')
2020-05-20 19:11:15 +02:00
return ct.root_hartree.get_host_node_by_uuid(node_uuid)
def get_statistics(self, capture_uuid: str) -> Dict[str, Any]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
2020-05-13 17:31:27 +02:00
if not ct:
self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.')
2020-05-15 11:39:16 +02:00
return {}
2020-05-13 17:31:27 +02:00
return ct.root_hartree.stats
def trigger_modules(self, capture_uuid: str, force: bool=False) -> None:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
if not ct:
2020-04-24 15:57:16 +02:00
self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_dir}) is cached.')
return
2020-06-09 15:06:35 +02:00
if hasattr(self, 'pi') and self.pi.available:
if ct.redirects:
for redirect in ct.redirects:
self.pi.url_lookup(redirect, force)
else:
self.pi.url_lookup(ct.root_hartree.har.root_url, force)
if hasattr(self, 'vt') and self.vt.available:
if ct.redirects:
for redirect in ct.redirects:
self.vt.url_lookup(redirect, force)
else:
self.vt.url_lookup(ct.root_hartree.har.root_url, force)
def get_modules_responses(self, capture_uuid: str) -> Optional[Dict[str, Any]]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
ct = load_pickle_tree(capture_dir)
2020-04-20 16:52:46 +02:00
if not ct:
2020-05-26 17:45:04 +02:00
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_dir}) is cached.')
2020-04-20 16:52:46 +02:00
return None
to_return: Dict[str, Any] = {}
if hasattr(self, 'vt') and self.vt.available:
to_return['vt'] = {}
if ct.redirects:
for redirect in ct.redirects:
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
else:
to_return['vt'][ct.root_hartree.har.root_url] = self.vt.get_url_lookup(ct.root_hartree.har.root_url)
2020-06-09 15:06:35 +02:00
if hasattr(self, 'pi') and self.pi.available:
to_return['pi'] = {}
if ct.redirects:
for redirect in ct.redirects:
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
else:
to_return['pi'][ct.root_hartree.har.root_url] = self.pi.get_url_lookup(ct.root_hartree.har.root_url)
return to_return
def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
if force or not self.redis.exists(str(capture_dir)):
# (re)build cache
pass
else:
2019-06-25 18:08:52 +02:00
return
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
har_files = sorted(capture_dir.glob('*.har'))
2020-03-17 15:27:04 +01:00
error_cache: Dict[str, str] = {}
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (Path(capture_dir) / 'error.txt').open() as _error:
2020-07-03 18:25:16 +02:00
content = _error.read()
try:
2020-07-06 14:16:17 +02:00
error_to_cache = json.loads(content)
if isinstance(error_to_cache, dict) and error_to_cache.get('details'):
error_to_cache = error_to_cache.get('details')
2020-07-03 18:25:16 +02:00
except json.decoder.JSONDecodeError:
# old format
error_to_cache = content
error_cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
2020-07-03 18:25:16 +02:00
2020-07-06 15:33:00 +02:00
fatal_error = False
if har_files:
try:
har = HarFile(har_files[0], uuid)
except Har2TreeError as e:
error_cache['error'] = e.message
fatal_error = True
else:
error_cache['error'] = f'No har files in {capture_dir.name}'
fatal_error = True
if error_cache:
self.logger.warning(error_cache['error'])
self.redis.hmset(str(capture_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
2020-07-06 15:33:00 +02:00
if fatal_error:
return
2020-02-03 22:25:48 +01:00
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = load_pickle_tree(capture_dir)
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
2020-02-03 22:25:48 +01:00
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title,
'timestamp': har.initial_start_time,
'url': har.root_url,
'redirects': json.dumps(redirects),
'capture_dir': str(capture_dir),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
2019-02-18 14:29:15 +01:00
cache['no_index'] = 1
self.redis.hmset(str(capture_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def hide_capture(self, capture_uuid: str) -> None:
"""Add the capture in the hidden pool (not shown on the front page)
NOTE: it won't remove the correlations until they are rebuilt.
"""
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
self.redis.hset(str(capture_dir), 'no_index', 1)
(capture_dir / 'no_index').touch()
@property
def capture_uuids(self):
return self.redis.hkeys('lookup_dirs')
def capture_cache(self, capture_uuid: str) -> Dict[str, Any]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
# try to rebuild the cache
self._set_capture_cache(capture_dir, force=True)
cached = self.redis.hgetall(str(capture_dir))
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir']):
2020-02-11 17:03:25 +01:00
cached['redirects'] = json.loads(cached['redirects'])
cached['capture_dir'] = Path(cached['capture_dir'])
2020-02-11 17:03:25 +01:00
return cached
elif 'error' in cached:
return cached
else:
self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
return {}
2019-02-01 16:11:16 +01:00
2020-01-06 15:32:38 +01:00
def _init_existing_dumps(self) -> None:
for capture_dir in self.capture_dirs:
if capture_dir.exists():
self._set_capture_cache(capture_dir)
self.redis.set('cache_loaded', 1)
2019-02-01 16:11:16 +01:00
2019-01-30 14:30:01 +01:00
@property
def capture_dirs(self) -> List[Path]:
for capture_dir in self.scrape_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir():
2019-01-30 14:30:01 +01:00
# Cleanup self.scrape_dir of failed runs.
capture_dir.rmdir()
if not (capture_dir / 'uuid').exists():
2019-01-30 14:30:01 +01:00
# Create uuid if missing
with (capture_dir / 'uuid').open('w') as f:
2019-01-30 14:30:01 +01:00
f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True)
def lookup_capture_dir(self, capture_uuid: str) -> Union[Path, None]:
capture_dir = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir:
return Path(capture_dir)
2019-02-01 16:11:16 +01:00
return None
2019-01-30 14:30:01 +01:00
2020-05-18 18:32:59 +02:00
def enqueue_scrape(self, query: MutableMapping[str, Any]) -> str:
2019-01-30 14:30:01 +01:00
perma_uuid = str(uuid4())
p = self.redis.pipeline()
for key, value in query.items():
if isinstance(value, bool):
# Yes, empty string because that's False.
query[key] = 1 if value else ''
2019-01-30 14:30:01 +01:00
p.hmset(perma_uuid, query)
p.sadd('to_scrape', perma_uuid)
p.execute()
return perma_uuid
2020-01-06 15:32:38 +01:00
def process_scrape_queue(self) -> Union[bool, None]:
2019-01-30 14:30:01 +01:00
uuid = self.redis.spop('to_scrape')
if not uuid:
2019-04-05 16:12:54 +02:00
return None
2019-01-30 14:30:01 +01:00
to_scrape = self.redis.hgetall(uuid)
self.redis.delete(uuid)
to_scrape['perma_uuid'] = uuid
2019-04-05 16:12:54 +02:00
if self.scrape(**to_scrape):
self.logger.info(f'Processed {to_scrape["url"]}')
return True
return False
2019-01-30 14:30:01 +01:00
def send_mail(self, capture_uuid: str, email: str='', comment: str='') -> None:
2020-05-11 19:01:02 +02:00
if not self.get_config('enable_mail_notification'):
return
redirects = ''
initial_url = ''
cache = self.capture_cache(capture_uuid)
if cache:
initial_url = cache['url']
if 'redirects' in cache and cache['redirects']:
redirects = "Redirects:\n"
redirects += '\n'.join(cache['redirects'])
else:
redirects = "No redirects."
2020-05-11 19:01:02 +02:00
email_config = self.get_config('email')
msg = EmailMessage()
msg['From'] = email_config['from']
if email:
msg['Reply-To'] = email
2020-05-11 19:01:02 +02:00
msg['To'] = email_config['to']
msg['Subject'] = email_config['subject']
body = get_email_template()
body = body.format(
recipient=msg['To'].addresses[0].display_name,
domain=email_config['domain'],
uuid=capture_uuid,
initial_url=initial_url,
redirects=redirects,
2020-05-11 19:01:02 +02:00
comment=comment,
sender=msg['From'].addresses[0].display_name,
)
msg.set_content(body)
try:
s = smtplib.SMTP(email_config['smtp_host'], email_config['smtp_port'])
s.send_message(msg)
s.quit()
except Exception as e:
2020-06-29 18:00:53 +02:00
self.logger.exception(e)
2020-08-20 15:05:27 +02:00
self.logger.warning(msg.as_string())
2020-05-11 19:01:02 +02:00
def _ensure_meta(self, capture_dir: Path, tree: CrawledTree) -> None:
metafile = capture_dir / 'meta'
if metafile.exists():
return
ua = UserAgent(tree.root_hartree.user_agent)
to_dump = {}
2020-06-25 17:08:11 +02:00
if ua.platform: # type: ignore
to_dump['os'] = ua.platform # type: ignore
if ua.browser: # type: ignore
if ua.version: # type: ignore
to_dump['browser'] = f'{ua.browser} {ua.version}' # type: ignore
else:
2020-06-25 17:08:11 +02:00
to_dump['browser'] = ua.browser # type: ignore
if ua.language: # type: ignore
to_dump['language'] = ua.language # type: ignore
if not to_dump:
# UA not recognized
self.logger.info(f'Unable to recognize the User agent: {ua}')
2020-06-25 17:08:11 +02:00
to_dump['user_agent'] = ua.string # type: ignore
with metafile.open('w') as f:
json.dump(to_dump, f)
def _get_raw(self, capture_uuid: str, extension: str='*', all_files: bool=True) -> BytesIO:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache')
all_paths = sorted(list(capture_dir.glob(f'*.{extension}')))
if not all_files:
# Only get the first one in the list
with open(all_paths[0], 'rb') as f:
return BytesIO(f.read())
to_return = BytesIO()
with ZipFile(to_return, 'w') as myzip:
for path in all_paths:
if path.name.endswith('pickle'):
continue
myzip.write(path, arcname=f'{capture_dir.name}/{path.name}')
to_return.seek(0)
return to_return
def get_html(self, capture_uuid: str, all_html: bool=False) -> BytesIO:
return self._get_raw(capture_uuid, 'html', all_html)
def get_cookies(self, capture_uuid: str, all_cookies: bool=False) -> BytesIO:
return self._get_raw(capture_uuid, 'cookies.json', all_cookies)
2020-05-26 17:45:04 +02:00
def get_screenshot(self, capture_uuid: str, all_images: bool=False) -> BytesIO:
return self._get_raw(capture_uuid, 'png', all_images)
def get_capture(self, capture_uuid: str) -> BytesIO:
return self._get_raw(capture_uuid)
2019-01-30 14:30:01 +01:00
def scrape(self, url: str, cookies_pseudofile: Optional[BufferedIOBase]=None,
depth: int=1, listing: bool=True, user_agent: Optional[str]=None,
2020-07-20 23:04:11 +02:00
referer: str='', perma_uuid: str=None, os: str=None,
2020-07-08 00:37:29 +02:00
browser: str=None) -> Union[bool, str]:
2020-03-19 14:05:19 +01:00
url = url.strip()
2020-03-19 11:05:29 +01:00
url = refang(url)
2019-01-30 14:30:01 +01:00
if not url.startswith('http'):
url = f'http://{url}'
if self.only_global_lookups:
splitted_url = urlsplit(url)
if splitted_url.netloc:
2020-01-06 15:32:38 +01:00
if splitted_url.hostname:
try:
ip = socket.gethostbyname(splitted_url.hostname)
except socket.gaierror:
2020-05-26 17:45:04 +02:00
self.logger.info('Name or service not known')
return False
2020-01-06 15:32:38 +01:00
if not ipaddress.ip_address(ip).is_global:
return False
else:
return False
2020-01-24 10:17:41 +01:00
cookies = load_cookies(cookies_pseudofile)
if not user_agent:
# Catch case where the UA is broken on the UI, and the async submission.
ua: str = self.get_config('default_user_agent') # type: ignore
else:
ua = user_agent
2020-06-29 18:00:53 +02:00
2020-06-29 19:23:16 +02:00
if int(depth) > int(self.get_config('max_depth')): # type: ignore
2020-06-29 18:00:53 +02:00
self.logger.warning(f'Not allowed to scrape on a depth higher than {self.get_config("max_depth")}: {depth}')
depth = int(self.get_config('max_depth')) # type: ignore
items = crawl(self.splash_url, url, cookies=cookies, depth=depth, user_agent=ua,
2020-07-08 00:37:29 +02:00
referer=referer, log_enabled=True, log_level=self.get_config('splash_loglevel'))
2019-01-30 14:30:01 +01:00
if not items:
# broken
2019-04-05 16:12:54 +02:00
return False
2019-01-30 14:30:01 +01:00
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = self.scrape_dir / datetime.now().isoformat()
2020-04-01 14:33:35 +02:00
safe_create_dir(dirpath)
2019-01-30 14:30:01 +01:00
for i, item in enumerate(items):
if not listing: # Write no_index marker
(dirpath / 'no_index').touch()
with (dirpath / 'uuid').open('w') as _uuid:
_uuid.write(perma_uuid)
if os or browser:
meta = {}
if os:
meta['os'] = os
if browser:
meta['browser'] = browser
with (dirpath / 'meta').open('w') as _meta:
json.dump(meta, _meta)
2020-07-03 18:25:16 +02:00
if 'error' in item:
with (dirpath / 'error.txt').open('w') as _error:
2020-07-03 18:25:16 +02:00
json.dump(item['error'], _error)
# The capture went fine
2019-01-30 14:30:01 +01:00
harfile = item['har']
png = base64.b64decode(item['png'])
html = item['html']
2020-03-18 21:14:48 +01:00
last_redirect = item['last_redirected_url']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as _har:
json.dump(harfile, _har)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as _img:
_img.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as _html:
_html.write(html)
2020-03-18 21:14:48 +01:00
with (dirpath / '{0:0{width}}.last_redirect.txt'.format(i, width=width)).open('w') as _redir:
_redir.write(last_redirect)
if 'childFrames' in item:
child_frames = item['childFrames']
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as _iframes:
json.dump(child_frames, _iframes)
if 'cookies' in item:
cookies = item['cookies']
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
self._set_capture_cache(dirpath)
2019-01-30 14:30:01 +01:00
return perma_uuid
2020-06-04 18:23:36 +02:00
def get_body_hash_investigator(self, body_hash: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
captures = []
2020-06-23 02:16:33 +02:00
for capture_uuid, url_uuid, url_hostname, _ in self.indexing.get_body_hash_captures(body_hash):
cache = self.capture_cache(capture_uuid)
if cache:
captures.append((capture_uuid, cache['title']))
domains = self.indexing.get_body_hash_domains(body_hash)
return captures, domains
def get_cookie_name_investigator(self, cookie_name: str):
captures = []
for capture_uuid, url_uuid in self.indexing.get_cookies_names_captures(cookie_name):
cache = self.capture_cache(capture_uuid)
if cache:
captures.append((capture_uuid, cache['title']))
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains
2020-08-10 20:05:47 +02:00
def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> Dict[str, List[Tuple[str, str, str, str, str]]]:
2020-08-10 20:11:26 +02:00
captures_list: Dict[str, List[Tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
for h_capture_uuid, url_uuid, url_hostname, same_url in self.indexing.get_body_hash_captures(blob_hash, url):
if h_capture_uuid == capture_uuid:
# Skip self.
continue
cache = self.capture_cache(h_capture_uuid)
if cache:
if same_url:
2020-08-10 20:05:47 +02:00
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache['title'], cache['timestamp'], url_hostname))
else:
2020-08-10 20:05:47 +02:00
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache['title'], cache['timestamp'], url_hostname))
return captures_list
def _format_sane_js_response(self, lookup_table: Dict, h: str) -> Optional[Union[str, Tuple]]:
if lookup_table.get(h):
if isinstance(lookup_table[h], list):
libname, version, path = lookup_table[h][0].split("|")
other_files = len(lookup_table[h])
return libname, version, path, other_files
else:
# Predefined generic file
return lookup_table[h]
return None
def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
capture_dir = self.lookup_capture_dir(capture_uuid)
if not capture_dir:
raise MissingUUID(f'Unable to find {capture_uuid}')
ct = load_pickle_tree(capture_dir)
2020-06-04 18:23:36 +02:00
if not ct:
raise MissingUUID(f'Unable to find {capture_dir}')
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
if not hostnode:
raise MissingUUID(f'Unable to find UUID {node_uuid} in {capture_dir}')
sanejs_lookups: Dict[str, List[str]] = {}
if hasattr(self, 'sanejs') and self.sanejs.available:
to_lookup = [url.body_hash for url in hostnode.urls if hasattr(url, 'body_hash')]
sanejs_lookups = self.sanejs.hashes_lookup(to_lookup)
urls: List[Dict[str, Any]] = []
for url in hostnode.urls:
# For the popup, we need:
# * https vs http
# * everything after the domain
# * the full URL
to_append: Dict[str, Any] = {
'encrypted': url.name.startswith('https'),
'url_path': url.name.split('/', 3)[-1],
2020-08-24 15:31:53 +02:00
'url_object': url,
'legitimacy': self.indexing.legitimacy_details(url)
2020-06-04 18:23:36 +02:00
}
if not url.empty_response:
# Index lookup
# %%% Full body %%%
freq = self.indexing.body_hash_fequency(url.body_hash)
to_append['body_hash_details'] = freq
2020-07-15 01:35:55 +02:00
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
to_append['body_hash_details']['other_captures'] = self.hash_lookup(url.body_hash, url.name, capture_uuid)
# %%% Embedded ressources %%%
2020-07-15 01:35:55 +02:00
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
to_append['embedded_ressources'] = {}
for mimetype, blobs in url.embedded_ressources.items():
for h, blob in blobs:
if h in to_append['embedded_ressources']:
# Skip duplicates
continue
2020-07-11 02:10:56 +02:00
freq_embedded = self.indexing.body_hash_fequency(h)
to_append['embedded_ressources'][h] = freq_embedded
to_append['embedded_ressources'][h]['type'] = mimetype
2020-07-11 02:10:56 +02:00
if freq_embedded['hash_freq'] > 1:
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
if hasattr(self, 'sanejs') and self.sanejs.available:
to_lookup = list(to_append['embedded_ressources'].keys())
sanejs_lookups_embedded = self.sanejs.hashes_lookup(to_lookup)
for h in to_append['embedded_ressources'].keys():
sane_js_match = self._format_sane_js_response(sanejs_lookups_embedded, h)
if sane_js_match:
to_append['embedded_ressources'][h]['sane_js'] = sane_js_match
# Optional: SaneJS information
sane_js_match = self._format_sane_js_response(sanejs_lookups, url.body_hash)
if sane_js_match:
to_append['sane_js'] = sane_js_match
2020-06-04 18:23:36 +02:00
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
if hasattr(url, 'cookies_sent'):
2020-06-11 15:32:43 +02:00
to_display_sent: Dict[str, Set[Iterable[Optional[str]]]] = defaultdict(set)
2020-06-04 18:23:36 +02:00
for cookie, contexts in url.cookies_sent.items():
if not contexts:
2020-06-11 15:13:31 +02:00
# Locally created?
2020-06-11 15:32:43 +02:00
to_display_sent[cookie].add(('Unknown origin', ))
2020-06-04 18:23:36 +02:00
continue
for context in contexts:
2020-06-11 15:32:43 +02:00
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
to_append['cookies_sent'] = to_display_sent
2020-06-04 18:23:36 +02:00
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
if hasattr(url, 'cookies_received'):
2020-06-11 15:32:43 +02:00
to_display_received: Dict[str, Dict[str, Set[Iterable[Optional[str]]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
2020-06-04 18:23:36 +02:00
for domain, c_received, is_3rd_party in url.cookies_received:
2020-06-11 15:13:31 +02:00
if c_received not in ct.root_hartree.cookies_sent:
# This cookie is never sent.
if is_3rd_party:
2020-06-11 15:32:43 +02:00
to_display_received['3rd_party'][c_received].add((domain, ))
2020-06-11 15:13:31 +02:00
else:
2020-06-11 15:32:43 +02:00
to_display_received['not_sent'][c_received].add((domain, ))
2020-06-11 15:13:31 +02:00
continue
2020-06-04 18:23:36 +02:00
for url_node in ct.root_hartree.cookies_sent[c_received]:
2020-06-11 15:13:31 +02:00
if is_3rd_party:
2020-06-11 15:32:43 +02:00
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
2020-06-11 15:13:31 +02:00
else:
2020-06-11 15:32:43 +02:00
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
to_append['cookies_received'] = to_display_received
2020-06-04 18:23:36 +02:00
urls.append(to_append)
return hostnode, urls