2019-01-30 14:30:01 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2020-05-11 19:01:02 +02:00
|
|
|
import base64
|
2022-04-29 21:55:59 +02:00
|
|
|
import hashlib
|
2020-05-11 19:01:02 +02:00
|
|
|
import json
|
|
|
|
import logging
|
2021-09-07 12:59:31 +02:00
|
|
|
import operator
|
2020-05-11 19:01:02 +02:00
|
|
|
import smtplib
|
2022-04-29 21:55:59 +02:00
|
|
|
|
2021-09-07 12:59:31 +02:00
|
|
|
from collections import defaultdict
|
|
|
|
from datetime import date, datetime
|
|
|
|
from email.message import EmailMessage
|
|
|
|
from io import BytesIO
|
|
|
|
from pathlib import Path
|
|
|
|
from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set,
|
|
|
|
Tuple, Union)
|
2019-01-30 14:30:01 +01:00
|
|
|
from uuid import uuid4
|
2020-05-12 16:53:10 +02:00
|
|
|
from zipfile import ZipFile
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2021-11-26 18:47:48 +01:00
|
|
|
from defang import defang # type: ignore
|
2021-09-27 16:04:00 +02:00
|
|
|
from har2tree import CrawledTree, HostNode, URLNode
|
2022-05-23 00:15:52 +02:00
|
|
|
from PIL import Image, UnidentifiedImageError
|
2021-09-07 12:59:31 +02:00
|
|
|
from pymisp import MISPAttribute, MISPEvent, MISPObject
|
|
|
|
from redis import ConnectionPool, Redis
|
2021-08-18 16:07:39 +02:00
|
|
|
from redis.connection import UnixDomainSocketConnection
|
2020-06-25 16:43:36 +02:00
|
|
|
|
2021-09-22 17:09:04 +02:00
|
|
|
from .capturecache import CaptureCache, CapturesIndex
|
2020-10-09 18:05:04 +02:00
|
|
|
from .context import Context
|
2021-10-18 13:06:43 +02:00
|
|
|
from .default import LookylooException, get_homedir, get_config, get_socket_path
|
|
|
|
from .exceptions import (MissingCaptureDirectory,
|
2021-09-27 11:36:27 +02:00
|
|
|
MissingUUID, TreeNeedsRebuild, NoValidHarFile)
|
2021-10-18 13:06:43 +02:00
|
|
|
from .helpers import (CaptureStatus, get_captures_dir, get_email_template,
|
2022-04-25 14:43:02 +02:00
|
|
|
get_resources_hashes, get_taxonomies,
|
2022-03-29 21:13:02 +02:00
|
|
|
uniq_domains, ParsedUserAgent)
|
2020-10-09 18:05:04 +02:00
|
|
|
from .indexing import Indexing
|
2021-09-21 18:01:32 +02:00
|
|
|
from .modules import (MISP, PhishingInitiative, UniversalWhois,
|
2021-11-30 14:59:48 +01:00
|
|
|
UrlScan, VirusTotal, Phishtank, Hashlookup)
|
2020-08-20 19:39:03 +02:00
|
|
|
|
2020-06-15 16:12:23 +02:00
|
|
|
|
2019-01-30 14:30:01 +01:00
|
|
|
class Lookyloo():
|
|
|
|
|
2020-03-31 14:12:49 +02:00
|
|
|
def __init__(self) -> None:
|
|
|
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
2020-09-21 16:41:30 +02:00
|
|
|
self.logger.setLevel(get_config('generic', 'loglevel'))
|
2020-06-22 19:17:25 +02:00
|
|
|
self.indexing = Indexing()
|
2020-09-21 16:41:30 +02:00
|
|
|
self.is_public_instance = get_config('generic', 'public_instance')
|
2020-12-07 20:50:46 +01:00
|
|
|
self.public_domain = get_config('generic', 'public_domain')
|
2020-10-28 18:49:15 +01:00
|
|
|
self.taxonomies = get_taxonomies()
|
2020-03-31 14:12:49 +02:00
|
|
|
|
2021-08-18 16:07:39 +02:00
|
|
|
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
|
|
|
path=get_socket_path('cache'), decode_responses=True)
|
2021-08-24 17:10:14 +02:00
|
|
|
self.capture_dir: Path = get_captures_dir()
|
2020-04-01 14:33:35 +02:00
|
|
|
|
2021-05-18 23:58:56 +02:00
|
|
|
self._priority = get_config('generic', 'priority')
|
|
|
|
|
2020-03-31 14:12:49 +02:00
|
|
|
# Initialize 3rd party components
|
2020-09-21 16:41:30 +02:00
|
|
|
self.pi = PhishingInitiative(get_config('modules', 'PhishingInitiative'))
|
|
|
|
if not self.pi.available:
|
|
|
|
self.logger.warning('Unable to setup the PhishingInitiative module')
|
|
|
|
|
|
|
|
self.vt = VirusTotal(get_config('modules', 'VirusTotal'))
|
|
|
|
if not self.vt.available:
|
|
|
|
self.logger.warning('Unable to setup the VirusTotal module')
|
|
|
|
|
2021-01-28 18:37:44 +01:00
|
|
|
self.misp = MISP(get_config('modules', 'MISP'))
|
|
|
|
if not self.misp.available:
|
|
|
|
self.logger.warning('Unable to setup the MISP module')
|
|
|
|
|
2021-04-26 00:52:08 +02:00
|
|
|
self.uwhois = UniversalWhois(get_config('modules', 'UniversalWhois'))
|
|
|
|
if not self.uwhois.available:
|
|
|
|
self.logger.warning('Unable to setup the UniversalWhois module')
|
|
|
|
|
2021-08-10 17:38:47 +02:00
|
|
|
self.urlscan = UrlScan(get_config('modules', 'UrlScan'))
|
|
|
|
if not self.urlscan.available:
|
|
|
|
self.logger.warning('Unable to setup the UrlScan module')
|
|
|
|
|
2021-09-16 16:33:44 +02:00
|
|
|
self.phishtank = Phishtank(get_config('modules', 'Phishtank'))
|
|
|
|
if not self.phishtank.available:
|
|
|
|
self.logger.warning('Unable to setup the Phishtank module')
|
|
|
|
|
2021-11-30 14:59:48 +01:00
|
|
|
self.hashlookup = Hashlookup(get_config('modules', 'Hashlookup'))
|
|
|
|
if not self.hashlookup.available:
|
|
|
|
self.logger.warning('Unable to setup the Hashlookup module')
|
|
|
|
|
2021-10-07 18:33:40 +02:00
|
|
|
self.logger.info('Initializing context...')
|
2021-09-21 18:01:32 +02:00
|
|
|
self.context = Context()
|
2021-10-07 18:33:40 +02:00
|
|
|
self.logger.info('Context initialized.')
|
|
|
|
self.logger.info('Initializing index...')
|
2021-09-22 17:09:04 +02:00
|
|
|
self._captures_index = CapturesIndex(self.redis, self.context)
|
2021-10-07 18:33:40 +02:00
|
|
|
self.logger.info('Index initialized.')
|
2020-08-27 12:57:10 +02:00
|
|
|
|
2021-08-18 16:07:39 +02:00
|
|
|
@property
|
|
|
|
def redis(self):
|
|
|
|
return Redis(connection_pool=self.redis_pool)
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str,
|
|
|
|
legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]):
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Adds context information to a capture or a URL node'''
|
2020-08-28 18:03:52 +02:00
|
|
|
if malicious:
|
|
|
|
self.context.add_malicious(ressource_hash, details['malicious'])
|
|
|
|
if legitimate:
|
|
|
|
self.context.add_legitimate(ressource_hash, details['legitimate'])
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def add_to_legitimate(self, capture_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None):
|
2021-10-07 18:33:40 +02:00
|
|
|
'''Mark a full capture as legitimate.
|
2021-01-12 17:22:51 +01:00
|
|
|
Iterates over all the nodes and mark them all as legitimate too.'''
|
2020-08-20 19:39:03 +02:00
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2020-08-27 12:57:10 +02:00
|
|
|
self.context.mark_as_legitimate(ct, hostnode_uuid, urlnode_uuid)
|
2020-08-20 19:39:03 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def remove_pickle(self, capture_uuid: str, /) -> None:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Remove the pickle from a specific capture.'''
|
2021-09-22 17:09:04 +02:00
|
|
|
self._captures_index.remove_pickle(capture_uuid)
|
2020-06-22 19:17:25 +02:00
|
|
|
|
2020-05-18 18:32:59 +02:00
|
|
|
def rebuild_cache(self) -> None:
|
2021-08-20 17:46:22 +02:00
|
|
|
'''Flush and rebuild the redis cache. Doesn't remove the pickles.
|
|
|
|
The cached captures will be rebuild when loading the index.'''
|
2020-04-01 17:44:06 +02:00
|
|
|
self.redis.flushdb()
|
|
|
|
|
2020-05-18 18:32:59 +02:00
|
|
|
def rebuild_all(self) -> None:
|
2021-08-20 17:46:22 +02:00
|
|
|
'''Flush and rebuild the redis cache, and delete all the pickles.
|
|
|
|
The captures will be rebuilt by the background indexer'''
|
2021-09-22 17:09:04 +02:00
|
|
|
self._captures_index.rebuild_all()
|
2020-04-01 17:44:06 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get a URL node from a tree, by UUID'''
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2020-05-20 19:11:15 +02:00
|
|
|
return ct.root_hartree.get_url_node_by_uuid(node_uuid)
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_hostnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> HostNode:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get a host node from a tree, by UUID'''
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2020-05-20 19:11:15 +02:00
|
|
|
return ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_statistics(self, capture_uuid: str, /) -> Dict[str, Any]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get the statistics of a capture.'''
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2021-04-27 17:09:58 +02:00
|
|
|
return ct.root_hartree.stats
|
2020-05-13 17:31:27 +02:00
|
|
|
|
2021-06-10 02:59:24 +02:00
|
|
|
def get_info(self, capture_uuid: str, /) -> Dict[str, Any]:
|
|
|
|
'''Get basic information about the capture.'''
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
|
|
|
to_return = {'url': ct.root_url, 'title': ct.root_hartree.har.initial_title,
|
2021-06-23 01:55:41 +02:00
|
|
|
'capture_time': ct.start_time.isoformat(), 'user_agent': ct.user_agent,
|
|
|
|
'referer': ct.referer}
|
2021-06-10 02:59:24 +02:00
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_meta(self, capture_uuid: str, /) -> Dict[str, str]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get the meta informations from a capture (mostly, details about the User Agent used.)'''
|
2021-09-27 11:36:27 +02:00
|
|
|
metafile = self._captures_index[capture_uuid].capture_dir / 'meta'
|
2021-09-21 18:01:32 +02:00
|
|
|
if metafile.exists():
|
|
|
|
with metafile.open('r') as f:
|
|
|
|
return json.load(f)
|
|
|
|
|
2021-01-12 17:22:51 +01:00
|
|
|
meta = {}
|
2021-09-21 18:01:32 +02:00
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2022-03-29 21:13:02 +02:00
|
|
|
ua = ParsedUserAgent(ct.root_hartree.user_agent)
|
2021-09-21 18:01:32 +02:00
|
|
|
meta['user_agent'] = ua.string
|
|
|
|
if ua.platform:
|
|
|
|
meta['os'] = ua.platform
|
|
|
|
if ua.browser:
|
|
|
|
if ua.version:
|
|
|
|
meta['browser'] = f'{ua.browser} {ua.version}'
|
|
|
|
else:
|
|
|
|
meta['browser'] = ua.browser
|
|
|
|
|
|
|
|
if not meta:
|
|
|
|
# UA not recognized
|
|
|
|
self.logger.info(f'Unable to recognize the User agent: {ua}')
|
|
|
|
with metafile.open('w') as f:
|
|
|
|
json.dump(meta, f)
|
2021-01-12 17:22:51 +01:00
|
|
|
return meta
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def categories_capture(self, capture_uuid: str, /) -> Dict[str, Any]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get all the categories related to a capture, in MISP Taxonomies format'''
|
2021-09-27 11:36:27 +02:00
|
|
|
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
|
2020-10-28 18:49:15 +01:00
|
|
|
# get existing categories if possible
|
2021-09-27 11:36:27 +02:00
|
|
|
if categ_file.exists():
|
|
|
|
with categ_file.open() as f:
|
2020-10-28 18:49:15 +01:00
|
|
|
current_categories = [line.strip() for line in f.readlines()]
|
2020-11-09 16:02:54 +01:00
|
|
|
return {e: self.taxonomies.revert_machinetag(e) for e in current_categories}
|
|
|
|
return {}
|
2020-10-28 18:49:15 +01:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def categorize_capture(self, capture_uuid: str, /, category: str) -> None:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Add a category (MISP Taxonomy tag) to a capture.'''
|
2020-10-28 18:49:15 +01:00
|
|
|
if not get_config('generic', 'enable_categorization'):
|
|
|
|
return
|
|
|
|
# Make sure the category is mappable to a taxonomy.
|
|
|
|
self.taxonomies.revert_machinetag(category)
|
|
|
|
|
2021-09-27 11:36:27 +02:00
|
|
|
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
|
2020-10-28 18:49:15 +01:00
|
|
|
# get existing categories if possible
|
2021-09-27 11:36:27 +02:00
|
|
|
if categ_file.exists():
|
|
|
|
with categ_file.open() as f:
|
2022-03-29 21:13:02 +02:00
|
|
|
current_categories = {line.strip() for line in f.readlines()}
|
2020-10-28 18:49:15 +01:00
|
|
|
else:
|
|
|
|
current_categories = set()
|
|
|
|
current_categories.add(category)
|
2021-09-27 11:36:27 +02:00
|
|
|
with categ_file.open('w') as f:
|
2020-10-28 18:49:15 +01:00
|
|
|
f.writelines(f'{t}\n' for t in current_categories)
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def uncategorize_capture(self, capture_uuid: str, /, category: str) -> None:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Remove a category (MISP Taxonomy tag) from a capture.'''
|
2020-10-28 18:49:15 +01:00
|
|
|
if not get_config('generic', 'enable_categorization'):
|
|
|
|
return
|
2021-09-27 11:36:27 +02:00
|
|
|
categ_file = self._captures_index[capture_uuid].capture_dir / 'categories'
|
2020-10-28 18:49:15 +01:00
|
|
|
# get existing categories if possible
|
2021-09-27 11:36:27 +02:00
|
|
|
if categ_file.exists():
|
|
|
|
with categ_file.open() as f:
|
2022-03-29 21:13:02 +02:00
|
|
|
current_categories = {line.strip() for line in f.readlines()}
|
2020-10-28 18:49:15 +01:00
|
|
|
else:
|
|
|
|
current_categories = set()
|
|
|
|
current_categories.remove(category)
|
2021-09-27 11:36:27 +02:00
|
|
|
with categ_file.open('w') as f:
|
2020-10-28 18:49:15 +01:00
|
|
|
f.writelines(f'{t}\n' for t in current_categories)
|
|
|
|
|
2021-08-13 13:50:26 +02:00
|
|
|
def trigger_modules(self, capture_uuid: str, /, force: bool=False, auto_trigger: bool=False) -> Dict:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Launch the 3rd party modules on a capture.
|
|
|
|
It uses the cached result *if* the module was triggered the same day.
|
|
|
|
The `force` flag re-triggers the module regardless of the cache.'''
|
|
|
|
try:
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
|
|
|
except LookylooException:
|
|
|
|
self.logger.warning(f'Unable to trigger the modules unless the tree ({capture_uuid}) is cached.')
|
2021-08-13 13:50:26 +02:00
|
|
|
return {'error': f'UUID {capture_uuid} is either unknown or the tree is not ready yet.'}
|
|
|
|
|
|
|
|
self.uwhois.capture_default_trigger(ct, force=force, auto_trigger=auto_trigger)
|
2021-11-30 14:59:48 +01:00
|
|
|
self.hashlookup.capture_default_trigger(ct, auto_trigger=auto_trigger)
|
2020-04-20 16:41:42 +02:00
|
|
|
|
2021-08-13 13:50:26 +02:00
|
|
|
to_return: Dict[str, Dict] = {'PhishingInitiative': {}, 'VirusTotal': {}, 'UrlScan': {}}
|
2021-08-11 15:26:12 +02:00
|
|
|
capture_cache = self.capture_cache(capture_uuid)
|
|
|
|
|
2021-08-13 13:50:26 +02:00
|
|
|
to_return['PhishingInitiative'] = self.pi.capture_default_trigger(ct, force=force, auto_trigger=auto_trigger)
|
|
|
|
to_return['VirusTotal'] = self.vt.capture_default_trigger(ct, force=force, auto_trigger=auto_trigger)
|
|
|
|
to_return['UrlScan'] = self.urlscan.capture_default_trigger(
|
2021-08-11 15:26:12 +02:00
|
|
|
self.get_info(capture_uuid),
|
|
|
|
visibility='unlisted' if (capture_cache and capture_cache.no_index) else 'public',
|
|
|
|
force=force, auto_trigger=auto_trigger)
|
2021-09-16 16:33:44 +02:00
|
|
|
to_return['Phishtank'] = self.phishtank.capture_default_trigger(ct, auto_trigger=auto_trigger)
|
2021-08-13 13:50:26 +02:00
|
|
|
return to_return
|
2020-04-20 16:41:42 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_modules_responses(self, capture_uuid: str, /) -> Optional[Dict[str, Any]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get the responses of the modules from the cached responses on the disk'''
|
|
|
|
try:
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
|
|
|
except LookylooException:
|
|
|
|
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
2020-04-20 16:52:46 +02:00
|
|
|
return None
|
|
|
|
to_return: Dict[str, Any] = {}
|
2020-12-03 12:33:35 +01:00
|
|
|
if self.vt.available:
|
2020-04-20 16:41:42 +02:00
|
|
|
to_return['vt'] = {}
|
|
|
|
if ct.redirects:
|
|
|
|
for redirect in ct.redirects:
|
|
|
|
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
|
|
|
|
else:
|
2020-06-08 16:37:59 +02:00
|
|
|
to_return['vt'][ct.root_hartree.har.root_url] = self.vt.get_url_lookup(ct.root_hartree.har.root_url)
|
2020-12-03 12:33:35 +01:00
|
|
|
if self.pi.available:
|
2020-06-09 15:06:35 +02:00
|
|
|
to_return['pi'] = {}
|
|
|
|
if ct.redirects:
|
|
|
|
for redirect in ct.redirects:
|
|
|
|
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
|
|
|
|
else:
|
|
|
|
to_return['pi'][ct.root_hartree.har.root_url] = self.pi.get_url_lookup(ct.root_hartree.har.root_url)
|
2021-09-16 16:33:44 +02:00
|
|
|
if self.phishtank.available:
|
2021-09-23 13:58:40 +02:00
|
|
|
to_return['phishtank'] = {'urls': {}, 'ips_hits': {}}
|
2021-09-16 16:33:44 +02:00
|
|
|
if ct.redirects:
|
|
|
|
for redirect in ct.redirects:
|
2021-09-23 13:58:40 +02:00
|
|
|
to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect)
|
2021-09-16 16:33:44 +02:00
|
|
|
else:
|
2021-09-23 13:58:40 +02:00
|
|
|
to_return['phishtank']['urls'][ct.root_hartree.har.root_url] = self.phishtank.get_url_lookup(ct.root_hartree.har.root_url)
|
|
|
|
ips_hits = self.phishtank.lookup_ips_capture(ct)
|
|
|
|
if ips_hits:
|
|
|
|
to_return['phishtank']['ips_hits'] = ips_hits
|
2021-08-10 17:38:47 +02:00
|
|
|
if self.urlscan.available:
|
2021-08-11 15:26:12 +02:00
|
|
|
info = self.get_info(capture_uuid)
|
2021-08-10 17:38:47 +02:00
|
|
|
to_return['urlscan'] = {'submission': {}, 'result': {}}
|
2021-08-11 15:26:12 +02:00
|
|
|
to_return['urlscan']['submission'] = self.urlscan.get_url_submission(info)
|
|
|
|
if to_return['urlscan']['submission'] and 'uuid' in to_return['urlscan']['submission']:
|
|
|
|
# The submission was done, try to get the results
|
|
|
|
result = self.urlscan.url_result(info)
|
|
|
|
if 'error' not in result:
|
|
|
|
to_return['urlscan']['result'] = result
|
2020-04-20 16:41:42 +02:00
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def hide_capture(self, capture_uuid: str, /) -> None:
|
2020-08-10 12:35:16 +02:00
|
|
|
"""Add the capture in the hidden pool (not shown on the front page)
|
|
|
|
NOTE: it won't remove the correlations until they are rebuilt.
|
|
|
|
"""
|
2021-09-27 11:36:27 +02:00
|
|
|
capture_dir = self._captures_index[capture_uuid].capture_dir
|
|
|
|
self.redis.hset(str(capture_dir), 'no_index', 1)
|
|
|
|
(capture_dir / 'no_index').touch()
|
2021-09-22 17:09:04 +02:00
|
|
|
self._captures_index.reload_cache(capture_uuid)
|
2020-08-10 12:35:16 +02:00
|
|
|
|
2021-09-24 12:02:28 +02:00
|
|
|
def update_tree_cache_info(self, process_id: int, classname: str) -> None:
|
|
|
|
self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))
|
|
|
|
|
2021-06-16 01:03:33 +02:00
|
|
|
def sorted_capture_cache(self, capture_uuids: Optional[Iterable[str]]=None) -> List[CaptureCache]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get all the captures in the cache, sorted by timestamp (new -> old).'''
|
2021-06-29 22:48:09 +02:00
|
|
|
if capture_uuids is None:
|
2021-09-27 11:36:27 +02:00
|
|
|
# Sort all recent captures
|
|
|
|
capture_uuids = self.redis.hkeys('lookup_dirs')
|
2021-03-18 15:39:55 +01:00
|
|
|
if not capture_uuids:
|
|
|
|
# No captures at all on the instance
|
2021-04-03 02:06:32 +02:00
|
|
|
return []
|
|
|
|
|
2021-09-22 17:09:04 +02:00
|
|
|
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
|
2021-03-18 15:39:55 +01:00
|
|
|
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
|
|
|
return all_cache
|
2020-10-29 13:29:13 +01:00
|
|
|
|
2021-09-01 14:08:25 +02:00
|
|
|
def get_capture_status(self, capture_uuid: str, /) -> CaptureStatus:
|
2021-09-27 11:36:27 +02:00
|
|
|
'''Returns the status (queued, ongoing, done, or UUID unknown)'''
|
2021-09-01 14:08:25 +02:00
|
|
|
if self.redis.zrank('to_capture', capture_uuid) is not None:
|
|
|
|
return CaptureStatus.QUEUED
|
|
|
|
elif self.redis.hexists('lookup_dirs', capture_uuid):
|
|
|
|
return CaptureStatus.DONE
|
|
|
|
elif self.redis.sismember('ongoing', capture_uuid):
|
|
|
|
return CaptureStatus.ONGOING
|
|
|
|
return CaptureStatus.UNKNOWN
|
|
|
|
|
2021-09-07 18:15:56 +02:00
|
|
|
def try_error_status(self, capture_uuid: str, /) -> Optional[str]:
|
2021-09-27 11:36:27 +02:00
|
|
|
'''If it is not possible to do the capture, we store the error for a short amount of time'''
|
2021-09-07 18:15:56 +02:00
|
|
|
return self.redis.get(f'error_{capture_uuid}')
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
|
2021-08-20 17:46:22 +02:00
|
|
|
"""Get the cache from redis."""
|
2021-08-26 15:49:19 +02:00
|
|
|
try:
|
2021-09-22 17:09:04 +02:00
|
|
|
return self._captures_index[capture_uuid]
|
2021-09-07 15:58:11 +02:00
|
|
|
except MissingCaptureDirectory as e:
|
|
|
|
# The UUID is in the captures but the directory is not on the disk.
|
|
|
|
self.logger.warning(e)
|
|
|
|
return None
|
|
|
|
except MissingUUID:
|
2021-09-01 14:08:25 +02:00
|
|
|
if self.get_capture_status(capture_uuid) not in [CaptureStatus.QUEUED, CaptureStatus.ONGOING]:
|
|
|
|
self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).')
|
2021-01-14 17:12:16 +01:00
|
|
|
return None
|
2021-01-14 17:28:59 +01:00
|
|
|
except LookylooException as e:
|
2021-09-07 15:58:11 +02:00
|
|
|
self.logger.warning(e)
|
|
|
|
return None
|
|
|
|
except Exception as e:
|
2022-04-21 13:53:42 +02:00
|
|
|
self.logger.exception(e)
|
2021-01-14 17:12:16 +01:00
|
|
|
return None
|
2019-02-01 16:11:16 +01:00
|
|
|
|
2021-08-24 18:32:54 +02:00
|
|
|
def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree:
|
|
|
|
'''Get the generated tree in ETE Toolkit format.
|
|
|
|
Loads the pickle if it exists, creates it otherwise.'''
|
2021-09-22 22:23:20 +02:00
|
|
|
try:
|
|
|
|
return self._captures_index[capture_uuid].tree
|
|
|
|
except TreeNeedsRebuild:
|
|
|
|
self._captures_index.reload_cache(capture_uuid)
|
|
|
|
return self._captures_index[capture_uuid].tree
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2021-05-18 23:58:56 +02:00
|
|
|
def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str:
|
2021-05-12 22:30:04 +02:00
|
|
|
'''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)'''
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2021-09-27 11:36:27 +02:00
|
|
|
def get_priority(source: str, user: str, authenticated: bool) -> int:
|
2021-08-25 13:36:48 +02:00
|
|
|
src_prio: int = self._priority['sources'][source] if source in self._priority['sources'] else -1
|
|
|
|
if not authenticated:
|
|
|
|
usr_prio = self._priority['users']['_default_anon']
|
|
|
|
# reduce priority for anonymous users making lots of captures
|
|
|
|
queue_size = self.redis.zscore('queues', f'{source}|{authenticated}|{user}')
|
|
|
|
if queue_size is None:
|
|
|
|
queue_size = 0
|
|
|
|
usr_prio -= int(queue_size / 10)
|
|
|
|
else:
|
|
|
|
usr_prio = self._priority['users'][user] if self._priority['users'].get(user) else self._priority['users']['_default_auth']
|
|
|
|
return src_prio + usr_prio
|
|
|
|
|
2020-04-21 18:41:57 +02:00
|
|
|
for key, value in query.items():
|
|
|
|
if isinstance(value, bool):
|
2021-09-10 14:20:58 +02:00
|
|
|
query[key] = 1 if value else 0
|
2022-04-30 00:08:35 +02:00
|
|
|
elif isinstance(value, (list, dict)):
|
2021-03-19 17:51:25 +01:00
|
|
|
query[key] = json.dumps(value)
|
2022-04-30 00:08:35 +02:00
|
|
|
elif isinstance(value, bytes):
|
|
|
|
query[key] = value.decode()
|
2022-04-29 21:55:59 +02:00
|
|
|
|
|
|
|
# dirty deduplicate
|
|
|
|
hash_query = hashlib.sha512(json.dumps(query).encode()).hexdigest()
|
2022-04-30 00:08:35 +02:00
|
|
|
# FIXME The line below should work, but it doesn't
|
|
|
|
# if (existing_uuid := self.redis.set(f'query_hash:{hash_query}', temp_uuid, get=True, nx=True, ex=300)):
|
|
|
|
if (existing_uuid := self.redis.get(f'query_hash:{hash_query}')):
|
|
|
|
return existing_uuid
|
2022-04-29 21:55:59 +02:00
|
|
|
|
|
|
|
perma_uuid = str(uuid4())
|
2022-04-30 00:08:35 +02:00
|
|
|
self.redis.set(f'query_hash:{hash_query}', perma_uuid, nx=True, ex=300)
|
2022-04-29 21:55:59 +02:00
|
|
|
|
|
|
|
priority = get_priority(source, user, authenticated)
|
|
|
|
p = self.redis.pipeline()
|
2021-08-25 13:36:48 +02:00
|
|
|
if priority < -10:
|
|
|
|
# Someone is probably abusing the system with useless URLs, remove them from the index
|
|
|
|
query['listing'] = 0
|
2022-05-23 00:15:52 +02:00
|
|
|
p.hset(perma_uuid, mapping=query)
|
2021-05-18 23:58:56 +02:00
|
|
|
p.zadd('to_capture', {perma_uuid: priority})
|
|
|
|
p.zincrby('queues', 1, f'{source}|{authenticated}|{user}')
|
|
|
|
p.set(f'{perma_uuid}_mgmt', f'{source}|{authenticated}|{user}')
|
2019-01-30 14:30:01 +01:00
|
|
|
p.execute()
|
|
|
|
return perma_uuid
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def send_mail(self, capture_uuid: str, /, email: str='', comment: str='') -> None:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Send an email notification regarding a specific capture'''
|
2020-09-21 16:41:30 +02:00
|
|
|
if not get_config('generic', 'enable_mail_notification'):
|
2020-05-11 19:01:02 +02:00
|
|
|
return
|
2020-05-27 15:15:37 +02:00
|
|
|
|
|
|
|
redirects = ''
|
2020-05-27 15:39:06 +02:00
|
|
|
initial_url = ''
|
2020-06-29 11:59:01 +02:00
|
|
|
cache = self.capture_cache(capture_uuid)
|
|
|
|
if cache:
|
2021-11-26 18:42:40 +01:00
|
|
|
initial_url = defang(cache.url, colon=True, all_dots=True)
|
2021-01-14 17:12:16 +01:00
|
|
|
if cache.redirects:
|
2020-06-29 11:59:01 +02:00
|
|
|
redirects = "Redirects:\n"
|
2021-11-26 18:42:40 +01:00
|
|
|
redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True)
|
2020-06-29 11:59:01 +02:00
|
|
|
else:
|
|
|
|
redirects = "No redirects."
|
2020-05-27 15:15:37 +02:00
|
|
|
|
2020-09-21 16:41:30 +02:00
|
|
|
email_config = get_config('generic', 'email')
|
2020-05-11 19:01:02 +02:00
|
|
|
msg = EmailMessage()
|
|
|
|
msg['From'] = email_config['from']
|
2020-05-27 15:15:37 +02:00
|
|
|
if email:
|
|
|
|
msg['Reply-To'] = email
|
2020-05-11 19:01:02 +02:00
|
|
|
msg['To'] = email_config['to']
|
|
|
|
msg['Subject'] = email_config['subject']
|
|
|
|
body = get_email_template()
|
|
|
|
body = body.format(
|
|
|
|
recipient=msg['To'].addresses[0].display_name,
|
2020-12-07 20:50:46 +01:00
|
|
|
domain=self.public_domain,
|
2020-05-11 19:01:02 +02:00
|
|
|
uuid=capture_uuid,
|
2020-05-27 15:39:06 +02:00
|
|
|
initial_url=initial_url,
|
2020-05-27 15:15:37 +02:00
|
|
|
redirects=redirects,
|
2020-05-11 19:01:02 +02:00
|
|
|
comment=comment,
|
|
|
|
sender=msg['From'].addresses[0].display_name,
|
|
|
|
)
|
|
|
|
msg.set_content(body)
|
|
|
|
try:
|
|
|
|
s = smtplib.SMTP(email_config['smtp_host'], email_config['smtp_port'])
|
|
|
|
s.send_message(msg)
|
|
|
|
s.quit()
|
|
|
|
except Exception as e:
|
2020-06-29 18:00:53 +02:00
|
|
|
self.logger.exception(e)
|
2020-08-20 15:05:27 +02:00
|
|
|
self.logger.warning(msg.as_string())
|
2020-05-11 19:01:02 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def _get_raw(self, capture_uuid: str, /, extension: str='*', all_files: bool=True) -> BytesIO:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get file(s) from the capture directory'''
|
2021-03-07 23:57:46 +01:00
|
|
|
try:
|
2021-09-27 11:36:27 +02:00
|
|
|
capture_dir = self._captures_index[capture_uuid].capture_dir
|
2021-03-07 23:57:46 +01:00
|
|
|
except MissingUUID:
|
|
|
|
return BytesIO(f'Capture {capture_uuid} not unavailable, try again later.'.encode())
|
2021-08-31 13:19:09 +02:00
|
|
|
except MissingCaptureDirectory:
|
|
|
|
return BytesIO(f'No capture {capture_uuid} on the system (directory missing).'.encode())
|
2020-05-12 16:53:10 +02:00
|
|
|
all_paths = sorted(list(capture_dir.glob(f'*.{extension}')))
|
|
|
|
if not all_files:
|
|
|
|
# Only get the first one in the list
|
2022-05-03 12:23:16 +02:00
|
|
|
if not all_paths:
|
|
|
|
return BytesIO()
|
2020-05-12 16:53:10 +02:00
|
|
|
with open(all_paths[0], 'rb') as f:
|
|
|
|
return BytesIO(f.read())
|
|
|
|
to_return = BytesIO()
|
2021-09-06 16:12:41 +02:00
|
|
|
# Add uuid file to the export, allows to keep the same UUID across platforms.
|
|
|
|
all_paths.append(capture_dir / 'uuid')
|
2020-05-12 16:53:10 +02:00
|
|
|
with ZipFile(to_return, 'w') as myzip:
|
|
|
|
for path in all_paths:
|
|
|
|
if path.name.endswith('pickle'):
|
|
|
|
continue
|
|
|
|
myzip.write(path, arcname=f'{capture_dir.name}/{path.name}')
|
|
|
|
to_return.seek(0)
|
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_html(self, capture_uuid: str, /, all_html: bool=False) -> BytesIO:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get rendered HTML'''
|
2020-06-29 11:59:01 +02:00
|
|
|
return self._get_raw(capture_uuid, 'html', all_html)
|
2020-05-12 16:53:10 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_cookies(self, capture_uuid: str, /, all_cookies: bool=False) -> BytesIO:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get the cookie(s)'''
|
2020-06-29 11:59:01 +02:00
|
|
|
return self._get_raw(capture_uuid, 'cookies.json', all_cookies)
|
2020-05-26 17:45:04 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_screenshot(self, capture_uuid: str, /) -> BytesIO:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get the screenshot(s) of the rendered page'''
|
2021-01-13 14:33:20 +01:00
|
|
|
return self._get_raw(capture_uuid, 'png', all_files=False)
|
2020-05-12 16:53:10 +02:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_screenshot_thumbnail(self, capture_uuid: str, /, for_datauri: bool=False, width: int=64) -> Union[str, BytesIO]:
|
2021-01-18 12:30:07 +01:00
|
|
|
'''Get the thumbnail of the rendered page. Always crop to a square.'''
|
2020-12-09 19:11:19 +01:00
|
|
|
to_return = BytesIO()
|
2021-01-18 12:30:07 +01:00
|
|
|
size = width, width
|
2021-01-13 14:33:20 +01:00
|
|
|
try:
|
2021-01-13 15:16:36 +01:00
|
|
|
s = self.get_screenshot(capture_uuid)
|
2021-01-18 12:30:07 +01:00
|
|
|
orig_screenshot = Image.open(s)
|
|
|
|
to_thumbnail = orig_screenshot.crop((0, 0, orig_screenshot.width, orig_screenshot.width))
|
2021-01-13 14:33:20 +01:00
|
|
|
except Image.DecompressionBombError as e:
|
|
|
|
# The image is most probably too big: https://pillow.readthedocs.io/en/stable/reference/Image.html
|
|
|
|
self.logger.warning(f'Unable to generate the screenshot thumbnail of {capture_uuid}: image too big ({e}).')
|
2021-01-18 12:30:07 +01:00
|
|
|
error_img: Path = get_homedir() / 'website' / 'web' / 'static' / 'error_screenshot.png'
|
|
|
|
to_thumbnail = Image.open(error_img)
|
2022-05-03 12:23:16 +02:00
|
|
|
except UnidentifiedImageError as e:
|
|
|
|
# The image is most probably too big: https://pillow.readthedocs.io/en/stable/reference/Image.html
|
|
|
|
self.logger.warning(f'Unable to generate the screenshot thumbnail of {capture_uuid}: image too big ({e}).')
|
|
|
|
error_img = get_homedir() / 'website' / 'web' / 'static' / 'error_screenshot.png'
|
|
|
|
to_thumbnail = Image.open(error_img)
|
2021-01-13 14:33:20 +01:00
|
|
|
|
2021-01-18 12:30:07 +01:00
|
|
|
to_thumbnail.thumbnail(size)
|
|
|
|
to_thumbnail.save(to_return, 'png')
|
|
|
|
|
|
|
|
to_return.seek(0)
|
2020-12-09 19:11:19 +01:00
|
|
|
if for_datauri:
|
|
|
|
return base64.b64encode(to_return.getvalue()).decode()
|
|
|
|
else:
|
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_capture(self, capture_uuid: str, /) -> BytesIO:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get all the files related to this capture.'''
|
2020-06-29 11:59:01 +02:00
|
|
|
return self._get_raw(capture_uuid)
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_urls_rendered_page(self, capture_uuid: str, /):
|
2021-03-19 17:51:25 +01:00
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
|
|
|
return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page)
|
|
|
|
- set(ct.root_hartree.all_url_requests.keys()))
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_body_hash_investigator(self, body_hash: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Returns all the captures related to a hash (sha512), used in the web interface.'''
|
2020-09-01 17:33:36 +02:00
|
|
|
total_captures, details = self.indexing.get_body_hash_captures(body_hash, limit=-1)
|
2021-03-18 18:47:54 +01:00
|
|
|
cached_captures = self.sorted_capture_cache([d[0] for d in details])
|
2021-03-18 15:39:55 +01:00
|
|
|
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
2020-06-22 19:17:25 +02:00
|
|
|
domains = self.indexing.get_body_hash_domains(body_hash)
|
2020-06-20 02:09:45 +02:00
|
|
|
return captures, domains
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_body_hash_full(self, body_hash: str, /) -> Tuple[Dict[str, List[Dict[str, str]]], BytesIO]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
|
|
|
|
Also contains the data (base64 encoded)'''
|
2020-10-23 20:51:15 +02:00
|
|
|
details = self.indexing.get_body_hash_urls(body_hash)
|
|
|
|
body_content = BytesIO()
|
|
|
|
# get the body from the first entry in the details list
|
2021-06-16 01:03:33 +02:00
|
|
|
for _, entries in details.items():
|
2021-01-12 17:22:51 +01:00
|
|
|
ct = self.get_crawled_tree(entries[0]['capture'])
|
2020-10-23 20:51:15 +02:00
|
|
|
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
|
|
|
|
if urlnode.body_hash == body_hash:
|
|
|
|
# the hash we're looking for is the whole file
|
|
|
|
body_content = urlnode.body
|
|
|
|
else:
|
|
|
|
# The hash is an embedded resource
|
2021-06-16 01:03:33 +02:00
|
|
|
for _, blobs in urlnode.body_hash.embedded_ressources.items():
|
2020-10-23 20:51:15 +02:00
|
|
|
for h, b in blobs:
|
|
|
|
if h == body_hash:
|
|
|
|
body_content = b
|
|
|
|
break
|
|
|
|
break
|
|
|
|
return details, body_content
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_latest_url_capture(self, url: str, /) -> Optional[CaptureCache]:
|
2021-03-19 17:51:25 +01:00
|
|
|
'''Get the most recent capture with this URL'''
|
|
|
|
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url))
|
|
|
|
if captures:
|
|
|
|
return captures[0]
|
|
|
|
return None
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_url_occurrences(self, url: str, /, limit: int=20) -> List[Dict]:
|
2021-03-16 13:35:59 +01:00
|
|
|
'''Get the most recent captures and URL nodes where the URL has been seen.'''
|
2021-03-18 18:47:54 +01:00
|
|
|
captures = self.sorted_capture_cache(self.indexing.get_captures_url(url))
|
2021-03-16 13:35:59 +01:00
|
|
|
|
|
|
|
to_return: List[Dict] = []
|
|
|
|
for capture in captures[:limit]:
|
|
|
|
ct = self.get_crawled_tree(capture.uuid)
|
|
|
|
to_append: Dict[str, Union[str, Dict]] = {'capture_uuid': capture.uuid,
|
2021-03-18 00:40:14 +01:00
|
|
|
'start_timestamp': capture.timestamp.isoformat(),
|
|
|
|
'title': capture.title}
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes: Dict[str, Dict[str, str]] = {}
|
2020-10-27 00:02:18 +01:00
|
|
|
for urlnode in ct.root_hartree.url_tree.search_nodes(name=url):
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
|
|
|
'hostnode_uuid': urlnode.hostnode_uuid}
|
2020-10-27 00:02:18 +01:00
|
|
|
if hasattr(urlnode, 'body_hash'):
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
|
|
|
to_append['urlnodes'] = urlnodes
|
|
|
|
to_return.append(to_append)
|
2020-10-27 00:02:18 +01:00
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_hostname_occurrences(self, hostname: str, /, with_urls_occurrences: bool=False, limit: int=20) -> List[Dict]:
|
2021-03-16 13:35:59 +01:00
|
|
|
'''Get the most recent captures and URL nodes where the hostname has been seen.'''
|
2021-03-18 18:47:54 +01:00
|
|
|
captures = self.sorted_capture_cache(self.indexing.get_captures_hostname(hostname))
|
2021-03-16 13:35:59 +01:00
|
|
|
|
|
|
|
to_return: List[Dict] = []
|
|
|
|
for capture in captures[:limit]:
|
|
|
|
ct = self.get_crawled_tree(capture.uuid)
|
|
|
|
to_append: Dict[str, Union[str, List, Dict]] = {'capture_uuid': capture.uuid,
|
2021-03-18 00:40:14 +01:00
|
|
|
'start_timestamp': capture.timestamp.isoformat(),
|
|
|
|
'title': capture.title}
|
2021-03-16 13:35:59 +01:00
|
|
|
hostnodes: List[str] = []
|
2020-10-27 01:52:28 +01:00
|
|
|
if with_urls_occurrences:
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes: Dict[str, Dict[str, str]] = {}
|
2020-10-27 01:42:00 +01:00
|
|
|
for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname):
|
2021-03-16 13:35:59 +01:00
|
|
|
hostnodes.append(hostnode.uuid)
|
2020-10-27 01:42:00 +01:00
|
|
|
if with_urls_occurrences:
|
|
|
|
for urlnode in hostnode.urls:
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(),
|
|
|
|
'url': urlnode.name,
|
|
|
|
'hostnode_uuid': urlnode.hostnode_uuid}
|
2020-10-27 01:42:00 +01:00
|
|
|
if hasattr(urlnode, 'body_hash'):
|
2021-03-16 13:35:59 +01:00
|
|
|
urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash
|
|
|
|
to_append['hostnodes'] = hostnodes
|
|
|
|
if with_urls_occurrences:
|
|
|
|
to_append['urlnodes'] = urlnodes
|
|
|
|
to_return.append(to_append)
|
2020-10-27 00:02:18 +01:00
|
|
|
return to_return
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_cookie_name_investigator(self, cookie_name: str, /) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float, List[Tuple[str, float]]]]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
2021-03-18 18:47:54 +01:00
|
|
|
cached_captures = self.sorted_capture_cache([entry[0] for entry in self.indexing.get_cookies_names_captures(cookie_name)])
|
2021-03-18 15:39:55 +01:00
|
|
|
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
2020-06-22 19:17:25 +02:00
|
|
|
domains = [(domain, freq, self.indexing.cookies_names_domains_values(cookie_name, domain))
|
|
|
|
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
2020-06-20 02:09:45 +02:00
|
|
|
return captures, domains
|
|
|
|
|
2020-09-01 17:54:54 +02:00
|
|
|
def hash_lookup(self, blob_hash: str, url: str, capture_uuid: str) -> Tuple[int, Dict[str, List[Tuple[str, str, str, str, str]]]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Search all the captures a specific hash was seen.
|
|
|
|
If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
|
|
|
|
Capture UUID avoids duplicates on the same capture'''
|
2020-08-10 20:11:26 +02:00
|
|
|
captures_list: Dict[str, List[Tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
|
2020-09-04 18:40:51 +02:00
|
|
|
total_captures, details = self.indexing.get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid)
|
2020-09-01 17:33:36 +02:00
|
|
|
for h_capture_uuid, url_uuid, url_hostname, same_url in details:
|
2020-07-10 18:57:16 +02:00
|
|
|
cache = self.capture_cache(h_capture_uuid)
|
|
|
|
if cache:
|
|
|
|
if same_url:
|
2021-01-14 17:12:16 +01:00
|
|
|
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
2020-07-10 18:57:16 +02:00
|
|
|
else:
|
2021-01-14 17:12:16 +01:00
|
|
|
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname))
|
2020-09-01 17:33:36 +02:00
|
|
|
return total_captures, captures_list
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: Optional[str]) -> Optional[Tuple[str, BytesIO, str]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
|
2021-04-20 17:32:17 +02:00
|
|
|
try:
|
|
|
|
url = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid)
|
|
|
|
except IndexError:
|
|
|
|
# unable to find the uuid, the cache is probably in a weird state.
|
|
|
|
return None
|
2021-09-27 11:36:27 +02:00
|
|
|
except NoValidHarFile as e:
|
|
|
|
# something went poorly when rebuilding the tree (probably a recursive error)
|
|
|
|
self.logger.warning(e)
|
|
|
|
return None
|
2020-09-03 14:39:38 +02:00
|
|
|
if url.empty_response:
|
|
|
|
return None
|
|
|
|
if not h or h == url.body_hash:
|
|
|
|
# we want the body
|
2021-09-29 15:00:07 +02:00
|
|
|
return url.filename if url.filename else 'file.bin', BytesIO(url.body.getvalue()), url.mimetype
|
2020-09-03 14:39:38 +02:00
|
|
|
|
|
|
|
# We want an embedded ressource
|
|
|
|
if h not in url.resources_hashes:
|
|
|
|
return None
|
|
|
|
for mimetype, blobs in url.embedded_ressources.items():
|
|
|
|
for ressource_h, blob in blobs:
|
|
|
|
if ressource_h == h:
|
2021-09-29 15:00:07 +02:00
|
|
|
return 'embedded_ressource.bin', BytesIO(blob.getvalue()), mimetype
|
2020-09-03 16:31:45 +02:00
|
|
|
return None
|
2020-09-03 14:39:38 +02:00
|
|
|
|
2021-02-02 15:23:25 +01:00
|
|
|
def __misp_add_vt_to_URLObject(self, obj: MISPObject) -> Optional[MISPObject]:
|
|
|
|
urls = obj.get_attributes_by_relation('url')
|
|
|
|
url = urls[0]
|
|
|
|
self.vt.url_lookup(url.value)
|
|
|
|
report = self.vt.get_url_lookup(url.value)
|
|
|
|
if not report:
|
|
|
|
return None
|
|
|
|
vt_obj = MISPObject('virustotal-report', standalone=False)
|
|
|
|
vt_obj.add_attribute('first-submission', value=datetime.fromtimestamp(report['attributes']['first_submission_date']), disable_correlation=True)
|
|
|
|
vt_obj.add_attribute('last-submission', value=datetime.fromtimestamp(report['attributes']['last_submission_date']), disable_correlation=True)
|
|
|
|
vt_obj.add_attribute('permalink', value=f"https://www.virustotal.com/gui/url/{report['id']}/detection", disable_correlation=True)
|
|
|
|
obj.add_reference(vt_obj, 'analysed-with')
|
|
|
|
return vt_obj
|
|
|
|
|
2021-09-15 15:42:02 +02:00
|
|
|
def __misp_add_urlscan_to_event(self, capture_uuid: str, visibility: str) -> Optional[MISPAttribute]:
|
|
|
|
response = self.urlscan.url_submit(self.get_info(capture_uuid), visibility)
|
|
|
|
if 'result' in response:
|
|
|
|
attribute = MISPAttribute()
|
|
|
|
attribute.value = response['result']
|
|
|
|
attribute.type = 'link'
|
|
|
|
return attribute
|
|
|
|
return None
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def misp_export(self, capture_uuid: str, /, with_parent: bool=False) -> Union[List[MISPEvent], Dict[str, str]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Export a capture in MISP format. You can POST the return of this method
|
|
|
|
directly to a MISP instance and it will create an event.'''
|
2020-12-07 20:50:46 +01:00
|
|
|
cache = self.capture_cache(capture_uuid)
|
|
|
|
if not cache:
|
|
|
|
return {'error': 'UUID missing in cache, try again later.'}
|
|
|
|
|
2021-09-27 16:04:00 +02:00
|
|
|
event = self.misp.export(cache, self.is_public_instance)
|
|
|
|
screenshot: MISPAttribute = event.add_attribute('attachment', 'screenshot_landing_page.png',
|
|
|
|
data=self.get_screenshot(cache.uuid),
|
|
|
|
disable_correlation=True) # type: ignore
|
|
|
|
# If the last object attached to tht event is a file, it is the rendered page
|
|
|
|
if event.objects and event.objects[-1].name == 'file':
|
|
|
|
event.objects[-1].add_reference(screenshot, 'rendered-as', 'Screenshot of the page')
|
2021-05-12 22:30:04 +02:00
|
|
|
|
2021-05-14 23:25:37 +02:00
|
|
|
if self.vt.available:
|
|
|
|
for e_obj in event.objects:
|
|
|
|
if e_obj.name != 'url':
|
|
|
|
continue
|
|
|
|
vt_obj = self.__misp_add_vt_to_URLObject(e_obj)
|
|
|
|
if vt_obj:
|
|
|
|
event.add_object(vt_obj)
|
2021-09-27 16:04:00 +02:00
|
|
|
|
|
|
|
if self.phishtank.available:
|
|
|
|
for e_obj in event.objects:
|
|
|
|
if e_obj.name != 'url':
|
|
|
|
continue
|
|
|
|
urls = e_obj.get_attributes_by_relation('url')
|
|
|
|
if not urls:
|
|
|
|
continue
|
|
|
|
pt_entry = self.phishtank.get_url_lookup(urls[0].value)
|
|
|
|
if not pt_entry or not pt_entry.get('phish_detail_url'):
|
|
|
|
continue
|
|
|
|
pt_attribute: MISPAttribute = event.add_attribute('link', value=pt_entry['phish_detail_url'], comment='Phishtank permalink') # type: ignore
|
|
|
|
e_obj.add_reference(pt_attribute, 'known-as', 'Permalink on Phishtank')
|
|
|
|
|
2021-09-15 15:42:02 +02:00
|
|
|
if self.urlscan.available:
|
|
|
|
urlscan_attribute = self.__misp_add_urlscan_to_event(
|
|
|
|
capture_uuid,
|
|
|
|
visibility='unlisted' if (cache and cache.no_index) else 'public')
|
|
|
|
if urlscan_attribute:
|
|
|
|
event.add_attribute(**urlscan_attribute)
|
2021-05-14 23:25:37 +02:00
|
|
|
|
2021-05-12 22:30:04 +02:00
|
|
|
if with_parent and cache.parent:
|
|
|
|
parent = self.misp_export(cache.parent, with_parent)
|
|
|
|
if isinstance(parent, dict):
|
|
|
|
# Something bad happened
|
|
|
|
return parent
|
|
|
|
|
2021-05-14 23:25:37 +02:00
|
|
|
event.extends_uuid = parent[-1].uuid
|
|
|
|
parent.append(event)
|
|
|
|
return parent
|
|
|
|
|
|
|
|
return [event]
|
2020-12-07 20:50:46 +01:00
|
|
|
|
2021-08-25 13:36:48 +02:00
|
|
|
def get_misp_occurrences(self, capture_uuid: str, /) -> Optional[Dict[str, Set[str]]]:
|
|
|
|
if not self.misp.available:
|
|
|
|
return None
|
|
|
|
try:
|
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
|
|
|
except LookylooException:
|
|
|
|
self.logger.warning(f'Unable to get the modules responses unless the tree ({capture_uuid}) is cached.')
|
|
|
|
return None
|
|
|
|
nodes_to_lookup = ct.root_hartree.rendered_node.get_ancestors() + [ct.root_hartree.rendered_node]
|
|
|
|
to_return: Dict[str, Set[str]] = defaultdict(set)
|
|
|
|
for node in nodes_to_lookup:
|
|
|
|
hits = self.misp.lookup(node, ct.root_hartree.get_host_node_by_uuid(node.hostnode_uuid))
|
|
|
|
for event_id, values in hits.items():
|
|
|
|
if not isinstance(values, set):
|
|
|
|
continue
|
|
|
|
to_return[event_id].update(values)
|
|
|
|
return to_return
|
|
|
|
|
2021-12-02 17:55:02 +01:00
|
|
|
def get_hashes_with_context(self, tree_uuid: str, /, algorithm: str, *, urls_only: bool=False) -> Union[Dict[str, Set[str]], Dict[str, List[URLNode]]]:
|
2021-11-30 14:59:48 +01:00
|
|
|
"""Build (on demand) hashes for all the ressources of the tree, using the alorighm provided by the user.
|
|
|
|
If you just want the hashes in SHA512, use the get_hashes method, it gives you a list of hashes an they're build
|
|
|
|
with the tree. This method is computing the hashes when you query it, so it is slower."""
|
|
|
|
ct = self.get_crawled_tree(tree_uuid)
|
|
|
|
hashes = ct.root_hartree.build_all_hashes(algorithm)
|
|
|
|
if urls_only:
|
2022-03-29 21:13:02 +02:00
|
|
|
return {h: {node.name for node in nodes} for h, nodes in hashes.items()}
|
2021-11-30 14:59:48 +01:00
|
|
|
return hashes
|
|
|
|
|
2021-11-30 15:51:48 +01:00
|
|
|
def merge_hashlookup_tree(self, tree_uuid: str, /) -> Tuple[Dict[str, Dict[str, Any]], int]:
|
2021-11-30 14:59:48 +01:00
|
|
|
if not self.hashlookup.available:
|
|
|
|
raise LookylooException('Hashlookup module not enabled.')
|
|
|
|
hashes_tree = self.get_hashes_with_context(tree_uuid, algorithm='sha1')
|
|
|
|
|
|
|
|
hashlookup_file = self._captures_index[tree_uuid].capture_dir / 'hashlookup.json'
|
|
|
|
if not hashlookup_file.exists():
|
|
|
|
ct = self.get_crawled_tree(tree_uuid)
|
|
|
|
self.hashlookup.capture_default_trigger(ct, auto_trigger=False)
|
|
|
|
|
|
|
|
if not hashlookup_file.exists():
|
|
|
|
# no hits on hashlookup
|
2021-11-30 15:51:48 +01:00
|
|
|
return {}, len(hashes_tree)
|
2021-11-30 14:59:48 +01:00
|
|
|
|
|
|
|
with hashlookup_file.open() as f:
|
|
|
|
hashlookup_entries = json.load(f)
|
|
|
|
|
2021-11-30 15:23:48 +01:00
|
|
|
to_return: Dict[str, Dict[str, Any]] = defaultdict(dict)
|
2021-11-30 14:59:48 +01:00
|
|
|
|
|
|
|
for sha1 in hashlookup_entries.keys():
|
|
|
|
to_return[sha1]['nodes'] = hashes_tree[sha1]
|
|
|
|
to_return[sha1]['hashlookup'] = hashlookup_entries[sha1]
|
2021-11-30 15:43:42 +01:00
|
|
|
return to_return, len(hashes_tree)
|
2021-11-30 14:59:48 +01:00
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_hashes(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
2021-11-30 14:59:48 +01:00
|
|
|
"""Return hashes (sha512) of resources.
|
2020-10-09 18:05:04 +02:00
|
|
|
Only tree_uuid: All the hashes
|
|
|
|
tree_uuid and hostnode_uuid: hashes of all the resources in that hostnode (including embedded ressources)
|
|
|
|
tree_uuid, hostnode_uuid, and urlnode_uuid: hash of the URL node body, and embedded resources
|
|
|
|
"""
|
|
|
|
container: Union[CrawledTree, HostNode, URLNode]
|
|
|
|
if urlnode_uuid:
|
|
|
|
container = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid)
|
|
|
|
elif hostnode_uuid:
|
|
|
|
container = self.get_hostnode_from_tree(tree_uuid, hostnode_uuid)
|
|
|
|
else:
|
|
|
|
container = self.get_crawled_tree(tree_uuid)
|
|
|
|
return get_resources_hashes(container)
|
|
|
|
|
2021-06-16 23:57:14 +02:00
|
|
|
def get_hostnames(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
|
|
|
"""Return all the unique hostnames:
|
|
|
|
* of a complete tree if no hostnode_uuid and urlnode_uuid are given
|
|
|
|
* of a HostNode if hostnode_uuid is given
|
|
|
|
* of a URLNode if urlnode_uuid is given
|
|
|
|
"""
|
|
|
|
if urlnode_uuid:
|
|
|
|
node = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid)
|
|
|
|
return {node.hostname}
|
|
|
|
elif hostnode_uuid:
|
|
|
|
node = self.get_hostnode_from_tree(tree_uuid, hostnode_uuid)
|
|
|
|
return {node.name}
|
|
|
|
else:
|
|
|
|
ct = self.get_crawled_tree(tree_uuid)
|
|
|
|
return {node.name for node in ct.root_hartree.hostname_tree.traverse()}
|
|
|
|
|
|
|
|
def get_urls(self, tree_uuid: str, /, hostnode_uuid: Optional[str]=None, urlnode_uuid: Optional[str]=None) -> Set[str]:
|
|
|
|
"""Return all the unique URLs:
|
|
|
|
* of a complete tree if no hostnode_uuid and urlnode_uuid are given
|
|
|
|
* of a HostNode if hostnode_uuid is given
|
|
|
|
* of a URLNode if urlnode_uuid is given
|
|
|
|
"""
|
|
|
|
if urlnode_uuid:
|
|
|
|
node = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid)
|
|
|
|
return {node.name}
|
|
|
|
elif hostnode_uuid:
|
|
|
|
node = self.get_hostnode_from_tree(tree_uuid, hostnode_uuid)
|
|
|
|
return {urlnode.name for urlnode in node.urls}
|
|
|
|
else:
|
|
|
|
ct = self.get_crawled_tree(tree_uuid)
|
|
|
|
return {node.name for node in ct.root_hartree.url_tree.traverse()}
|
|
|
|
|
2021-05-18 02:08:43 +02:00
|
|
|
def get_hostnode_investigator(self, capture_uuid: str, /, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Gather all the informations needed to display the Hostnode investigator popup.'''
|
2021-08-25 13:36:48 +02:00
|
|
|
|
2021-09-27 11:36:27 +02:00
|
|
|
def normalize_known_content(h: str, /, known_content: Dict[str, Any], url: URLNode) -> Tuple[Optional[Union[str, List[Any]]], Optional[Tuple[bool, Any]]]:
|
2021-08-25 13:36:48 +02:00
|
|
|
''' There are a few different sources to figure out known vs. legitimate content,
|
|
|
|
this method normalize it for the web interface.'''
|
|
|
|
known: Optional[Union[str, List[Any]]] = None
|
|
|
|
legitimate: Optional[Tuple[bool, Any]] = None
|
|
|
|
if h not in known_content:
|
|
|
|
return known, legitimate
|
|
|
|
|
|
|
|
if known_content[h]['type'] in ['generic', 'sanejs']:
|
|
|
|
known = known_content[h]['details']
|
|
|
|
elif known_content[h]['type'] == 'legitimate_on_domain':
|
|
|
|
legit = False
|
|
|
|
if url.hostname in known_content[h]['details']:
|
|
|
|
legit = True
|
|
|
|
legitimate = (legit, known_content[h]['details'])
|
|
|
|
elif known_content[h]['type'] == 'malicious':
|
|
|
|
legitimate = (False, known_content[h]['details'])
|
|
|
|
|
|
|
|
return known, legitimate
|
|
|
|
|
2021-01-12 17:22:51 +01:00
|
|
|
ct = self.get_crawled_tree(capture_uuid)
|
2020-06-04 18:23:36 +02:00
|
|
|
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
|
|
|
|
2020-08-27 12:57:10 +02:00
|
|
|
known_content = self.context.find_known_content(hostnode)
|
2021-05-20 00:03:07 +02:00
|
|
|
self.uwhois.query_whois_hostnode(hostnode)
|
2020-06-04 18:23:36 +02:00
|
|
|
|
|
|
|
urls: List[Dict[str, Any]] = []
|
|
|
|
for url in hostnode.urls:
|
|
|
|
# For the popup, we need:
|
|
|
|
# * https vs http
|
|
|
|
# * everything after the domain
|
|
|
|
# * the full URL
|
|
|
|
to_append: Dict[str, Any] = {
|
|
|
|
'encrypted': url.name.startswith('https'),
|
|
|
|
'url_path': url.name.split('/', 3)[-1],
|
2020-08-24 15:31:53 +02:00
|
|
|
'url_object': url,
|
2020-06-04 18:23:36 +02:00
|
|
|
}
|
|
|
|
|
2020-06-29 11:59:01 +02:00
|
|
|
if not url.empty_response:
|
2020-06-19 00:25:24 +02:00
|
|
|
# Index lookup
|
2020-07-10 18:57:16 +02:00
|
|
|
# %%% Full body %%%
|
2020-06-22 19:17:25 +02:00
|
|
|
freq = self.indexing.body_hash_fequency(url.body_hash)
|
2020-07-10 18:57:16 +02:00
|
|
|
to_append['body_hash_details'] = freq
|
2020-07-15 01:35:55 +02:00
|
|
|
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
|
2020-07-10 18:57:16 +02:00
|
|
|
to_append['body_hash_details']['other_captures'] = self.hash_lookup(url.body_hash, url.name, capture_uuid)
|
|
|
|
|
|
|
|
# %%% Embedded ressources %%%
|
2020-07-15 01:35:55 +02:00
|
|
|
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
|
2020-07-10 18:57:16 +02:00
|
|
|
to_append['embedded_ressources'] = {}
|
|
|
|
for mimetype, blobs in url.embedded_ressources.items():
|
|
|
|
for h, blob in blobs:
|
|
|
|
if h in to_append['embedded_ressources']:
|
|
|
|
# Skip duplicates
|
|
|
|
continue
|
2020-07-11 02:10:56 +02:00
|
|
|
freq_embedded = self.indexing.body_hash_fequency(h)
|
|
|
|
to_append['embedded_ressources'][h] = freq_embedded
|
2020-08-27 12:57:10 +02:00
|
|
|
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes
|
2020-07-10 18:57:16 +02:00
|
|
|
to_append['embedded_ressources'][h]['type'] = mimetype
|
2020-07-11 02:10:56 +02:00
|
|
|
if freq_embedded['hash_freq'] > 1:
|
2020-07-10 18:57:16 +02:00
|
|
|
to_append['embedded_ressources'][h]['other_captures'] = self.hash_lookup(h, url.name, capture_uuid)
|
2020-08-25 18:00:16 +02:00
|
|
|
for h in to_append['embedded_ressources'].keys():
|
2021-09-27 11:36:27 +02:00
|
|
|
known, legitimate = normalize_known_content(h, known_content, url)
|
2020-09-01 17:33:36 +02:00
|
|
|
if known:
|
|
|
|
to_append['embedded_ressources'][h]['known_content'] = known
|
|
|
|
elif legitimate:
|
|
|
|
to_append['embedded_ressources'][h]['legitimacy'] = legitimate
|
|
|
|
|
2021-09-27 11:36:27 +02:00
|
|
|
known, legitimate = normalize_known_content(url.body_hash, known_content, url)
|
2020-09-01 17:33:36 +02:00
|
|
|
if known:
|
|
|
|
to_append['known_content'] = known
|
|
|
|
elif legitimate:
|
|
|
|
to_append['legitimacy'] = legitimate
|
2020-06-04 18:23:36 +02:00
|
|
|
|
|
|
|
# Optional: Cookies sent to server in request -> map to nodes who set the cookie in response
|
|
|
|
if hasattr(url, 'cookies_sent'):
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_sent: Dict[str, Set[Iterable[Optional[str]]]] = defaultdict(set)
|
2020-06-04 18:23:36 +02:00
|
|
|
for cookie, contexts in url.cookies_sent.items():
|
|
|
|
if not contexts:
|
2020-06-11 15:13:31 +02:00
|
|
|
# Locally created?
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_sent[cookie].add(('Unknown origin', ))
|
2020-06-04 18:23:36 +02:00
|
|
|
continue
|
|
|
|
for context in contexts:
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_sent[cookie].add((context['setter'].hostname, context['setter'].hostnode_uuid))
|
|
|
|
to_append['cookies_sent'] = to_display_sent
|
2020-06-04 18:23:36 +02:00
|
|
|
|
|
|
|
# Optional: Cookies received from server in response -> map to nodes who send the cookie in request
|
|
|
|
if hasattr(url, 'cookies_received'):
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_received: Dict[str, Dict[str, Set[Iterable[Optional[str]]]]] = {'3rd_party': defaultdict(set), 'sent': defaultdict(set), 'not_sent': defaultdict(set)}
|
2020-06-04 18:23:36 +02:00
|
|
|
for domain, c_received, is_3rd_party in url.cookies_received:
|
2020-06-11 15:13:31 +02:00
|
|
|
if c_received not in ct.root_hartree.cookies_sent:
|
|
|
|
# This cookie is never sent.
|
|
|
|
if is_3rd_party:
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_received['3rd_party'][c_received].add((domain, ))
|
2020-06-11 15:13:31 +02:00
|
|
|
else:
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_received['not_sent'][c_received].add((domain, ))
|
2020-06-11 15:13:31 +02:00
|
|
|
continue
|
|
|
|
|
2020-06-04 18:23:36 +02:00
|
|
|
for url_node in ct.root_hartree.cookies_sent[c_received]:
|
2020-06-11 15:13:31 +02:00
|
|
|
if is_3rd_party:
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_received['3rd_party'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
2020-06-11 15:13:31 +02:00
|
|
|
else:
|
2020-06-11 15:32:43 +02:00
|
|
|
to_display_received['sent'][c_received].add((url_node.hostname, url_node.hostnode_uuid))
|
|
|
|
to_append['cookies_received'] = to_display_received
|
2020-06-04 18:23:36 +02:00
|
|
|
|
|
|
|
urls.append(to_append)
|
|
|
|
return hostnode, urls
|
2020-11-24 16:46:01 +01:00
|
|
|
|
2020-11-27 16:27:29 +01:00
|
|
|
def get_stats(self) -> Dict[str, List]:
|
2021-01-12 17:22:51 +01:00
|
|
|
'''Gather statistics about the lookyloo instance'''
|
2020-11-24 16:46:01 +01:00
|
|
|
today = date.today()
|
|
|
|
calendar_week = today.isocalendar()[1]
|
2020-11-27 16:27:29 +01:00
|
|
|
|
2020-12-07 13:55:03 +01:00
|
|
|
stats_dict = {'submissions': 0, 'submissions_with_redirects': 0, 'redirects': 0}
|
2020-11-27 16:27:29 +01:00
|
|
|
stats: Dict[int, Dict[int, Dict[str, Any]]] = {}
|
|
|
|
weeks_stats: Dict[int, Dict] = {}
|
|
|
|
|
2021-03-18 18:47:54 +01:00
|
|
|
for cache in self.sorted_capture_cache():
|
2021-01-14 17:12:16 +01:00
|
|
|
date_submission: datetime = cache.timestamp
|
2020-12-07 13:55:03 +01:00
|
|
|
|
|
|
|
if date_submission.year not in stats:
|
|
|
|
stats[date_submission.year] = {}
|
|
|
|
if date_submission.month not in stats[date_submission.year]:
|
|
|
|
stats[date_submission.year][date_submission.month] = defaultdict(dict, **stats_dict)
|
|
|
|
stats[date_submission.year][date_submission.month]['uniq_urls'] = set()
|
|
|
|
stats[date_submission.year][date_submission.month]['submissions'] += 1
|
2021-01-14 17:12:16 +01:00
|
|
|
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
|
|
|
|
if len(cache.redirects) > 0:
|
2020-12-07 13:55:03 +01:00
|
|
|
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
|
2021-01-14 17:12:16 +01:00
|
|
|
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
|
|
|
|
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
|
2020-12-07 13:55:03 +01:00
|
|
|
|
2021-02-02 12:45:53 +01:00
|
|
|
if ((date_submission.year == today.year and calendar_week - 1 <= date_submission.isocalendar()[1] <= calendar_week)
|
2021-01-04 12:34:00 +01:00
|
|
|
or (calendar_week == 1 and date_submission.year == today.year - 1 and date_submission.isocalendar()[1] in [52, 53])):
|
2020-12-07 13:55:03 +01:00
|
|
|
if date_submission.isocalendar()[1] not in weeks_stats:
|
|
|
|
weeks_stats[date_submission.isocalendar()[1]] = defaultdict(dict, **stats_dict)
|
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set()
|
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
|
2021-01-14 17:12:16 +01:00
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
|
|
|
|
if len(cache.redirects) > 0:
|
2020-12-07 13:55:03 +01:00
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
|
2021-01-14 17:12:16 +01:00
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
|
|
|
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
|
2020-11-27 16:27:29 +01:00
|
|
|
|
|
|
|
statistics: Dict[str, List] = {'weeks': [], 'years': []}
|
|
|
|
for week_number in sorted(weeks_stats.keys()):
|
|
|
|
week_stat = weeks_stats[week_number]
|
|
|
|
urls = week_stat.pop('uniq_urls')
|
|
|
|
week_stat['week_number'] = week_number
|
|
|
|
week_stat['uniq_urls'] = len(urls)
|
|
|
|
week_stat['uniq_domains'] = len(uniq_domains(urls))
|
|
|
|
statistics['weeks'].append(week_stat)
|
|
|
|
|
|
|
|
for year in sorted(stats.keys()):
|
2020-12-07 13:55:03 +01:00
|
|
|
year_stats: Dict[str, Union[int, List]] = {'year': year, 'months': [], 'yearly_submissions': 0, 'yearly_redirects': 0}
|
2020-11-27 16:27:29 +01:00
|
|
|
for month in sorted(stats[year].keys()):
|
|
|
|
month_stats = stats[year][month]
|
|
|
|
urls = month_stats.pop('uniq_urls')
|
|
|
|
month_stats['month_number'] = month
|
|
|
|
month_stats['uniq_urls'] = len(urls)
|
|
|
|
month_stats['uniq_domains'] = len(uniq_domains(urls))
|
|
|
|
year_stats['months'].append(month_stats) # type: ignore
|
|
|
|
|
2020-12-07 13:55:03 +01:00
|
|
|
year_stats['yearly_submissions'] += month_stats['submissions']
|
2020-11-27 16:27:29 +01:00
|
|
|
year_stats['yearly_redirects'] += month_stats['redirects']
|
|
|
|
statistics['years'].append(year_stats)
|
2020-11-24 16:46:01 +01:00
|
|
|
return statistics
|