chg: Simplify tracing the exceptions related to a specific capture.

pull/640/head
Raphaël Vinot 2023-03-16 13:49:22 +01:00
parent afd383cfc3
commit d970c924f8
2 changed files with 43 additions and 26 deletions

View File

@ -14,9 +14,9 @@ import time
from collections.abc import Mapping from collections.abc import Mapping
from datetime import datetime from datetime import datetime
from functools import lru_cache from functools import lru_cache
from logging import Logger from logging import Logger, LoggerAdapter
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, Set from typing import Any, Dict, List, Optional, Tuple, Union, Set, MutableMapping
import dns.rdatatype import dns.rdatatype
import dns.resolver import dns.resolver
@ -32,19 +32,31 @@ from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, Tr
from .modules import Cloudflare from .modules import Cloudflare
class LookylooCacheLogAdapter(LoggerAdapter):
"""
Prepend log entry with the UUID of the capture
"""
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]:
if self.extra:
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
return msg, kwargs
class CaptureCache(): class CaptureCache():
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir', __slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent', 'error', 'incomplete_redirects', 'no_index', 'categories', 'parent',
'user_agent', 'referer', 'logger') 'user_agent', 'referer', 'logger')
def __init__(self, cache_entry: Dict[str, Any]): def __init__(self, cache_entry: Dict[str, Any]):
self.logger = logging.getLogger(f'{self.__class__.__name__}') logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel')) logger.setLevel(get_config('generic', 'loglevel'))
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp', __default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
'url', 'redirects', 'capture_dir') 'url', 'redirects', 'capture_dir')
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry: if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
raise LookylooException(f'The capture is deeply broken: {cache_entry}') raise LookylooException(f'The capture is deeply broken: {cache_entry}')
self.uuid: str = cache_entry['uuid'] self.uuid: str = cache_entry['uuid']
self.logger = LookylooCacheLogAdapter(logger, {'uuid': self.uuid})
self.capture_dir: Path = Path(cache_entry['capture_dir']) self.capture_dir: Path = Path(cache_entry['capture_dir'])
if all(key in cache_entry.keys() for key in __default_cache_keys): if all(key in cache_entry.keys() for key in __default_cache_keys):
@ -58,7 +70,7 @@ class CaptureCache():
if cache_entry.get('redirects'): if cache_entry.get('redirects'):
self.redirects: List[str] = json.loads(cache_entry['redirects']) self.redirects: List[str] = json.loads(cache_entry['redirects'])
else: else:
self.logger.info(f'No redirects in cache for {self.uuid}') self.logger.debug('No redirects in cache')
self.redirects = [] self.redirects = []
if not self.capture_dir.exists(): if not self.capture_dir.exists():
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.') raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
@ -104,7 +116,7 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C
except EOFError: except EOFError:
remove_pickle_tree(capture_dir) remove_pickle_tree(capture_dir)
except Exception: except Exception:
logger.exception('Unexpected exception when unpickling') logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir) remove_pickle_tree(capture_dir)
if tree: if tree:
@ -241,7 +253,7 @@ class CapturesIndex(Mapping):
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).') raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
raise MissingUUID(f'Unable to find UUID {uuid}.') raise MissingUUID(f'Unable to find UUID {uuid}.')
def _create_pickle(self, capture_dir: Path) -> CrawledTree: def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree:
with (capture_dir / 'uuid').open() as f: with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip() uuid = f.read().strip()
@ -254,14 +266,14 @@ class CapturesIndex(Mapping):
# The pickle is being created somewhere else, wait until it's done. # The pickle is being created somewhere else, wait until it's done.
while is_locked(capture_dir): while is_locked(capture_dir):
time.sleep(5) time.sleep(5)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger) return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
if not (har_files := sorted(capture_dir.glob('*.har'))): if not (har_files := sorted(capture_dir.glob('*.har'))):
har_files = sorted(capture_dir.glob('*.har.gz')) har_files = sorted(capture_dir.glob('*.har.gz'))
try: try:
with self._timeout_context(): with self._timeout_context():
tree = CrawledTree(har_files, uuid) tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree) self.__resolve_dns(tree, logger)
if self.contextualizer: if self.contextualizer:
self.contextualizer.contextualize_tree(tree) self.contextualizer.contextualize_tree(tree)
except Har2TreeError as e: except Har2TreeError as e:
@ -270,7 +282,7 @@ class CapturesIndex(Mapping):
har_file.rename(har_file.with_suffix('.broken')) har_file.rename(har_file.with_suffix('.broken'))
raise NoValidHarFile(f'We got har files, but they are broken: {e}') raise NoValidHarFile(f'We got har files, but they are broken: {e}')
except TimeoutError: except TimeoutError:
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.') logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.')
for har_file in har_files: for har_file in har_files:
har_file.rename(har_file.with_suffix('.broken')) har_file.rename(har_file.with_suffix('.broken'))
raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.') raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.')
@ -319,16 +331,17 @@ class CapturesIndex(Mapping):
with (capture_dir / 'uuid').open() as f: with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip() uuid = f.read().strip()
logger = LookylooCacheLogAdapter(self.logger, {'uuid': uuid})
try: try:
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger) tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
except NoValidHarFile: except NoValidHarFile:
self.logger.debug('Unable to rebuild the tree, the HAR files are broken.') logger.debug('Unable to rebuild the tree, the HAR files are broken.')
except TreeNeedsRebuild: except TreeNeedsRebuild:
try: try:
tree = self._create_pickle(capture_dir) tree = self._create_pickle(capture_dir, logger)
self.indexing.new_internal_uuids(tree) self.indexing.new_internal_uuids(tree)
except NoValidHarFile: except NoValidHarFile:
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
tree = None tree = None
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str} cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
@ -368,7 +381,7 @@ class CapturesIndex(Mapping):
and isinstance(cache['error'], str) and isinstance(cache['error'], str)
and 'HTTP Error' not in cache['error'] and 'HTTP Error' not in cache['error']
and "No har files in" not in cache['error']): and "No har files in" not in cache['error']):
self.logger.info(cache['error']) logger.info(cache['error'])
if (capture_dir / 'categories').exists(): if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories: with (capture_dir / 'categories').open() as _categories:
@ -395,7 +408,7 @@ class CapturesIndex(Mapping):
p.execute() p.execute()
return CaptureCache(cache) return CaptureCache(cache)
def __resolve_dns(self, ct: CrawledTree): def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter):
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
and store them in ips.json and cnames.json, in the capture directory. and store them in ips.json and cnames.json, in the capture directory.
Updates the nodes of the tree accordingly so the information is available. Updates the nodes of the tree accordingly so the information is available.
@ -458,7 +471,7 @@ class CapturesIndex(Mapping):
try: try:
response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False) response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False)
except Exception as e: except Exception as e:
self.logger.warning(f'Unable to resolve DNS: {e}') logger.warning(f'Unable to resolve DNS: {e}')
continue continue
for answer in response.response.answer: for answer in response.response.answer:
name_to_cache = str(answer.name).rstrip('.') name_to_cache = str(answer.name).rstrip('.')
@ -507,7 +520,7 @@ class CapturesIndex(Mapping):
try: try:
self.ipasnhistory.mass_cache(ips) self.ipasnhistory.mass_cache(ips)
except Exception as e: except Exception as e:
self.logger.warning(f'Unable to submit IPs to IPASNHistory: {e}') logger.warning(f'Unable to submit IPs to IPASNHistory: {e}')
else: else:
time.sleep(2) time.sleep(2)
ipasn_responses = self.ipasnhistory.mass_query(ips) ipasn_responses = self.ipasnhistory.mass_query(ips)

View File

@ -323,24 +323,28 @@ class Lookyloo():
if not cache: if not cache:
self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached') self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached')
return None return None
if not hasattr(cache, 'url'):
self.logger.warning(f'The capture {capture_uuid} does not have a URL in the cache, it is broken.')
return None
to_return: Dict[str, Any] = {} to_return: Dict[str, Any] = {}
if self.vt.available: if self.vt.available:
to_return['vt'] = {} to_return['vt'] = {}
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects: for redirect in cache.redirects:
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect) to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
else: else:
to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url) to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url)
if self.pi.available: if self.pi.available:
to_return['pi'] = {} to_return['pi'] = {}
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects: for redirect in cache.redirects:
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect) to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
else: else:
to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url) to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url)
if self.phishtank.available: if self.phishtank.available:
to_return['phishtank'] = {'urls': {}, 'ips_hits': {}} to_return['phishtank'] = {'urls': {}, 'ips_hits': {}}
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects: for redirect in cache.redirects:
to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect) to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect)
else: else:
@ -350,7 +354,7 @@ class Lookyloo():
to_return['phishtank']['ips_hits'] = ips_hits to_return['phishtank']['ips_hits'] = ips_hits
if self.urlhaus.available: if self.urlhaus.available:
to_return['urlhaus'] = {'urls': {}} to_return['urlhaus'] = {'urls': {}}
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects: for redirect in cache.redirects:
to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect) to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect)
else: else:
@ -379,7 +383,7 @@ class Lookyloo():
if self.riskiq.available: if self.riskiq.available:
try: try:
self.riskiq.capture_default_trigger(cache) self.riskiq.capture_default_trigger(cache)
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
hostname = urlparse(cache.redirects[-1]).hostname hostname = urlparse(cache.redirects[-1]).hostname
else: else:
hostname = urlparse(cache.url).hostname hostname = urlparse(cache.url).hostname
@ -675,7 +679,7 @@ class Lookyloo():
initial_url = defang(cache.url, colon=True, all_dots=True) initial_url = defang(cache.url, colon=True, all_dots=True)
else: else:
initial_url = cache.url initial_url = cache.url
if cache.redirects: if hasattr(cache, 'redirects') and cache.redirects:
redirects = "Redirects:\n" redirects = "Redirects:\n"
if email_config['defang_urls']: if email_config['defang_urls']:
redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True) redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True)
@ -1270,7 +1274,7 @@ class Lookyloo():
stats[date_submission.year][date_submission.month]['uniq_urls'] = set() stats[date_submission.year][date_submission.month]['uniq_urls'] = set()
stats[date_submission.year][date_submission.month]['submissions'] += 1 stats[date_submission.year][date_submission.month]['submissions'] += 1
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url) stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
if len(cache.redirects) > 0: if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1 stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects) stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects) stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
@ -1282,7 +1286,7 @@ class Lookyloo():
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set() weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set()
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1 weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url) weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
if len(cache.redirects) > 0: if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1 weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects) weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects) weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)