chg: Simplify tracing the exceptions related to a specific capture.

pull/640/head
Raphaël Vinot 2023-03-16 13:49:22 +01:00
parent afd383cfc3
commit d970c924f8
2 changed files with 43 additions and 26 deletions

View File

@ -14,9 +14,9 @@ import time
from collections.abc import Mapping
from datetime import datetime
from functools import lru_cache
from logging import Logger
from logging import Logger, LoggerAdapter
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, Set
from typing import Any, Dict, List, Optional, Tuple, Union, Set, MutableMapping
import dns.rdatatype
import dns.resolver
@ -32,19 +32,31 @@ from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, Tr
from .modules import Cloudflare
class LookylooCacheLogAdapter(LoggerAdapter):
"""
Prepend log entry with the UUID of the capture
"""
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]:
if self.extra:
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
return msg, kwargs
class CaptureCache():
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent',
'user_agent', 'referer', 'logger')
def __init__(self, cache_entry: Dict[str, Any]):
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel'))
logger = logging.getLogger(f'{self.__class__.__name__}')
logger.setLevel(get_config('generic', 'loglevel'))
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
'url', 'redirects', 'capture_dir')
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
self.uuid: str = cache_entry['uuid']
self.logger = LookylooCacheLogAdapter(logger, {'uuid': self.uuid})
self.capture_dir: Path = Path(cache_entry['capture_dir'])
if all(key in cache_entry.keys() for key in __default_cache_keys):
@ -58,7 +70,7 @@ class CaptureCache():
if cache_entry.get('redirects'):
self.redirects: List[str] = json.loads(cache_entry['redirects'])
else:
self.logger.info(f'No redirects in cache for {self.uuid}')
self.logger.debug('No redirects in cache')
self.redirects = []
if not self.capture_dir.exists():
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
@ -104,7 +116,7 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C
except EOFError:
remove_pickle_tree(capture_dir)
except Exception:
logger.exception('Unexpected exception when unpickling')
logger.exception('Unexpected exception when unpickling.')
remove_pickle_tree(capture_dir)
if tree:
@ -241,7 +253,7 @@ class CapturesIndex(Mapping):
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
raise MissingUUID(f'Unable to find UUID {uuid}.')
def _create_pickle(self, capture_dir: Path) -> CrawledTree:
def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree:
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
@ -254,14 +266,14 @@ class CapturesIndex(Mapping):
# The pickle is being created somewhere else, wait until it's done.
while is_locked(capture_dir):
time.sleep(5)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
if not (har_files := sorted(capture_dir.glob('*.har'))):
har_files = sorted(capture_dir.glob('*.har.gz'))
try:
with self._timeout_context():
tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree)
self.__resolve_dns(tree, logger)
if self.contextualizer:
self.contextualizer.contextualize_tree(tree)
except Har2TreeError as e:
@ -270,7 +282,7 @@ class CapturesIndex(Mapping):
har_file.rename(har_file.with_suffix('.broken'))
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
except TimeoutError:
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.')
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.')
for har_file in har_files:
har_file.rename(har_file.with_suffix('.broken'))
raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.')
@ -319,16 +331,17 @@ class CapturesIndex(Mapping):
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
logger = LookylooCacheLogAdapter(self.logger, {'uuid': uuid})
try:
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger)
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
except NoValidHarFile:
self.logger.debug('Unable to rebuild the tree, the HAR files are broken.')
logger.debug('Unable to rebuild the tree, the HAR files are broken.')
except TreeNeedsRebuild:
try:
tree = self._create_pickle(capture_dir)
tree = self._create_pickle(capture_dir, logger)
self.indexing.new_internal_uuids(tree)
except NoValidHarFile:
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
tree = None
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
@ -368,7 +381,7 @@ class CapturesIndex(Mapping):
and isinstance(cache['error'], str)
and 'HTTP Error' not in cache['error']
and "No har files in" not in cache['error']):
self.logger.info(cache['error'])
logger.info(cache['error'])
if (capture_dir / 'categories').exists():
with (capture_dir / 'categories').open() as _categories:
@ -395,7 +408,7 @@ class CapturesIndex(Mapping):
p.execute()
return CaptureCache(cache)
def __resolve_dns(self, ct: CrawledTree):
def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter):
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
and store them in ips.json and cnames.json, in the capture directory.
Updates the nodes of the tree accordingly so the information is available.
@ -458,7 +471,7 @@ class CapturesIndex(Mapping):
try:
response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False)
except Exception as e:
self.logger.warning(f'Unable to resolve DNS: {e}')
logger.warning(f'Unable to resolve DNS: {e}')
continue
for answer in response.response.answer:
name_to_cache = str(answer.name).rstrip('.')
@ -507,7 +520,7 @@ class CapturesIndex(Mapping):
try:
self.ipasnhistory.mass_cache(ips)
except Exception as e:
self.logger.warning(f'Unable to submit IPs to IPASNHistory: {e}')
logger.warning(f'Unable to submit IPs to IPASNHistory: {e}')
else:
time.sleep(2)
ipasn_responses = self.ipasnhistory.mass_query(ips)

View File

@ -323,24 +323,28 @@ class Lookyloo():
if not cache:
self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached')
return None
if not hasattr(cache, 'url'):
self.logger.warning(f'The capture {capture_uuid} does not have a URL in the cache, it is broken.')
return None
to_return: Dict[str, Any] = {}
if self.vt.available:
to_return['vt'] = {}
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects:
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
else:
to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url)
if self.pi.available:
to_return['pi'] = {}
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects:
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
else:
to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url)
if self.phishtank.available:
to_return['phishtank'] = {'urls': {}, 'ips_hits': {}}
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects:
to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect)
else:
@ -350,7 +354,7 @@ class Lookyloo():
to_return['phishtank']['ips_hits'] = ips_hits
if self.urlhaus.available:
to_return['urlhaus'] = {'urls': {}}
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
for redirect in cache.redirects:
to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect)
else:
@ -379,7 +383,7 @@ class Lookyloo():
if self.riskiq.available:
try:
self.riskiq.capture_default_trigger(cache)
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
hostname = urlparse(cache.redirects[-1]).hostname
else:
hostname = urlparse(cache.url).hostname
@ -675,7 +679,7 @@ class Lookyloo():
initial_url = defang(cache.url, colon=True, all_dots=True)
else:
initial_url = cache.url
if cache.redirects:
if hasattr(cache, 'redirects') and cache.redirects:
redirects = "Redirects:\n"
if email_config['defang_urls']:
redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True)
@ -1270,7 +1274,7 @@ class Lookyloo():
stats[date_submission.year][date_submission.month]['uniq_urls'] = set()
stats[date_submission.year][date_submission.month]['submissions'] += 1
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
if len(cache.redirects) > 0:
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
@ -1282,7 +1286,7 @@ class Lookyloo():
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set()
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
if len(cache.redirects) > 0:
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)