From d970c924f8037c064fa000a34b6317aeee22ba87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 16 Mar 2023 13:49:22 +0100 Subject: [PATCH] chg: Simplify tracing the exceptions related to a specific capture. --- lookyloo/capturecache.py | 49 +++++++++++++++++++++++++--------------- lookyloo/lookyloo.py | 20 +++++++++------- 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index 2ede828..12416dc 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -14,9 +14,9 @@ import time from collections.abc import Mapping from datetime import datetime from functools import lru_cache -from logging import Logger +from logging import Logger, LoggerAdapter from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, Set +from typing import Any, Dict, List, Optional, Tuple, Union, Set, MutableMapping import dns.rdatatype import dns.resolver @@ -32,19 +32,31 @@ from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, Tr from .modules import Cloudflare +class LookylooCacheLogAdapter(LoggerAdapter): + """ + Prepend log entry with the UUID of the capture + """ + def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]: + if self.extra: + return '[{}] {}'.format(self.extra['uuid'], msg), kwargs + return msg, kwargs + + class CaptureCache(): __slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir', 'error', 'incomplete_redirects', 'no_index', 'categories', 'parent', 'user_agent', 'referer', 'logger') def __init__(self, cache_entry: Dict[str, Any]): - self.logger = logging.getLogger(f'{self.__class__.__name__}') - self.logger.setLevel(get_config('generic', 'loglevel')) + logger = logging.getLogger(f'{self.__class__.__name__}') + logger.setLevel(get_config('generic', 'loglevel')) __default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir') if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry: raise LookylooException(f'The capture is deeply broken: {cache_entry}') self.uuid: str = cache_entry['uuid'] + self.logger = LookylooCacheLogAdapter(logger, {'uuid': self.uuid}) + self.capture_dir: Path = Path(cache_entry['capture_dir']) if all(key in cache_entry.keys() for key in __default_cache_keys): @@ -58,7 +70,7 @@ class CaptureCache(): if cache_entry.get('redirects'): self.redirects: List[str] = json.loads(cache_entry['redirects']) else: - self.logger.info(f'No redirects in cache for {self.uuid}') + self.logger.debug('No redirects in cache') self.redirects = [] if not self.capture_dir.exists(): raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.') @@ -104,7 +116,7 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C except EOFError: remove_pickle_tree(capture_dir) except Exception: - logger.exception('Unexpected exception when unpickling') + logger.exception('Unexpected exception when unpickling.') remove_pickle_tree(capture_dir) if tree: @@ -241,7 +253,7 @@ class CapturesIndex(Mapping): raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).') raise MissingUUID(f'Unable to find UUID {uuid}.') - def _create_pickle(self, capture_dir: Path) -> CrawledTree: + def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree: with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() @@ -254,14 +266,14 @@ class CapturesIndex(Mapping): # The pickle is being created somewhere else, wait until it's done. while is_locked(capture_dir): time.sleep(5) - return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger) + return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger) if not (har_files := sorted(capture_dir.glob('*.har'))): har_files = sorted(capture_dir.glob('*.har.gz')) try: with self._timeout_context(): tree = CrawledTree(har_files, uuid) - self.__resolve_dns(tree) + self.__resolve_dns(tree, logger) if self.contextualizer: self.contextualizer.contextualize_tree(tree) except Har2TreeError as e: @@ -270,7 +282,7 @@ class CapturesIndex(Mapping): har_file.rename(har_file.with_suffix('.broken')) raise NoValidHarFile(f'We got har files, but they are broken: {e}') except TimeoutError: - self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.') + logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.') for har_file in har_files: har_file.rename(har_file.with_suffix('.broken')) raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.') @@ -319,16 +331,17 @@ class CapturesIndex(Mapping): with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() + logger = LookylooCacheLogAdapter(self.logger, {'uuid': uuid}) try: - tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger) + tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger) except NoValidHarFile: - self.logger.debug('Unable to rebuild the tree, the HAR files are broken.') + logger.debug('Unable to rebuild the tree, the HAR files are broken.') except TreeNeedsRebuild: try: - tree = self._create_pickle(capture_dir) + tree = self._create_pickle(capture_dir, logger) self.indexing.new_internal_uuids(tree) except NoValidHarFile: - self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') + logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.') tree = None cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str} @@ -368,7 +381,7 @@ class CapturesIndex(Mapping): and isinstance(cache['error'], str) and 'HTTP Error' not in cache['error'] and "No har files in" not in cache['error']): - self.logger.info(cache['error']) + logger.info(cache['error']) if (capture_dir / 'categories').exists(): with (capture_dir / 'categories').open() as _categories: @@ -395,7 +408,7 @@ class CapturesIndex(Mapping): p.execute() return CaptureCache(cache) - def __resolve_dns(self, ct: CrawledTree): + def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter): '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries and store them in ips.json and cnames.json, in the capture directory. Updates the nodes of the tree accordingly so the information is available. @@ -458,7 +471,7 @@ class CapturesIndex(Mapping): try: response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False) except Exception as e: - self.logger.warning(f'Unable to resolve DNS: {e}') + logger.warning(f'Unable to resolve DNS: {e}') continue for answer in response.response.answer: name_to_cache = str(answer.name).rstrip('.') @@ -507,7 +520,7 @@ class CapturesIndex(Mapping): try: self.ipasnhistory.mass_cache(ips) except Exception as e: - self.logger.warning(f'Unable to submit IPs to IPASNHistory: {e}') + logger.warning(f'Unable to submit IPs to IPASNHistory: {e}') else: time.sleep(2) ipasn_responses = self.ipasnhistory.mass_query(ips) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index a88fdcf..80d6085 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -323,24 +323,28 @@ class Lookyloo(): if not cache: self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached') return None + if not hasattr(cache, 'url'): + self.logger.warning(f'The capture {capture_uuid} does not have a URL in the cache, it is broken.') + return None + to_return: Dict[str, Any] = {} if self.vt.available: to_return['vt'] = {} - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: for redirect in cache.redirects: to_return['vt'][redirect] = self.vt.get_url_lookup(redirect) else: to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url) if self.pi.available: to_return['pi'] = {} - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: for redirect in cache.redirects: to_return['pi'][redirect] = self.pi.get_url_lookup(redirect) else: to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url) if self.phishtank.available: to_return['phishtank'] = {'urls': {}, 'ips_hits': {}} - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: for redirect in cache.redirects: to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect) else: @@ -350,7 +354,7 @@ class Lookyloo(): to_return['phishtank']['ips_hits'] = ips_hits if self.urlhaus.available: to_return['urlhaus'] = {'urls': {}} - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: for redirect in cache.redirects: to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect) else: @@ -379,7 +383,7 @@ class Lookyloo(): if self.riskiq.available: try: self.riskiq.capture_default_trigger(cache) - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: hostname = urlparse(cache.redirects[-1]).hostname else: hostname = urlparse(cache.url).hostname @@ -675,7 +679,7 @@ class Lookyloo(): initial_url = defang(cache.url, colon=True, all_dots=True) else: initial_url = cache.url - if cache.redirects: + if hasattr(cache, 'redirects') and cache.redirects: redirects = "Redirects:\n" if email_config['defang_urls']: redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True) @@ -1270,7 +1274,7 @@ class Lookyloo(): stats[date_submission.year][date_submission.month]['uniq_urls'] = set() stats[date_submission.year][date_submission.month]['submissions'] += 1 stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url) - if len(cache.redirects) > 0: + if hasattr(cache, 'redirects') and len(cache.redirects) > 0: stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1 stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects) stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects) @@ -1282,7 +1286,7 @@ class Lookyloo(): weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set() weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1 weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url) - if len(cache.redirects) > 0: + if hasattr(cache, 'redirects') and len(cache.redirects) > 0: weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1 weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects) weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)