mirror of https://github.com/CIRCL/lookyloo
chg: Simplify tracing the exceptions related to a specific capture.
parent
afd383cfc3
commit
d970c924f8
|
@ -14,9 +14,9 @@ import time
|
||||||
from collections.abc import Mapping
|
from collections.abc import Mapping
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from logging import Logger
|
from logging import Logger, LoggerAdapter
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Tuple, Union, Set
|
from typing import Any, Dict, List, Optional, Tuple, Union, Set, MutableMapping
|
||||||
|
|
||||||
import dns.rdatatype
|
import dns.rdatatype
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
|
@ -32,19 +32,31 @@ from .exceptions import MissingCaptureDirectory, NoValidHarFile, MissingUUID, Tr
|
||||||
from .modules import Cloudflare
|
from .modules import Cloudflare
|
||||||
|
|
||||||
|
|
||||||
|
class LookylooCacheLogAdapter(LoggerAdapter):
|
||||||
|
"""
|
||||||
|
Prepend log entry with the UUID of the capture
|
||||||
|
"""
|
||||||
|
def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]:
|
||||||
|
if self.extra:
|
||||||
|
return '[{}] {}'.format(self.extra['uuid'], msg), kwargs
|
||||||
|
return msg, kwargs
|
||||||
|
|
||||||
|
|
||||||
class CaptureCache():
|
class CaptureCache():
|
||||||
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
|
__slots__ = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir',
|
||||||
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent',
|
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent',
|
||||||
'user_agent', 'referer', 'logger')
|
'user_agent', 'referer', 'logger')
|
||||||
|
|
||||||
def __init__(self, cache_entry: Dict[str, Any]):
|
def __init__(self, cache_entry: Dict[str, Any]):
|
||||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
logger.setLevel(get_config('generic', 'loglevel'))
|
||||||
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
|
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
|
||||||
'url', 'redirects', 'capture_dir')
|
'url', 'redirects', 'capture_dir')
|
||||||
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
|
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
|
||||||
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
|
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
|
||||||
self.uuid: str = cache_entry['uuid']
|
self.uuid: str = cache_entry['uuid']
|
||||||
|
self.logger = LookylooCacheLogAdapter(logger, {'uuid': self.uuid})
|
||||||
|
|
||||||
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
||||||
|
|
||||||
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
||||||
|
@ -58,7 +70,7 @@ class CaptureCache():
|
||||||
if cache_entry.get('redirects'):
|
if cache_entry.get('redirects'):
|
||||||
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
||||||
else:
|
else:
|
||||||
self.logger.info(f'No redirects in cache for {self.uuid}')
|
self.logger.debug('No redirects in cache')
|
||||||
self.redirects = []
|
self.redirects = []
|
||||||
if not self.capture_dir.exists():
|
if not self.capture_dir.exists():
|
||||||
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
|
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
|
||||||
|
@ -104,7 +116,7 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C
|
||||||
except EOFError:
|
except EOFError:
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception('Unexpected exception when unpickling')
|
logger.exception('Unexpected exception when unpickling.')
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
|
|
||||||
if tree:
|
if tree:
|
||||||
|
@ -241,7 +253,7 @@ class CapturesIndex(Mapping):
|
||||||
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
|
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
|
||||||
raise MissingUUID(f'Unable to find UUID {uuid}.')
|
raise MissingUUID(f'Unable to find UUID {uuid}.')
|
||||||
|
|
||||||
def _create_pickle(self, capture_dir: Path) -> CrawledTree:
|
def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree:
|
||||||
with (capture_dir / 'uuid').open() as f:
|
with (capture_dir / 'uuid').open() as f:
|
||||||
uuid = f.read().strip()
|
uuid = f.read().strip()
|
||||||
|
|
||||||
|
@ -254,14 +266,14 @@ class CapturesIndex(Mapping):
|
||||||
# The pickle is being created somewhere else, wait until it's done.
|
# The pickle is being created somewhere else, wait until it's done.
|
||||||
while is_locked(capture_dir):
|
while is_locked(capture_dir):
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger)
|
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
|
||||||
|
|
||||||
if not (har_files := sorted(capture_dir.glob('*.har'))):
|
if not (har_files := sorted(capture_dir.glob('*.har'))):
|
||||||
har_files = sorted(capture_dir.glob('*.har.gz'))
|
har_files = sorted(capture_dir.glob('*.har.gz'))
|
||||||
try:
|
try:
|
||||||
with self._timeout_context():
|
with self._timeout_context():
|
||||||
tree = CrawledTree(har_files, uuid)
|
tree = CrawledTree(har_files, uuid)
|
||||||
self.__resolve_dns(tree)
|
self.__resolve_dns(tree, logger)
|
||||||
if self.contextualizer:
|
if self.contextualizer:
|
||||||
self.contextualizer.contextualize_tree(tree)
|
self.contextualizer.contextualize_tree(tree)
|
||||||
except Har2TreeError as e:
|
except Har2TreeError as e:
|
||||||
|
@ -270,7 +282,7 @@ class CapturesIndex(Mapping):
|
||||||
har_file.rename(har_file.with_suffix('.broken'))
|
har_file.rename(har_file.with_suffix('.broken'))
|
||||||
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
|
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
|
||||||
except TimeoutError:
|
except TimeoutError:
|
||||||
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.')
|
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the tree took too long.')
|
||||||
for har_file in har_files:
|
for har_file in har_files:
|
||||||
har_file.rename(har_file.with_suffix('.broken'))
|
har_file.rename(har_file.with_suffix('.broken'))
|
||||||
raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.')
|
raise NoValidHarFile(f'We got har files, but creating a tree took more than {self.timeout}s.')
|
||||||
|
@ -319,16 +331,17 @@ class CapturesIndex(Mapping):
|
||||||
with (capture_dir / 'uuid').open() as f:
|
with (capture_dir / 'uuid').open() as f:
|
||||||
uuid = f.read().strip()
|
uuid = f.read().strip()
|
||||||
|
|
||||||
|
logger = LookylooCacheLogAdapter(self.logger, {'uuid': uuid})
|
||||||
try:
|
try:
|
||||||
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, self.logger)
|
tree = load_pickle_tree(capture_dir, capture_dir.stat().st_mtime, logger)
|
||||||
except NoValidHarFile:
|
except NoValidHarFile:
|
||||||
self.logger.debug('Unable to rebuild the tree, the HAR files are broken.')
|
logger.debug('Unable to rebuild the tree, the HAR files are broken.')
|
||||||
except TreeNeedsRebuild:
|
except TreeNeedsRebuild:
|
||||||
try:
|
try:
|
||||||
tree = self._create_pickle(capture_dir)
|
tree = self._create_pickle(capture_dir, logger)
|
||||||
self.indexing.new_internal_uuids(tree)
|
self.indexing.new_internal_uuids(tree)
|
||||||
except NoValidHarFile:
|
except NoValidHarFile:
|
||||||
self.logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
|
logger.warning(f'Unable to rebuild the tree for {capture_dir}, the HAR files are broken.')
|
||||||
tree = None
|
tree = None
|
||||||
|
|
||||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
|
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
|
||||||
|
@ -368,7 +381,7 @@ class CapturesIndex(Mapping):
|
||||||
and isinstance(cache['error'], str)
|
and isinstance(cache['error'], str)
|
||||||
and 'HTTP Error' not in cache['error']
|
and 'HTTP Error' not in cache['error']
|
||||||
and "No har files in" not in cache['error']):
|
and "No har files in" not in cache['error']):
|
||||||
self.logger.info(cache['error'])
|
logger.info(cache['error'])
|
||||||
|
|
||||||
if (capture_dir / 'categories').exists():
|
if (capture_dir / 'categories').exists():
|
||||||
with (capture_dir / 'categories').open() as _categories:
|
with (capture_dir / 'categories').open() as _categories:
|
||||||
|
@ -395,7 +408,7 @@ class CapturesIndex(Mapping):
|
||||||
p.execute()
|
p.execute()
|
||||||
return CaptureCache(cache)
|
return CaptureCache(cache)
|
||||||
|
|
||||||
def __resolve_dns(self, ct: CrawledTree):
|
def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter):
|
||||||
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
|
||||||
and store them in ips.json and cnames.json, in the capture directory.
|
and store them in ips.json and cnames.json, in the capture directory.
|
||||||
Updates the nodes of the tree accordingly so the information is available.
|
Updates the nodes of the tree accordingly so the information is available.
|
||||||
|
@ -458,7 +471,7 @@ class CapturesIndex(Mapping):
|
||||||
try:
|
try:
|
||||||
response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False)
|
response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f'Unable to resolve DNS: {e}')
|
logger.warning(f'Unable to resolve DNS: {e}')
|
||||||
continue
|
continue
|
||||||
for answer in response.response.answer:
|
for answer in response.response.answer:
|
||||||
name_to_cache = str(answer.name).rstrip('.')
|
name_to_cache = str(answer.name).rstrip('.')
|
||||||
|
@ -507,7 +520,7 @@ class CapturesIndex(Mapping):
|
||||||
try:
|
try:
|
||||||
self.ipasnhistory.mass_cache(ips)
|
self.ipasnhistory.mass_cache(ips)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(f'Unable to submit IPs to IPASNHistory: {e}')
|
logger.warning(f'Unable to submit IPs to IPASNHistory: {e}')
|
||||||
else:
|
else:
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
ipasn_responses = self.ipasnhistory.mass_query(ips)
|
ipasn_responses = self.ipasnhistory.mass_query(ips)
|
||||||
|
|
|
@ -323,24 +323,28 @@ class Lookyloo():
|
||||||
if not cache:
|
if not cache:
|
||||||
self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached')
|
self.logger.warning(f'Unable to get the modules responses unless the capture {capture_uuid} is cached')
|
||||||
return None
|
return None
|
||||||
|
if not hasattr(cache, 'url'):
|
||||||
|
self.logger.warning(f'The capture {capture_uuid} does not have a URL in the cache, it is broken.')
|
||||||
|
return None
|
||||||
|
|
||||||
to_return: Dict[str, Any] = {}
|
to_return: Dict[str, Any] = {}
|
||||||
if self.vt.available:
|
if self.vt.available:
|
||||||
to_return['vt'] = {}
|
to_return['vt'] = {}
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
for redirect in cache.redirects:
|
for redirect in cache.redirects:
|
||||||
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
|
to_return['vt'][redirect] = self.vt.get_url_lookup(redirect)
|
||||||
else:
|
else:
|
||||||
to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url)
|
to_return['vt'][cache.url] = self.vt.get_url_lookup(cache.url)
|
||||||
if self.pi.available:
|
if self.pi.available:
|
||||||
to_return['pi'] = {}
|
to_return['pi'] = {}
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
for redirect in cache.redirects:
|
for redirect in cache.redirects:
|
||||||
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
|
to_return['pi'][redirect] = self.pi.get_url_lookup(redirect)
|
||||||
else:
|
else:
|
||||||
to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url)
|
to_return['pi'][cache.url] = self.pi.get_url_lookup(cache.url)
|
||||||
if self.phishtank.available:
|
if self.phishtank.available:
|
||||||
to_return['phishtank'] = {'urls': {}, 'ips_hits': {}}
|
to_return['phishtank'] = {'urls': {}, 'ips_hits': {}}
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
for redirect in cache.redirects:
|
for redirect in cache.redirects:
|
||||||
to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect)
|
to_return['phishtank']['urls'][redirect] = self.phishtank.get_url_lookup(redirect)
|
||||||
else:
|
else:
|
||||||
|
@ -350,7 +354,7 @@ class Lookyloo():
|
||||||
to_return['phishtank']['ips_hits'] = ips_hits
|
to_return['phishtank']['ips_hits'] = ips_hits
|
||||||
if self.urlhaus.available:
|
if self.urlhaus.available:
|
||||||
to_return['urlhaus'] = {'urls': {}}
|
to_return['urlhaus'] = {'urls': {}}
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
for redirect in cache.redirects:
|
for redirect in cache.redirects:
|
||||||
to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect)
|
to_return['urlhaus']['urls'][redirect] = self.urlhaus.get_url_lookup(redirect)
|
||||||
else:
|
else:
|
||||||
|
@ -379,7 +383,7 @@ class Lookyloo():
|
||||||
if self.riskiq.available:
|
if self.riskiq.available:
|
||||||
try:
|
try:
|
||||||
self.riskiq.capture_default_trigger(cache)
|
self.riskiq.capture_default_trigger(cache)
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
hostname = urlparse(cache.redirects[-1]).hostname
|
hostname = urlparse(cache.redirects[-1]).hostname
|
||||||
else:
|
else:
|
||||||
hostname = urlparse(cache.url).hostname
|
hostname = urlparse(cache.url).hostname
|
||||||
|
@ -675,7 +679,7 @@ class Lookyloo():
|
||||||
initial_url = defang(cache.url, colon=True, all_dots=True)
|
initial_url = defang(cache.url, colon=True, all_dots=True)
|
||||||
else:
|
else:
|
||||||
initial_url = cache.url
|
initial_url = cache.url
|
||||||
if cache.redirects:
|
if hasattr(cache, 'redirects') and cache.redirects:
|
||||||
redirects = "Redirects:\n"
|
redirects = "Redirects:\n"
|
||||||
if email_config['defang_urls']:
|
if email_config['defang_urls']:
|
||||||
redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True)
|
redirects += defang('\n'.join(cache.redirects), colon=True, all_dots=True)
|
||||||
|
@ -1270,7 +1274,7 @@ class Lookyloo():
|
||||||
stats[date_submission.year][date_submission.month]['uniq_urls'] = set()
|
stats[date_submission.year][date_submission.month]['uniq_urls'] = set()
|
||||||
stats[date_submission.year][date_submission.month]['submissions'] += 1
|
stats[date_submission.year][date_submission.month]['submissions'] += 1
|
||||||
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
|
stats[date_submission.year][date_submission.month]['uniq_urls'].add(cache.url)
|
||||||
if len(cache.redirects) > 0:
|
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
||||||
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
|
stats[date_submission.year][date_submission.month]['submissions_with_redirects'] += 1
|
||||||
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
|
stats[date_submission.year][date_submission.month]['redirects'] += len(cache.redirects)
|
||||||
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
|
stats[date_submission.year][date_submission.month]['uniq_urls'].update(cache.redirects)
|
||||||
|
@ -1282,7 +1286,7 @@ class Lookyloo():
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set()
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'] = set()
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
|
weeks_stats[date_submission.isocalendar()[1]]['submissions'] += 1
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].add(cache.url)
|
||||||
if len(cache.redirects) > 0:
|
if hasattr(cache, 'redirects') and len(cache.redirects) > 0:
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
|
weeks_stats[date_submission.isocalendar()[1]]['submissions_with_redirects'] += 1
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
|
weeks_stats[date_submission.isocalendar()[1]]['redirects'] += len(cache.redirects)
|
||||||
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
|
weeks_stats[date_submission.isocalendar()[1]]['uniq_urls'].update(cache.redirects)
|
||||||
|
|
Loading…
Reference in New Issue