From 32ee474be287406f27382e42cf7fe39389386985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 22 Sep 2021 17:09:04 +0200 Subject: [PATCH] chg: Improve tree creation and cache --- bin/background_indexer.py | 2 + lookyloo/capturecache.py | 307 +++++++++++++++++++++++++++++++++++- lookyloo/exceptions.py | 4 + lookyloo/helpers.py | 23 --- lookyloo/lookyloo.py | 316 ++------------------------------------ 5 files changed, 326 insertions(+), 326 deletions(-) diff --git a/bin/background_indexer.py b/bin/background_indexer.py index bdc6677f..57ccf22e 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -91,6 +91,8 @@ class BackgroundIndexer(AbstractManager): if not indexed[2]: self.logger.info(f'Indexing cookies for {cache.uuid}') self.lookyloo.indexing.index_cookies_capture(ct) + # NOTE: categories aren't taken in account here, should be fixed(?) + # see indexing.index_categories_capture(capture_uuid, categories) def main(): diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index c001ce20..eb240287 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -2,11 +2,26 @@ # -*- coding: utf-8 -*- import json -from datetime import datetime -from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +import logging +import pickle +import sys +import time -from .exceptions import LookylooException, MissingCaptureDirectory +from collections.abc import Mapping +from datetime import datetime +from functools import lru_cache +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +import dns.rdatatype +import dns.resolver +from har2tree import CrawledTree, Har2TreeError, HarFile +from redis import Redis + +from .context import Context +from .exceptions import (LookylooException, MissingCaptureDirectory, NoValidHarFile, + MissingUUID, TreeNeedsRebuild) +from .helpers import try_make_file, get_config class CaptureCache(): @@ -35,3 +50,287 @@ class CaptureCache(): self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False self.categories: List[str] = json.loads(cache_entry['categories']) if cache_entry.get('categories') else [] self.parent: Optional[str] = cache_entry.get('parent') + + @property + def tree(self) -> CrawledTree: + try: + return load_pickle_tree(self.capture_dir) + except TreeNeedsRebuild: + # The pickle is missing, that shouldn't at this stage. + raise LookylooException(f'Unable to get pickle for {self.uuid} - {self.capture_dir}') + + +def remove_pickle_tree(capture_dir: Path) -> None: + pickle_file = capture_dir / 'tree.pickle' + if pickle_file.exists(): + pickle_file.unlink() + + +@lru_cache(maxsize=1024) +def load_pickle_tree(capture_dir: Path) -> CrawledTree: + pickle_file = capture_dir / 'tree.pickle' + if pickle_file.exists(): + with pickle_file.open('rb') as _p: + try: + return pickle.load(_p) + except pickle.UnpicklingError: + remove_pickle_tree(capture_dir) + except EOFError: + remove_pickle_tree(capture_dir) + except Exception: + remove_pickle_tree(capture_dir) + raise TreeNeedsRebuild() + + +class CapturesIndex(Mapping): + + def __init__(self, redis: Redis, contextualizer: Optional[Context]=None): + self.logger = logging.getLogger(f'{self.__class__.__name__}') + self.logger.setLevel(get_config('generic', 'loglevel')) + self.redis = redis + self.contextualizer = contextualizer + self.__cache: Dict[str, CaptureCache] = {} + + def __getitem__(self, uuid: str) -> CaptureCache: + if uuid in self.__cache: + if (self.__cache[uuid].capture_dir.exists() + and not self.__cache[uuid].incomplete_redirects): + return self.__cache[uuid] + del self.__cache[uuid] + capture_dir = self._get_capture_dir(uuid) + cached = self.redis.hgetall(str(capture_dir)) + if cached: + cc = CaptureCache(cached) + # NOTE: checking for pickle to exist may be a bad idea here. + if (cc.capture_dir.exists() + and (cc.capture_dir / 'tree.pickle').exists() + and not cc.incomplete_redirects): + self.__cache[uuid] = cc + return self.__cache[uuid] + try: + tree = load_pickle_tree(capture_dir) + except TreeNeedsRebuild: + tree = self._create_pickle(capture_dir) + self.__cache[uuid] = self._set_capture_cache(capture_dir, tree) + return self.__cache[uuid] + + def __iter__(self): + return iter(self.__cache) + + def __len__(self): + return len(self.__cache) + + def reload_cache(self, uuid: str) -> None: + if uuid in self.__cache: + del self.__cache[uuid] + + def remove_pickle(self, uuid: str) -> None: + if uuid in self.__cache: + remove_pickle_tree(self.__cache[uuid].capture_dir) + del self.__cache[uuid] + + def rebuild_all(self) -> None: + for uuid, cache in self.__cache.items(): + remove_pickle_tree(cache.capture_dir) + self.redis.flushdb() + self.__cache = {} + + def _get_capture_dir(self, uuid: str) -> Path: + # Try to get from the recent captures cache in redis + capture_dir = self.redis.hget('lookup_dirs', uuid) + if capture_dir: + to_return = Path(capture_dir) + if to_return.exists(): + return to_return + # The capture was either removed or archived, cleaning up + self.redis.hdel('lookup_dirs', uuid) + self.redis.delete(capture_dir) + + # Try to get from the archived captures cache in redis + capture_dir = self.redis.hget('lookup_dirs_archived', uuid) + if capture_dir: + to_return = Path(capture_dir) + if to_return.exists(): + return to_return + self.redis.hdel('lookup_dirs_archived', uuid) + # The capture was removed, remove the UUID + self.logger.warning(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).') + raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).') + + raise MissingUUID(f'Unable to find UUID {uuid}.') + + def _create_pickle(self, capture_dir: Path) -> CrawledTree: + with (capture_dir / 'uuid').open() as f: + uuid = f.read().strip() + + lock_file = capture_dir / 'lock' + if try_make_file(lock_file): + # Lock created, we can process + with lock_file.open('w') as f: + f.write(datetime.now().isoformat()) + else: + # The pickle is being created somewhere else, wait until it's done. + while lock_file.exists(): + time.sleep(5) + return load_pickle_tree(capture_dir) + + har_files = sorted(capture_dir.glob('*.har')) + pickle_file = capture_dir / 'tree.pickle' + try: + tree = CrawledTree(har_files, uuid) + self.__resolve_dns(tree) + if self.contextualizer: + self.contextualizer.contextualize_tree(tree) + except Har2TreeError as e: + raise NoValidHarFile(e) + except RecursionError as e: + raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') + else: + with pickle_file.open('wb') as _p: + # Some pickles require a pretty high recursion limit, this kindof fixes it. + # If the capture is really broken (generally a refresh to self), the capture + # is discarded in the RecursionError above. + default_recursion_limit = sys.getrecursionlimit() + sys.setrecursionlimit(int(default_recursion_limit * 1.1)) + try: + pickle.dump(tree, _p) + except RecursionError as e: + raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') + sys.setrecursionlimit(default_recursion_limit) + finally: + lock_file.unlink(missing_ok=True) + return tree + + def _set_capture_cache(self, capture_dir: Path, tree: Optional[CrawledTree]=None) -> CaptureCache: + '''Populate the redis cache for a capture. Mostly used on the index page. + NOTE: Doesn't require the pickle.''' + with (capture_dir / 'uuid').open() as f: + uuid = f.read().strip() + + cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)} + if (capture_dir / 'error.txt').exists(): + # Something went wrong + with (capture_dir / 'error.txt').open() as _error: + content = _error.read() + try: + error_to_cache = json.loads(content) + if isinstance(error_to_cache, dict) and error_to_cache.get('details'): + error_to_cache = error_to_cache.get('details') + except json.decoder.JSONDecodeError: + # old format + error_to_cache = content + cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' + + if (har_files := sorted(capture_dir.glob('*.har'))): + try: + har = HarFile(har_files[0], uuid) + cache['title'] = har.initial_title + cache['timestamp'] = har.initial_start_time + cache['url'] = har.root_url + if har.initial_redirects and har.need_tree_redirects: + if not tree: + # try to load tree from disk + tree = load_pickle_tree(capture_dir) + # get redirects + if tree: + cache['redirects'] = json.dumps(tree.redirects) + cache['incomplete_redirects'] = 0 + else: + # Pickle not available + cache['redirects'] = json.dumps(har.initial_redirects) + cache['incomplete_redirects'] = 1 + else: + cache['redirects'] = json.dumps(har.initial_redirects) + cache['incomplete_redirects'] = 0 + + except Har2TreeError as e: + cache['error'] = str(e) + else: + cache['error'] = f'No har files in {capture_dir.name}' + + if (cache.get('error') + and isinstance(cache['error'], str) + and 'HTTP Error' not in cache['error']): + self.logger.warning(cache['error']) + + if (capture_dir / 'categories').exists(): + with (capture_dir / 'categories').open() as _categories: + cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()]) + + if (capture_dir / 'no_index').exists(): + # If the folders claims anonymity + cache['no_index'] = 1 + + if (capture_dir / 'parent').exists(): + # The capture was initiated from an other one + with (capture_dir / 'parent').open() as f: + cache['parent'] = f.read().strip() + + p = self.redis.pipeline() + p.hset('lookup_dirs', uuid, str(capture_dir)) + p.hmset(str(capture_dir), cache) # type: ignore + p.execute() + return CaptureCache(cache) + + def __resolve_dns(self, ct: CrawledTree): + '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries + and store them in ips.json and cnames.json, in the capture directory. + Updates the nodes of the tree accordingly so the information is available. + ''' + + def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]: + '''Returns a list of CNAMEs starting from one hostname. + The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry + and the CNAME entry can have an other CNAME entry, and so on multiple times. + This method loops over the hostnames until there are no CNAMES.''' + cnames: List[str] = [] + to_search = hostname + while True: + if known_cnames.get(to_search) is None: + break + # At this point, known_cnames[to_search] must exist and be a str + cnames.append(known_cnames[to_search]) # type: ignore + to_search = known_cnames[to_search] + return cnames + + cnames_path = ct.root_hartree.har.path.parent / 'cnames.json' + ips_path = ct.root_hartree.har.path.parent / 'ips.json' + host_cnames: Dict[str, Optional[str]] = {} + if cnames_path.exists(): + with cnames_path.open() as f: + host_cnames = json.load(f) + + host_ips: Dict[str, List[str]] = {} + if ips_path.exists(): + with ips_path.open() as f: + host_ips = json.load(f) + + for node in ct.root_hartree.hostname_tree.traverse(): + if node.name not in host_cnames or node.name not in host_ips: + # Resolve and cache + try: + response = dns.resolver.resolve(node.name, search=True) + for answer in response.response.answer: + if answer.rdtype == dns.rdatatype.RdataType.CNAME: + host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.') + else: + host_cnames[str(answer.name).rstrip('.')] = None + + if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]: + host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer)) + except Exception: + host_cnames[node.name] = None + host_ips[node.name] = [] + cnames = _build_cname_chain(host_cnames, node.name) + if cnames: + node.add_feature('cname', cnames) + if cnames[-1] in host_ips: + node.add_feature('resolved_ips', host_ips[cnames[-1]]) + elif node.name in host_ips: + node.add_feature('resolved_ips', host_ips[node.name]) + + with cnames_path.open('w') as f: + json.dump(host_cnames, f) + with ips_path.open('w') as f: + json.dump(host_ips, f) + return ct diff --git a/lookyloo/exceptions.py b/lookyloo/exceptions.py index 94731dcd..d6da41bd 100644 --- a/lookyloo/exceptions.py +++ b/lookyloo/exceptions.py @@ -28,3 +28,7 @@ class MissingUUID(LookylooException): class MissingCaptureDirectory(LookylooException): pass + + +class TreeNeedsRebuild(LookylooException): + pass diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index f0378302..5b551ec3 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -3,7 +3,6 @@ import json import logging import os -import pickle from datetime import datetime, timedelta from enum import IntEnum, unique from functools import lru_cache @@ -216,28 +215,6 @@ def load_cookies(cookie_pseudofile: Optional[Union[BufferedIOBase, str]]=None) - return to_return -def load_pickle_tree(capture_dir: Path) -> Optional[CrawledTree]: - pickle_file = capture_dir / 'tree.pickle' - if pickle_file.exists(): - with pickle_file.open('rb') as _p: - try: - return pickle.load(_p) - except pickle.UnpicklingError: - remove_pickle_tree(capture_dir) - except EOFError: - remove_pickle_tree(capture_dir) - except Exception: - remove_pickle_tree(capture_dir) - - return None - - -def remove_pickle_tree(capture_dir: Path) -> None: - pickle_file = capture_dir / 'tree.pickle' - if pickle_file.exists(): - pickle_file.unlink() - - def uniq_domains(uniq_urls): domains = set() for url in uniq_urls: diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index f2b5381e..9acf1ff1 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -5,10 +5,7 @@ import base64 import json import logging import operator -import pickle import smtplib -import sys -import time from collections import defaultdict from datetime import date, datetime from email.message import EmailMessage @@ -19,9 +16,7 @@ from typing import (Any, Dict, Iterable, List, MutableMapping, Optional, Set, from uuid import uuid4 from zipfile import ZipFile -import dns.rdatatype -import dns.resolver -from har2tree import CrawledTree, Har2TreeError, HarFile, HostNode, URLNode +from har2tree import CrawledTree, Har2TreeError, HostNode, URLNode from PIL import Image # type: ignore from pymisp import MISPAttribute, MISPEvent, MISPObject from pymisp.tools import FileObject, URLObject @@ -29,15 +24,13 @@ from redis import ConnectionPool, Redis from redis.connection import UnixDomainSocketConnection from werkzeug.useragents import UserAgent -from .capturecache import CaptureCache +from .capturecache import CaptureCache, CapturesIndex from .context import Context from .exceptions import (LookylooException, MissingCaptureDirectory, - MissingUUID, NoValidHarFile) + MissingUUID) from .helpers import (CaptureStatus, get_captures_dir, get_config, get_email_template, get_homedir, get_resources_hashes, - get_socket_path, get_splash_url, get_taxonomies, - load_pickle_tree, remove_pickle_tree, try_make_file, - uniq_domains) + get_socket_path, get_splash_url, get_taxonomies, uniq_domains) from .indexing import Indexing from .modules import (MISP, PhishingInitiative, UniversalWhois, UrlScan, VirusTotal, Phishtank) @@ -86,7 +79,7 @@ class Lookyloo(): self.logger.warning('Unable to setup the Phishtank module') self.context = Context() - self._captures_index: Dict[str, CaptureCache] = {} + self._captures_index = CapturesIndex(self.redis, self.context) @property def redis(self): @@ -94,233 +87,7 @@ class Lookyloo(): def _get_capture_dir(self, capture_uuid: str, /) -> Path: '''Use the cache to get a capture directory from a capture UUID''' - capture_dir: Optional[str] - to_return: Path - - # Try to get from the in-class cache - if capture_uuid in self._captures_index: - to_return = self._captures_index[capture_uuid].capture_dir - if to_return.exists(): - return to_return - self.redis.delete(str(to_return)) - self._captures_index.pop(capture_uuid) - - # Try to get from the recent captures cache in redis - capture_dir = self.redis.hget('lookup_dirs', capture_uuid) - if capture_dir: - to_return = Path(capture_dir) - if to_return.exists(): - return to_return - # The capture was either removed or archived, cleaning up - self.redis.hdel('lookup_dirs', capture_uuid) - self.redis.delete(capture_dir) - - # Try to get from the archived captures cache in redis - capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid) - if capture_dir: - to_return = Path(capture_dir) - if to_return.exists(): - return to_return - self.redis.hdel('lookup_dirs_archived', capture_uuid) - # The capture was removed, remove the UUID - self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).') - raise MissingCaptureDirectory(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}).') - - raise MissingUUID(f'Unable to find UUID {capture_uuid}.') - - def _cache_capture(self, capture_uuid: str, /) -> CrawledTree: - '''Generate the pickle, set the cache, add capture in the indexes''' - - capture_dir = self._get_capture_dir(capture_uuid) - - har_files = sorted(capture_dir.glob('*.har')) - lock_file = capture_dir / 'lock' - pickle_file = capture_dir / 'tree.pickle' - - if try_make_file(lock_file): - # Lock created, we can process - with lock_file.open('w') as f: - f.write(datetime.now().isoformat()) - else: - # The pickle is being created somewhere else, wait until it's done. - while lock_file.exists(): - time.sleep(5) - keep_going = 5 - while (ct := load_pickle_tree(capture_dir)) is None: - keep_going -= 1 - if not keep_going: - raise LookylooException(f'Unable to get tree for {capture_uuid}') - time.sleep(5) - return ct - - # NOTE: We only index the public captures - index = True - try: - ct = CrawledTree(har_files, capture_uuid) - self._resolve_dns(ct) - self.context.contextualize_tree(ct) - cache = self.capture_cache(capture_uuid) - if not cache: - raise LookylooException(f'Broken cache for {capture_dir}') - if self.is_public_instance: - if cache.no_index: - index = False - if index: - self.indexing.index_cookies_capture(ct) - self.indexing.index_body_hashes_capture(ct) - self.indexing.index_url_capture(ct) - categories = list(self.categories_capture(capture_uuid).keys()) - self.indexing.index_categories_capture(capture_uuid, categories) - except Har2TreeError as e: - raise NoValidHarFile(e) - except RecursionError as e: - raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') - else: - with pickle_file.open('wb') as _p: - # Some pickles require a pretty high recursion limit, this kindof fixes it. - # If the capture is really broken (generally a refresh to self), the capture - # is discarded in the RecursionError above. - default_recursion_limit = sys.getrecursionlimit() - sys.setrecursionlimit(int(default_recursion_limit * 1.1)) - try: - pickle.dump(ct, _p) - except RecursionError as e: - raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') - sys.setrecursionlimit(default_recursion_limit) - finally: - lock_file.unlink(missing_ok=True) - return ct - - def _set_capture_cache(self, capture_dir: Path): - '''Populate the redis cache for a capture. Mostly used on the index page. - NOTE: Doesn't require the pickle.''' - with (capture_dir / 'uuid').open() as f: - uuid = f.read().strip() - - cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': str(capture_dir)} - if (capture_dir / 'error.txt').exists(): - # Something went wrong - with (capture_dir / 'error.txt').open() as _error: - content = _error.read() - try: - error_to_cache = json.loads(content) - if isinstance(error_to_cache, dict) and error_to_cache.get('details'): - error_to_cache = error_to_cache.get('details') - except json.decoder.JSONDecodeError: - # old format - error_to_cache = content - cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}' - - if (har_files := sorted(capture_dir.glob('*.har'))): - try: - har = HarFile(har_files[0], uuid) - cache['title'] = har.initial_title - cache['timestamp'] = har.initial_start_time - cache['url'] = har.root_url - if har.initial_redirects and har.need_tree_redirects: - # try to load tree from disk, get redirects - if (ct := load_pickle_tree(capture_dir)): - cache['redirects'] = json.dumps(ct.redirects) - cache['incomplete_redirects'] = 0 - else: - # Pickle not available - cache['redirects'] = json.dumps(har.initial_redirects) - cache['incomplete_redirects'] = 1 - else: - cache['redirects'] = json.dumps(har.initial_redirects) - cache['incomplete_redirects'] = 0 - - except Har2TreeError as e: - cache['error'] = str(e) - else: - cache['error'] = f'No har files in {capture_dir.name}' - - if (cache.get('error') - and isinstance(cache['error'], str) - and 'HTTP Error' not in cache['error']): - self.logger.warning(cache['error']) - - if (capture_dir / 'categories').exists(): - with (capture_dir / 'categories').open() as _categories: - cache['categories'] = json.dumps([c.strip() for c in _categories.readlines()]) - - if (capture_dir / 'no_index').exists(): - # If the folders claims anonymity - cache['no_index'] = 1 - - if (capture_dir / 'parent').exists(): - # The capture was initiated from an other one - with (capture_dir / 'parent').open() as f: - cache['parent'] = f.read().strip() - - p = self.redis.pipeline() - p.hset('lookup_dirs', uuid, str(capture_dir)) - p.hmset(str(capture_dir), cache) - p.execute() - self._captures_index[uuid] = CaptureCache(cache) - - def _resolve_dns(self, ct: CrawledTree): - '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries - and store them in ips.json and cnames.json, in the capture directory. - Updates the nodes of the tree accordingly so the information is available. - ''' - - def _build_cname_chain(known_cnames: Dict[str, Optional[str]], hostname) -> List[str]: - '''Returns a list of CNAMEs starting from one hostname. - The CNAMEs resolutions are made in `_resolve_dns`. A hostname can have a CNAME entry - and the CNAME entry can have an other CNAME entry, and so on multiple times. - This method loops over the hostnames until there are no CNAMES.''' - cnames: List[str] = [] - to_search = hostname - while True: - if known_cnames.get(to_search) is None: - break - # At this point, known_cnames[to_search] must exist and be a str - cnames.append(known_cnames[to_search]) # type: ignore - to_search = known_cnames[to_search] - return cnames - - cnames_path = ct.root_hartree.har.path.parent / 'cnames.json' - ips_path = ct.root_hartree.har.path.parent / 'ips.json' - host_cnames: Dict[str, Optional[str]] = {} - if cnames_path.exists(): - with cnames_path.open() as f: - host_cnames = json.load(f) - - host_ips: Dict[str, List[str]] = {} - if ips_path.exists(): - with ips_path.open() as f: - host_ips = json.load(f) - - for node in ct.root_hartree.hostname_tree.traverse(): - if node.name not in host_cnames or node.name not in host_ips: - # Resolve and cache - try: - response = dns.resolver.resolve(node.name, search=True) - for answer in response.response.answer: - if answer.rdtype == dns.rdatatype.RdataType.CNAME: - host_cnames[str(answer.name).rstrip('.')] = str(answer[0].target).rstrip('.') - else: - host_cnames[str(answer.name).rstrip('.')] = None - - if answer.rdtype in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]: - host_ips[str(answer.name).rstrip('.')] = list(set(str(b) for b in answer)) - except Exception: - host_cnames[node.name] = None - host_ips[node.name] = [] - cnames = _build_cname_chain(host_cnames, node.name) - if cnames: - node.add_feature('cname', cnames) - if cnames[-1] in host_ips: - node.add_feature('resolved_ips', host_ips[cnames[-1]]) - elif node.name in host_ips: - node.add_feature('resolved_ips', host_ips[node.name]) - - with cnames_path.open('w') as f: - json.dump(host_cnames, f) - with ips_path.open('w') as f: - json.dump(host_ips, f) - return ct + return self._captures_index[capture_uuid].capture_dir def add_context(self, capture_uuid: str, /, urlnode_uuid: str, *, ressource_hash: str, legitimate: bool, malicious: bool, details: Dict[str, Dict[str, str]]): @@ -338,8 +105,7 @@ class Lookyloo(): def remove_pickle(self, capture_uuid: str, /) -> None: '''Remove the pickle from a specific capture.''' - capture_dir = self._get_capture_dir(capture_uuid) - remove_pickle_tree(capture_dir) + self._captures_index.remove_pickle(capture_uuid) def rebuild_cache(self) -> None: '''Flush and rebuild the redis cache. Doesn't remove the pickles. @@ -349,8 +115,7 @@ class Lookyloo(): def rebuild_all(self) -> None: '''Flush and rebuild the redis cache, and delete all the pickles. The captures will be rebuilt by the background indexer''' - [remove_pickle_tree(capture_dir) for capture_dir in self.capture_dir.iterdir() if capture_dir.is_dir()] # type: ignore - self.rebuild_cache() + self._captures_index.rebuild_all() def get_urlnode_from_tree(self, capture_uuid: str, /, node_uuid: str) -> URLNode: '''Get a URL node from a tree, by UUID''' @@ -515,11 +280,9 @@ class Lookyloo(): """Add the capture in the hidden pool (not shown on the front page) NOTE: it won't remove the correlations until they are rebuilt. """ - capture_dir = self._get_capture_dir(capture_uuid) - self.redis.hset(str(capture_dir), 'no_index', 1) - (capture_dir / 'no_index').touch() - if capture_uuid in self._captures_index: - self._captures_index[capture_uuid].no_index = True + self.redis.hset(str(self._get_capture_dir(capture_uuid)), 'no_index', 1) + (self._get_capture_dir(capture_uuid) / 'no_index').touch() + self._captures_index.reload_cache(capture_uuid) @property def capture_uuids(self) -> List[str]: @@ -535,31 +298,7 @@ class Lookyloo(): # No captures at all on the instance return [] - all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids - if (uuid in self._captures_index - and not self._captures_index[uuid].incomplete_redirects)] - - captures_to_get = set(capture_uuids) - set(self._captures_index.keys()) - if captures_to_get: - p = self.redis.pipeline() - for directory in self.redis.hmget('lookup_dirs', *captures_to_get): - if not directory: - continue - p.hgetall(directory) - for uuid, c in zip(captures_to_get, p.execute()): - try: - if not c: - c = self.capture_cache(uuid) - if not c: - continue - else: - c = CaptureCache(c) - except LookylooException as e: - self.logger.warning(e) - continue - if hasattr(c, 'timestamp'): - all_cache.append(c) - self._captures_index[c.uuid] = c + all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)] all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True) return all_cache @@ -577,15 +316,8 @@ class Lookyloo(): def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]: """Get the cache from redis.""" - if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects: - return self._captures_index[capture_uuid] try: - capture_dir = self._get_capture_dir(capture_uuid) - cached = self.redis.hgetall(str(capture_dir)) - if not cached or cached.get('incomplete_redirects') == '1': - self._set_capture_cache(capture_dir) - else: - self._captures_index[capture_uuid] = CaptureCache(cached) + return self._captures_index[capture_uuid] except MissingCaptureDirectory as e: # The UUID is in the captures but the directory is not on the disk. self.logger.warning(e) @@ -600,17 +332,11 @@ class Lookyloo(): except Exception as e: self.logger.critical(e) return None - else: - return self._captures_index[capture_uuid] def get_crawled_tree(self, capture_uuid: str, /) -> CrawledTree: '''Get the generated tree in ETE Toolkit format. Loads the pickle if it exists, creates it otherwise.''' - capture_dir = self._get_capture_dir(capture_uuid) - ct = load_pickle_tree(capture_dir) - if not ct: - ct = self._cache_capture(capture_uuid) - return ct + return self._captures_index[capture_uuid].tree def enqueue_capture(self, query: MutableMapping[str, Any], source: str, user: str, authenticated: bool) -> str: '''Enqueue a query in the capture queue (used by the UI and the API for asynchronous processing)''' @@ -923,14 +649,6 @@ class Lookyloo(): if not cache: return {'error': 'UUID missing in cache, try again later.'} - if cache.incomplete_redirects: - ct = self._cache_capture(capture_uuid) - cache = self.capture_cache(capture_uuid) - if not cache: - return {'error': 'UUID missing in cache, try again later.'} - else: - ct = self.get_crawled_tree(capture_uuid) - event = MISPEvent() event.info = f'Lookyloo Capture ({cache.url})' lookyloo_link: MISPAttribute = event.add_attribute('link', f'https://{self.public_domain}/tree/{capture_uuid}') # type: ignore @@ -939,7 +657,7 @@ class Lookyloo(): initial_url = URLObject(cache.url) initial_url.comment = 'Submitted URL' - self.__misp_add_ips_to_URLObject(initial_url, ct.root_hartree.hostname_tree) + self.__misp_add_ips_to_URLObject(initial_url, cache.tree.root_hartree.hostname_tree) redirects: List[URLObject] = [] for nb, url in enumerate(cache.redirects): @@ -947,7 +665,7 @@ class Lookyloo(): continue obj = URLObject(url) obj.comment = f'Redirect {nb}' - self.__misp_add_ips_to_URLObject(obj, ct.root_hartree.hostname_tree) + self.__misp_add_ips_to_URLObject(obj, cache.tree.root_hartree.hostname_tree) redirects.append(obj) if redirects: redirects[-1].comment = f'Last redirect ({nb})' @@ -967,7 +685,7 @@ class Lookyloo(): screenshot: MISPAttribute = event.add_attribute('attachment', 'screenshot_landing_page.png', data=self.get_screenshot(capture_uuid), disable_correlation=True) # type: ignore try: - fo = FileObject(pseudofile=ct.root_hartree.rendered_node.body, filename=ct.root_hartree.rendered_node.filename) + fo = FileObject(pseudofile=cache.tree.root_hartree.rendered_node.body, filename=cache.tree.root_hartree.rendered_node.filename) fo.comment = 'Content received for the final redirect (before rendering)' fo.add_reference(final_redirect, 'loaded-by', 'URL loading that content') fo.add_reference(screenshot, 'rendered-as', 'Screenshot of the page')