diff --git a/bin/background_build_captures.py b/bin/background_build_captures.py index cbc153d..53847cb 100755 --- a/bin/background_build_captures.py +++ b/bin/background_build_captures.py @@ -9,10 +9,12 @@ import shutil from datetime import datetime, timedelta from pathlib import Path +from redis import Redis + from lookyloo import Lookyloo -from lookyloo.default import AbstractManager, get_config +from lookyloo.default import AbstractManager, get_config, get_socket_path from lookyloo.exceptions import MissingUUID, NoValidHarFile -from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list +from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list, get_captures_dir logging.config.dictConfig(get_config('logging')) @@ -25,11 +27,17 @@ class BackgroundBuildCaptures(AbstractManager): self.lookyloo = Lookyloo() self.script_name = 'background_build_captures' # make sure discarded captures dir exists - self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' + self.captures_dir = get_captures_dir() + self.discarded_captures_dir = self.captures_dir.parent / 'discarded_captures' self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) + # Redis connector so we don't use the one from Lookyloo + self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) + def _to_run_forever(self) -> None: self._build_missing_pickles() + # Don't need the cache in this class. + self.lookyloo.clear_tree_cache() def _build_missing_pickles(self) -> bool: self.logger.debug('Build missing pickles...') @@ -41,7 +49,7 @@ class BackgroundBuildCaptures(AbstractManager): # Initialize time where we do not want to build the pickles anymore. archive_interval = timedelta(days=get_config('generic', 'archive')) cut_time = (datetime.now() - archive_interval) - for month_dir in make_dirs_list(self.lookyloo.capture_dir): + for month_dir in make_dirs_list(self.captures_dir): __counter_shutdown = 0 for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True): __counter_shutdown += 1 @@ -65,11 +73,11 @@ class BackgroundBuildCaptures(AbstractManager): with (path / 'uuid').open() as f: uuid = f.read() - if not self.lookyloo.redis.hexists('lookup_dirs', uuid): + if not self.redis.hexists('lookup_dirs', uuid): # The capture with this UUID exists, but it is for some reason missing in lookup_dirs - self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) + self.redis.hset('lookup_dirs', uuid, str(path)) else: - cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type] + cached_path = Path(self.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type] if cached_path != path: # we have a duplicate UUID, it is proably related to some bad copy/paste if cached_path.exists(): @@ -82,12 +90,15 @@ class BackgroundBuildCaptures(AbstractManager): continue else: # The path in lookup_dirs for that UUID doesn't exists, just update it. - self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) + self.redis.hset('lookup_dirs', uuid, str(path)) try: self.logger.info(f'Build pickle for {uuid}: {path.name}') self.lookyloo.get_crawled_tree(uuid) - self.lookyloo.trigger_modules(uuid, auto_trigger=True) + try: + self.lookyloo.trigger_modules(uuid, auto_trigger=True) + except Exception as e: + self.logger.exception(f'Unable to trigger modules for {uuid}: {e}') self.logger.info(f'Pickle for {uuid} built.') got_new_captures = True max_captures -= 1 @@ -102,7 +113,7 @@ class BackgroundBuildCaptures(AbstractManager): # The capture is not working, moving it away. try: shutil.move(str(path), str(self.discarded_captures_dir / path.name)) - self.lookyloo.redis.hdel('lookup_dirs', uuid) + self.redis.hdel('lookup_dirs', uuid) except FileNotFoundError as e: self.logger.warning(f'Unable to move capture: {e}') continue diff --git a/bin/background_indexer.py b/bin/background_indexer.py index f02249a..b82a15a 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -4,7 +4,6 @@ from __future__ import annotations import logging import logging.config -import os from lookyloo import Lookyloo, Indexing from lookyloo.default import AbstractManager, get_config @@ -25,13 +24,11 @@ class BackgroundIndexer(AbstractManager): self.script_name = 'background_full_indexer' else: self.script_name = 'background_indexer' - # make sure discarded captures dir exists - self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' - self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) def _to_run_forever(self) -> None: self._check_indexes() - self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) + # Don't need the cache in this class. + self.lookyloo.clear_tree_cache() def _check_indexes(self) -> None: if not self.indexing.can_index: diff --git a/lookyloo/capturecache.py b/lookyloo/capturecache.py index 60cbcfc..d9b839e 100644 --- a/lookyloo/capturecache.py +++ b/lookyloo/capturecache.py @@ -145,10 +145,14 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C remove_pickle_tree(capture_dir) if tree: - if tree.root_hartree.har.path.exists(): - return tree - else: - # The capture was moved. + try: + if tree.root_hartree.har.path.exists(): + return tree + else: + # The capture was moved. + remove_pickle_tree(capture_dir) + except Exception as e: + logger.warning(f'The pickle is broken, removing: {e}') remove_pickle_tree(capture_dir) if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')): @@ -239,6 +243,9 @@ class CapturesIndex(Mapping): # type: ignore[type-arg] def lru_cache_status(self) -> CacheInfo: return load_pickle_tree.cache_info() + def lru_cache_clear(self) -> None: + load_pickle_tree.cache_clear() + def _quick_init(self) -> None: '''Initialize the cache with a list of UUIDs, with less back and forth with redis. Only get recent captures.''' diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index f4c3513..aee0b2c 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -464,6 +464,9 @@ class Lookyloo(): def update_tree_cache_info(self, process_id: int, classname: str) -> None: self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status())) + def clear_tree_cache(self) -> None: + self._captures_index.lru_cache_clear() + def sorted_capture_cache(self, capture_uuids: Iterable[str] | None=None, cached_captures_only: bool=True, index_cut_time: datetime | None=None) -> list[CaptureCache]: '''Get all the captures in the cache, sorted by timestamp (new -> old). By default, this method will only return the captures that are currently cached.'''