chg: clear cache when it is not needed

pull/893/head
Raphaël Vinot 2024-03-08 15:50:47 +01:00
parent 935e809112
commit 24cc00fe96
4 changed files with 37 additions and 19 deletions

View File

@ -9,10 +9,12 @@ import shutil
from datetime import datetime, timedelta from datetime import datetime, timedelta
from pathlib import Path from pathlib import Path
from redis import Redis
from lookyloo import Lookyloo from lookyloo import Lookyloo
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list, get_captures_dir
logging.config.dictConfig(get_config('logging')) logging.config.dictConfig(get_config('logging'))
@ -25,11 +27,17 @@ class BackgroundBuildCaptures(AbstractManager):
self.lookyloo = Lookyloo() self.lookyloo = Lookyloo()
self.script_name = 'background_build_captures' self.script_name = 'background_build_captures'
# make sure discarded captures dir exists # make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures' self.captures_dir = get_captures_dir()
self.discarded_captures_dir = self.captures_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True) self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
# Redis connector so we don't use the one from Lookyloo
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def _to_run_forever(self) -> None: def _to_run_forever(self) -> None:
self._build_missing_pickles() self._build_missing_pickles()
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _build_missing_pickles(self) -> bool: def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...') self.logger.debug('Build missing pickles...')
@ -41,7 +49,7 @@ class BackgroundBuildCaptures(AbstractManager):
# Initialize time where we do not want to build the pickles anymore. # Initialize time where we do not want to build the pickles anymore.
archive_interval = timedelta(days=get_config('generic', 'archive')) archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval) cut_time = (datetime.now() - archive_interval)
for month_dir in make_dirs_list(self.lookyloo.capture_dir): for month_dir in make_dirs_list(self.captures_dir):
__counter_shutdown = 0 __counter_shutdown = 0
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True): for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
__counter_shutdown += 1 __counter_shutdown += 1
@ -65,11 +73,11 @@ class BackgroundBuildCaptures(AbstractManager):
with (path / 'uuid').open() as f: with (path / 'uuid').open() as f:
uuid = f.read() uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid): if not self.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs # The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) self.redis.hset('lookup_dirs', uuid, str(path))
else: else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type] cached_path = Path(self.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
if cached_path != path: if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste # we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists(): if cached_path.exists():
@ -82,12 +90,15 @@ class BackgroundBuildCaptures(AbstractManager):
continue continue
else: else:
# The path in lookup_dirs for that UUID doesn't exists, just update it. # The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path)) self.redis.hset('lookup_dirs', uuid, str(path))
try: try:
self.logger.info(f'Build pickle for {uuid}: {path.name}') self.logger.info(f'Build pickle for {uuid}: {path.name}')
self.lookyloo.get_crawled_tree(uuid) self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True) try:
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
except Exception as e:
self.logger.exception(f'Unable to trigger modules for {uuid}: {e}')
self.logger.info(f'Pickle for {uuid} built.') self.logger.info(f'Pickle for {uuid} built.')
got_new_captures = True got_new_captures = True
max_captures -= 1 max_captures -= 1
@ -102,7 +113,7 @@ class BackgroundBuildCaptures(AbstractManager):
# The capture is not working, moving it away. # The capture is not working, moving it away.
try: try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name)) shutil.move(str(path), str(self.discarded_captures_dir / path.name))
self.lookyloo.redis.hdel('lookup_dirs', uuid) self.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e: except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}') self.logger.warning(f'Unable to move capture: {e}')
continue continue

View File

@ -4,7 +4,6 @@ from __future__ import annotations
import logging import logging
import logging.config import logging.config
import os
from lookyloo import Lookyloo, Indexing from lookyloo import Lookyloo, Indexing
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config
@ -25,13 +24,11 @@ class BackgroundIndexer(AbstractManager):
self.script_name = 'background_full_indexer' self.script_name = 'background_full_indexer'
else: else:
self.script_name = 'background_indexer' self.script_name = 'background_indexer'
# make sure discarded captures dir exists
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
def _to_run_forever(self) -> None: def _to_run_forever(self) -> None:
self._check_indexes() self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) # Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _check_indexes(self) -> None: def _check_indexes(self) -> None:
if not self.indexing.can_index: if not self.indexing.can_index:

View File

@ -145,10 +145,14 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C
remove_pickle_tree(capture_dir) remove_pickle_tree(capture_dir)
if tree: if tree:
if tree.root_hartree.har.path.exists(): try:
return tree if tree.root_hartree.har.path.exists():
else: return tree
# The capture was moved. else:
# The capture was moved.
remove_pickle_tree(capture_dir)
except Exception as e:
logger.warning(f'The pickle is broken, removing: {e}')
remove_pickle_tree(capture_dir) remove_pickle_tree(capture_dir)
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')): if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
@ -239,6 +243,9 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
def lru_cache_status(self) -> CacheInfo: def lru_cache_status(self) -> CacheInfo:
return load_pickle_tree.cache_info() return load_pickle_tree.cache_info()
def lru_cache_clear(self) -> None:
load_pickle_tree.cache_clear()
def _quick_init(self) -> None: def _quick_init(self) -> None:
'''Initialize the cache with a list of UUIDs, with less back and forth with redis. '''Initialize the cache with a list of UUIDs, with less back and forth with redis.
Only get recent captures.''' Only get recent captures.'''

View File

@ -464,6 +464,9 @@ class Lookyloo():
def update_tree_cache_info(self, process_id: int, classname: str) -> None: def update_tree_cache_info(self, process_id: int, classname: str) -> None:
self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status())) self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))
def clear_tree_cache(self) -> None:
self._captures_index.lru_cache_clear()
def sorted_capture_cache(self, capture_uuids: Iterable[str] | None=None, cached_captures_only: bool=True, index_cut_time: datetime | None=None) -> list[CaptureCache]: def sorted_capture_cache(self, capture_uuids: Iterable[str] | None=None, cached_captures_only: bool=True, index_cut_time: datetime | None=None) -> list[CaptureCache]:
'''Get all the captures in the cache, sorted by timestamp (new -> old). '''Get all the captures in the cache, sorted by timestamp (new -> old).
By default, this method will only return the captures that are currently cached.''' By default, this method will only return the captures that are currently cached.'''