mirror of https://github.com/CIRCL/lookyloo
chg: clear cache when it is not needed
parent
935e809112
commit
24cc00fe96
|
@ -9,10 +9,12 @@ import shutil
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
from lookyloo import Lookyloo
|
from lookyloo import Lookyloo
|
||||||
from lookyloo.default import AbstractManager, get_config
|
from lookyloo.default import AbstractManager, get_config, get_socket_path
|
||||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||||
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list
|
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list, get_captures_dir
|
||||||
|
|
||||||
|
|
||||||
logging.config.dictConfig(get_config('logging'))
|
logging.config.dictConfig(get_config('logging'))
|
||||||
|
@ -25,11 +27,17 @@ class BackgroundBuildCaptures(AbstractManager):
|
||||||
self.lookyloo = Lookyloo()
|
self.lookyloo = Lookyloo()
|
||||||
self.script_name = 'background_build_captures'
|
self.script_name = 'background_build_captures'
|
||||||
# make sure discarded captures dir exists
|
# make sure discarded captures dir exists
|
||||||
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
self.captures_dir = get_captures_dir()
|
||||||
|
self.discarded_captures_dir = self.captures_dir.parent / 'discarded_captures'
|
||||||
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Redis connector so we don't use the one from Lookyloo
|
||||||
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
|
|
||||||
def _to_run_forever(self) -> None:
|
def _to_run_forever(self) -> None:
|
||||||
self._build_missing_pickles()
|
self._build_missing_pickles()
|
||||||
|
# Don't need the cache in this class.
|
||||||
|
self.lookyloo.clear_tree_cache()
|
||||||
|
|
||||||
def _build_missing_pickles(self) -> bool:
|
def _build_missing_pickles(self) -> bool:
|
||||||
self.logger.debug('Build missing pickles...')
|
self.logger.debug('Build missing pickles...')
|
||||||
|
@ -41,7 +49,7 @@ class BackgroundBuildCaptures(AbstractManager):
|
||||||
# Initialize time where we do not want to build the pickles anymore.
|
# Initialize time where we do not want to build the pickles anymore.
|
||||||
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
archive_interval = timedelta(days=get_config('generic', 'archive'))
|
||||||
cut_time = (datetime.now() - archive_interval)
|
cut_time = (datetime.now() - archive_interval)
|
||||||
for month_dir in make_dirs_list(self.lookyloo.capture_dir):
|
for month_dir in make_dirs_list(self.captures_dir):
|
||||||
__counter_shutdown = 0
|
__counter_shutdown = 0
|
||||||
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
|
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
|
||||||
__counter_shutdown += 1
|
__counter_shutdown += 1
|
||||||
|
@ -65,11 +73,11 @@ class BackgroundBuildCaptures(AbstractManager):
|
||||||
with (path / 'uuid').open() as f:
|
with (path / 'uuid').open() as f:
|
||||||
uuid = f.read()
|
uuid = f.read()
|
||||||
|
|
||||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
if not self.redis.hexists('lookup_dirs', uuid):
|
||||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
self.redis.hset('lookup_dirs', uuid, str(path))
|
||||||
else:
|
else:
|
||||||
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
|
cached_path = Path(self.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
|
||||||
if cached_path != path:
|
if cached_path != path:
|
||||||
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
||||||
if cached_path.exists():
|
if cached_path.exists():
|
||||||
|
@ -82,12 +90,15 @@ class BackgroundBuildCaptures(AbstractManager):
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
||||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(path))
|
self.redis.hset('lookup_dirs', uuid, str(path))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.logger.info(f'Build pickle for {uuid}: {path.name}')
|
self.logger.info(f'Build pickle for {uuid}: {path.name}')
|
||||||
self.lookyloo.get_crawled_tree(uuid)
|
self.lookyloo.get_crawled_tree(uuid)
|
||||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
try:
|
||||||
|
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.exception(f'Unable to trigger modules for {uuid}: {e}')
|
||||||
self.logger.info(f'Pickle for {uuid} built.')
|
self.logger.info(f'Pickle for {uuid} built.')
|
||||||
got_new_captures = True
|
got_new_captures = True
|
||||||
max_captures -= 1
|
max_captures -= 1
|
||||||
|
@ -102,7 +113,7 @@ class BackgroundBuildCaptures(AbstractManager):
|
||||||
# The capture is not working, moving it away.
|
# The capture is not working, moving it away.
|
||||||
try:
|
try:
|
||||||
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
|
||||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
self.redis.hdel('lookup_dirs', uuid)
|
||||||
except FileNotFoundError as e:
|
except FileNotFoundError as e:
|
||||||
self.logger.warning(f'Unable to move capture: {e}')
|
self.logger.warning(f'Unable to move capture: {e}')
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -4,7 +4,6 @@ from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import logging.config
|
import logging.config
|
||||||
import os
|
|
||||||
|
|
||||||
from lookyloo import Lookyloo, Indexing
|
from lookyloo import Lookyloo, Indexing
|
||||||
from lookyloo.default import AbstractManager, get_config
|
from lookyloo.default import AbstractManager, get_config
|
||||||
|
@ -25,13 +24,11 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self.script_name = 'background_full_indexer'
|
self.script_name = 'background_full_indexer'
|
||||||
else:
|
else:
|
||||||
self.script_name = 'background_indexer'
|
self.script_name = 'background_indexer'
|
||||||
# make sure discarded captures dir exists
|
|
||||||
self.discarded_captures_dir = self.lookyloo.capture_dir.parent / 'discarded_captures'
|
|
||||||
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
def _to_run_forever(self) -> None:
|
def _to_run_forever(self) -> None:
|
||||||
self._check_indexes()
|
self._check_indexes()
|
||||||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
# Don't need the cache in this class.
|
||||||
|
self.lookyloo.clear_tree_cache()
|
||||||
|
|
||||||
def _check_indexes(self) -> None:
|
def _check_indexes(self) -> None:
|
||||||
if not self.indexing.can_index:
|
if not self.indexing.can_index:
|
||||||
|
|
|
@ -145,10 +145,14 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int, logger: Logger) -> C
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
|
|
||||||
if tree:
|
if tree:
|
||||||
if tree.root_hartree.har.path.exists():
|
try:
|
||||||
return tree
|
if tree.root_hartree.har.path.exists():
|
||||||
else:
|
return tree
|
||||||
# The capture was moved.
|
else:
|
||||||
|
# The capture was moved.
|
||||||
|
remove_pickle_tree(capture_dir)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f'The pickle is broken, removing: {e}')
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
|
|
||||||
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
|
if list(capture_dir.rglob('*.har')) or list(capture_dir.rglob('*.har.gz')):
|
||||||
|
@ -239,6 +243,9 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
|
||||||
def lru_cache_status(self) -> CacheInfo:
|
def lru_cache_status(self) -> CacheInfo:
|
||||||
return load_pickle_tree.cache_info()
|
return load_pickle_tree.cache_info()
|
||||||
|
|
||||||
|
def lru_cache_clear(self) -> None:
|
||||||
|
load_pickle_tree.cache_clear()
|
||||||
|
|
||||||
def _quick_init(self) -> None:
|
def _quick_init(self) -> None:
|
||||||
'''Initialize the cache with a list of UUIDs, with less back and forth with redis.
|
'''Initialize the cache with a list of UUIDs, with less back and forth with redis.
|
||||||
Only get recent captures.'''
|
Only get recent captures.'''
|
||||||
|
|
|
@ -464,6 +464,9 @@ class Lookyloo():
|
||||||
def update_tree_cache_info(self, process_id: int, classname: str) -> None:
|
def update_tree_cache_info(self, process_id: int, classname: str) -> None:
|
||||||
self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))
|
self.redis.hset('tree_cache', f'{process_id}|{classname}', str(self._captures_index.lru_cache_status()))
|
||||||
|
|
||||||
|
def clear_tree_cache(self) -> None:
|
||||||
|
self._captures_index.lru_cache_clear()
|
||||||
|
|
||||||
def sorted_capture_cache(self, capture_uuids: Iterable[str] | None=None, cached_captures_only: bool=True, index_cut_time: datetime | None=None) -> list[CaptureCache]:
|
def sorted_capture_cache(self, capture_uuids: Iterable[str] | None=None, cached_captures_only: bool=True, index_cut_time: datetime | None=None) -> list[CaptureCache]:
|
||||||
'''Get all the captures in the cache, sorted by timestamp (new -> old).
|
'''Get all the captures in the cache, sorted by timestamp (new -> old).
|
||||||
By default, this method will only return the captures that are currently cached.'''
|
By default, this method will only return the captures that are currently cached.'''
|
||||||
|
|
Loading…
Reference in New Issue