lookyloo/bin/background_build_captures.py

137 lines
6.3 KiB
Python
Executable File

#!/usr/bin/env python3
from __future__ import annotations
import logging
import logging.config
import shutil
from datetime import datetime, timedelta
from pathlib import Path
from redis import Redis
from lookyloo import Lookyloo
from lookyloo.default import AbstractManager, get_config, get_socket_path
from lookyloo.exceptions import MissingUUID, NoValidHarFile
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list, get_captures_dir
logging.config.dictConfig(get_config('logging'))
class BackgroundBuildCaptures(AbstractManager):
def __init__(self, loglevel: int | None=None):
super().__init__(loglevel)
self.lookyloo = Lookyloo(cache_max_size=1)
self.script_name = 'background_build_captures'
# make sure discarded captures dir exists
self.captures_dir = get_captures_dir()
self.discarded_captures_dir = self.captures_dir.parent / 'discarded_captures'
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
# Redis connector so we don't use the one from Lookyloo
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def _to_run_forever(self) -> None:
self._build_missing_pickles()
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50
got_new_captures = False
# Initialize time where we do not want to build the pickles anymore.
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval)
for month_dir in make_dirs_list(self.captures_dir):
__counter_shutdown = 0
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
__counter_shutdown += 1
if __counter_shutdown % 10 and self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.')
return False
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
# We already have a pickle file
self.logger.debug(f'{path} has a pickle.')
continue
if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
# No HAR file
self.logger.debug(f'{path} has no HAR file.')
continue
if is_locked(path):
# it is really locked
self.logger.debug(f'{path} is locked, pickle generated by another process.')
continue
with (path / 'uuid').open() as f:
uuid = f.read()
if not self.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.redis.hset('lookup_dirs', uuid, str(path))
else:
cached_path = Path(self.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.redis.hset('lookup_dirs', uuid, str(path))
try:
self.logger.info(f'Build pickle for {uuid}: {path.name}')
self.lookyloo.get_crawled_tree(uuid)
try:
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
except Exception as e:
self.logger.exception(f'Unable to trigger modules for {uuid}: {e}')
self.logger.info(f'Pickle for {uuid} built.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
# The capture is not working, moving it away.
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
self.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
if max_captures <= 0:
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures:
self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles.
return True
return False
def main() -> None:
i = BackgroundBuildCaptures()
i.run(sleep_in_sec=60)
if __name__ == '__main__':
main()