lookyloo/bin/background_build_captures.py

137 lines
6.3 KiB
Python
Raw Normal View History

2024-03-05 21:03:36 +01:00
#!/usr/bin/env python3
from __future__ import annotations
import logging
import logging.config
import shutil
from datetime import datetime, timedelta
from pathlib import Path
2024-03-08 15:50:47 +01:00
from redis import Redis
2024-03-05 21:03:36 +01:00
from lookyloo import Lookyloo
2024-03-08 15:50:47 +01:00
from lookyloo.default import AbstractManager, get_config, get_socket_path
2024-03-05 21:03:36 +01:00
from lookyloo.exceptions import MissingUUID, NoValidHarFile
2024-03-08 15:50:47 +01:00
from lookyloo.helpers import is_locked, get_sorted_captures_from_disk, make_dirs_list, get_captures_dir
2024-03-05 21:03:36 +01:00
logging.config.dictConfig(get_config('logging'))
class BackgroundBuildCaptures(AbstractManager):
def __init__(self, loglevel: int | None=None):
super().__init__(loglevel)
self.lookyloo = Lookyloo(cache_max_size=1)
2024-03-05 21:03:36 +01:00
self.script_name = 'background_build_captures'
# make sure discarded captures dir exists
2024-03-08 15:50:47 +01:00
self.captures_dir = get_captures_dir()
self.discarded_captures_dir = self.captures_dir.parent / 'discarded_captures'
2024-03-05 21:03:36 +01:00
self.discarded_captures_dir.mkdir(parents=True, exist_ok=True)
2024-03-08 15:50:47 +01:00
# Redis connector so we don't use the one from Lookyloo
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
2024-03-05 21:03:36 +01:00
def _to_run_forever(self) -> None:
self._build_missing_pickles()
2024-03-08 15:50:47 +01:00
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
2024-03-05 21:03:36 +01:00
def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50
got_new_captures = False
# Initialize time where we do not want to build the pickles anymore.
archive_interval = timedelta(days=get_config('generic', 'archive'))
cut_time = (datetime.now() - archive_interval)
2024-03-08 15:50:47 +01:00
for month_dir in make_dirs_list(self.captures_dir):
2024-03-05 21:03:36 +01:00
__counter_shutdown = 0
for capture_time, path in sorted(get_sorted_captures_from_disk(month_dir, cut_time=cut_time, keep_more_recent=True), reverse=True):
__counter_shutdown += 1
if __counter_shutdown % 10 and self.shutdown_requested():
self.logger.warning('Shutdown requested, breaking.')
return False
if ((path / 'tree.pickle.gz').exists() or (path / 'tree.pickle').exists()):
# We already have a pickle file
self.logger.debug(f'{path} has a pickle.')
continue
if not list(path.rglob('*.har.gz')) and not list(path.rglob('*.har')):
# No HAR file
self.logger.debug(f'{path} has no HAR file.')
continue
if is_locked(path):
# it is really locked
self.logger.debug(f'{path} is locked, pickle generated by another process.')
continue
with (path / 'uuid').open() as f:
uuid = f.read()
2024-03-08 15:50:47 +01:00
if not self.redis.hexists('lookup_dirs', uuid):
2024-03-05 21:03:36 +01:00
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
2024-03-08 15:50:47 +01:00
self.redis.hset('lookup_dirs', uuid, str(path))
2024-03-05 21:03:36 +01:00
else:
2024-03-08 15:50:47 +01:00
cached_path = Path(self.redis.hget('lookup_dirs', uuid)) # type: ignore[arg-type]
2024-03-05 21:03:36 +01:00
if cached_path != path:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {path}, discarding the latest')
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
2024-03-08 15:50:47 +01:00
self.redis.hset('lookup_dirs', uuid, str(path))
2024-03-05 21:03:36 +01:00
try:
self.logger.info(f'Build pickle for {uuid}: {path.name}')
self.lookyloo.get_crawled_tree(uuid)
2024-03-08 15:50:47 +01:00
try:
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
except Exception as e:
self.logger.exception(f'Unable to trigger modules for {uuid}: {e}')
2024-03-05 21:03:36 +01:00
self.logger.info(f'Pickle for {uuid} built.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {path.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {path.name}')
# The capture is not working, moving it away.
try:
shutil.move(str(path), str(self.discarded_captures_dir / path.name))
2024-03-08 15:50:47 +01:00
self.redis.hdel('lookup_dirs', uuid)
2024-03-05 21:03:36 +01:00
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
if max_captures <= 0:
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures:
self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles.
return True
return False
def main() -> None:
i = BackgroundBuildCaptures()
i.run(sleep_in_sec=60)
if __name__ == '__main__':
main()