From 2920f796feb2e45d7f60b9a2473e627caf4b3c88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 9 Oct 2023 10:26:37 +0200 Subject: [PATCH] fix: Speedup generating pickles in BG --- bin/background_indexer.py | 141 +++++++++++++++++++++----------------- lookyloo/lookyloo.py | 2 +- 2 files changed, 80 insertions(+), 63 deletions(-) diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 5238049b..441c27de 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -5,8 +5,9 @@ import logging.config import os import shutil +from datetime import date from pathlib import Path -from typing import Optional +from typing import Optional, List from lookyloo.default import AbstractManager, get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile @@ -33,74 +34,90 @@ class BackgroundIndexer(AbstractManager): self._check_indexes() self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) + def _make_dirs_list(self, root_dir: Path) -> List[Path]: + directories = [] + year_now = date.today().year + while True: + year_dir = root_dir / str(year_now) + if not year_dir.exists(): + # if we do not have a directory with this year, quit the loop + break + for month in range(12, 0, -1): + month_dir = year_dir / f'{month:02}' + if month_dir.exists(): + directories.append(month_dir) + year_now -= 1 + return directories + def _build_missing_pickles(self) -> bool: self.logger.debug('Build missing pickles...') # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time # This value makes sure we break out of the loop and build pickles of the most recent captures max_captures = 50 got_new_captures = False - for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True): - if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()): - # We already have a pickle file - self.logger.debug(f'{uuid_path.parent} has a pickle.') - continue - if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')): - # No HAR file - self.logger.debug(f'{uuid_path.parent} has no HAR file.') - continue - - if is_locked(uuid_path.parent): - # it is really locked - self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.') - continue - - with uuid_path.open() as f: - uuid = f.read() - - if not self.lookyloo.redis.hexists('lookup_dirs', uuid): - # The capture with this UUID exists, but it is for some reason missing in lookup_dirs - self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) - else: - cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) - if cached_path != uuid_path.parent: - # we have a duplicate UUID, it is proably related to some bad copy/paste - if cached_path.exists(): - # Both paths exist, move the one that isn't in lookup_dirs - self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest') - try: - shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) - except FileNotFoundError as e: - self.logger.warning(f'Unable to move capture: {e}') - continue - else: - # The path in lookup_dirs for that UUID doesn't exists, just update it. - self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) - - try: - self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') - self.lookyloo.get_crawled_tree(uuid) - self.lookyloo.trigger_modules(uuid, auto_trigger=True) - self.logger.info(f'Pickle for {uuid} build.') - got_new_captures = True - max_captures -= 1 - except MissingUUID: - self.logger.warning(f'Unable to find {uuid}. That should not happen.') - except NoValidHarFile as e: - self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}') - except FileNotFoundError: - self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.') - except Exception: - self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') - # The capture is not working, moving it away. - try: - shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) - self.lookyloo.redis.hdel('lookup_dirs', uuid) - except FileNotFoundError as e: - self.logger.warning(f'Unable to move capture: {e}') + for month_dir in self._make_dirs_list(self.lookyloo.capture_dir): + for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True): + if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()): + # We already have a pickle file + self.logger.debug(f'{uuid_path.parent} has a pickle.') continue - if max_captures <= 0: - self.logger.info('Too many captures in the backlog, start from the beginning.') - return False + if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')): + # No HAR file + self.logger.debug(f'{uuid_path.parent} has no HAR file.') + continue + + if is_locked(uuid_path.parent): + # it is really locked + self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.') + continue + + with uuid_path.open() as f: + uuid = f.read() + + if not self.lookyloo.redis.hexists('lookup_dirs', uuid): + # The capture with this UUID exists, but it is for some reason missing in lookup_dirs + self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) + else: + cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid)) + if cached_path != uuid_path.parent: + # we have a duplicate UUID, it is proably related to some bad copy/paste + if cached_path.exists(): + # Both paths exist, move the one that isn't in lookup_dirs + self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest') + try: + shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) + except FileNotFoundError as e: + self.logger.warning(f'Unable to move capture: {e}') + continue + else: + # The path in lookup_dirs for that UUID doesn't exists, just update it. + self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent)) + + try: + self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}') + self.lookyloo.get_crawled_tree(uuid) + self.lookyloo.trigger_modules(uuid, auto_trigger=True) + self.logger.info(f'Pickle for {uuid} build.') + got_new_captures = True + max_captures -= 1 + except MissingUUID: + self.logger.warning(f'Unable to find {uuid}. That should not happen.') + except NoValidHarFile as e: + self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}') + except FileNotFoundError: + self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.') + except Exception: + self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}') + # The capture is not working, moving it away. + try: + shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name)) + self.lookyloo.redis.hdel('lookup_dirs', uuid) + except FileNotFoundError as e: + self.logger.warning(f'Unable to move capture: {e}') + continue + if max_captures <= 0: + self.logger.info('Too many captures in the backlog, start from the beginning.') + return False if got_new_captures: self.logger.info('Finished building all missing pickles.') # Only return True if we built new pickles. diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 7b7cb074..350b47dd 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -1164,7 +1164,7 @@ class Lookyloo(): # and it has been archived. self.get_crawled_tree(capture_uuid) - # if the file submitted on lookyloo cannot be displayed (PDF), it willbe downloaded. + # if the file submitted on lookyloo cannot be displayed (PDF), it will be downloaded. # In the case, we want to have it as a FileObject in the export filename, pseudofile = self.get_data(capture_uuid) if filename: