fix: Speedup generating pickles in BG

pull/800/head
Raphaël Vinot 2023-10-09 10:26:37 +02:00
parent 6e1e4d831c
commit 2920f796fe
2 changed files with 80 additions and 63 deletions

View File

@ -5,8 +5,9 @@ import logging.config
import os import os
import shutil import shutil
from datetime import date
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional, List
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.exceptions import MissingUUID, NoValidHarFile
@ -33,74 +34,90 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes() self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
directories = []
year_now = date.today().year
while True:
year_dir = root_dir / str(year_now)
if not year_dir.exists():
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = year_dir / f'{month:02}'
if month_dir.exists():
directories.append(month_dir)
year_now -= 1
return directories
def _build_missing_pickles(self) -> bool: def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...') self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures # This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50 max_captures = 50
got_new_captures = False got_new_captures = False
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True): for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()): for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True):
# We already have a pickle file if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
self.logger.debug(f'{uuid_path.parent} has a pickle.') # We already have a pickle file
continue self.logger.debug(f'{uuid_path.parent} has a pickle.')
if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')):
# No HAR file
self.logger.debug(f'{uuid_path.parent} has no HAR file.')
continue
if is_locked(uuid_path.parent):
# it is really locked
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
continue
with uuid_path.open() as f:
uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
if cached_path != uuid_path.parent:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} build.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
# The capture is not working, moving it away.
try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
self.lookyloo.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue continue
if max_captures <= 0: if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')):
self.logger.info('Too many captures in the backlog, start from the beginning.') # No HAR file
return False self.logger.debug(f'{uuid_path.parent} has no HAR file.')
continue
if is_locked(uuid_path.parent):
# it is really locked
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
continue
with uuid_path.open() as f:
uuid = f.read()
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
else:
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
if cached_path != uuid_path.parent:
# we have a duplicate UUID, it is proably related to some bad copy/paste
if cached_path.exists():
# Both paths exist, move the one that isn't in lookup_dirs
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
else:
# The path in lookup_dirs for that UUID doesn't exists, just update it.
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
try:
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
self.lookyloo.get_crawled_tree(uuid)
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
self.logger.info(f'Pickle for {uuid} build.')
got_new_captures = True
max_captures -= 1
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
except FileNotFoundError:
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
except Exception:
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
# The capture is not working, moving it away.
try:
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
self.lookyloo.redis.hdel('lookup_dirs', uuid)
except FileNotFoundError as e:
self.logger.warning(f'Unable to move capture: {e}')
continue
if max_captures <= 0:
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures: if got_new_captures:
self.logger.info('Finished building all missing pickles.') self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles. # Only return True if we built new pickles.

View File

@ -1164,7 +1164,7 @@ class Lookyloo():
# and it has been archived. # and it has been archived.
self.get_crawled_tree(capture_uuid) self.get_crawled_tree(capture_uuid)
# if the file submitted on lookyloo cannot be displayed (PDF), it willbe downloaded. # if the file submitted on lookyloo cannot be displayed (PDF), it will be downloaded.
# In the case, we want to have it as a FileObject in the export # In the case, we want to have it as a FileObject in the export
filename, pseudofile = self.get_data(capture_uuid) filename, pseudofile = self.get_data(capture_uuid)
if filename: if filename: