fix: Speedup generating pickles in BG

pull/800/head
Raphaël Vinot 2023-10-09 10:26:37 +02:00
parent 6e1e4d831c
commit 2920f796fe
2 changed files with 80 additions and 63 deletions

View File

@ -5,8 +5,9 @@ import logging.config
import os import os
import shutil import shutil
from datetime import date
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional, List
from lookyloo.default import AbstractManager, get_config from lookyloo.default import AbstractManager, get_config
from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.exceptions import MissingUUID, NoValidHarFile
@ -33,13 +34,29 @@ class BackgroundIndexer(AbstractManager):
self._check_indexes() self._check_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
directories = []
year_now = date.today().year
while True:
year_dir = root_dir / str(year_now)
if not year_dir.exists():
# if we do not have a directory with this year, quit the loop
break
for month in range(12, 0, -1):
month_dir = year_dir / f'{month:02}'
if month_dir.exists():
directories.append(month_dir)
year_now -= 1
return directories
def _build_missing_pickles(self) -> bool: def _build_missing_pickles(self) -> bool:
self.logger.debug('Build missing pickles...') self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time # Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures # This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50 max_captures = 50
got_new_captures = False got_new_captures = False
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True): for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True):
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()): if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
# We already have a pickle file # We already have a pickle file
self.logger.debug(f'{uuid_path.parent} has a pickle.') self.logger.debug(f'{uuid_path.parent} has a pickle.')

View File

@ -1164,7 +1164,7 @@ class Lookyloo():
# and it has been archived. # and it has been archived.
self.get_crawled_tree(capture_uuid) self.get_crawled_tree(capture_uuid)
# if the file submitted on lookyloo cannot be displayed (PDF), it willbe downloaded. # if the file submitted on lookyloo cannot be displayed (PDF), it will be downloaded.
# In the case, we want to have it as a FileObject in the export # In the case, we want to have it as a FileObject in the export
filename, pseudofile = self.get_data(capture_uuid) filename, pseudofile = self.get_data(capture_uuid)
if filename: if filename: