mirror of https://github.com/CIRCL/lookyloo
fix: Speedup generating pickles in BG
parent
6e1e4d831c
commit
2920f796fe
|
@ -5,8 +5,9 @@ import logging.config
|
||||||
import os
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
|
from datetime import date
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional, List
|
||||||
|
|
||||||
from lookyloo.default import AbstractManager, get_config
|
from lookyloo.default import AbstractManager, get_config
|
||||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||||
|
@ -33,13 +34,29 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self._check_indexes()
|
self._check_indexes()
|
||||||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
||||||
|
|
||||||
|
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
|
||||||
|
directories = []
|
||||||
|
year_now = date.today().year
|
||||||
|
while True:
|
||||||
|
year_dir = root_dir / str(year_now)
|
||||||
|
if not year_dir.exists():
|
||||||
|
# if we do not have a directory with this year, quit the loop
|
||||||
|
break
|
||||||
|
for month in range(12, 0, -1):
|
||||||
|
month_dir = year_dir / f'{month:02}'
|
||||||
|
if month_dir.exists():
|
||||||
|
directories.append(month_dir)
|
||||||
|
year_now -= 1
|
||||||
|
return directories
|
||||||
|
|
||||||
def _build_missing_pickles(self) -> bool:
|
def _build_missing_pickles(self) -> bool:
|
||||||
self.logger.debug('Build missing pickles...')
|
self.logger.debug('Build missing pickles...')
|
||||||
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
||||||
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
||||||
max_captures = 50
|
max_captures = 50
|
||||||
got_new_captures = False
|
got_new_captures = False
|
||||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True):
|
for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
|
||||||
|
for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True):
|
||||||
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
||||||
# We already have a pickle file
|
# We already have a pickle file
|
||||||
self.logger.debug(f'{uuid_path.parent} has a pickle.')
|
self.logger.debug(f'{uuid_path.parent} has a pickle.')
|
||||||
|
|
|
@ -1164,7 +1164,7 @@ class Lookyloo():
|
||||||
# and it has been archived.
|
# and it has been archived.
|
||||||
self.get_crawled_tree(capture_uuid)
|
self.get_crawled_tree(capture_uuid)
|
||||||
|
|
||||||
# if the file submitted on lookyloo cannot be displayed (PDF), it willbe downloaded.
|
# if the file submitted on lookyloo cannot be displayed (PDF), it will be downloaded.
|
||||||
# In the case, we want to have it as a FileObject in the export
|
# In the case, we want to have it as a FileObject in the export
|
||||||
filename, pseudofile = self.get_data(capture_uuid)
|
filename, pseudofile = self.get_data(capture_uuid)
|
||||||
if filename:
|
if filename:
|
||||||
|
|
Loading…
Reference in New Issue