mirror of https://github.com/CIRCL/lookyloo
fix: Speedup generating pickles in BG
parent
6e1e4d831c
commit
2920f796fe
lookyloo
|
@ -5,8 +5,9 @@ import logging.config
|
|||
import os
|
||||
import shutil
|
||||
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Optional, List
|
||||
|
||||
from lookyloo.default import AbstractManager, get_config
|
||||
from lookyloo.exceptions import MissingUUID, NoValidHarFile
|
||||
|
@ -33,74 +34,90 @@ class BackgroundIndexer(AbstractManager):
|
|||
self._check_indexes()
|
||||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
||||
|
||||
def _make_dirs_list(self, root_dir: Path) -> List[Path]:
|
||||
directories = []
|
||||
year_now = date.today().year
|
||||
while True:
|
||||
year_dir = root_dir / str(year_now)
|
||||
if not year_dir.exists():
|
||||
# if we do not have a directory with this year, quit the loop
|
||||
break
|
||||
for month in range(12, 0, -1):
|
||||
month_dir = year_dir / f'{month:02}'
|
||||
if month_dir.exists():
|
||||
directories.append(month_dir)
|
||||
year_now -= 1
|
||||
return directories
|
||||
|
||||
def _build_missing_pickles(self) -> bool:
|
||||
self.logger.debug('Build missing pickles...')
|
||||
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
||||
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
||||
max_captures = 50
|
||||
got_new_captures = False
|
||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('*/*/*/uuid'), reverse=True):
|
||||
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
||||
# We already have a pickle file
|
||||
self.logger.debug(f'{uuid_path.parent} has a pickle.')
|
||||
continue
|
||||
if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')):
|
||||
# No HAR file
|
||||
self.logger.debug(f'{uuid_path.parent} has no HAR file.')
|
||||
continue
|
||||
|
||||
if is_locked(uuid_path.parent):
|
||||
# it is really locked
|
||||
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
|
||||
continue
|
||||
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
|
||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||
else:
|
||||
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
|
||||
if cached_path != uuid_path.parent:
|
||||
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
||||
if cached_path.exists():
|
||||
# Both paths exist, move the one that isn't in lookup_dirs
|
||||
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
|
||||
try:
|
||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
continue
|
||||
else:
|
||||
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||
|
||||
try:
|
||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
self.lookyloo.get_crawled_tree(uuid)
|
||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||
self.logger.info(f'Pickle for {uuid} build.')
|
||||
got_new_captures = True
|
||||
max_captures -= 1
|
||||
except MissingUUID:
|
||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||
except NoValidHarFile as e:
|
||||
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
|
||||
except FileNotFoundError:
|
||||
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
|
||||
except Exception:
|
||||
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
# The capture is not working, moving it away.
|
||||
try:
|
||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
for month_dir in self._make_dirs_list(self.lookyloo.capture_dir):
|
||||
for uuid_path in sorted(month_dir.glob('*/uuid'), reverse=True):
|
||||
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
||||
# We already have a pickle file
|
||||
self.logger.debug(f'{uuid_path.parent} has a pickle.')
|
||||
continue
|
||||
if max_captures <= 0:
|
||||
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
||||
return False
|
||||
if not list(uuid_path.parent.rglob('*.har.gz')) and not list(uuid_path.parent.rglob('*.har')):
|
||||
# No HAR file
|
||||
self.logger.debug(f'{uuid_path.parent} has no HAR file.')
|
||||
continue
|
||||
|
||||
if is_locked(uuid_path.parent):
|
||||
# it is really locked
|
||||
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
|
||||
continue
|
||||
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
|
||||
if not self.lookyloo.redis.hexists('lookup_dirs', uuid):
|
||||
# The capture with this UUID exists, but it is for some reason missing in lookup_dirs
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||
else:
|
||||
cached_path = Path(self.lookyloo.redis.hget('lookup_dirs', uuid))
|
||||
if cached_path != uuid_path.parent:
|
||||
# we have a duplicate UUID, it is proably related to some bad copy/paste
|
||||
if cached_path.exists():
|
||||
# Both paths exist, move the one that isn't in lookup_dirs
|
||||
self.logger.critical(f'Duplicate UUID for {uuid} in {cached_path} and {uuid_path.parent}, discarding the latest')
|
||||
try:
|
||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
continue
|
||||
else:
|
||||
# The path in lookup_dirs for that UUID doesn't exists, just update it.
|
||||
self.lookyloo.redis.hset('lookup_dirs', uuid, str(uuid_path.parent))
|
||||
|
||||
try:
|
||||
self.logger.info(f'Build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
self.lookyloo.get_crawled_tree(uuid)
|
||||
self.lookyloo.trigger_modules(uuid, auto_trigger=True)
|
||||
self.logger.info(f'Pickle for {uuid} build.')
|
||||
got_new_captures = True
|
||||
max_captures -= 1
|
||||
except MissingUUID:
|
||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||
except NoValidHarFile as e:
|
||||
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
|
||||
except FileNotFoundError:
|
||||
self.logger.warning(f'Capture {uuid} disappeared during processing, probably archived.')
|
||||
except Exception:
|
||||
self.logger.exception(f'Unable to build pickle for {uuid}: {uuid_path.parent.name}')
|
||||
# The capture is not working, moving it away.
|
||||
try:
|
||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
except FileNotFoundError as e:
|
||||
self.logger.warning(f'Unable to move capture: {e}')
|
||||
continue
|
||||
if max_captures <= 0:
|
||||
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
||||
return False
|
||||
if got_new_captures:
|
||||
self.logger.info('Finished building all missing pickles.')
|
||||
# Only return True if we built new pickles.
|
||||
|
|
|
@ -1164,7 +1164,7 @@ class Lookyloo():
|
|||
# and it has been archived.
|
||||
self.get_crawled_tree(capture_uuid)
|
||||
|
||||
# if the file submitted on lookyloo cannot be displayed (PDF), it willbe downloaded.
|
||||
# if the file submitted on lookyloo cannot be displayed (PDF), it will be downloaded.
|
||||
# In the case, we want to have it as a FileObject in the export
|
||||
filename, pseudofile = self.get_data(capture_uuid)
|
||||
if filename:
|
||||
|
|
Loading…
Reference in New Issue