chg: Better use of cache, sanity checks

pull/251/head
Raphaël Vinot 2021-08-23 12:17:44 +02:00
parent 9eb5076133
commit d359bc7521
4 changed files with 32 additions and 10 deletions

View File

@ -45,6 +45,11 @@ class Archiver(AbstractManager):
with (capture_path / 'uuid').open() as _f:
uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
self.logger.info(f'Archiving {capture_path}.')
if not to_archive:
self.logger.info('Nothing to archive.')
return
archived_uuids = {}
for year, month_captures in to_archive.items():

View File

@ -6,7 +6,7 @@ import json
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from .exceptions import LookylooException
from .exceptions import LookylooException, MissingCaptureDirectory
class CaptureCache():
@ -22,6 +22,8 @@ class CaptureCache():
self.url: str = cache_entry['url']
self.redirects: List[str] = json.loads(cache_entry['redirects'])
self.capture_dir: Path = Path(cache_entry['capture_dir'])
if not self.capture_dir.exists():
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
elif not cache_entry.get('error'):
missing = set(__default_cache_keys) - set(cache_entry.keys())
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')

View File

@ -24,3 +24,7 @@ class ConfigError(LookylooException):
class MissingUUID(LookylooException):
pass
class MissingCaptureDirectory(LookylooException):
pass

View File

@ -36,7 +36,7 @@ from redis.connection import UnixDomainSocketConnection
from scrapysplashwrapper import crawl
from werkzeug.useragents import UserAgent
from .exceptions import NoValidHarFile, MissingUUID, LookylooException
from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
safe_create_dir, get_email_template, load_pickle_tree,
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
@ -581,7 +581,11 @@ class Lookyloo():
for c in p.execute():
if not c:
continue
c = CaptureCache(c)
try:
c = CaptureCache(c)
except LookylooException as e:
self.logger.warning(e)
continue
if hasattr(c, 'timestamp'):
all_cache.append(c)
self._captures_index[c.uuid] = c
@ -593,12 +597,7 @@ class Lookyloo():
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
"""Get the cache from redis."""
if capture_uuid in self._captures_index:
if self._captures_index[capture_uuid].incomplete_redirects:
# Try to rebuild the cache
capture_dir = self._get_capture_dir(capture_uuid)
cached = self._set_capture_cache(capture_dir)
self._captures_index[capture_uuid] = CaptureCache(cached)
if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
return self._captures_index[capture_uuid]
capture_dir = self._get_capture_dir(capture_uuid)
if not capture_dir:
@ -611,13 +610,25 @@ class Lookyloo():
try:
self._captures_index[capture_uuid] = CaptureCache(cached)
return self._captures_index[capture_uuid]
except MissingCaptureDirectory:
self.logger.warning(f'Cache ({capture_dir}) is missing.')
return None
except LookylooException as e:
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
return None
def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID'''
if capture_uuid in self._captures_index:
capture_dir = self._captures_index[capture_uuid].capture_dir
if capture_dir.exists():
return capture_dir
self._captures_index.pop(capture_uuid)
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid)
capture_dir = None
if not capture_dir:
# Try in the archive
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
@ -626,7 +637,7 @@ class Lookyloo():
to_return = Path(capture_dir)
if not to_return.exists():
# The capture was removed, remove the UUID
self.redis.hdel('lookup_dirs', capture_uuid)
self.redis.hdel('lookup_dirs_archived', capture_uuid)
self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
return to_return