chg: Better use of cache, sanity checks

pull/251/head
Raphaël Vinot 2021-08-23 12:17:44 +02:00
parent 9eb5076133
commit d359bc7521
4 changed files with 32 additions and 10 deletions

View File

@ -45,6 +45,11 @@ class Archiver(AbstractManager):
with (capture_path / 'uuid').open() as _f: with (capture_path / 'uuid').open() as _f:
uuid = _f.read().strip() uuid = _f.read().strip()
to_archive[timestamp.year][timestamp.month].append((capture_path, uuid)) to_archive[timestamp.year][timestamp.month].append((capture_path, uuid))
self.logger.info(f'Archiving {capture_path}.')
if not to_archive:
self.logger.info('Nothing to archive.')
return
archived_uuids = {} archived_uuids = {}
for year, month_captures in to_archive.items(): for year, month_captures in to_archive.items():

View File

@ -6,7 +6,7 @@ import json
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from .exceptions import LookylooException from .exceptions import LookylooException, MissingCaptureDirectory
class CaptureCache(): class CaptureCache():
@ -22,6 +22,8 @@ class CaptureCache():
self.url: str = cache_entry['url'] self.url: str = cache_entry['url']
self.redirects: List[str] = json.loads(cache_entry['redirects']) self.redirects: List[str] = json.loads(cache_entry['redirects'])
self.capture_dir: Path = Path(cache_entry['capture_dir']) self.capture_dir: Path = Path(cache_entry['capture_dir'])
if not self.capture_dir.exists():
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
elif not cache_entry.get('error'): elif not cache_entry.get('error'):
missing = set(__default_cache_keys) - set(cache_entry.keys()) missing = set(__default_cache_keys) - set(cache_entry.keys())
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.') raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')

View File

@ -24,3 +24,7 @@ class ConfigError(LookylooException):
class MissingUUID(LookylooException): class MissingUUID(LookylooException):
pass pass
class MissingCaptureDirectory(LookylooException):
pass

View File

@ -36,7 +36,7 @@ from redis.connection import UnixDomainSocketConnection
from scrapysplashwrapper import crawl from scrapysplashwrapper import crawl
from werkzeug.useragents import UserAgent from werkzeug.useragents import UserAgent
from .exceptions import NoValidHarFile, MissingUUID, LookylooException from .exceptions import NoValidHarFile, MissingUUID, LookylooException, MissingCaptureDirectory
from .helpers import (get_homedir, get_socket_path, load_cookies, get_config, from .helpers import (get_homedir, get_socket_path, load_cookies, get_config,
safe_create_dir, get_email_template, load_pickle_tree, safe_create_dir, get_email_template, load_pickle_tree,
remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains, remove_pickle_tree, get_resources_hashes, get_taxonomies, uniq_domains,
@ -581,7 +581,11 @@ class Lookyloo():
for c in p.execute(): for c in p.execute():
if not c: if not c:
continue continue
c = CaptureCache(c) try:
c = CaptureCache(c)
except LookylooException as e:
self.logger.warning(e)
continue
if hasattr(c, 'timestamp'): if hasattr(c, 'timestamp'):
all_cache.append(c) all_cache.append(c)
self._captures_index[c.uuid] = c self._captures_index[c.uuid] = c
@ -593,12 +597,7 @@ class Lookyloo():
def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]: def capture_cache(self, capture_uuid: str, /) -> Optional[CaptureCache]:
"""Get the cache from redis.""" """Get the cache from redis."""
if capture_uuid in self._captures_index: if capture_uuid in self._captures_index and not self._captures_index[capture_uuid].incomplete_redirects:
if self._captures_index[capture_uuid].incomplete_redirects:
# Try to rebuild the cache
capture_dir = self._get_capture_dir(capture_uuid)
cached = self._set_capture_cache(capture_dir)
self._captures_index[capture_uuid] = CaptureCache(cached)
return self._captures_index[capture_uuid] return self._captures_index[capture_uuid]
capture_dir = self._get_capture_dir(capture_uuid) capture_dir = self._get_capture_dir(capture_uuid)
if not capture_dir: if not capture_dir:
@ -611,13 +610,25 @@ class Lookyloo():
try: try:
self._captures_index[capture_uuid] = CaptureCache(cached) self._captures_index[capture_uuid] = CaptureCache(cached)
return self._captures_index[capture_uuid] return self._captures_index[capture_uuid]
except MissingCaptureDirectory:
self.logger.warning(f'Cache ({capture_dir}) is missing.')
return None
except LookylooException as e: except LookylooException as e:
self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}') self.logger.warning(f'Cache ({capture_dir}) is invalid ({e}): {json.dumps(cached, indent=2)}')
return None return None
def _get_capture_dir(self, capture_uuid: str, /) -> Path: def _get_capture_dir(self, capture_uuid: str, /) -> Path:
'''Use the cache to get a capture directory from a capture UUID''' '''Use the cache to get a capture directory from a capture UUID'''
if capture_uuid in self._captures_index:
capture_dir = self._captures_index[capture_uuid].capture_dir
if capture_dir.exists():
return capture_dir
self._captures_index.pop(capture_uuid)
capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid) capture_dir: str = self.redis.hget('lookup_dirs', capture_uuid)
if capture_dir and not Path(capture_dir).exists():
# The capture was either removed or archived, cleaning up
self.redis.hdel('lookup_dirs', capture_uuid)
capture_dir = None
if not capture_dir: if not capture_dir:
# Try in the archive # Try in the archive
capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid) capture_dir = self.redis.hget('lookup_dirs_archived', capture_uuid)
@ -626,7 +637,7 @@ class Lookyloo():
to_return = Path(capture_dir) to_return = Path(capture_dir)
if not to_return.exists(): if not to_return.exists():
# The capture was removed, remove the UUID # The capture was removed, remove the UUID
self.redis.hdel('lookup_dirs', capture_uuid) self.redis.hdel('lookup_dirs_archived', capture_uuid)
self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.') self.logger.warning(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.') raise NoValidHarFile(f'UUID ({capture_uuid}) linked to a missing directory ({capture_dir}). Removed now.')
return to_return return to_return