fix: Exception when a formerly broken capture is re-processed and works

pull/746/head
Raphaël Vinot 2023-07-27 14:56:39 +02:00
parent db639d9dde
commit ebfc2f00a5
3 changed files with 32 additions and 15 deletions

View File

@ -34,10 +34,11 @@ class BackgroundIndexer(AbstractManager):
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _build_missing_pickles(self) -> bool:
self.logger.info('Build missing pickles...')
self.logger.debug('Build missing pickles...')
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
# This value makes sure we break out of the loop and build pickles of the most recent captures
max_captures = 50
got_new_captures = False
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
# We already have a pickle file
@ -53,6 +54,7 @@ class BackgroundIndexer(AbstractManager):
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
continue
got_new_captures = True
max_captures -= 1
with uuid_path.open() as f:
uuid = f.read()
@ -88,11 +90,12 @@ class BackgroundIndexer(AbstractManager):
self.lookyloo.redis.hdel('lookup_dirs', uuid)
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
if max_captures <= 0:
break
else:
self.logger.info('... done.')
self.logger.info('Too many captures in the backlog, start from the beginning.')
return False
if got_new_captures:
self.logger.info('Finished building all missing pickles.')
# Only return True if we built new pickles.
return True
self.logger.info('... too many captures in the backlog, start from the beginning.')
return False
def _check_indexes(self):

View File

@ -65,24 +65,34 @@ class CaptureCache():
# This entry *should* be present even if there is an error.
self.url: str = url
if all(key in cache_entry.keys() for key in __default_cache_keys):
# if the cache doesn't have the keys in __default_cache_keys, it must have an error.
# if it has neither all the expected entries, nor error, we must raise an exception
if (not all(key in cache_entry.keys() for key in __default_cache_keys)
and not cache_entry.get('error')):
missing = set(__default_cache_keys) - set(cache_entry.keys())
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
if cache_entry.get('title') is not None:
self.title: str = cache_entry['title']
else:
# This shouldn't happen, but if it does, we need the key to exist.
self.logger.warning(f'Title missing in cache for {self.uuid}.')
self.title = ''
if cache_entry.get('timestamp'):
try:
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
except ValueError:
# If the microsecond is missing (0), it fails
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
if cache_entry.get('redirects'):
self.redirects: List[str] = json.loads(cache_entry['redirects'])
else:
self.logger.debug('No redirects in cache')
self.redirects = []
elif not cache_entry.get('error'):
missing = set(__default_cache_keys) - set(cache_entry.keys())
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
else:
# This shouldn't happen, but if it does, we need the key to exist.
self.logger.warning(f'Timestamp missing in cache for {self.uuid}.')
self.timestamp = datetime.fromtimestamp(0)
self.redirects: List[str] = json.loads(cache_entry['redirects']) if cache_entry.get('redirects') else []
# Error without all the keys in __default_cache_keys was fatal.
# if the keys in __default_cache_keys are present, it was an HTTP error
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
self.error: Optional[str] = cache_entry.get('error')
self.incomplete_redirects: bool = True if cache_entry.get('incomplete_redirects') in [1, '1'] else False
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False

View File

@ -522,6 +522,10 @@ class Lookyloo():
if not cache.user_agent and not cache.error:
# 2022-12-07: New cache format, store the user agent and referers.
needs_update = True
if not hasattr(cache, 'title') or not cache.title:
# 2023-17-27: The title should *always* be there,
# unless the HAR file is missing or broken
needs_update = True
if needs_update:
self._captures_index.reload_cache(capture_uuid)
cache = self._captures_index[capture_uuid]