mirror of https://github.com/CIRCL/lookyloo
fix: Exception when a formerly broken capture is re-processed and works
parent
db639d9dde
commit
ebfc2f00a5
|
@ -34,10 +34,11 @@ class BackgroundIndexer(AbstractManager):
|
|||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
||||
|
||||
def _build_missing_pickles(self) -> bool:
|
||||
self.logger.info('Build missing pickles...')
|
||||
self.logger.debug('Build missing pickles...')
|
||||
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
||||
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
||||
max_captures = 50
|
||||
got_new_captures = False
|
||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
||||
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
||||
# We already have a pickle file
|
||||
|
@ -53,6 +54,7 @@ class BackgroundIndexer(AbstractManager):
|
|||
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
|
||||
continue
|
||||
|
||||
got_new_captures = True
|
||||
max_captures -= 1
|
||||
with uuid_path.open() as f:
|
||||
uuid = f.read()
|
||||
|
@ -88,11 +90,12 @@ class BackgroundIndexer(AbstractManager):
|
|||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||
if max_captures <= 0:
|
||||
break
|
||||
else:
|
||||
self.logger.info('... done.')
|
||||
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
||||
return False
|
||||
if got_new_captures:
|
||||
self.logger.info('Finished building all missing pickles.')
|
||||
# Only return True if we built new pickles.
|
||||
return True
|
||||
self.logger.info('... too many captures in the backlog, start from the beginning.')
|
||||
return False
|
||||
|
||||
def _check_indexes(self):
|
||||
|
|
|
@ -65,24 +65,34 @@ class CaptureCache():
|
|||
# This entry *should* be present even if there is an error.
|
||||
self.url: str = url
|
||||
|
||||
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
||||
# if the cache doesn't have the keys in __default_cache_keys, it must have an error.
|
||||
# if it has neither all the expected entries, nor error, we must raise an exception
|
||||
if (not all(key in cache_entry.keys() for key in __default_cache_keys)
|
||||
and not cache_entry.get('error')):
|
||||
missing = set(__default_cache_keys) - set(cache_entry.keys())
|
||||
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
|
||||
|
||||
if cache_entry.get('title') is not None:
|
||||
self.title: str = cache_entry['title']
|
||||
else:
|
||||
# This shouldn't happen, but if it does, we need the key to exist.
|
||||
self.logger.warning(f'Title missing in cache for {self.uuid}.')
|
||||
self.title = ''
|
||||
if cache_entry.get('timestamp'):
|
||||
try:
|
||||
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
||||
except ValueError:
|
||||
# If the microsecond is missing (0), it fails
|
||||
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
||||
if cache_entry.get('redirects'):
|
||||
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
||||
else:
|
||||
self.logger.debug('No redirects in cache')
|
||||
self.redirects = []
|
||||
elif not cache_entry.get('error'):
|
||||
missing = set(__default_cache_keys) - set(cache_entry.keys())
|
||||
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
|
||||
else:
|
||||
# This shouldn't happen, but if it does, we need the key to exist.
|
||||
self.logger.warning(f'Timestamp missing in cache for {self.uuid}.')
|
||||
self.timestamp = datetime.fromtimestamp(0)
|
||||
|
||||
self.redirects: List[str] = json.loads(cache_entry['redirects']) if cache_entry.get('redirects') else []
|
||||
|
||||
# Error without all the keys in __default_cache_keys was fatal.
|
||||
# if the keys in __default_cache_keys are present, it was an HTTP error
|
||||
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
|
||||
self.error: Optional[str] = cache_entry.get('error')
|
||||
self.incomplete_redirects: bool = True if cache_entry.get('incomplete_redirects') in [1, '1'] else False
|
||||
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
|
||||
|
|
|
@ -522,6 +522,10 @@ class Lookyloo():
|
|||
if not cache.user_agent and not cache.error:
|
||||
# 2022-12-07: New cache format, store the user agent and referers.
|
||||
needs_update = True
|
||||
if not hasattr(cache, 'title') or not cache.title:
|
||||
# 2023-17-27: The title should *always* be there,
|
||||
# unless the HAR file is missing or broken
|
||||
needs_update = True
|
||||
if needs_update:
|
||||
self._captures_index.reload_cache(capture_uuid)
|
||||
cache = self._captures_index[capture_uuid]
|
||||
|
|
Loading…
Reference in New Issue