mirror of https://github.com/CIRCL/lookyloo
fix: Exception when a formerly broken capture is re-processed and works
parent
db639d9dde
commit
ebfc2f00a5
|
@ -34,10 +34,11 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
|
||||||
|
|
||||||
def _build_missing_pickles(self) -> bool:
|
def _build_missing_pickles(self) -> bool:
|
||||||
self.logger.info('Build missing pickles...')
|
self.logger.debug('Build missing pickles...')
|
||||||
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
# Sometimes, we have a huge backlog and the process might get stuck on old captures for a very long time
|
||||||
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
# This value makes sure we break out of the loop and build pickles of the most recent captures
|
||||||
max_captures = 50
|
max_captures = 50
|
||||||
|
got_new_captures = False
|
||||||
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
for uuid_path in sorted(self.lookyloo.capture_dir.glob('**/uuid'), reverse=True):
|
||||||
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
if ((uuid_path.parent / 'tree.pickle.gz').exists() or (uuid_path.parent / 'tree.pickle').exists()):
|
||||||
# We already have a pickle file
|
# We already have a pickle file
|
||||||
|
@ -53,6 +54,7 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
|
self.logger.debug(f'{uuid_path.parent} is locked, pickle generated by another process.')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
got_new_captures = True
|
||||||
max_captures -= 1
|
max_captures -= 1
|
||||||
with uuid_path.open() as f:
|
with uuid_path.open() as f:
|
||||||
uuid = f.read()
|
uuid = f.read()
|
||||||
|
@ -88,11 +90,12 @@ class BackgroundIndexer(AbstractManager):
|
||||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||||
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
shutil.move(str(uuid_path.parent), str(self.discarded_captures_dir / uuid_path.parent.name))
|
||||||
if max_captures <= 0:
|
if max_captures <= 0:
|
||||||
break
|
self.logger.info('Too many captures in the backlog, start from the beginning.')
|
||||||
else:
|
return False
|
||||||
self.logger.info('... done.')
|
if got_new_captures:
|
||||||
|
self.logger.info('Finished building all missing pickles.')
|
||||||
|
# Only return True if we built new pickles.
|
||||||
return True
|
return True
|
||||||
self.logger.info('... too many captures in the backlog, start from the beginning.')
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _check_indexes(self):
|
def _check_indexes(self):
|
||||||
|
|
|
@ -65,24 +65,34 @@ class CaptureCache():
|
||||||
# This entry *should* be present even if there is an error.
|
# This entry *should* be present even if there is an error.
|
||||||
self.url: str = url
|
self.url: str = url
|
||||||
|
|
||||||
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
# if the cache doesn't have the keys in __default_cache_keys, it must have an error.
|
||||||
|
# if it has neither all the expected entries, nor error, we must raise an exception
|
||||||
|
if (not all(key in cache_entry.keys() for key in __default_cache_keys)
|
||||||
|
and not cache_entry.get('error')):
|
||||||
|
missing = set(__default_cache_keys) - set(cache_entry.keys())
|
||||||
|
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
|
||||||
|
|
||||||
|
if cache_entry.get('title') is not None:
|
||||||
self.title: str = cache_entry['title']
|
self.title: str = cache_entry['title']
|
||||||
|
else:
|
||||||
|
# This shouldn't happen, but if it does, we need the key to exist.
|
||||||
|
self.logger.warning(f'Title missing in cache for {self.uuid}.')
|
||||||
|
self.title = ''
|
||||||
|
if cache_entry.get('timestamp'):
|
||||||
try:
|
try:
|
||||||
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
||||||
except ValueError:
|
except ValueError:
|
||||||
# If the microsecond is missing (0), it fails
|
# If the microsecond is missing (0), it fails
|
||||||
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
||||||
if cache_entry.get('redirects'):
|
else:
|
||||||
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
# This shouldn't happen, but if it does, we need the key to exist.
|
||||||
else:
|
self.logger.warning(f'Timestamp missing in cache for {self.uuid}.')
|
||||||
self.logger.debug('No redirects in cache')
|
self.timestamp = datetime.fromtimestamp(0)
|
||||||
self.redirects = []
|
|
||||||
elif not cache_entry.get('error'):
|
self.redirects: List[str] = json.loads(cache_entry['redirects']) if cache_entry.get('redirects') else []
|
||||||
missing = set(__default_cache_keys) - set(cache_entry.keys())
|
|
||||||
raise LookylooException(f'Missing keys ({missing}), no error message. It should not happen.')
|
|
||||||
|
|
||||||
# Error without all the keys in __default_cache_keys was fatal.
|
# Error without all the keys in __default_cache_keys was fatal.
|
||||||
# if the keys in __default_cache_keys are present, it was an HTTP error
|
# if the keys in __default_cache_keys are present, it was an HTTP error and we still need to pass the error along
|
||||||
self.error: Optional[str] = cache_entry.get('error')
|
self.error: Optional[str] = cache_entry.get('error')
|
||||||
self.incomplete_redirects: bool = True if cache_entry.get('incomplete_redirects') in [1, '1'] else False
|
self.incomplete_redirects: bool = True if cache_entry.get('incomplete_redirects') in [1, '1'] else False
|
||||||
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
|
self.no_index: bool = True if cache_entry.get('no_index') in [1, '1'] else False
|
||||||
|
|
|
@ -522,6 +522,10 @@ class Lookyloo():
|
||||||
if not cache.user_agent and not cache.error:
|
if not cache.user_agent and not cache.error:
|
||||||
# 2022-12-07: New cache format, store the user agent and referers.
|
# 2022-12-07: New cache format, store the user agent and referers.
|
||||||
needs_update = True
|
needs_update = True
|
||||||
|
if not hasattr(cache, 'title') or not cache.title:
|
||||||
|
# 2023-17-27: The title should *always* be there,
|
||||||
|
# unless the HAR file is missing or broken
|
||||||
|
needs_update = True
|
||||||
if needs_update:
|
if needs_update:
|
||||||
self._captures_index.reload_cache(capture_uuid)
|
self._captures_index.reload_cache(capture_uuid)
|
||||||
cache = self._captures_index[capture_uuid]
|
cache = self._captures_index[capture_uuid]
|
||||||
|
|
Loading…
Reference in New Issue