fix: Better handling of half broken captures without HAR files

pull/526/head
Raphaël Vinot 2022-09-26 14:58:30 +02:00
parent 50e59bdf31
commit 31261e84c2
4 changed files with 29 additions and 18 deletions

View File

@ -59,6 +59,8 @@ class BackgroundIndexer(AbstractManager):
except MissingUUID:
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
except NoValidHarFile as e:
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
except Exception as e:
self.logger.critical(f'Unable to build pickle for {uuid}: {uuid_path.parent.name} - {e}')
# The capture is not working, moving it away.
self.lookyloo.redis.hdel('lookup_dirs', uuid)

View File

@ -30,9 +30,14 @@ class CaptureCache():
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent')
def __init__(self, cache_entry: Dict[str, Any]):
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir')
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
'url', 'redirects', 'capture_dir')
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
self.uuid: str = cache_entry['uuid']
self.capture_dir: Path = Path(cache_entry['capture_dir'])
if all(key in cache_entry.keys() for key in __default_cache_keys):
self.uuid: str = cache_entry['uuid']
self.title: str = cache_entry['title']
try:
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
@ -41,7 +46,6 @@ class CaptureCache():
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
self.url: str = cache_entry['url']
self.redirects: List[str] = json.loads(cache_entry['redirects'])
self.capture_dir: Path = Path(cache_entry['capture_dir'])
if not self.capture_dir.exists():
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
elif not cache_entry.get('error'):
@ -85,7 +89,10 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
remove_pickle_tree(capture_dir)
except Exception:
remove_pickle_tree(capture_dir)
raise TreeNeedsRebuild()
if list(capture_dir.rglob('*.har')):
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
# The tree doesn't need to be rebuilt if there are no HAR files.
raise NoValidHarFile("Couldn't find HAR files")
class CapturesIndex(Mapping):
@ -160,7 +167,7 @@ class CapturesIndex(Mapping):
try:
cc = CaptureCache(cache)
except LookylooException as e:
self.logger.warning(e)
self.logger.warning(f'Unable to initialize the cache: {e}')
continue
self.__cache[cc.uuid] = cc
@ -201,14 +208,17 @@ class CapturesIndex(Mapping):
time.sleep(5)
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
har_files = sorted(capture_dir.glob('*.har*'))
har_files = sorted(capture_dir.glob('*.har'))
try:
tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree)
if self.contextualizer:
self.contextualizer.contextualize_tree(tree)
except Har2TreeError as e:
raise NoValidHarFile(e)
# unable to use the HAR files, get them out of the way
for har_file in har_files:
har_file.rename(har_file.with_suffix('.broken'))
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
else:
@ -241,8 +251,7 @@ class CapturesIndex(Mapping):
tree = self._create_pickle(capture_dir)
self.indexing.new_internal_uuids(tree)
except NoValidHarFile:
# We may not have a HAR file, the reason will be in the error file.
pass
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.')
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
if (capture_dir / 'error.txt').exists():
@ -258,7 +267,7 @@ class CapturesIndex(Mapping):
error_to_cache = content
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
if (har_files := sorted(capture_dir.glob('*.har*'))):
if (har_files := sorted(capture_dir.rglob('*.har'))):
try:
har = HarFile(har_files[0], uuid)
cache['title'] = har.initial_title

View File

@ -366,7 +366,7 @@ class Lookyloo():
# Do not try to build pickles
capture_uuids = set(capture_uuids) & self._captures_index.cached_captures
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid) and hasattr(self._captures_index[uuid], 'timestamp')]
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
return all_cache
@ -393,14 +393,14 @@ class Lookyloo():
return self._captures_index[capture_uuid]
except MissingCaptureDirectory as e:
# The UUID is in the captures but the directory is not on the disk.
self.logger.warning(e)
self.logger.warning(f'Missing Directory: {e}')
return None
except MissingUUID:
if self.get_capture_status(capture_uuid) not in [CaptureStatusCore.QUEUED, CaptureStatusCore.ONGOING]:
self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).')
return None
except LookylooException as e:
self.logger.warning(e)
self.logger.warning(f'Lookyloo Exception: {e}')
return None
except Exception as e:
self.logger.exception(e)

View File

@ -650,9 +650,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
message = "The capture is ongoing."
return render_template('tree_wait.html', message=message, tree_uuid=tree_uuid)
if cache.error:
flash(cache.error, 'warning')
try:
ct = lookyloo.get_crawled_tree(tree_uuid)
b64_thumbnail = lookyloo.get_screenshot_thumbnail(tree_uuid, for_datauri=True)
@ -674,6 +671,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
except IndexError as e:
print(e)
pass
if cache.error:
flash(cache.error, 'warning')
return render_template('tree.html', tree_json=ct.to_json(),
info=info,
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
@ -691,8 +690,9 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
parent_uuid=cache.parent,
has_redirects=True if cache.redirects else False)
except NoValidHarFile as e:
return render_template('error.html', error_message=e)
except NoValidHarFile:
flash(f'Unable to build a tree for {tree_uuid}: {cache.error}.', 'warning')
return index_generic()
finally:
lookyloo.update_tree_cache_info(os.getpid(), 'website')