mirror of https://github.com/CIRCL/lookyloo
fix: Better handling of half broken captures without HAR files
parent
50e59bdf31
commit
31261e84c2
|
@ -59,6 +59,8 @@ class BackgroundIndexer(AbstractManager):
|
||||||
except MissingUUID:
|
except MissingUUID:
|
||||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||||
except NoValidHarFile as e:
|
except NoValidHarFile as e:
|
||||||
|
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
|
||||||
|
except Exception as e:
|
||||||
self.logger.critical(f'Unable to build pickle for {uuid}: {uuid_path.parent.name} - {e}')
|
self.logger.critical(f'Unable to build pickle for {uuid}: {uuid_path.parent.name} - {e}')
|
||||||
# The capture is not working, moving it away.
|
# The capture is not working, moving it away.
|
||||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||||
|
|
|
@ -30,9 +30,14 @@ class CaptureCache():
|
||||||
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent')
|
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent')
|
||||||
|
|
||||||
def __init__(self, cache_entry: Dict[str, Any]):
|
def __init__(self, cache_entry: Dict[str, Any]):
|
||||||
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir')
|
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
|
||||||
|
'url', 'redirects', 'capture_dir')
|
||||||
|
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
|
||||||
|
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
|
||||||
|
self.uuid: str = cache_entry['uuid']
|
||||||
|
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
||||||
|
|
||||||
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
||||||
self.uuid: str = cache_entry['uuid']
|
|
||||||
self.title: str = cache_entry['title']
|
self.title: str = cache_entry['title']
|
||||||
try:
|
try:
|
||||||
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
||||||
|
@ -41,7 +46,6 @@ class CaptureCache():
|
||||||
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
||||||
self.url: str = cache_entry['url']
|
self.url: str = cache_entry['url']
|
||||||
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
||||||
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
|
||||||
if not self.capture_dir.exists():
|
if not self.capture_dir.exists():
|
||||||
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
|
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
|
||||||
elif not cache_entry.get('error'):
|
elif not cache_entry.get('error'):
|
||||||
|
@ -85,7 +89,10 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
except Exception:
|
except Exception:
|
||||||
remove_pickle_tree(capture_dir)
|
remove_pickle_tree(capture_dir)
|
||||||
raise TreeNeedsRebuild()
|
if list(capture_dir.rglob('*.har')):
|
||||||
|
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
|
||||||
|
# The tree doesn't need to be rebuilt if there are no HAR files.
|
||||||
|
raise NoValidHarFile("Couldn't find HAR files")
|
||||||
|
|
||||||
|
|
||||||
class CapturesIndex(Mapping):
|
class CapturesIndex(Mapping):
|
||||||
|
@ -160,7 +167,7 @@ class CapturesIndex(Mapping):
|
||||||
try:
|
try:
|
||||||
cc = CaptureCache(cache)
|
cc = CaptureCache(cache)
|
||||||
except LookylooException as e:
|
except LookylooException as e:
|
||||||
self.logger.warning(e)
|
self.logger.warning(f'Unable to initialize the cache: {e}')
|
||||||
continue
|
continue
|
||||||
self.__cache[cc.uuid] = cc
|
self.__cache[cc.uuid] = cc
|
||||||
|
|
||||||
|
@ -201,14 +208,17 @@ class CapturesIndex(Mapping):
|
||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||||
|
|
||||||
har_files = sorted(capture_dir.glob('*.har*'))
|
har_files = sorted(capture_dir.glob('*.har'))
|
||||||
try:
|
try:
|
||||||
tree = CrawledTree(har_files, uuid)
|
tree = CrawledTree(har_files, uuid)
|
||||||
self.__resolve_dns(tree)
|
self.__resolve_dns(tree)
|
||||||
if self.contextualizer:
|
if self.contextualizer:
|
||||||
self.contextualizer.contextualize_tree(tree)
|
self.contextualizer.contextualize_tree(tree)
|
||||||
except Har2TreeError as e:
|
except Har2TreeError as e:
|
||||||
raise NoValidHarFile(e)
|
# unable to use the HAR files, get them out of the way
|
||||||
|
for har_file in har_files:
|
||||||
|
har_file.rename(har_file.with_suffix('.broken'))
|
||||||
|
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
|
||||||
except RecursionError as e:
|
except RecursionError as e:
|
||||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||||
else:
|
else:
|
||||||
|
@ -241,8 +251,7 @@ class CapturesIndex(Mapping):
|
||||||
tree = self._create_pickle(capture_dir)
|
tree = self._create_pickle(capture_dir)
|
||||||
self.indexing.new_internal_uuids(tree)
|
self.indexing.new_internal_uuids(tree)
|
||||||
except NoValidHarFile:
|
except NoValidHarFile:
|
||||||
# We may not have a HAR file, the reason will be in the error file.
|
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.')
|
||||||
pass
|
|
||||||
|
|
||||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
|
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
|
||||||
if (capture_dir / 'error.txt').exists():
|
if (capture_dir / 'error.txt').exists():
|
||||||
|
@ -258,7 +267,7 @@ class CapturesIndex(Mapping):
|
||||||
error_to_cache = content
|
error_to_cache = content
|
||||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||||
|
|
||||||
if (har_files := sorted(capture_dir.glob('*.har*'))):
|
if (har_files := sorted(capture_dir.rglob('*.har'))):
|
||||||
try:
|
try:
|
||||||
har = HarFile(har_files[0], uuid)
|
har = HarFile(har_files[0], uuid)
|
||||||
cache['title'] = har.initial_title
|
cache['title'] = har.initial_title
|
||||||
|
|
|
@ -366,7 +366,7 @@ class Lookyloo():
|
||||||
# Do not try to build pickles
|
# Do not try to build pickles
|
||||||
capture_uuids = set(capture_uuids) & self._captures_index.cached_captures
|
capture_uuids = set(capture_uuids) & self._captures_index.cached_captures
|
||||||
|
|
||||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
|
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid) and hasattr(self._captures_index[uuid], 'timestamp')]
|
||||||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||||
return all_cache
|
return all_cache
|
||||||
|
|
||||||
|
@ -393,14 +393,14 @@ class Lookyloo():
|
||||||
return self._captures_index[capture_uuid]
|
return self._captures_index[capture_uuid]
|
||||||
except MissingCaptureDirectory as e:
|
except MissingCaptureDirectory as e:
|
||||||
# The UUID is in the captures but the directory is not on the disk.
|
# The UUID is in the captures but the directory is not on the disk.
|
||||||
self.logger.warning(e)
|
self.logger.warning(f'Missing Directory: {e}')
|
||||||
return None
|
return None
|
||||||
except MissingUUID:
|
except MissingUUID:
|
||||||
if self.get_capture_status(capture_uuid) not in [CaptureStatusCore.QUEUED, CaptureStatusCore.ONGOING]:
|
if self.get_capture_status(capture_uuid) not in [CaptureStatusCore.QUEUED, CaptureStatusCore.ONGOING]:
|
||||||
self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).')
|
self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).')
|
||||||
return None
|
return None
|
||||||
except LookylooException as e:
|
except LookylooException as e:
|
||||||
self.logger.warning(e)
|
self.logger.warning(f'Lookyloo Exception: {e}')
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.exception(e)
|
self.logger.exception(e)
|
||||||
|
|
|
@ -650,9 +650,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
message = "The capture is ongoing."
|
message = "The capture is ongoing."
|
||||||
return render_template('tree_wait.html', message=message, tree_uuid=tree_uuid)
|
return render_template('tree_wait.html', message=message, tree_uuid=tree_uuid)
|
||||||
|
|
||||||
if cache.error:
|
|
||||||
flash(cache.error, 'warning')
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ct = lookyloo.get_crawled_tree(tree_uuid)
|
ct = lookyloo.get_crawled_tree(tree_uuid)
|
||||||
b64_thumbnail = lookyloo.get_screenshot_thumbnail(tree_uuid, for_datauri=True)
|
b64_thumbnail = lookyloo.get_screenshot_thumbnail(tree_uuid, for_datauri=True)
|
||||||
|
@ -674,6 +671,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
except IndexError as e:
|
except IndexError as e:
|
||||||
print(e)
|
print(e)
|
||||||
pass
|
pass
|
||||||
|
if cache.error:
|
||||||
|
flash(cache.error, 'warning')
|
||||||
return render_template('tree.html', tree_json=ct.to_json(),
|
return render_template('tree.html', tree_json=ct.to_json(),
|
||||||
info=info,
|
info=info,
|
||||||
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
|
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
|
||||||
|
@ -691,8 +690,9 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
||||||
parent_uuid=cache.parent,
|
parent_uuid=cache.parent,
|
||||||
has_redirects=True if cache.redirects else False)
|
has_redirects=True if cache.redirects else False)
|
||||||
|
|
||||||
except NoValidHarFile as e:
|
except NoValidHarFile:
|
||||||
return render_template('error.html', error_message=e)
|
flash(f'Unable to build a tree for {tree_uuid}: {cache.error}.', 'warning')
|
||||||
|
return index_generic()
|
||||||
finally:
|
finally:
|
||||||
lookyloo.update_tree_cache_info(os.getpid(), 'website')
|
lookyloo.update_tree_cache_info(os.getpid(), 'website')
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue