mirror of https://github.com/CIRCL/lookyloo
fix: Better handling of half broken captures without HAR files
parent
50e59bdf31
commit
31261e84c2
|
@ -59,6 +59,8 @@ class BackgroundIndexer(AbstractManager):
|
|||
except MissingUUID:
|
||||
self.logger.warning(f'Unable to find {uuid}. That should not happen.')
|
||||
except NoValidHarFile as e:
|
||||
self.logger.critical(f'There are no HAR files in the capture {uuid}: {uuid_path.parent.name} - {e}')
|
||||
except Exception as e:
|
||||
self.logger.critical(f'Unable to build pickle for {uuid}: {uuid_path.parent.name} - {e}')
|
||||
# The capture is not working, moving it away.
|
||||
self.lookyloo.redis.hdel('lookup_dirs', uuid)
|
||||
|
|
|
@ -30,9 +30,14 @@ class CaptureCache():
|
|||
'error', 'incomplete_redirects', 'no_index', 'categories', 'parent')
|
||||
|
||||
def __init__(self, cache_entry: Dict[str, Any]):
|
||||
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp', 'url', 'redirects', 'capture_dir')
|
||||
__default_cache_keys: Tuple[str, str, str, str, str, str] = ('uuid', 'title', 'timestamp',
|
||||
'url', 'redirects', 'capture_dir')
|
||||
if 'uuid' not in cache_entry or 'capture_dir' not in cache_entry:
|
||||
raise LookylooException(f'The capture is deeply broken: {cache_entry}')
|
||||
self.uuid: str = cache_entry['uuid']
|
||||
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
||||
|
||||
if all(key in cache_entry.keys() for key in __default_cache_keys):
|
||||
self.uuid: str = cache_entry['uuid']
|
||||
self.title: str = cache_entry['title']
|
||||
try:
|
||||
self.timestamp: datetime = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S.%f%z')
|
||||
|
@ -41,7 +46,6 @@ class CaptureCache():
|
|||
self.timestamp = datetime.strptime(cache_entry['timestamp'], '%Y-%m-%dT%H:%M:%S%z')
|
||||
self.url: str = cache_entry['url']
|
||||
self.redirects: List[str] = json.loads(cache_entry['redirects'])
|
||||
self.capture_dir: Path = Path(cache_entry['capture_dir'])
|
||||
if not self.capture_dir.exists():
|
||||
raise MissingCaptureDirectory(f'The capture {self.uuid} does not exists in {self.capture_dir}.')
|
||||
elif not cache_entry.get('error'):
|
||||
|
@ -85,7 +89,10 @@ def load_pickle_tree(capture_dir: Path, last_mod_time: int) -> CrawledTree:
|
|||
remove_pickle_tree(capture_dir)
|
||||
except Exception:
|
||||
remove_pickle_tree(capture_dir)
|
||||
raise TreeNeedsRebuild()
|
||||
if list(capture_dir.rglob('*.har')):
|
||||
raise TreeNeedsRebuild('We have HAR files and need to rebuild the tree.')
|
||||
# The tree doesn't need to be rebuilt if there are no HAR files.
|
||||
raise NoValidHarFile("Couldn't find HAR files")
|
||||
|
||||
|
||||
class CapturesIndex(Mapping):
|
||||
|
@ -160,7 +167,7 @@ class CapturesIndex(Mapping):
|
|||
try:
|
||||
cc = CaptureCache(cache)
|
||||
except LookylooException as e:
|
||||
self.logger.warning(e)
|
||||
self.logger.warning(f'Unable to initialize the cache: {e}')
|
||||
continue
|
||||
self.__cache[cc.uuid] = cc
|
||||
|
||||
|
@ -201,14 +208,17 @@ class CapturesIndex(Mapping):
|
|||
time.sleep(5)
|
||||
return load_pickle_tree(capture_dir, capture_dir.stat().st_mtime)
|
||||
|
||||
har_files = sorted(capture_dir.glob('*.har*'))
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
try:
|
||||
tree = CrawledTree(har_files, uuid)
|
||||
self.__resolve_dns(tree)
|
||||
if self.contextualizer:
|
||||
self.contextualizer.contextualize_tree(tree)
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e)
|
||||
# unable to use the HAR files, get them out of the way
|
||||
for har_file in har_files:
|
||||
har_file.rename(har_file.with_suffix('.broken'))
|
||||
raise NoValidHarFile(f'We got har files, but they are broken: {e}')
|
||||
except RecursionError as e:
|
||||
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
|
||||
else:
|
||||
|
@ -241,8 +251,7 @@ class CapturesIndex(Mapping):
|
|||
tree = self._create_pickle(capture_dir)
|
||||
self.indexing.new_internal_uuids(tree)
|
||||
except NoValidHarFile:
|
||||
# We may not have a HAR file, the reason will be in the error file.
|
||||
pass
|
||||
self.logger.warning('Unable to rebuild the tree, the HAR files are broken.')
|
||||
|
||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'capture_dir': capture_dir_str}
|
||||
if (capture_dir / 'error.txt').exists():
|
||||
|
@ -258,7 +267,7 @@ class CapturesIndex(Mapping):
|
|||
error_to_cache = content
|
||||
cache['error'] = f'The capture {capture_dir.name} has an error: {error_to_cache}'
|
||||
|
||||
if (har_files := sorted(capture_dir.glob('*.har*'))):
|
||||
if (har_files := sorted(capture_dir.rglob('*.har'))):
|
||||
try:
|
||||
har = HarFile(har_files[0], uuid)
|
||||
cache['title'] = har.initial_title
|
||||
|
|
|
@ -366,7 +366,7 @@ class Lookyloo():
|
|||
# Do not try to build pickles
|
||||
capture_uuids = set(capture_uuids) & self._captures_index.cached_captures
|
||||
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid)]
|
||||
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if self.capture_cache(uuid) and hasattr(self._captures_index[uuid], 'timestamp')]
|
||||
all_cache.sort(key=operator.attrgetter('timestamp'), reverse=True)
|
||||
return all_cache
|
||||
|
||||
|
@ -393,14 +393,14 @@ class Lookyloo():
|
|||
return self._captures_index[capture_uuid]
|
||||
except MissingCaptureDirectory as e:
|
||||
# The UUID is in the captures but the directory is not on the disk.
|
||||
self.logger.warning(e)
|
||||
self.logger.warning(f'Missing Directory: {e}')
|
||||
return None
|
||||
except MissingUUID:
|
||||
if self.get_capture_status(capture_uuid) not in [CaptureStatusCore.QUEUED, CaptureStatusCore.ONGOING]:
|
||||
self.logger.warning(f'Unable to find {capture_uuid} (not in the cache and/or missing capture directory).')
|
||||
return None
|
||||
except LookylooException as e:
|
||||
self.logger.warning(e)
|
||||
self.logger.warning(f'Lookyloo Exception: {e}')
|
||||
return None
|
||||
except Exception as e:
|
||||
self.logger.exception(e)
|
||||
|
|
|
@ -650,9 +650,6 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
message = "The capture is ongoing."
|
||||
return render_template('tree_wait.html', message=message, tree_uuid=tree_uuid)
|
||||
|
||||
if cache.error:
|
||||
flash(cache.error, 'warning')
|
||||
|
||||
try:
|
||||
ct = lookyloo.get_crawled_tree(tree_uuid)
|
||||
b64_thumbnail = lookyloo.get_screenshot_thumbnail(tree_uuid, for_datauri=True)
|
||||
|
@ -674,6 +671,8 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
except IndexError as e:
|
||||
print(e)
|
||||
pass
|
||||
if cache.error:
|
||||
flash(cache.error, 'warning')
|
||||
return render_template('tree.html', tree_json=ct.to_json(),
|
||||
info=info,
|
||||
tree_uuid=tree_uuid, public_domain=lookyloo.public_domain,
|
||||
|
@ -691,8 +690,9 @@ def tree(tree_uuid: str, node_uuid: Optional[str]=None):
|
|||
parent_uuid=cache.parent,
|
||||
has_redirects=True if cache.redirects else False)
|
||||
|
||||
except NoValidHarFile as e:
|
||||
return render_template('error.html', error_message=e)
|
||||
except NoValidHarFile:
|
||||
flash(f'Unable to build a tree for {tree_uuid}: {cache.error}.', 'warning')
|
||||
return index_generic()
|
||||
finally:
|
||||
lookyloo.update_tree_cache_info(os.getpid(), 'website')
|
||||
|
||||
|
|
Loading…
Reference in New Issue