fix: re-cache captures with incomplete redirects, code cleanup

pull/219/head
Raphaël Vinot 2021-06-15 16:03:33 -07:00
parent b6a636697f
commit 4a8db1fc6e
3 changed files with 11 additions and 11 deletions

View File

@ -33,7 +33,7 @@ class Context():
p = self.redis.pipeline() p = self.redis.pipeline()
if filename == 'generic': if filename == 'generic':
# 1px images, files with spaces, empty => non-relevant stuff # 1px images, files with spaces, empty => non-relevant stuff
for k, type_content in file_content.items(): for _, type_content in file_content.items():
p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']}) p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']})
elif filename == 'malicious': elif filename == 'malicious':
# User defined as malicious # User defined as malicious
@ -133,7 +133,7 @@ class Context():
# this is the hash of an embeded content so it won't have a filename but has a different mimetype # this is the hash of an embeded content so it won't have a filename but has a different mimetype
# FIXME: this is ugly. # FIXME: this is ugly.
for ressource_mimetype, blobs in urlnode.embedded_ressources.items(): for ressource_mimetype, blobs in urlnode.embedded_ressources.items():
for ressource_h, b in blobs: for ressource_h, _ in blobs:
if ressource_h == h: if ressource_h == h:
mimetype = ressource_mimetype.split(';')[0] mimetype = ressource_mimetype.split(';')[0]
break break

View File

@ -128,7 +128,7 @@ class Lookyloo():
to_store: Dict[str, Any] = {'by_frequency': []} to_store: Dict[str, Any] = {'by_frequency': []}
uas = Counter([entry.split('|', 1)[1] for entry in entries]) uas = Counter([entry.split('|', 1)[1] for entry in entries])
for ua, count in uas.most_common(): for ua, _ in uas.most_common():
parsed_ua = UserAgent(ua) parsed_ua = UserAgent(ua)
if not parsed_ua.platform or not parsed_ua.browser: if not parsed_ua.platform or not parsed_ua.browser:
continue continue
@ -191,7 +191,7 @@ class Lookyloo():
categories = list(self.categories_capture(capture_uuid).keys()) categories = list(self.categories_capture(capture_uuid).keys())
self.indexing.index_categories_capture(capture_uuid, categories) self.indexing.index_categories_capture(capture_uuid, categories)
except Har2TreeError as e: except Har2TreeError as e:
raise NoValidHarFile(e.message) raise NoValidHarFile(e)
except RecursionError as e: except RecursionError as e:
raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.') raise NoValidHarFile(f'Tree too deep, probably a recursive refresh: {e}.\n Append /export to the URL to get the files.')
else: else:
@ -464,7 +464,7 @@ class Lookyloo():
try: try:
har = HarFile(har_files[0], uuid) har = HarFile(har_files[0], uuid)
except Har2TreeError as e: except Har2TreeError as e:
error_cache['error'] = e.message error_cache['error'] = str(e)
fatal_error = True fatal_error = True
else: else:
error_cache['error'] = f'No har files in {capture_dir.name}' error_cache['error'] = f'No har files in {capture_dir.name}'
@ -534,7 +534,7 @@ class Lookyloo():
'''All the capture UUIDs present in the cache.''' '''All the capture UUIDs present in the cache.'''
return self.redis.hkeys('lookup_dirs') return self.redis.hkeys('lookup_dirs')
def sorted_capture_cache(self, capture_uuids: Iterable[str]=[]) -> List[CaptureCache]: def sorted_capture_cache(self, capture_uuids: Optional[Iterable[str]]=None) -> List[CaptureCache]:
'''Get all the captures in the cache, sorted by timestamp (new -> old).''' '''Get all the captures in the cache, sorted by timestamp (new -> old).'''
if not capture_uuids: if not capture_uuids:
# Sort all captures # Sort all captures
@ -543,7 +543,7 @@ class Lookyloo():
# No captures at all on the instance # No captures at all on the instance
return [] return []
all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index] all_cache: List[CaptureCache] = [self._captures_index[uuid] for uuid in capture_uuids if uuid in self._captures_index and not self._captures_index[uuid].incomplete_redirects]
captures_to_get = set(capture_uuids) - set(self._captures_index.keys()) captures_to_get = set(capture_uuids) - set(self._captures_index.keys())
if captures_to_get: if captures_to_get:
@ -947,7 +947,7 @@ class Lookyloo():
details = self.indexing.get_body_hash_urls(body_hash) details = self.indexing.get_body_hash_urls(body_hash)
body_content = BytesIO() body_content = BytesIO()
# get the body from the first entry in the details list # get the body from the first entry in the details list
for url, entries in details.items(): for _, entries in details.items():
ct = self.get_crawled_tree(entries[0]['capture']) ct = self.get_crawled_tree(entries[0]['capture'])
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode']) urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
if urlnode.body_hash == body_hash: if urlnode.body_hash == body_hash:
@ -955,7 +955,7 @@ class Lookyloo():
body_content = urlnode.body body_content = urlnode.body
else: else:
# The hash is an embedded resource # The hash is an embedded resource
for mimetype, blobs in urlnode.body_hash.embedded_ressources.items(): for _, blobs in urlnode.body_hash.embedded_ressources.items():
for h, b in blobs: for h, b in blobs:
if h == body_hash: if h == body_hash:
body_content = b body_content = b

View File

@ -363,7 +363,7 @@ class PhishingInitiative():
if not force and pi_file.exists(): if not force and pi_file.exists():
return return
for i in range(3): for _ in range(3):
url_information = self.client.lookup(url) url_information = self.client.lookup(url)
if not url_information['results']: if not url_information['results']:
# No results, that should not happen (?) # No results, that should not happen (?)
@ -457,7 +457,7 @@ class VirusTotal():
if not force and vt_file.exists(): if not force and vt_file.exists():
return return
for i in range(3): for _ in range(3):
try: try:
url_information = self.client.get_object(f"/urls/{url_id}") url_information = self.client.get_object(f"/urls/{url_id}")
with vt_file.open('w') as _f: with vt_file.open('w') as _f: