chg: speedup rendering of very big hostnode popups

pull/954/head
Raphaël Vinot 2024-10-08 23:10:46 +02:00
parent 17e19a5f27
commit acd4cb8da4
3 changed files with 22 additions and 9 deletions

View File

@ -234,6 +234,7 @@ class Indexing():
if self.redis.type(f'cookies_names|{cookie_name}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_cookies', *[entry.split('|')[0] for entry in self.redis.smembers(f'cn|{cookie_name}|captures')])
self.redis.delete(f'cookies_names|{cookie_name}|captures')
return []
return self.redis.zrevrangebyscore(f'cookies_names|{cookie_name}|captures', max_score, min_score, withscores=True)
@ -310,6 +311,7 @@ class Indexing():
if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{h}|captures'))
self.redis.delete(f'bh|{h}|captures')
return 0
return self.redis.zcard(f'body_hashes|{h}|captures')
@ -408,6 +410,7 @@ class Indexing():
if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_hhhashes', *self.redis.smembers(f'hhhashes|{hhh}|captures'))
self.redis.delete(f'hhhashes|{hhh}|captures')
return []
return self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True)
@ -499,6 +502,7 @@ class Indexing():
if self.redis.type(f'urls|{md5}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_urls', *self.redis.smembers(f'urls|{md5}|captures'))
self.redis.delete(f'urls|{md5}|captures')
return []
return self.redis.zrevrangebyscore(f'urls|{md5}|captures', max_score, min_score, withscores=True)
@ -507,6 +511,7 @@ class Indexing():
if self.redis.type(f'urls|{md5}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_urls', *self.redis.smembers(f'urls|{md5}|captures'))
self.redis.delete(f'urls|{md5}|captures')
return 0
return self.redis.zcard(f'urls|{md5}|captures')
@ -523,6 +528,7 @@ class Indexing():
if self.redis.type(f'hostnames|{hostname}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_urls', *self.redis.smembers(f'hostnames|{hostname}|captures'))
self.redis.delete(f'hostnames|{hostname}|captures')
return []
return self.redis.zrevrangebyscore(f'hostnames|{hostname}|captures', max_score, min_score, withscores=True)
@ -530,6 +536,7 @@ class Indexing():
if self.redis.type(f'hostnames|{hostname}|captures') == 'set': # type: ignore[no-untyped-call]
# triggers the re-index soon.
self.redis.srem('indexed_urls', *self.redis.smembers(f'hostnames|{hostname}|captures'))
self.redis.delete(f'hostnames|{hostname}|captures')
return 0
return self.redis.zcard(f'hostnames|{hostname}|captures')

View File

@ -512,14 +512,16 @@ def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[in
Capture UUID avoids duplicates on the same capture'''
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
cached_captures = lookyloo.sorted_capture_cache(
[uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash)],
[uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash,
oldest_capture=datetime.now() - timedelta(**time_delta_on_index))],
cached_captures_only=True)
for cache in cached_captures:
if cache.uuid == current_capture_uuid:
continue
for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(cache.uuid, blob_hash):
urlnodes = get_indexing(flask_login.current_user).get_capture_body_hash_nodes(cache.uuid, blob_hash)
for urlnode_uuid in urlnodes:
try:
urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, urlnode_uuid)
urlnode = cache.tree.root_hartree.get_url_node_by_uuid(urlnode_uuid)
except IndexError:
continue
if url == urlnode.name:
@ -578,8 +580,9 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos
# Index lookup
# %%% Full body %%%
if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(url.body_hash):
to_append['body_hash_details'] = {'hash_freq': freq}
to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid)
to_append['body_hash_details'] = {'hash_freq': freq, 'other_captures': (freq, {'same_url': [], 'different_url': []})}
if freq > 1:
to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid)
# %%% Embedded ressources %%%
if hasattr(url, 'embedded_ressources') and url.embedded_ressources:
@ -589,10 +592,13 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos
if h in to_append['embedded_ressources']:
# Skip duplicates
continue
to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes, 'type': mimetype}
to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes,
'type': mimetype}
if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(h):
to_append['embedded_ressources'][h]['hash_freq'] = freq
to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid)
to_append['embedded_ressources'][h]['other_captures'] = (freq, {'same_url': [], 'different_url': []})
if freq > 1:
to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid)
for h in to_append['embedded_ressources'].keys():
known, legitimate = normalize_known_content(h, known_content, url)
if known:

View File

@ -192,7 +192,7 @@
{% set total_captures = details[0] %}
{% set other_captures = details[1] %}
{# Only show details if the hits are in an other capture #}
{% if total_captures > 1 %}
{% if total_captures > 1 %}
<p>
The same file was seen in <b>{{ total_captures - 1 }}</b> other captures.
</br>
@ -206,7 +206,7 @@
{# Lists of other captures loading the same content... #}
<div class="collapse" id="captureslist_{{ identifier_for_toggle }}">
<div class="card card-body">
Note that only the most recent cached captures are displayed here.
Note that only the most recent cached captures are displayed here, click on the link below to see more.
{% if other_captures['different_url']|length > 0 %}
{# ... on other URLs #}
<div>