diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 96cab5de..c0ea4d4e 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -234,6 +234,7 @@ class Indexing(): if self.redis.type(f'cookies_names|{cookie_name}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_cookies', *[entry.split('|')[0] for entry in self.redis.smembers(f'cn|{cookie_name}|captures')]) + self.redis.delete(f'cookies_names|{cookie_name}|captures') return [] return self.redis.zrevrangebyscore(f'cookies_names|{cookie_name}|captures', max_score, min_score, withscores=True) @@ -310,6 +311,7 @@ class Indexing(): if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{h}|captures')) + self.redis.delete(f'bh|{h}|captures') return 0 return self.redis.zcard(f'body_hashes|{h}|captures') @@ -408,6 +410,7 @@ class Indexing(): if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_hhhashes', *self.redis.smembers(f'hhhashes|{hhh}|captures')) + self.redis.delete(f'hhhashes|{hhh}|captures') return [] return self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True) @@ -499,6 +502,7 @@ class Indexing(): if self.redis.type(f'urls|{md5}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_urls', *self.redis.smembers(f'urls|{md5}|captures')) + self.redis.delete(f'urls|{md5}|captures') return [] return self.redis.zrevrangebyscore(f'urls|{md5}|captures', max_score, min_score, withscores=True) @@ -507,6 +511,7 @@ class Indexing(): if self.redis.type(f'urls|{md5}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_urls', *self.redis.smembers(f'urls|{md5}|captures')) + self.redis.delete(f'urls|{md5}|captures') return 0 return self.redis.zcard(f'urls|{md5}|captures') @@ -523,6 +528,7 @@ class Indexing(): if self.redis.type(f'hostnames|{hostname}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_urls', *self.redis.smembers(f'hostnames|{hostname}|captures')) + self.redis.delete(f'hostnames|{hostname}|captures') return [] return self.redis.zrevrangebyscore(f'hostnames|{hostname}|captures', max_score, min_score, withscores=True) @@ -530,6 +536,7 @@ class Indexing(): if self.redis.type(f'hostnames|{hostname}|captures') == 'set': # type: ignore[no-untyped-call] # triggers the re-index soon. self.redis.srem('indexed_urls', *self.redis.smembers(f'hostnames|{hostname}|captures')) + self.redis.delete(f'hostnames|{hostname}|captures') return 0 return self.redis.zcard(f'hostnames|{hostname}|captures') diff --git a/website/web/__init__.py b/website/web/__init__.py index 88c6d536..a814812c 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -512,14 +512,16 @@ def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[in Capture UUID avoids duplicates on the same capture''' captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} cached_captures = lookyloo.sorted_capture_cache( - [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash)], + [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash, + oldest_capture=datetime.now() - timedelta(**time_delta_on_index))], cached_captures_only=True) for cache in cached_captures: if cache.uuid == current_capture_uuid: continue - for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(cache.uuid, blob_hash): + urlnodes = get_indexing(flask_login.current_user).get_capture_body_hash_nodes(cache.uuid, blob_hash) + for urlnode_uuid in urlnodes: try: - urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, urlnode_uuid) + urlnode = cache.tree.root_hartree.get_url_node_by_uuid(urlnode_uuid) except IndexError: continue if url == urlnode.name: @@ -578,8 +580,9 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos # Index lookup # %%% Full body %%% if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(url.body_hash): - to_append['body_hash_details'] = {'hash_freq': freq} - to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid) + to_append['body_hash_details'] = {'hash_freq': freq, 'other_captures': (freq, {'same_url': [], 'different_url': []})} + if freq > 1: + to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid) # %%% Embedded ressources %%% if hasattr(url, 'embedded_ressources') and url.embedded_ressources: @@ -589,10 +592,13 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos if h in to_append['embedded_ressources']: # Skip duplicates continue - to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes, 'type': mimetype} + to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes, + 'type': mimetype} if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(h): to_append['embedded_ressources'][h]['hash_freq'] = freq - to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid) + to_append['embedded_ressources'][h]['other_captures'] = (freq, {'same_url': [], 'different_url': []}) + if freq > 1: + to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid) for h in to_append['embedded_ressources'].keys(): known, legitimate = normalize_known_content(h, known_content, url) if known: diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index 07a53c3e..5ee03d2c 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -192,7 +192,7 @@ {% set total_captures = details[0] %} {% set other_captures = details[1] %} {# Only show details if the hits are in an other capture #} -{% if total_captures > 1 %} +{% if total_captures > 1 %}
The same file was seen in {{ total_captures - 1 }} other captures. @@ -206,7 +206,7 @@ {# Lists of other captures loading the same content... #}