From c68080431da505a38e741f6cb5c496e231e7e5af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Mon, 7 Oct 2024 13:15:15 +0200 Subject: [PATCH] chg: Migrate ressources/body hashes to new index that allows pagination on capture time --- .pre-commit-config.yaml | 2 +- lookyloo/indexing.py | 168 ++++++++++------------ lookyloo/lookyloo.py | 5 + website/web/__init__.py | 129 +++++++---------- website/web/genericapi.py | 18 +-- website/web/templates/body_hash.html | 26 +--- website/web/templates/hostname_popup.html | 9 +- website/web/templates/macros.html | 11 +- website/web/templates/ressources.html | 6 +- 9 files changed, 159 insertions(+), 215 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 52457d8f..e4484308 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ exclude: "user_agents|website/web/sri.txt" repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.6.0 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 0b1b9e62..a433a1c3 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -7,9 +7,7 @@ import hashlib import logging from io import BytesIO -from collections import defaultdict from datetime import datetime, timedelta -from urllib.parse import urlsplit from zipfile import ZipFile import mmh3 @@ -49,12 +47,12 @@ class Indexing(): self.redis.flushdb() @property - def redis_bytes(self) -> Redis: # type: ignore[type-arg] + def redis_bytes(self) -> Redis[bytes]: return Redis(connection_pool=self.__redis_pool_bytes) @property - def redis(self) -> Redis: # type: ignore[type-arg] - return Redis(connection_pool=self.__redis_pool) + def redis(self) -> Redis[str]: + return Redis(connection_pool=self.__redis_pool) # type: ignore[return-value] def can_index(self, capture_uuid: str | None=None) -> bool: if capture_uuid: @@ -83,6 +81,7 @@ class Indexing(): for hash_type in self.captures_hashes_types(): p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'): + # NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild # internal_index can be "tlds" for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'): # entry can be a "com", we delete a set of UUIDs, remove from the captures set @@ -185,7 +184,7 @@ class Indexing(): return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]: - return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] + return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] # type: ignore[misc] def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_cookies', crawled_tree.uuid): @@ -228,24 +227,25 @@ class Indexing(): # ###### Body hashes ###### - @property - def ressources(self) -> list[tuple[str, float]]: - return self.redis.zrevrange('body_hashes', 0, 200, withscores=True) - - def ressources_number_domains(self, h: str) -> int: - return self.redis.zcard(f'bh|{h}') - - def body_hash_fequency(self, body_hash: str) -> dict[str, int]: + def _reindex_ressources(self, h: str) -> None: + # We changed the format of the indexes, so we need to make sure they're re-triggered. pipeline = self.redis.pipeline() - pipeline.zscore('body_hashes', body_hash) - pipeline.zcard(f'bh|{body_hash}') - hash_freq, hash_domains_freq = pipeline.execute() - to_return = {'hash_freq': 0, 'hash_domains_freq': 0} - if hash_freq: - to_return['hash_freq'] = int(hash_freq) - if hash_domains_freq: - to_return['hash_domains_freq'] = int(hash_domains_freq) - return to_return + if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call] + uuids_to_reindex = self.redis.smembers(f'bh|{h}|captures') + pipeline.srem('indexed_body_hashes', *uuids_to_reindex) + # deprecated index + pipeline.delete(*[f'bh|{h}|captures|{uuid}' for uuid in uuids_to_reindex]) + pipeline.delete(f'bh|{h}|captures') + if self.redis.type(f'bh|{h}') == 'zset': # type: ignore[no-untyped-call] + pipeline.delete(f'bh|{h}') + + if self.redis.type('body_hashes') == 'zset': # type: ignore[no-untyped-call] + pipeline.delete('body_hashes') + pipeline.execute() + + @property + def ressources(self) -> set[str]: + return self.redis.smembers('body_hashes') def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): @@ -253,84 +253,74 @@ class Indexing(): return self.redis.sadd('indexed_body_hashes', crawled_tree.uuid) self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ') - - cleaned_up_hashes: set[str] = set() pipeline = self.redis.pipeline() - is_reindex = False + + # Add the body hashes key in internal indexes set + internal_index = f'capture_indexes|{crawled_tree.uuid}' + pipeline.sadd(internal_index, 'body_hashes') + + already_indexed_global: set[str] = set() for urlnode in crawled_tree.root_hartree.url_tree.traverse(): for h in urlnode.resources_hashes: - if h not in cleaned_up_hashes: - # Delete the hash for that capture the first time we see it. - if self.redis.exists(f'bh|{h}|captures|{crawled_tree.uuid}'): - pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}') - cleaned_up_hashes.add(h) - is_reindex = True - self.logger.debug(f'reindexing body hashes for {crawled_tree.uuid} ... ') - # ZSet of all urlnode_UUIDs|full_url - pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, - f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') - if not is_reindex: - pipeline.zincrby('body_hashes', 1, h) - pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) - # set of all captures with this hash - pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid) + + self._reindex_ressources(h) + + if h not in already_indexed_global: + # The hash hasn't been indexed in that run yet + already_indexed_global.add(h) + pipeline.sadd(f'{internal_index}|body_hashes', h) # Only used to delete index + pipeline.sadd('body_hashes', h) + pipeline.zadd(f'body_hashes|{h}|captures', + mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()}) + + # Add hostnode UUID in internal index + pipeline.sadd(f'{internal_index}|body_hashes|{h}', urlnode.uuid) + pipeline.execute() self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.') - def get_hash_uuids(self, body_hash: str) -> tuple[str, str, str]: - """Use that to get a reference allowing to fetch a resource from one of the capture.""" - capture_uuid = str(self.redis.srandmember(f'bh|{body_hash}|captures')) - entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0] - urlnode_uuid, hostnode_uuid, url = entry.split('|', 2) - return capture_uuid, urlnode_uuid, hostnode_uuid + def get_captures_body_hash_count(self, h: str) -> int: + # NOTE: the old name was bh instead of body_hashes + if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call] + # triggers the re-index soon. + self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{h}|captures')) + return 0 + return self.redis.zcard(f'body_hashes|{h}|captures') - def get_body_hash_captures(self, body_hash: str, filter_url: str | None=None, - filter_capture_uuid: str | None=None, - limit: int=20, - prefered_uuids: set[str]=set()) -> tuple[int, list[tuple[str, str, str, bool, str]]]: + def get_hash_uuids(self, body_hash: str) -> tuple[str, str] | None: + """Use that to get a reference allowing to fetch a resource from one of the capture.""" + if capture_uuids := self.redis.zrevrange(f'body_hashes|{body_hash}|captures', 0, 0, withscores=False): + capture_uuid = capture_uuids[0] + internal_index = f'capture_indexes|{capture_uuid}' + if urlnode_uuid := self.redis.srandmember(f'{internal_index}|body_hashes|{body_hash}'): + return str(capture_uuid), str(urlnode_uuid) + return None + + def get_captures_body_hash(self, body_hash: str, most_recent_capture: datetime | None = None, + oldest_capture: datetime | None = None) -> list[tuple[str, float]]: '''Get the captures matching the hash. - :param filter_url: URL of the hash we're searching for + :param body_hash: The hash to search for :param filter_capture_uuid: UUID of the capture the hash was found in - :param limit: Max matching captures to return, -1 means unlimited. - :param prefered_uuids: UUID cached right now, so we don't rebuild trees. ''' - to_return: list[tuple[str, str, str, bool, str]] = [] - len_captures = self.redis.scard(f'bh|{body_hash}|captures') - unlimited = False - if limit == -1: - unlimited = True - for capture_uuid in self.redis.sscan_iter(f'bh|{body_hash}|captures'): - if capture_uuid == filter_capture_uuid: - # Used to skip hits in current capture - len_captures -= 1 - continue - if prefered_uuids and capture_uuid not in prefered_uuids: - continue - if not unlimited: - limit -= 1 - for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1): - url_uuid, hostnode_uuid, url = entry.split('|', 2) - hostname: str = urlsplit(url).hostname - if filter_url: - to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url, url)) - else: - to_return.append((capture_uuid, hostnode_uuid, hostname, False, url)) - if not unlimited and limit <= 0: - break - return len_captures, to_return + max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf' + min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp() - def get_body_hash_domains(self, body_hash: str) -> list[tuple[str, float]]: - return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True) + if self.redis.type(f'bh|{body_hash}|captures') == 'set': # type: ignore[no-untyped-call] + # triggers the re-index soon. + self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{body_hash}|captures')) + self.redis.delete(f'bh|{body_hash}|captures') + return [] + return self.redis.zrevrangebyscore(f'body_hashes|{body_hash}|captures', max_score, min_score, withscores=True) - def get_body_hash_urls(self, body_hash: str) -> dict[str, list[dict[str, str]]]: - all_captures: set[str] = self.redis.smembers(f'bh|{body_hash}|captures') - urls = defaultdict(list) - for capture_uuid in list(all_captures): - for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1): - url_uuid, hostnode_uuid, url = entry.split('|', 2) - urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid}) - return urls + def get_capture_body_hash_nodes(self, capture_uuid: str, body_hash: str) -> set[str]: + if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}'): + return set(url_nodes) + return set() + + def get_body_hash_urlnodes(self, body_hash: str) -> dict[str, set[str]]: + return {capture_uuid: self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}') + for capture_uuid, capture_ts in self.get_captures_body_hash(body_hash)} # ###### HTTP Headers Hashes ###### @@ -342,7 +332,7 @@ class Indexing(): return self.redis.scard(f'hhhashes|{hhh}|captures') def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]: - return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] + return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] # type: ignore[misc] def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None: if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid): diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 5f8d7db3..2c06f009 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -1103,6 +1103,11 @@ class Lookyloo(): def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None: '''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource''' + + # Break immediately if we have the hash of the empty file + if h == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e': + return ('empty', BytesIO(), 'inode/x-empty') + try: url = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid) except IndexError: diff --git a/website/web/__init__.py b/website/web/__init__.py index 18247206..3fede2b0 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -342,55 +342,18 @@ def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Respons # ##### Methods querying the indexes ##### -def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: +def _get_body_hash_investigator(body_hash: str, /) -> list[tuple[str, str, datetime, str, str]]: '''Returns all the captures related to a hash (sha512), used in the web interface.''' - total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1) + _captures = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash) captures = [] - for capture_uuid, hostnode_uuid, hostname, _, url in details: + for capture_uuid, capture_ts in _captures: cache = lookyloo.capture_cache(capture_uuid) if not cache: continue - captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url)) - domains = get_indexing(flask_login.current_user).get_body_hash_domains(body_hash) - return captures, domains - - -def get_body_hash_full(body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]: - '''Returns a lot of information about the hash (sha512) and the hits in the instance. - Also contains the data (base64 encoded)''' - details = get_indexing(flask_login.current_user).get_body_hash_urls(body_hash) - - # Break immediately if we have the hash of the empty file - if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e': - return details, BytesIO() - - # get the body from the first entry in the details list - for _, entries in details.items(): - if not entries: - continue - ct = lookyloo.get_crawled_tree(entries[0]['capture']) - try: - urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode']) - except Exception: - # Unable to find URLnode in the tree, it probably has been rebuild. - # TODO throw a log line or something - # self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]') - # lookyloo._captures_index.remove_pickle() - continue - - # From that point, we just try to get the content. Break as soon as we found one. - if urlnode.body_hash == body_hash: - # the hash we're looking for is the whole file - return details, urlnode.body - else: - # The hash is an embedded resource - for _, blobs in urlnode.embedded_ressources.items(): - for h, b in blobs: - if h == body_hash: - return details, b - - # TODO: Couldn't find the file anywhere. Maybe return a warning in the file? - return details, BytesIO() + for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, body_hash): + urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid) + captures.append((cache.uuid, cache.title, cache.timestamp, urlnode.hostnode_uuid, urlnode.name)) + return captures def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]: @@ -400,8 +363,7 @@ def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | i if node.empty_response or node.body_hash in to_return: # If we have the same hash more than once, skip continue - total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(node.body_hash, limit=-1) - # Note for future: mayeb get url, capture title, something better than just the hash to show to the user + total_captures = get_indexing(flask_login.current_user).get_captures_body_hash_count(node.body_hash) to_return[node.body_hash] = {'node': node, 'total_captures': total_captures} return to_return @@ -539,23 +501,28 @@ def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], return [], [] -def hash_lookup(blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]: +def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]: '''Search all the captures a specific hash was seen. If a URL is given, it splits the results if the hash is seen on the same URL or an other one. Capture UUID avoids duplicates on the same capture''' captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} - total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1, - prefered_uuids=set(lookyloo._captures_index.keys())) - for h_capture_uuid, url_uuid, url_hostname, same_url, url in details: - cache = lookyloo.capture_cache(h_capture_uuid) - if cache and hasattr(cache, 'title'): - if same_url: - captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) + _captures = get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash) + for capture_uuid, capture_ts in _captures: + if capture_uuid == current_capture_uuid: + continue + cache = lookyloo.capture_cache(capture_uuid) + if not cache: + continue + for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, blob_hash): + urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid) + if url == urlnode.name: + captures_list['same_url'].append((capture_uuid, urlnode_uuid, cache.title, cache.timestamp.isoformat(), urlnode.hostname)) else: - captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) + captures_list['different_url'].append((capture_uuid, urlnode_uuid, cache.title, cache.timestamp.isoformat(), urlnode.hostname)) # Sort by timestamp by default captures_list['same_url'].sort(key=lambda y: y[3]) captures_list['different_url'].sort(key=lambda y: y[3]) + total_captures = get_indexing(flask_login.current_user).get_captures_body_hash_count(blob_hash) return total_captures, captures_list @@ -603,9 +570,8 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos if not url.empty_response: # Index lookup # %%% Full body %%% - freq = get_indexing(flask_login.current_user).body_hash_fequency(url.body_hash) - to_append['body_hash_details'] = freq - if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1: + if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(url.body_hash): + to_append['body_hash_details'] = {'hash_freq': freq} to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid) # %%% Embedded ressources %%% @@ -616,11 +582,9 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos if h in to_append['embedded_ressources']: # Skip duplicates continue - freq_embedded = get_indexing(flask_login.current_user).body_hash_fequency(h) - to_append['embedded_ressources'][h] = freq_embedded - to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes - to_append['embedded_ressources'][h]['type'] = mimetype - if freq_embedded['hash_freq'] > 1: + to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes, 'type': mimetype} + if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(h): + to_append['embedded_ressources'][h]['hash_freq'] = freq to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid) for h in to_append['embedded_ressources'].keys(): known, legitimate = normalize_known_content(h, known_content, url) @@ -1487,18 +1451,19 @@ def favicons_lookup() -> str: @app.route('/ressources', methods=['GET']) def ressources() -> str: ressources = [] - for h, freq in get_indexing(flask_login.current_user).ressources: - domain_freq = get_indexing(flask_login.current_user).ressources_number_domains(h) + for h in get_indexing(flask_login.current_user).ressources: + freq = get_indexing(flask_login.current_user).get_captures_body_hash_count(h) context = lookyloo.context.find_known_content(h) - capture_uuid, url_uuid, hostnode_uuid = get_indexing(flask_login.current_user).get_hash_uuids(h) - try: - ressource = lookyloo.get_ressource(capture_uuid, url_uuid, h) - except MissingUUID: - pass - if ressource: - ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid, ressource[0], ressource[2])) - else: - ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid, 'unknown', 'unknown')) + # Only get the recent captures + for capture_uuid, capture_ts in get_indexing(flask_login.current_user).get_captures_body_hash(h): + url_nodes = get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, h) + print(url_nodes) + url_node = url_nodes.pop() + print(capture_uuid, url_node, h) + ressource = lookyloo.get_ressource(capture_uuid, url_node, h) + if not ressource: + continue + ressources.append((h, freq, context.get(h), capture_uuid, url_node, ressource[0], ressource[2])) return render_template('ressources.html', ressources=ressources) @@ -1563,8 +1528,14 @@ def recapture(tree_uuid: str) -> str | Response | WerkzeugResponse: @app.route('/ressource_by_hash/', methods=['GET']) @file_response # type: ignore[misc] def ressource_by_hash(sha512: str) -> Response: - details, body = get_body_hash_full(sha512) - return send_file(body, as_attachment=True, download_name='ressource.bin') + if uuids := get_indexing(flask_login.current_user).get_hash_uuids(sha512): + # got UUIDs for this hash + capture_uuid, urlnode_uuid = uuids + if ressource := lookyloo.get_ressource(capture_uuid, urlnode_uuid, sha512): + filename, body, mimetype = ressource + return send_file(body, as_attachment=True, download_name=filename) + + return send_file(f'Unable to find {sha512}', as_attachment=True, download_name='Hash unknown.') # ################## Submit existing capture ################## @@ -1811,8 +1782,8 @@ def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str: @app.route('/body_hashes/', methods=['GET']) def body_hash_details(body_hash: str) -> str: from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False - captures, domains = _get_body_hash_investigator(body_hash.strip()) - return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures, from_popup=from_popup) + captures = _get_body_hash_investigator(body_hash.strip()) + return render_template('body_hash.html', body_hash=body_hash, captures=captures, from_popup=from_popup) @app.route('/urls/', methods=['GET']) @@ -1976,7 +1947,6 @@ def add_context(tree_uuid: str, node_uuid: str) -> WerkzeugResponse | None: context_data = request.form ressource_hash: str = context_data['hash_to_contextualize'] - hostnode_uuid: str = context_data['hostnode_uuid'] callback_str: str = context_data['callback_str'] legitimate: bool = True if context_data.get('legitimate') else False malicious: bool = True if context_data.get('malicious') else False @@ -1998,6 +1968,7 @@ def add_context(tree_uuid: str, node_uuid: str) -> WerkzeugResponse | None: lookyloo.add_context(tree_uuid, urlnode_uuid=node_uuid, ressource_hash=ressource_hash, legitimate=legitimate, malicious=malicious, details=details) if callback_str == 'hostnode_popup': + hostnode_uuid = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid).hostnode_uuid return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid)) elif callback_str == 'ressources': return redirect(url_for('ressources')) diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 3d7eb0cd..ad8b4eda 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -324,14 +324,16 @@ class ModulesResponse(Resource): # type: ignore[misc] params={'h': 'The hash (sha512)'}) class HashInfo(Resource): # type: ignore[misc] def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]: - from . import get_body_hash_full - - details, body = get_body_hash_full(h) - if not details: - return {'error': 'Unknown Hash.'}, 400 - to_return: dict[str, Any] = {'response': {'hash': h, 'details': details, - 'body': base64.b64encode(body.getvalue()).decode()}} - return to_return + if uuids := get_indexing(flask_login.current_user).get_hash_uuids(h): + # got UUIDs for this hash + capture_uuid, urlnode_uuid = uuids + if ressource := lookyloo.get_ressource(capture_uuid, urlnode_uuid, h): + filename, body, mimetype = ressource + details = get_indexing(flask_login.current_user).get_body_hash_urlnodes(h) + return {'response': {'hash': h, 'details': details, + 'body': base64.b64encode(body.getvalue()).decode()}} + return {'error': 'Unable to get ressource'}, 400 + return {'error': 'Unknown Hash.'}, 400 def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: diff --git a/website/web/templates/body_hash.html b/website/web/templates/body_hash.html index 2777d59e..3e08dafc 100644 --- a/website/web/templates/body_hash.html +++ b/website/web/templates/body_hash.html @@ -38,16 +38,10 @@
{{ body_hash }}
+
Only the most recent captures are listed below, this will change soon.
Download
- - - - - - - - - - {% for domain, freq in domains %} - - - - - {% endfor %} - -
FrequencyHostname
{{ freq }}{{ domain }}
-

The same file was seen in these captures:

+

The same file was seen in these captures recently:

diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index 2b5a27b6..fcd9b7b3 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -256,7 +256,7 @@ {% if url['body_hash_details'] and url['body_hash_details']['hash_freq'] %}
This file can be found {{ url['body_hash_details']['hash_freq'] }} times - across all the captures on this lookyloo instance, in {{ url['body_hash_details']['hash_domains_freq'] }} unique domains. + across all the captures on this lookyloo instance. {# other captures related with the same content #} {% if 'other_captures' in url['body_hash_details'] %} @@ -281,7 +281,8 @@ {% endif %} {% if enable_context_by_users %}
- {{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, url['url_object'].body_hash, 'hostnode_popup') }} + {{ context_form(tree_uuid, url['url_object'].uuid, + url['url_object'].body_hash, 'hostnode_popup') }} {% endif %} {% if url['embedded_ressources'] %} @@ -306,13 +307,13 @@ {% endif %}
This file {% if details['type'] %}({{ details['type'] }}){% endif %} can be found {{ details['hash_freq'] }} times - across all the captures on this lookyloo instance, in {{ details['hash_domains_freq'] }} unique domains. + across all the captures on this lookyloo instance. {{ get_ressource_button(tree_uuid, url['url_object'].uuid, hash, 'Download the embedded ressource', details['type'] and details['type'].startswith('image')) }}
{% if enable_context_by_users %} - {{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, hash, 'hostnode_popup') }} + {{ context_form(tree_uuid, url['url_object'].uuid, hash, 'hostnode_popup') }} {% endif %} {% if 'other_captures' in details %} diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index 5e7026fc..07a53c3e 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -94,7 +94,7 @@
{% endmacro %} -{% macro context_form(tree_uuid, urlnode_uuid, hostnode_uuid, hash, callback_str) %} +{% macro context_form(tree_uuid, urlnode_uuid, hash, callback_str) %}
- @@ -193,15 +192,15 @@ {% set total_captures = details[0] %} {% set other_captures = details[1] %} {# Only show details if the hits are in an other capture #} -{% if total_captures > 0 %} +{% if total_captures > 1 %}

- The same file was seen in {{ total_captures }} other captures. + The same file was seen in {{ total_captures - 1 }} other captures.

{# Lists of other captures loading the same content... #} diff --git a/website/web/templates/ressources.html b/website/web/templates/ressources.html index d00fe0d5..6ba0a7b3 100644 --- a/website/web/templates/ressources.html +++ b/website/web/templates/ressources.html @@ -32,23 +32,21 @@ - - {% for h, freq, number_domains, context, capture_uuid, urlnode_uuid, hostnode_uuid, filename, mimetype in ressources %} + {% for h, freq, context, capture_uuid, urlnode_uuid, filename, mimetype in ressources %} -
SHA 521 FrequencyNumber unique domains Context Mimetype Filename
{{ shorten_string(h, 10) }}
{{ get_ressource_button(capture_uuid, urlnode_uuid, h, 'Download sample', mimetype and mimetype.startswith('image')) }}
{{ freq }}{{ number_domains }} {{ context['type'] }} - {{ context['details'] }}
- {{ context_form(capture_uuid, urlnode_uuid, hostnode_uuid, h, 'ressources') }} + {{ context_form(capture_uuid, urlnode_uuid, h, 'ressources') }}
{{ mimetype }} {{ shorten_string(filename, 10) }}