chg: Migrate ressources/body hashes to new index that allows pagination on capture time

pull/952/head
Raphaël Vinot 2024-10-07 13:15:15 +02:00
parent e54c580cce
commit c68080431d
9 changed files with 159 additions and 215 deletions

View File

@ -3,7 +3,7 @@
exclude: "user_agents|website/web/sri.txt" exclude: "user_agents|website/web/sri.txt"
repos: repos:
- repo: https://github.com/pre-commit/pre-commit-hooks - repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0 rev: v5.0.0
hooks: hooks:
- id: trailing-whitespace - id: trailing-whitespace
- id: end-of-file-fixer - id: end-of-file-fixer

View File

@ -7,9 +7,7 @@ import hashlib
import logging import logging
from io import BytesIO from io import BytesIO
from collections import defaultdict
from datetime import datetime, timedelta from datetime import datetime, timedelta
from urllib.parse import urlsplit
from zipfile import ZipFile from zipfile import ZipFile
import mmh3 import mmh3
@ -49,12 +47,12 @@ class Indexing():
self.redis.flushdb() self.redis.flushdb()
@property @property
def redis_bytes(self) -> Redis: # type: ignore[type-arg] def redis_bytes(self) -> Redis[bytes]:
return Redis(connection_pool=self.__redis_pool_bytes) return Redis(connection_pool=self.__redis_pool_bytes)
@property @property
def redis(self) -> Redis: # type: ignore[type-arg] def redis(self) -> Redis[str]:
return Redis(connection_pool=self.__redis_pool) return Redis(connection_pool=self.__redis_pool) # type: ignore[return-value]
def can_index(self, capture_uuid: str | None=None) -> bool: def can_index(self, capture_uuid: str | None=None) -> bool:
if capture_uuid: if capture_uuid:
@ -83,6 +81,7 @@ class Indexing():
for hash_type in self.captures_hashes_types(): for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'): for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
# NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild
# internal_index can be "tlds" # internal_index can be "tlds"
for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'): for entry in self.redis.smembers(f'capture_indexes|{capture_uuid}|{internal_index}'):
# entry can be a "com", we delete a set of UUIDs, remove from the captures set # entry can be a "com", we delete a set of UUIDs, remove from the captures set
@ -185,7 +184,7 @@ class Indexing():
return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True) return self.redis.zrevrange(f'cn|{cookie_name}', 0, -1, withscores=True)
def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]: def get_cookies_names_captures(self, cookie_name: str) -> list[tuple[str, str]]:
return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] return [uuids.split('|') for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] # type: ignore[misc]
def index_cookies_capture(self, crawled_tree: CrawledTree) -> None: def index_cookies_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_cookies', crawled_tree.uuid): if self.redis.sismember('indexed_cookies', crawled_tree.uuid):
@ -228,24 +227,25 @@ class Indexing():
# ###### Body hashes ###### # ###### Body hashes ######
@property def _reindex_ressources(self, h: str) -> None:
def ressources(self) -> list[tuple[str, float]]: # We changed the format of the indexes, so we need to make sure they're re-triggered.
return self.redis.zrevrange('body_hashes', 0, 200, withscores=True)
def ressources_number_domains(self, h: str) -> int:
return self.redis.zcard(f'bh|{h}')
def body_hash_fequency(self, body_hash: str) -> dict[str, int]:
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
pipeline.zscore('body_hashes', body_hash) if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call]
pipeline.zcard(f'bh|{body_hash}') uuids_to_reindex = self.redis.smembers(f'bh|{h}|captures')
hash_freq, hash_domains_freq = pipeline.execute() pipeline.srem('indexed_body_hashes', *uuids_to_reindex)
to_return = {'hash_freq': 0, 'hash_domains_freq': 0} # deprecated index
if hash_freq: pipeline.delete(*[f'bh|{h}|captures|{uuid}' for uuid in uuids_to_reindex])
to_return['hash_freq'] = int(hash_freq) pipeline.delete(f'bh|{h}|captures')
if hash_domains_freq: if self.redis.type(f'bh|{h}') == 'zset': # type: ignore[no-untyped-call]
to_return['hash_domains_freq'] = int(hash_domains_freq) pipeline.delete(f'bh|{h}')
return to_return
if self.redis.type('body_hashes') == 'zset': # type: ignore[no-untyped-call]
pipeline.delete('body_hashes')
pipeline.execute()
@property
def ressources(self) -> set[str]:
return self.redis.smembers('body_hashes')
def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None: def index_body_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid):
@ -253,84 +253,74 @@ class Indexing():
return return
self.redis.sadd('indexed_body_hashes', crawled_tree.uuid) self.redis.sadd('indexed_body_hashes', crawled_tree.uuid)
self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ') self.logger.debug(f'Indexing body hashes for {crawled_tree.uuid} ... ')
cleaned_up_hashes: set[str] = set()
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
is_reindex = False
# Add the body hashes key in internal indexes set
internal_index = f'capture_indexes|{crawled_tree.uuid}'
pipeline.sadd(internal_index, 'body_hashes')
already_indexed_global: set[str] = set()
for urlnode in crawled_tree.root_hartree.url_tree.traverse(): for urlnode in crawled_tree.root_hartree.url_tree.traverse():
for h in urlnode.resources_hashes: for h in urlnode.resources_hashes:
if h not in cleaned_up_hashes:
# Delete the hash for that capture the first time we see it. self._reindex_ressources(h)
if self.redis.exists(f'bh|{h}|captures|{crawled_tree.uuid}'):
pipeline.delete(f'bh|{h}|captures|{crawled_tree.uuid}') if h not in already_indexed_global:
cleaned_up_hashes.add(h) # The hash hasn't been indexed in that run yet
is_reindex = True already_indexed_global.add(h)
self.logger.debug(f'reindexing body hashes for {crawled_tree.uuid} ... ') pipeline.sadd(f'{internal_index}|body_hashes', h) # Only used to delete index
# ZSet of all urlnode_UUIDs|full_url pipeline.sadd('body_hashes', h)
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, pipeline.zadd(f'body_hashes|{h}|captures',
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
if not is_reindex:
pipeline.zincrby('body_hashes', 1, h) # Add hostnode UUID in internal index
pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) pipeline.sadd(f'{internal_index}|body_hashes|{h}', urlnode.uuid)
# set of all captures with this hash
pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
pipeline.execute() pipeline.execute()
self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.') self.logger.debug(f'done with body hashes for {crawled_tree.uuid}.')
def get_hash_uuids(self, body_hash: str) -> tuple[str, str, str]: def get_captures_body_hash_count(self, h: str) -> int:
"""Use that to get a reference allowing to fetch a resource from one of the capture.""" # NOTE: the old name was bh instead of body_hashes
capture_uuid = str(self.redis.srandmember(f'bh|{body_hash}|captures')) if self.redis.type(f'bh|{h}|captures') == 'set': # type: ignore[no-untyped-call]
entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0] # triggers the re-index soon.
urlnode_uuid, hostnode_uuid, url = entry.split('|', 2) self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{h}|captures'))
return capture_uuid, urlnode_uuid, hostnode_uuid return 0
return self.redis.zcard(f'body_hashes|{h}|captures')
def get_body_hash_captures(self, body_hash: str, filter_url: str | None=None, def get_hash_uuids(self, body_hash: str) -> tuple[str, str] | None:
filter_capture_uuid: str | None=None, """Use that to get a reference allowing to fetch a resource from one of the capture."""
limit: int=20, if capture_uuids := self.redis.zrevrange(f'body_hashes|{body_hash}|captures', 0, 0, withscores=False):
prefered_uuids: set[str]=set()) -> tuple[int, list[tuple[str, str, str, bool, str]]]: capture_uuid = capture_uuids[0]
internal_index = f'capture_indexes|{capture_uuid}'
if urlnode_uuid := self.redis.srandmember(f'{internal_index}|body_hashes|{body_hash}'):
return str(capture_uuid), str(urlnode_uuid)
return None
def get_captures_body_hash(self, body_hash: str, most_recent_capture: datetime | None = None,
oldest_capture: datetime | None = None) -> list[tuple[str, float]]:
'''Get the captures matching the hash. '''Get the captures matching the hash.
:param filter_url: URL of the hash we're searching for :param body_hash: The hash to search for
:param filter_capture_uuid: UUID of the capture the hash was found in :param filter_capture_uuid: UUID of the capture the hash was found in
:param limit: Max matching captures to return, -1 means unlimited.
:param prefered_uuids: UUID cached right now, so we don't rebuild trees.
''' '''
to_return: list[tuple[str, str, str, bool, str]] = [] max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
len_captures = self.redis.scard(f'bh|{body_hash}|captures') min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp()
unlimited = False
if limit == -1:
unlimited = True
for capture_uuid in self.redis.sscan_iter(f'bh|{body_hash}|captures'):
if capture_uuid == filter_capture_uuid:
# Used to skip hits in current capture
len_captures -= 1
continue
if prefered_uuids and capture_uuid not in prefered_uuids:
continue
if not unlimited:
limit -= 1
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, hostnode_uuid, url = entry.split('|', 2)
hostname: str = urlsplit(url).hostname
if filter_url:
to_return.append((capture_uuid, hostnode_uuid, hostname, url == filter_url, url))
else:
to_return.append((capture_uuid, hostnode_uuid, hostname, False, url))
if not unlimited and limit <= 0:
break
return len_captures, to_return
def get_body_hash_domains(self, body_hash: str) -> list[tuple[str, float]]: if self.redis.type(f'bh|{body_hash}|captures') == 'set': # type: ignore[no-untyped-call]
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True) # triggers the re-index soon.
self.redis.srem('indexed_body_hashes', *self.redis.smembers(f'bh|{body_hash}|captures'))
self.redis.delete(f'bh|{body_hash}|captures')
return []
return self.redis.zrevrangebyscore(f'body_hashes|{body_hash}|captures', max_score, min_score, withscores=True)
def get_body_hash_urls(self, body_hash: str) -> dict[str, list[dict[str, str]]]: def get_capture_body_hash_nodes(self, capture_uuid: str, body_hash: str) -> set[str]:
all_captures: set[str] = self.redis.smembers(f'bh|{body_hash}|captures') if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}'):
urls = defaultdict(list) return set(url_nodes)
for capture_uuid in list(all_captures): return set()
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
url_uuid, hostnode_uuid, url = entry.split('|', 2) def get_body_hash_urlnodes(self, body_hash: str) -> dict[str, set[str]]:
urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid}) return {capture_uuid: self.redis.smembers(f'capture_indexes|{capture_uuid}|body_hashes|{body_hash}')
return urls for capture_uuid, capture_ts in self.get_captures_body_hash(body_hash)}
# ###### HTTP Headers Hashes ###### # ###### HTTP Headers Hashes ######
@ -342,7 +332,7 @@ class Indexing():
return self.redis.scard(f'hhhashes|{hhh}|captures') return self.redis.scard(f'hhhashes|{hhh}|captures')
def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]: def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]:
return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] # type: ignore[misc]
def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None: def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None:
if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid): if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid):

View File

@ -1103,6 +1103,11 @@ class Lookyloo():
def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None: def get_ressource(self, tree_uuid: str, /, urlnode_uuid: str, h: str | None) -> tuple[str, BytesIO, str] | None:
'''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource''' '''Get a specific resource from a URL node. If a hash s also given, we want an embeded resource'''
# Break immediately if we have the hash of the empty file
if h == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
return ('empty', BytesIO(), 'inode/x-empty')
try: try:
url = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid) url = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid)
except IndexError: except IndexError:

View File

@ -342,55 +342,18 @@ def handle_pydandic_validation_exception(error: CaptureSettingsError) -> Respons
# ##### Methods querying the indexes ##### # ##### Methods querying the indexes #####
def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: def _get_body_hash_investigator(body_hash: str, /) -> list[tuple[str, str, datetime, str, str]]:
'''Returns all the captures related to a hash (sha512), used in the web interface.''' '''Returns all the captures related to a hash (sha512), used in the web interface.'''
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1) _captures = get_indexing(flask_login.current_user).get_captures_body_hash(body_hash)
captures = [] captures = []
for capture_uuid, hostnode_uuid, hostname, _, url in details: for capture_uuid, capture_ts in _captures:
cache = lookyloo.capture_cache(capture_uuid) cache = lookyloo.capture_cache(capture_uuid)
if not cache: if not cache:
continue continue
captures.append((cache.uuid, cache.title, cache.timestamp, hostnode_uuid, url)) for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, body_hash):
domains = get_indexing(flask_login.current_user).get_body_hash_domains(body_hash) urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid)
return captures, domains captures.append((cache.uuid, cache.title, cache.timestamp, urlnode.hostnode_uuid, urlnode.name))
return captures
def get_body_hash_full(body_hash: str, /) -> tuple[dict[str, list[dict[str, str]]], BytesIO]:
'''Returns a lot of information about the hash (sha512) and the hits in the instance.
Also contains the data (base64 encoded)'''
details = get_indexing(flask_login.current_user).get_body_hash_urls(body_hash)
# Break immediately if we have the hash of the empty file
if body_hash == 'cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e':
return details, BytesIO()
# get the body from the first entry in the details list
for _, entries in details.items():
if not entries:
continue
ct = lookyloo.get_crawled_tree(entries[0]['capture'])
try:
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
except Exception:
# Unable to find URLnode in the tree, it probably has been rebuild.
# TODO throw a log line or something
# self.logger.warning(f'Unable to find {entries[0]["urlnode"]} in entries[0]["capture"]')
# lookyloo._captures_index.remove_pickle(<capture UUID>)
continue
# From that point, we just try to get the content. Break as soon as we found one.
if urlnode.body_hash == body_hash:
# the hash we're looking for is the whole file
return details, urlnode.body
else:
# The hash is an embedded resource
for _, blobs in urlnode.embedded_ressources.items():
for h, b in blobs:
if h == body_hash:
return details, b
# TODO: Couldn't find the file anywhere. Maybe return a warning in the file?
return details, BytesIO()
def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]: def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | int]]:
@ -400,8 +363,7 @@ def get_all_body_hashes(capture_uuid: str, /) -> dict[str, dict[str, URLNode | i
if node.empty_response or node.body_hash in to_return: if node.empty_response or node.body_hash in to_return:
# If we have the same hash more than once, skip # If we have the same hash more than once, skip
continue continue
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(node.body_hash, limit=-1) total_captures = get_indexing(flask_login.current_user).get_captures_body_hash_count(node.body_hash)
# Note for future: mayeb get url, capture title, something better than just the hash to show to the user
to_return[node.body_hash] = {'node': node, 'total_captures': total_captures} to_return[node.body_hash] = {'node': node, 'total_captures': total_captures}
return to_return return to_return
@ -539,23 +501,28 @@ def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]],
return [], [] return [], []
def hash_lookup(blob_hash: str, url: str, capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]: def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
'''Search all the captures a specific hash was seen. '''Search all the captures a specific hash was seen.
If a URL is given, it splits the results if the hash is seen on the same URL or an other one. If a URL is given, it splits the results if the hash is seen on the same URL or an other one.
Capture UUID avoids duplicates on the same capture''' Capture UUID avoids duplicates on the same capture'''
captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []} captures_list: dict[str, list[tuple[str, str, str, str, str]]] = {'same_url': [], 'different_url': []}
total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(blob_hash, url, filter_capture_uuid=capture_uuid, limit=-1, _captures = get_indexing(flask_login.current_user).get_captures_body_hash(blob_hash)
prefered_uuids=set(lookyloo._captures_index.keys())) for capture_uuid, capture_ts in _captures:
for h_capture_uuid, url_uuid, url_hostname, same_url, url in details: if capture_uuid == current_capture_uuid:
cache = lookyloo.capture_cache(h_capture_uuid) continue
if cache and hasattr(cache, 'title'): cache = lookyloo.capture_cache(capture_uuid)
if same_url: if not cache:
captures_list['same_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) continue
for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, blob_hash):
urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid)
if url == urlnode.name:
captures_list['same_url'].append((capture_uuid, urlnode_uuid, cache.title, cache.timestamp.isoformat(), urlnode.hostname))
else: else:
captures_list['different_url'].append((h_capture_uuid, url_uuid, cache.title, cache.timestamp.isoformat(), url_hostname)) captures_list['different_url'].append((capture_uuid, urlnode_uuid, cache.title, cache.timestamp.isoformat(), urlnode.hostname))
# Sort by timestamp by default # Sort by timestamp by default
captures_list['same_url'].sort(key=lambda y: y[3]) captures_list['same_url'].sort(key=lambda y: y[3])
captures_list['different_url'].sort(key=lambda y: y[3]) captures_list['different_url'].sort(key=lambda y: y[3])
total_captures = get_indexing(flask_login.current_user).get_captures_body_hash_count(blob_hash)
return total_captures, captures_list return total_captures, captures_list
@ -603,9 +570,8 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos
if not url.empty_response: if not url.empty_response:
# Index lookup # Index lookup
# %%% Full body %%% # %%% Full body %%%
freq = get_indexing(flask_login.current_user).body_hash_fequency(url.body_hash) if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(url.body_hash):
to_append['body_hash_details'] = freq to_append['body_hash_details'] = {'hash_freq': freq}
if freq and 'hash_freq' in freq and freq['hash_freq'] and freq['hash_freq'] > 1:
to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid) to_append['body_hash_details']['other_captures'] = hash_lookup(url.body_hash, url.name, capture_uuid)
# %%% Embedded ressources %%% # %%% Embedded ressources %%%
@ -616,11 +582,9 @@ def get_hostnode_investigator(capture_uuid: str, /, node_uuid: str) -> tuple[Hos
if h in to_append['embedded_ressources']: if h in to_append['embedded_ressources']:
# Skip duplicates # Skip duplicates
continue continue
freq_embedded = get_indexing(flask_login.current_user).body_hash_fequency(h) to_append['embedded_ressources'][h] = {'body_size': blob.getbuffer().nbytes, 'type': mimetype}
to_append['embedded_ressources'][h] = freq_embedded if freq := get_indexing(flask_login.current_user).get_captures_body_hash_count(h):
to_append['embedded_ressources'][h]['body_size'] = blob.getbuffer().nbytes to_append['embedded_ressources'][h]['hash_freq'] = freq
to_append['embedded_ressources'][h]['type'] = mimetype
if freq_embedded['hash_freq'] > 1:
to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid) to_append['embedded_ressources'][h]['other_captures'] = hash_lookup(h, url.name, capture_uuid)
for h in to_append['embedded_ressources'].keys(): for h in to_append['embedded_ressources'].keys():
known, legitimate = normalize_known_content(h, known_content, url) known, legitimate = normalize_known_content(h, known_content, url)
@ -1487,18 +1451,19 @@ def favicons_lookup() -> str:
@app.route('/ressources', methods=['GET']) @app.route('/ressources', methods=['GET'])
def ressources() -> str: def ressources() -> str:
ressources = [] ressources = []
for h, freq in get_indexing(flask_login.current_user).ressources: for h in get_indexing(flask_login.current_user).ressources:
domain_freq = get_indexing(flask_login.current_user).ressources_number_domains(h) freq = get_indexing(flask_login.current_user).get_captures_body_hash_count(h)
context = lookyloo.context.find_known_content(h) context = lookyloo.context.find_known_content(h)
capture_uuid, url_uuid, hostnode_uuid = get_indexing(flask_login.current_user).get_hash_uuids(h) # Only get the recent captures
try: for capture_uuid, capture_ts in get_indexing(flask_login.current_user).get_captures_body_hash(h):
ressource = lookyloo.get_ressource(capture_uuid, url_uuid, h) url_nodes = get_indexing(flask_login.current_user).get_capture_body_hash_nodes(capture_uuid, h)
except MissingUUID: print(url_nodes)
pass url_node = url_nodes.pop()
if ressource: print(capture_uuid, url_node, h)
ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid, ressource[0], ressource[2])) ressource = lookyloo.get_ressource(capture_uuid, url_node, h)
else: if not ressource:
ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid, 'unknown', 'unknown')) continue
ressources.append((h, freq, context.get(h), capture_uuid, url_node, ressource[0], ressource[2]))
return render_template('ressources.html', ressources=ressources) return render_template('ressources.html', ressources=ressources)
@ -1563,8 +1528,14 @@ def recapture(tree_uuid: str) -> str | Response | WerkzeugResponse:
@app.route('/ressource_by_hash/<string:sha512>', methods=['GET']) @app.route('/ressource_by_hash/<string:sha512>', methods=['GET'])
@file_response # type: ignore[misc] @file_response # type: ignore[misc]
def ressource_by_hash(sha512: str) -> Response: def ressource_by_hash(sha512: str) -> Response:
details, body = get_body_hash_full(sha512) if uuids := get_indexing(flask_login.current_user).get_hash_uuids(sha512):
return send_file(body, as_attachment=True, download_name='ressource.bin') # got UUIDs for this hash
capture_uuid, urlnode_uuid = uuids
if ressource := lookyloo.get_ressource(capture_uuid, urlnode_uuid, sha512):
filename, body, mimetype = ressource
return send_file(body, as_attachment=True, download_name=filename)
return send_file(f'Unable to find {sha512}', as_attachment=True, download_name='Hash unknown.')
# ################## Submit existing capture ################## # ################## Submit existing capture ##################
@ -1811,8 +1782,8 @@ def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
@app.route('/body_hashes/<string:body_hash>', methods=['GET']) @app.route('/body_hashes/<string:body_hash>', methods=['GET'])
def body_hash_details(body_hash: str) -> str: def body_hash_details(body_hash: str) -> str:
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
captures, domains = _get_body_hash_investigator(body_hash.strip()) captures = _get_body_hash_investigator(body_hash.strip())
return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures, from_popup=from_popup) return render_template('body_hash.html', body_hash=body_hash, captures=captures, from_popup=from_popup)
@app.route('/urls/<string:url>', methods=['GET']) @app.route('/urls/<string:url>', methods=['GET'])
@ -1976,7 +1947,6 @@ def add_context(tree_uuid: str, node_uuid: str) -> WerkzeugResponse | None:
context_data = request.form context_data = request.form
ressource_hash: str = context_data['hash_to_contextualize'] ressource_hash: str = context_data['hash_to_contextualize']
hostnode_uuid: str = context_data['hostnode_uuid']
callback_str: str = context_data['callback_str'] callback_str: str = context_data['callback_str']
legitimate: bool = True if context_data.get('legitimate') else False legitimate: bool = True if context_data.get('legitimate') else False
malicious: bool = True if context_data.get('malicious') else False malicious: bool = True if context_data.get('malicious') else False
@ -1998,6 +1968,7 @@ def add_context(tree_uuid: str, node_uuid: str) -> WerkzeugResponse | None:
lookyloo.add_context(tree_uuid, urlnode_uuid=node_uuid, ressource_hash=ressource_hash, lookyloo.add_context(tree_uuid, urlnode_uuid=node_uuid, ressource_hash=ressource_hash,
legitimate=legitimate, malicious=malicious, details=details) legitimate=legitimate, malicious=malicious, details=details)
if callback_str == 'hostnode_popup': if callback_str == 'hostnode_popup':
hostnode_uuid = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid).hostnode_uuid
return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid)) return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid))
elif callback_str == 'ressources': elif callback_str == 'ressources':
return redirect(url_for('ressources')) return redirect(url_for('ressources'))

View File

@ -324,14 +324,16 @@ class ModulesResponse(Resource): # type: ignore[misc]
params={'h': 'The hash (sha512)'}) params={'h': 'The hash (sha512)'})
class HashInfo(Resource): # type: ignore[misc] class HashInfo(Resource): # type: ignore[misc]
def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]: def get(self, h: str) -> dict[str, Any] | tuple[dict[str, Any], int]:
from . import get_body_hash_full if uuids := get_indexing(flask_login.current_user).get_hash_uuids(h):
# got UUIDs for this hash
details, body = get_body_hash_full(h) capture_uuid, urlnode_uuid = uuids
if not details: if ressource := lookyloo.get_ressource(capture_uuid, urlnode_uuid, h):
return {'error': 'Unknown Hash.'}, 400 filename, body, mimetype = ressource
to_return: dict[str, Any] = {'response': {'hash': h, 'details': details, details = get_indexing(flask_login.current_user).get_body_hash_urlnodes(h)
return {'response': {'hash': h, 'details': details,
'body': base64.b64encode(body.getvalue()).decode()}} 'body': base64.b64encode(body.getvalue()).decode()}}
return to_return return {'error': 'Unable to get ressource'}, 400
return {'error': 'Unknown Hash.'}, 400
def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]:

View File

@ -38,16 +38,10 @@
<center> <center>
<h6>{{ body_hash }}</h6> <h6>{{ body_hash }}</h6>
<h6>Only the most recent captures are listed below, this will change soon.</h6>
<a href="{{ url_for('ressource_by_hash', sha512=body_hash) }}">Download</a> <a href="{{ url_for('ressource_by_hash', sha512=body_hash) }}">Download</a>
</center> </center>
<script type="text/javascript" nonce="{{ csp_nonce() }}">
new DataTable('#freqHostTable', {
order: [[ 0, "desc" ]],
columnDefs: [{ width: '20%', targets: 0 },
{ width: '80%', targets: 1 }],
});
</script>
<script type="text/javascript" nonce="{{ csp_nonce() }}"> <script type="text/javascript" nonce="{{ csp_nonce() }}">
new DataTable('#bodyHashDetailsTable', { new DataTable('#bodyHashDetailsTable', {
order: [[ 0, "desc" ]], order: [[ 0, "desc" ]],
@ -61,23 +55,7 @@
{ width: '40%', targets: 2 }], { width: '40%', targets: 2 }],
}); });
</script> </script>
<table id="freqHostTable" class="table table-striped" style="width:100%"> <p>The same file was seen in these captures recently:</p>
<thead>
<tr>
<th>Frequency</th>
<th>Hostname</th>
</tr>
</thead>
<tbody>
{% for domain, freq in domains %}
<tr>
<td>{{ freq }}</td>
<td>{{ domain }}</td>
</tr>
{% endfor %}
</tbody>
</table>
<p>The same file was seen in these captures:</p>
<table id="bodyHashDetailsTable" class="table table-striped" style="width:100%"> <table id="bodyHashDetailsTable" class="table table-striped" style="width:100%">
<thead> <thead>
<tr> <tr>

View File

@ -256,7 +256,7 @@
{% if url['body_hash_details'] and url['body_hash_details']['hash_freq'] %} {% if url['body_hash_details'] and url['body_hash_details']['hash_freq'] %}
<div> <div>
This file can be found <b>{{ url['body_hash_details']['hash_freq'] }}</b> times This file can be found <b>{{ url['body_hash_details']['hash_freq'] }}</b> times
across all the captures on this lookyloo instance, in <b>{{ url['body_hash_details']['hash_domains_freq'] }}</b> unique domains. across all the captures on this lookyloo instance.
{# other captures related with the same content #} {# other captures related with the same content #}
{% if 'other_captures' in url['body_hash_details'] %} {% if 'other_captures' in url['body_hash_details'] %}
@ -281,7 +281,8 @@
{% endif %} {% endif %}
{% if enable_context_by_users %} {% if enable_context_by_users %}
</br> </br>
{{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, url['url_object'].body_hash, 'hostnode_popup') }} {{ context_form(tree_uuid, url['url_object'].uuid,
url['url_object'].body_hash, 'hostnode_popup') }}
{% endif %} {% endif %}
{% if url['embedded_ressources'] %} {% if url['embedded_ressources'] %}
@ -306,13 +307,13 @@
{% endif %} {% endif %}
<div> <div>
This file {% if details['type'] %}(<b>{{ details['type'] }}</b>){% endif %} can be found <b>{{ details['hash_freq'] }}</b> times This file {% if details['type'] %}(<b>{{ details['type'] }}</b>){% endif %} can be found <b>{{ details['hash_freq'] }}</b> times
across all the captures on this lookyloo instance, in <b>{{ details['hash_domains_freq'] }}</b> unique domains. across all the captures on this lookyloo instance.
{{ get_ressource_button(tree_uuid, url['url_object'].uuid, hash, {{ get_ressource_button(tree_uuid, url['url_object'].uuid, hash,
'Download the embedded ressource', 'Download the embedded ressource',
details['type'] and details['type'].startswith('image')) }} details['type'] and details['type'].startswith('image')) }}
</br> </br>
{% if enable_context_by_users %} {% if enable_context_by_users %}
{{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, hash, 'hostnode_popup') }} {{ context_form(tree_uuid, url['url_object'].uuid, hash, 'hostnode_popup') }}
{% endif %} {% endif %}
{% if 'other_captures' in details %} {% if 'other_captures' in details %}

View File

@ -94,7 +94,7 @@
</div> </div>
{% endmacro %} {% endmacro %}
{% macro context_form(tree_uuid, urlnode_uuid, hostnode_uuid, hash, callback_str) %} {% macro context_form(tree_uuid, urlnode_uuid, hash, callback_str) %}
<button class="btn btn-primary collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#context_response_{{ urlnode_uuid }}" aria-expanded="false" aria-controls="collapseContextForm"> <button class="btn btn-primary collapsed" type="button" data-bs-toggle="collapse" data-bs-target="#context_response_{{ urlnode_uuid }}" aria-expanded="false" aria-controls="collapseContextForm">
<span class="if-collapsed">Add context</span> <span class="if-collapsed">Add context</span>
<span class="if-not-collapsed">Hide context form</span> <span class="if-not-collapsed">Hide context form</span>
@ -143,7 +143,6 @@
</div> </div>
</div> </div>
<input type="hidden" id="hash_to_contextualize" name="hash_to_contextualize" value="{{ hash }}"> <input type="hidden" id="hash_to_contextualize" name="hash_to_contextualize" value="{{ hash }}">
<input type="hidden" id="hostnode_uuid" name="hostnode_uuid" value="{{ hostnode_uuid }}">
<input type="hidden" id="callback_str" name="callback_str" value="{{ callback_str }}"> <input type="hidden" id="callback_str" name="callback_str" value="{{ callback_str }}">
<button type="submit" class="btn btn-primary" id="btn-looking">Submit context</button> <button type="submit" class="btn btn-primary" id="btn-looking">Submit context</button>
</form> </form>
@ -193,15 +192,15 @@
{% set total_captures = details[0] %} {% set total_captures = details[0] %}
{% set other_captures = details[1] %} {% set other_captures = details[1] %}
{# Only show details if the hits are in an other capture #} {# Only show details if the hits are in an other capture #}
{% if total_captures > 0 %} {% if total_captures > 1 %}
<p> <p>
The same file was seen in <b>{{ total_captures }}</b> other captures. The same file was seen in <b>{{ total_captures - 1 }}</b> other captures.
</br> </br>
<button class="btn btn-primary collapsed" type="button" data-bs-toggle="collapse" <button class="btn btn-primary collapsed" type="button" data-bs-toggle="collapse"
data-bs-target="#captureslist_{{ identifier_for_toggle }}" data-bs-target="#captureslist_{{ identifier_for_toggle }}"
aria-expanded="false" aria-controls="collapseExample"> aria-expanded="false" aria-controls="collapseExample">
<span class="if-collapsed">Show other captures</span> <span class="if-collapsed">Show other recent captures</span>
<span class="if-not-collapsed">Hide other captures</span> <span class="if-not-collapsed">Hide other recent captures</span>
</button> </button>
</p> </p>
{# Lists of other captures loading the same content... #} {# Lists of other captures loading the same content... #}

View File

@ -32,23 +32,21 @@
<tr> <tr>
<th>SHA 521</th> <th>SHA 521</th>
<th>Frequency</th> <th>Frequency</th>
<th>Number unique domains</th>
<th>Context</th> <th>Context</th>
<th>Mimetype</th> <th>Mimetype</th>
<th>Filename</th> <th>Filename</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for h, freq, number_domains, context, capture_uuid, urlnode_uuid, hostnode_uuid, filename, mimetype in ressources %} {% for h, freq, context, capture_uuid, urlnode_uuid, filename, mimetype in ressources %}
<tr> <tr>
<td> <td>
<a href="{{ url_for('body_hash_details', body_hash=h) }}">{{ shorten_string(h, 10) }}</a></br> <a href="{{ url_for('body_hash_details', body_hash=h) }}">{{ shorten_string(h, 10) }}</a></br>
{{ get_ressource_button(capture_uuid, urlnode_uuid, h, 'Download sample', mimetype and mimetype.startswith('image')) }} {{ get_ressource_button(capture_uuid, urlnode_uuid, h, 'Download sample', mimetype and mimetype.startswith('image')) }}
</td> </td>
<td>{{ freq }}</td> <td>{{ freq }}</td>
<td>{{ number_domains }}</td>
<td> {{ context['type'] }} - {{ context['details'] }}</br> <td> {{ context['type'] }} - {{ context['details'] }}</br>
{{ context_form(capture_uuid, urlnode_uuid, hostnode_uuid, h, 'ressources') }} {{ context_form(capture_uuid, urlnode_uuid, h, 'ressources') }}
</td> </td>
<td>{{ mimetype }}</td> <td>{{ mimetype }}</td>
<td>{{ shorten_string(filename, 10) }}</td> <td>{{ shorten_string(filename, 10) }}</td>