diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 3e6a7ff..6cd0949 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager): # Don't need the cache in this class. self.lookyloo.clear_tree_cache() - def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]: + def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool, bool], str], None, None]: # NOTE: only get the non-archived captures for now. for uuid, directory in self.redis.hscan_iter('lookup_dirs'): if not self.full_indexer: @@ -88,6 +88,9 @@ class BackgroundIndexer(AbstractManager): if not indexed[5]: self.logger.info(f'Indexing identifiers for {uuid_to_index}') self.indexing.index_identifiers_capture(ct) + if not indexed[6]: + self.logger.info(f'Indexing hash types for {uuid_to_index}') + self.indexing.index_capture_hashes_types(ct) # NOTE: categories aren't taken in account here, should be fixed(?) # see indexing.index_categories_capture(capture_uuid, categories) self.indexing.indexing_done() diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index ea2b141..b038081 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -14,11 +14,14 @@ from zipfile import ZipFile import mmh3 +from bs4 import BeautifulSoup +from hashlib import sha256 + from har2tree import CrawledTree from redis import ConnectionPool, Redis from redis.connection import UnixDomainSocketConnection -from .default import get_socket_path, get_config +from .default import get_socket_path # from .helpers import get_public_suffix_list @@ -66,9 +69,13 @@ class Indexing(): p.srem('indexed_hhhashes', capture_uuid) p.srem('indexed_favicons', capture_uuid) p.srem('indexed_identifiers', capture_uuid) + for identifier_type in self.identifiers_types(): + p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) + for hash_type in self.captures_hashes_types(): + p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) p.execute() - def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]: + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool, bool]: p = self.redis.pipeline() p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid) @@ -76,8 +83,12 @@ class Indexing(): p.sismember('indexed_hhhashes', capture_uuid) p.sismember('indexed_favicons', capture_uuid) p.sismember('indexed_identifiers', capture_uuid) - # This call for sure returns a tuple of 6 booleans - return p.execute() # type: ignore[return-value] + # We also need to check if the hash_type are all indexed for this capture + hash_types_indexed = all(self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid) for hash_type in self.captures_hashes_types()) + to_return: list[bool] = p.execute() + to_return.append(hash_types_indexed) + # This call for sure returns a tuple of 7 booleans + return tuple(to_return) # type: ignore[return-value] # ###### Cookies ###### @@ -367,6 +378,65 @@ class Indexing(): def get_favicon(self, favicon_sha512: str) -> bytes | None: return self.redis_bytes.get(f'favicons|{favicon_sha512}') + # ###### Capture hashes ###### + + # This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture) + # certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page + + def _compute_certpl_html_structure_hash(self, html: str) -> str: + soup = BeautifulSoup(html, "lxml") + to_hash = "|".join(t.name for t in soup.findAll()).encode() + return sha256(to_hash).hexdigest()[:32] + + def captures_hashes_types(self) -> set[str]: + return set('certpl_html_structure_hash', ) + # return self.redis.smembers('capture_hash_types') + + def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]: + return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True) + + def hash_frequency(self, hash_type: str, h: str) -> float | None: + return self.redis.zscore(f'capture_hash_types|{hash_type}', h) + + def hash_number_captures(self, hash_type: str, h: str) -> int: + return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures') + + def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None: + capture_uuid = crawled_tree.uuid + # NOTE: We will have multiple hash types for each captures, we want to make sure + # to reindex all the captures if there is a new hash type but only index the new + # captures on the existing hash types + # hashes = ('certpl_html_structure_hash', ) + for hash_type in self.captures_hashes_types(): + if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid): + # Do not reindex + return + self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid) + + if hash_type == 'certpl_html_structure_hash': + # we must have a rendered HTML for this hash to be relevant. + if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html') + or not crawled_tree.root_hartree.rendered_node.rendered_html): + continue + # we have a rendered HTML, compute the hash + hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html) + + if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid): + # Already counted this specific identifier for this capture + continue + self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ') + pipeline = self.redis.pipeline() + pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index) + pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid) + pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index) + pipeline.execute() + + def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]: + return self.redis.hgetall(f'capture_hash_types|{capture_uuid}') + + def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]: + return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures') + # ###### identifiers ###### def identifiers_types(self) -> set[str]: diff --git a/website/web/__init__.py b/website/web/__init__.py index 06a0a16..0bd8158 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -421,6 +421,11 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] +def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]: + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)]) + return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] + + def get_favicon_investigator(favicon_sha512: str, /, get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]], @@ -1232,6 +1237,16 @@ def tree_favicons(tree_uuid: str) -> str: return render_template('tree_favicons.html', tree_uuid=tree_uuid, favicons=favicons) +@app.route('/tree//hashes_types', methods=['GET']) +def tree_capture_hashes_types(tree_uuid: str) -> str: + to_return: list[tuple[int, str, str]] = [] + + for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items(): + nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h) + to_return.append((nb_captures, hash_type, h)) + return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return) + + @app.route('/tree//body_hashes', methods=['GET']) def tree_body_hashes(tree_uuid: str) -> str: body_hashes = get_all_body_hashes(tree_uuid) @@ -1638,6 +1653,14 @@ def identifier_details(identifier_type: str, identifier: str) -> str: captures=captures) +@app.route('/capture_hash_details//', methods=['GET']) +def capture_hash_details(hash_type: str, h: str) -> str: + captures = get_capture_hash_investigator(hash_type, h) + return render_template('identifier_details.html', hash_type=hash_type, + h=h, + captures=captures) + + @app.route('/favicon_details/', methods=['GET']) @app.route('/favicon_details//', methods=['GET']) def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str: diff --git a/website/web/templates/hash_type_details.html b/website/web/templates/hash_type_details.html new file mode 100644 index 0000000..885aec9 --- /dev/null +++ b/website/web/templates/hash_type_details.html @@ -0,0 +1,49 @@ +{% from "macros.html" import shorten_string %} + + + +
+
{{hash_type}}: {{h}}
+
+ + + + + + + + + + {% for capture_uuid, title, landing_page, capture_time in captures %} + + + + + + {% endfor %} + +
Capture TimeCapture TitleLanding page
+ {{capture_time}} + + + {{ title }} + + + + {{ landing_page }} + +
diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index d586fc7..09748c6 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -112,6 +112,20 @@ }); + + + + +
Click on the hash to see the other captures it's been found in
+ + + + + + + + + + {% for number_captures, hash_type, hash in hashes %} + + + + + + {% endfor %} + +
Number of capturesHashHash type
{{ number_captures }} + + {{ hash }} + + {{hash_type}}