chg: Rename certpl hash to domhash, rebuild accordingly.

2024-10-28 14:45:26 +00:00 · 2024-10-28 14:45:26 +00:00 · 4717f3877c
parent c3d9d897e9
commit 4717f3877c
3 changed files with 66 additions and 32 deletions
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@ -12,8 +12,6 @@ from zipfile import ZipFile

 import mmh3

-from bs4 import BeautifulSoup
-from hashlib import sha256
 from pathlib import Path

 from har2tree import CrawledTree
@ -79,7 +77,10 @@ class Indexing():
        for identifier_type in self.identifiers_types():
            p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
        for hash_type in self.captures_hashes_types():
-            p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
+            if hash_type == 'certpl_html_structure_hash':
+                self._rename_certpl_hash_domhash()
+            else:
+                p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
        for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
            # NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild
            # internal_index can be "tlds"
@ -164,7 +165,7 @@ class Indexing():
        except (TreeNeedsRebuild, NoValidHarFile) as e:
            self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
        except Exception as e:
-            self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
+            self.logger.exception(f'Error during indexing for {uuid_to_index}: {e}')
        finally:
            self.indexing_done(uuid_to_index)

@ -665,45 +666,54 @@ class Indexing():
    # ###### Capture hashes ######

    # This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
-    # certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page
+    # domhash (formerly known as certpl_html_structure_hash): concatenated list of all the tag names on the page - done on the rendered page

-    def _compute_certpl_html_structure_hash(self, html: str) -> str:
-        soup = BeautifulSoup(html, "lxml")
-        to_hash = "|".join(t.name for t in soup.findAll()).encode()
-        return sha256(to_hash).hexdigest()[:32]
+    def _rename_certpl_hash_domhash(self) -> None:
+        # This is a one shot call that gets rid of all the old certpl_html_structure_hash and they will be replaced by domhash
+        if (not self.redis.exists('capture_hash_types|certpl_html_structure_hash')
+                and not self.redis.exists('indexed_hash_type|certpl_html_structure_hash')):
+            # Already cleaned up
+            return
+        pipeline = self.redis.pipeline()
+        domhashes = set()
+        for capture_uuid in self.redis.sscan_iter('indexed_hash_type|certpl_html_structure_hash'):
+            domhash = self.redis.hget(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
+            if domhash not in domhashes:
+                # delete the whole key containing all the uuids
+                pipeline.delete(f'capture_hash_types|certpl_html_structure_hash|{domhash}|captures')
+            domhashes.add(domhash)
+            pipeline.hdel(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
+        pipeline.delete('capture_hash_types|certpl_html_structure_hash')
+        pipeline.delete('indexed_hash_type|certpl_html_structure_hash')
+        pipeline.execute()

    def captures_hashes_types(self) -> set[str]:
-        return {'certpl_html_structure_hash'}
+        return {'domhash'}
    # return self.redis.smembers('capture_hash_types')

-    def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]:
-        return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True)
-
-    def hash_frequency(self, hash_type: str, h: str) -> float | None:
-        return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
-
-    def hash_number_captures(self, hash_type: str, h: str) -> int:
-        return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
+    def captures_hashes(self, hash_type: str) -> set[str]:
+        return self.redis.smembers(f'capture_hash_types|{hash_type}')

    def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
        capture_uuid = crawled_tree.uuid
        # NOTE: We will have multiple hash types for each captures, we want to make sure
        # to reindex all the captures if there is a new hash type but only index the new
        # captures on the existing hash types
-        # hashes = ('certpl_html_structure_hash', )
        for hash_type in self.captures_hashes_types():
+            if hash_type == 'certpl_html_structure_hash':
+                self._rename_certpl_hash_domhash()
+                continue
            if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
                # Do not reindex
                return
            self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)

-            if hash_type == 'certpl_html_structure_hash':
-                # we must have a rendered HTML for this hash to be relevant.
-                if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html')
-                        or not crawled_tree.root_hartree.rendered_node.rendered_html):
+            if hash_type == 'domhash':
+                # the hash is computed in har2tree, we just check if it exists.
+                if not hasattr(crawled_tree.root_hartree.rendered_node, 'domhash'):
                    continue
                # we have a rendered HTML, compute the hash
-                hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html)
+                hash_to_index = crawled_tree.root_hartree.rendered_node.domhash
            else:
                self.logger.warning(f'Unknown hash type: {hash_type}')
                continue
@ -712,21 +722,42 @@ class Indexing():
                self.logger.info(f'No hash to index for {hash_type} in {capture_uuid} ... ')
                continue

-            if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid):
+            if self.redis.zscore(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid) is not None:
                # Already counted this specific identifier for this capture
                continue
            self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
            pipeline = self.redis.pipeline()
            pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
-            pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid)
-            pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index)
+            pipeline.sadd(f'capture_hash_types|{hash_type}', hash_to_index)
+            pipeline.zadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures',
+                          mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
            pipeline.execute()

    def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
-        return self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
+        to_return = self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
+        if to_return.pop('certpl_html_structure_hash', None):
+            # This one should be removed
+            self._rename_certpl_hash_domhash()
+        return to_return

-    def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]:
-        return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures')
+    def get_captures_hash_type(self, hash_type: str, h: str, most_recent_capture: datetime | None = None,
+                               oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
+        """Get all the captures for a hash of a specific type, on a time interval starting from the most recent one.
+
+        :param hash_type: The type of hash
+        :param h: The hash
+        :param most_recent_capture: The capture time of the most recent capture to consider
+        :param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
+        """
+        max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
+        min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
+        return self.redis.zrevrangebyscore(f'capture_hash_types|{hash_type}|{h}|captures', max_score, min_score, withscores=True)
+
+    def get_captures_hash_type_count(self, hash_type: str, h: str) -> int:
+        if hash_type == 'certpl_html_structure_hash':
+            # that one should be removed
+            return 0
+        return self.redis.zcard(f'capture_hash_types|{hash_type}|{h}|captures')

    # ###### identifiers ######

--- a/website/web/init.py
+++ b/website/web/init.py
@ -436,7 +436,9 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t


 def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]:
-    cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)])
+    cached_captures = lookyloo.sorted_capture_cache(
+        [uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)],
+        cached_captures_only=True)
    return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]


@ -1321,7 +1323,7 @@ def tree_capture_hashes_types(tree_uuid: str) -> str:
    to_return: list[tuple[int, str, str]] = []

    for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items():
-        nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h)
+        nb_captures = get_indexing(flask_login.current_user).get_captures_hash_type_count(hash_type, h)
        to_return.append((nb_captures, hash_type, h))
    return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return)

--- a/website/web/templates/hash_type_details.html
+++ b/website/web/templates/hash_type_details.html
@ -18,6 +18,7 @@

 <center>
  <h5>{{hash_type}}: {{h}}</h5>
+  <h6>Only the most recent captures are listed below, this will change soon.</h6>
 </center>
 <table id="hashTypeDetailsTable" class="table table-striped" style="width:100%">
  <thead>