chg: Rename certpl hash to domhash, rebuild accordingly.

pull/980/head
Raphaël Vinot 2024-10-28 14:45:26 +00:00
parent c3d9d897e9
commit 4717f3877c
3 changed files with 66 additions and 32 deletions

View File

@ -12,8 +12,6 @@ from zipfile import ZipFile
import mmh3
from bs4 import BeautifulSoup
from hashlib import sha256
from pathlib import Path
from har2tree import CrawledTree
@ -79,7 +77,10 @@ class Indexing():
for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
if hash_type == 'certpl_html_structure_hash':
self._rename_certpl_hash_domhash()
else:
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
# NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild
# internal_index can be "tlds"
@ -164,7 +165,7 @@ class Indexing():
except (TreeNeedsRebuild, NoValidHarFile) as e:
self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
except Exception as e:
self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}')
self.logger.exception(f'Error during indexing for {uuid_to_index}: {e}')
finally:
self.indexing_done(uuid_to_index)
@ -665,45 +666,54 @@ class Indexing():
# ###### Capture hashes ######
# This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
# certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page
# domhash (formerly known as certpl_html_structure_hash): concatenated list of all the tag names on the page - done on the rendered page
def _compute_certpl_html_structure_hash(self, html: str) -> str:
soup = BeautifulSoup(html, "lxml")
to_hash = "|".join(t.name for t in soup.findAll()).encode()
return sha256(to_hash).hexdigest()[:32]
def _rename_certpl_hash_domhash(self) -> None:
# This is a one shot call that gets rid of all the old certpl_html_structure_hash and they will be replaced by domhash
if (not self.redis.exists('capture_hash_types|certpl_html_structure_hash')
and not self.redis.exists('indexed_hash_type|certpl_html_structure_hash')):
# Already cleaned up
return
pipeline = self.redis.pipeline()
domhashes = set()
for capture_uuid in self.redis.sscan_iter('indexed_hash_type|certpl_html_structure_hash'):
domhash = self.redis.hget(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
if domhash not in domhashes:
# delete the whole key containing all the uuids
pipeline.delete(f'capture_hash_types|certpl_html_structure_hash|{domhash}|captures')
domhashes.add(domhash)
pipeline.hdel(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
pipeline.delete('capture_hash_types|certpl_html_structure_hash')
pipeline.delete('indexed_hash_type|certpl_html_structure_hash')
pipeline.execute()
def captures_hashes_types(self) -> set[str]:
return {'certpl_html_structure_hash'}
return {'domhash'}
# return self.redis.smembers('capture_hash_types')
def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True)
def hash_frequency(self, hash_type: str, h: str) -> float | None:
return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
def hash_number_captures(self, hash_type: str, h: str) -> int:
return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
def captures_hashes(self, hash_type: str) -> set[str]:
return self.redis.smembers(f'capture_hash_types|{hash_type}')
def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
# NOTE: We will have multiple hash types for each captures, we want to make sure
# to reindex all the captures if there is a new hash type but only index the new
# captures on the existing hash types
# hashes = ('certpl_html_structure_hash', )
for hash_type in self.captures_hashes_types():
if hash_type == 'certpl_html_structure_hash':
self._rename_certpl_hash_domhash()
continue
if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)
if hash_type == 'certpl_html_structure_hash':
# we must have a rendered HTML for this hash to be relevant.
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html')
or not crawled_tree.root_hartree.rendered_node.rendered_html):
if hash_type == 'domhash':
# the hash is computed in har2tree, we just check if it exists.
if not hasattr(crawled_tree.root_hartree.rendered_node, 'domhash'):
continue
# we have a rendered HTML, compute the hash
hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html)
hash_to_index = crawled_tree.root_hartree.rendered_node.domhash
else:
self.logger.warning(f'Unknown hash type: {hash_type}')
continue
@ -712,21 +722,42 @@ class Indexing():
self.logger.info(f'No hash to index for {hash_type} in {capture_uuid} ... ')
continue
if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid):
if self.redis.zscore(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid) is not None:
# Already counted this specific identifier for this capture
continue
self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
pipeline = self.redis.pipeline()
pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid)
pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index)
pipeline.sadd(f'capture_hash_types|{hash_type}', hash_to_index)
pipeline.zadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures',
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
pipeline.execute()
def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
return self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
to_return = self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
if to_return.pop('certpl_html_structure_hash', None):
# This one should be removed
self._rename_certpl_hash_domhash()
return to_return
def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]:
return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures')
def get_captures_hash_type(self, hash_type: str, h: str, most_recent_capture: datetime | None = None,
oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
"""Get all the captures for a hash of a specific type, on a time interval starting from the most recent one.
:param hash_type: The type of hash
:param h: The hash
:param most_recent_capture: The capture time of the most recent capture to consider
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
return self.redis.zrevrangebyscore(f'capture_hash_types|{hash_type}|{h}|captures', max_score, min_score, withscores=True)
def get_captures_hash_type_count(self, hash_type: str, h: str) -> int:
if hash_type == 'certpl_html_structure_hash':
# that one should be removed
return 0
return self.redis.zcard(f'capture_hash_types|{hash_type}|{h}|captures')
# ###### identifiers ######

View File

@ -436,7 +436,9 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t
def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]:
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)])
cached_captures = lookyloo.sorted_capture_cache(
[uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)],
cached_captures_only=True)
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
@ -1321,7 +1323,7 @@ def tree_capture_hashes_types(tree_uuid: str) -> str:
to_return: list[tuple[int, str, str]] = []
for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items():
nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h)
nb_captures = get_indexing(flask_login.current_user).get_captures_hash_type_count(hash_type, h)
to_return.append((nb_captures, hash_type, h))
return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return)

View File

@ -18,6 +18,7 @@
<center>
<h5>{{hash_type}}: {{h}}</h5>
<h6>Only the most recent captures are listed below, this will change soon.</h6>
</center>
<table id="hashTypeDetailsTable" class="table table-striped" style="width:100%">
<thead>