chg: Rename certpl hash to domhash, rebuild accordingly.

pull/980/head
Raphaël Vinot 2024-10-28 14:45:26 +00:00
parent c3d9d897e9
commit 4717f3877c
3 changed files with 66 additions and 32 deletions

View File

@ -12,8 +12,6 @@ from zipfile import ZipFile
import mmh3 import mmh3
from bs4 import BeautifulSoup
from hashlib import sha256
from pathlib import Path from pathlib import Path
from har2tree import CrawledTree from har2tree import CrawledTree
@ -79,7 +77,10 @@ class Indexing():
for identifier_type in self.identifiers_types(): for identifier_type in self.identifiers_types():
p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) p.srem(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
for hash_type in self.captures_hashes_types(): for hash_type in self.captures_hashes_types():
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid) if hash_type == 'certpl_html_structure_hash':
self._rename_certpl_hash_domhash()
else:
p.srem(f'indexed_hash_type|{hash_type}', capture_uuid)
for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'): for internal_index in self.redis.smembers(f'capture_indexes|{capture_uuid}'):
# NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild # NOTE: these ones need to be removed because the node UUIDs are recreated on tree rebuild
# internal_index can be "tlds" # internal_index can be "tlds"
@ -164,7 +165,7 @@ class Indexing():
except (TreeNeedsRebuild, NoValidHarFile) as e: except (TreeNeedsRebuild, NoValidHarFile) as e:
self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}') self.logger.warning(f'Error loading the pickle for {uuid_to_index}: {e}')
except Exception as e: except Exception as e:
self.logger.warning(f'Error during indexing for {uuid_to_index}: {e}') self.logger.exception(f'Error during indexing for {uuid_to_index}: {e}')
finally: finally:
self.indexing_done(uuid_to_index) self.indexing_done(uuid_to_index)
@ -665,45 +666,54 @@ class Indexing():
# ###### Capture hashes ###### # ###### Capture hashes ######
# This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture) # This is where we define the indexing for the hashes generated for a whole capture (at most one hash per capture)
# certpl_html_structure_hash: concatenated list of all the tag names on the page - done on the rendered page # domhash (formerly known as certpl_html_structure_hash): concatenated list of all the tag names on the page - done on the rendered page
def _compute_certpl_html_structure_hash(self, html: str) -> str: def _rename_certpl_hash_domhash(self) -> None:
soup = BeautifulSoup(html, "lxml") # This is a one shot call that gets rid of all the old certpl_html_structure_hash and they will be replaced by domhash
to_hash = "|".join(t.name for t in soup.findAll()).encode() if (not self.redis.exists('capture_hash_types|certpl_html_structure_hash')
return sha256(to_hash).hexdigest()[:32] and not self.redis.exists('indexed_hash_type|certpl_html_structure_hash')):
# Already cleaned up
return
pipeline = self.redis.pipeline()
domhashes = set()
for capture_uuid in self.redis.sscan_iter('indexed_hash_type|certpl_html_structure_hash'):
domhash = self.redis.hget(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
if domhash not in domhashes:
# delete the whole key containing all the uuids
pipeline.delete(f'capture_hash_types|certpl_html_structure_hash|{domhash}|captures')
domhashes.add(domhash)
pipeline.hdel(f'capture_hash_types|{capture_uuid}', 'certpl_html_structure_hash')
pipeline.delete('capture_hash_types|certpl_html_structure_hash')
pipeline.delete('indexed_hash_type|certpl_html_structure_hash')
pipeline.execute()
def captures_hashes_types(self) -> set[str]: def captures_hashes_types(self) -> set[str]:
return {'certpl_html_structure_hash'} return {'domhash'}
# return self.redis.smembers('capture_hash_types') # return self.redis.smembers('capture_hash_types')
def captures_hashes(self, hash_type: str) -> list[tuple[str, float]]: def captures_hashes(self, hash_type: str) -> set[str]:
return self.redis.zrevrange(f'capture_hash_types|{hash_type}', 0, 200, withscores=True) return self.redis.smembers(f'capture_hash_types|{hash_type}')
def hash_frequency(self, hash_type: str, h: str) -> float | None:
return self.redis.zscore(f'capture_hash_types|{hash_type}', h)
def hash_number_captures(self, hash_type: str, h: str) -> int:
return self.redis.scard(f'capture_hash_types|{hash_type}|{h}|captures')
def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None: def index_capture_hashes_types(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid capture_uuid = crawled_tree.uuid
# NOTE: We will have multiple hash types for each captures, we want to make sure # NOTE: We will have multiple hash types for each captures, we want to make sure
# to reindex all the captures if there is a new hash type but only index the new # to reindex all the captures if there is a new hash type but only index the new
# captures on the existing hash types # captures on the existing hash types
# hashes = ('certpl_html_structure_hash', )
for hash_type in self.captures_hashes_types(): for hash_type in self.captures_hashes_types():
if hash_type == 'certpl_html_structure_hash':
self._rename_certpl_hash_domhash()
continue
if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid): if self.redis.sismember(f'indexed_hash_type|{hash_type}', capture_uuid):
# Do not reindex # Do not reindex
return return
self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid) self.redis.sadd(f'indexed_hash_type|{hash_type}', capture_uuid)
if hash_type == 'certpl_html_structure_hash': if hash_type == 'domhash':
# we must have a rendered HTML for this hash to be relevant. # the hash is computed in har2tree, we just check if it exists.
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'rendered_html') if not hasattr(crawled_tree.root_hartree.rendered_node, 'domhash'):
or not crawled_tree.root_hartree.rendered_node.rendered_html):
continue continue
# we have a rendered HTML, compute the hash # we have a rendered HTML, compute the hash
hash_to_index = self._compute_certpl_html_structure_hash(crawled_tree.root_hartree.rendered_node.rendered_html) hash_to_index = crawled_tree.root_hartree.rendered_node.domhash
else: else:
self.logger.warning(f'Unknown hash type: {hash_type}') self.logger.warning(f'Unknown hash type: {hash_type}')
continue continue
@ -712,21 +722,42 @@ class Indexing():
self.logger.info(f'No hash to index for {hash_type} in {capture_uuid} ... ') self.logger.info(f'No hash to index for {hash_type} in {capture_uuid} ... ')
continue continue
if self.redis.sismember(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid): if self.redis.zscore(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid) is not None:
# Already counted this specific identifier for this capture # Already counted this specific identifier for this capture
continue continue
self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ') self.logger.debug(f'Indexing hash {hash_type} for {capture_uuid} ... ')
pipeline = self.redis.pipeline() pipeline = self.redis.pipeline()
pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index) pipeline.hset(f'capture_hash_types|{capture_uuid}', hash_type, hash_to_index)
pipeline.sadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures', capture_uuid) pipeline.sadd(f'capture_hash_types|{hash_type}', hash_to_index)
pipeline.zincrby(f'capture_hash_types|{hash_type}', 1, hash_to_index) pipeline.zadd(f'capture_hash_types|{hash_type}|{hash_to_index}|captures',
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
pipeline.execute() pipeline.execute()
def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]: def get_hashes_types_capture(self, capture_uuid: str) -> dict[str, str]:
return self.redis.hgetall(f'capture_hash_types|{capture_uuid}') to_return = self.redis.hgetall(f'capture_hash_types|{capture_uuid}')
if to_return.pop('certpl_html_structure_hash', None):
# This one should be removed
self._rename_certpl_hash_domhash()
return to_return
def get_captures_hash_type(self, hash_type: str, h: str) -> set[str]: def get_captures_hash_type(self, hash_type: str, h: str, most_recent_capture: datetime | None = None,
return self.redis.smembers(f'capture_hash_types|{hash_type}|{h}|captures') oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
"""Get all the captures for a hash of a specific type, on a time interval starting from the most recent one.
:param hash_type: The type of hash
:param h: The hash
:param most_recent_capture: The capture time of the most recent capture to consider
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 5 days ago.
"""
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=5)).timestamp()
return self.redis.zrevrangebyscore(f'capture_hash_types|{hash_type}|{h}|captures', max_score, min_score, withscores=True)
def get_captures_hash_type_count(self, hash_type: str, h: str) -> int:
if hash_type == 'certpl_html_structure_hash':
# that one should be removed
return 0
return self.redis.zcard(f'capture_hash_types|{hash_type}|{h}|captures')
# ###### identifiers ###### # ###### identifiers ######

View File

@ -436,7 +436,9 @@ def get_identifier_investigator(identifier_type: str, identifier: str) -> list[t
def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]: def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str, str, datetime]]:
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)]) cached_captures = lookyloo.sorted_capture_cache(
[uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_hash_type(hash_type=hash_type, h=h)],
cached_captures_only=True)
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
@ -1321,7 +1323,7 @@ def tree_capture_hashes_types(tree_uuid: str) -> str:
to_return: list[tuple[int, str, str]] = [] to_return: list[tuple[int, str, str]] = []
for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items(): for hash_type, h in get_indexing(flask_login.current_user).get_hashes_types_capture(tree_uuid).items():
nb_captures = get_indexing(flask_login.current_user).hash_number_captures(hash_type, h) nb_captures = get_indexing(flask_login.current_user).get_captures_hash_type_count(hash_type, h)
to_return.append((nb_captures, hash_type, h)) to_return.append((nb_captures, hash_type, h))
return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return) return render_template('tree_hashes_types.html', tree_uuid=tree_uuid, hashes=to_return)

View File

@ -18,6 +18,7 @@
<center> <center>
<h5>{{hash_type}}: {{h}}</h5> <h5>{{hash_type}}: {{h}}</h5>
<h6>Only the most recent captures are listed below, this will change soon.</h6>
</center> </center>
<table id="hashTypeDetailsTable" class="table table-striped" style="width:100%"> <table id="hashTypeDetailsTable" class="table table-striped" style="width:100%">
<thead> <thead>