mirror of https://github.com/CIRCL/lookyloo
chg: Migrate HHHashes indexes to new format
parent
80ca393f1a
commit
9f9a24eb18
|
@ -144,7 +144,7 @@ class Indexing():
|
||||||
self.index_cookies_capture(ct)
|
self.index_cookies_capture(ct)
|
||||||
if not indexed[3]:
|
if not indexed[3]:
|
||||||
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
|
self.logger.info(f'Indexing HH Hashes for {uuid_to_index}')
|
||||||
self.index_http_headers_hashes_capture(ct)
|
self.index_hhhashes_capture(ct)
|
||||||
if not indexed[4]:
|
if not indexed[4]:
|
||||||
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
||||||
self.index_favicons_capture(uuid_to_index, directory)
|
self.index_favicons_capture(uuid_to_index, directory)
|
||||||
|
@ -324,48 +324,74 @@ class Indexing():
|
||||||
|
|
||||||
# ###### HTTP Headers Hashes ######
|
# ###### HTTP Headers Hashes ######
|
||||||
|
|
||||||
|
def _reindex_hhhashes(self, hhh: str) -> None:
|
||||||
|
# We changed the format of the indexes, so we need to make sure they're re-triggered.
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call]
|
||||||
|
pipeline.srem('indexed_hhhashes', *self.redis.smembers(f'hhhashes|{hhh}|captures'))
|
||||||
|
pipeline.delete(f'hhhashes|{hhh}|captures')
|
||||||
|
if self.redis.type('hhhashes') == 'zset': # type: ignore[no-untyped-call]
|
||||||
|
pipeline.delete('hhhashes')
|
||||||
|
pipeline.execute()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def http_headers_hashes(self) -> list[tuple[str, float]]:
|
def http_headers_hashes(self) -> set[str]:
|
||||||
return self.redis.zrevrange('hhhashes', 0, -1, withscores=True)
|
return self.redis.smembers('hhhashes')
|
||||||
|
|
||||||
def http_headers_hashes_number_captures(self, hhh: str) -> int:
|
def index_hhhashes_capture(self, crawled_tree: CrawledTree) -> None:
|
||||||
return self.redis.scard(f'hhhashes|{hhh}|captures')
|
|
||||||
|
|
||||||
def get_http_headers_hashes_captures(self, hhh: str) -> list[tuple[str, str]]:
|
|
||||||
return [uuids.split('|') for uuids in self.redis.smembers(f'hhhashes|{hhh}|captures')] # type: ignore[misc]
|
|
||||||
|
|
||||||
def index_http_headers_hashes_capture(self, crawled_tree: CrawledTree) -> None:
|
|
||||||
if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid):
|
if self.redis.sismember('indexed_hhhashes', crawled_tree.uuid):
|
||||||
# Do not reindex
|
# Do not reindex
|
||||||
return
|
return
|
||||||
self.redis.sadd('indexed_hhhashes', crawled_tree.uuid)
|
self.redis.sadd('indexed_hhhashes', crawled_tree.uuid)
|
||||||
self.logger.debug(f'Indexing http headers hashes for {crawled_tree.uuid} ... ')
|
self.logger.debug(f'Indexing HHHashes for {crawled_tree.uuid} ... ')
|
||||||
|
|
||||||
pipeline = self.redis.pipeline()
|
pipeline = self.redis.pipeline()
|
||||||
already_loaded: set[str] = set()
|
|
||||||
already_cleaned_up: set[str] = set()
|
# Add the tlds key in internal indexes set
|
||||||
is_reindex = False
|
internal_index = f'capture_indexes|{crawled_tree.uuid}'
|
||||||
|
pipeline.sadd(internal_index, 'hhhashes')
|
||||||
|
|
||||||
|
already_indexed_global: set[str] = set()
|
||||||
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
for urlnode in crawled_tree.root_hartree.url_tree.traverse():
|
||||||
if 'hhhash' not in urlnode.features:
|
if 'hhhash' not in urlnode.features:
|
||||||
continue
|
continue
|
||||||
if urlnode.hhhash in already_loaded:
|
self._reindex_hhhashes(urlnode.hhhash)
|
||||||
# Only add HTTP header Hash once / capture
|
if urlnode.hhhash not in already_indexed_global:
|
||||||
continue
|
# HHH hasn't been indexed in that run yet
|
||||||
already_loaded.add(urlnode.hhhash)
|
already_indexed_global.add(urlnode.hhhash)
|
||||||
if urlnode.hhhash not in already_cleaned_up:
|
pipeline.sadd(f'{internal_index}|hhhashes', urlnode.hhhash) # Only used to delete index
|
||||||
# We only run this srem once per name for a capture,
|
pipeline.sadd('hhhashes', urlnode.hhhash)
|
||||||
# before adding it for the first time
|
pipeline.zadd(f'hhhashes|{urlnode.hhhash}|captures',
|
||||||
to_remove = [key for key in self.redis.sscan_iter(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|*')]
|
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
|
||||||
if to_remove:
|
|
||||||
pipeline.srem(f'hhhashes|{urlnode.hhhash}|captures', * to_remove)
|
# Add hostnode UUID in internal index
|
||||||
is_reindex = True
|
pipeline.sadd(f'{internal_index}|hhhashes|{urlnode.hhhash}', urlnode.uuid)
|
||||||
self.logger.debug(f'reindexing http headers hashes for {crawled_tree.uuid} ... ')
|
|
||||||
already_cleaned_up.add(urlnode.hhhash)
|
|
||||||
pipeline.sadd(f'hhhashes|{urlnode.hhhash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}')
|
|
||||||
if not is_reindex:
|
|
||||||
pipeline.zincrby('hhhashes', 1, urlnode.hhhash)
|
|
||||||
pipeline.execute()
|
pipeline.execute()
|
||||||
self.logger.debug(f'done with http headers hashes for {crawled_tree.uuid}.')
|
self.logger.debug(f'done with HHHashes for {crawled_tree.uuid}.')
|
||||||
|
|
||||||
|
def get_captures_hhhash(self, hhh: str, most_recent_capture: datetime | None = None,
|
||||||
|
oldest_capture: datetime | None= None) -> list[tuple[str, float]]:
|
||||||
|
"""Get all the captures for a specific HTTP Header Hash, on a time interval starting from the most recent one.
|
||||||
|
|
||||||
|
:param hhh: The HTTP Header Hash
|
||||||
|
:param most_recent_capture: The capture time of the most recent capture to consider
|
||||||
|
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 15 days ago.
|
||||||
|
"""
|
||||||
|
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
|
||||||
|
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp()
|
||||||
|
if self.redis.type(f'hhhashes|{hhh}|captures') == 'set': # type: ignore[no-untyped-call]
|
||||||
|
# triggers the re-index soon.
|
||||||
|
self.redis.srem('indexed_urls', *self.redis.smembers(f'hhhashes|{hhh}|captures'))
|
||||||
|
return []
|
||||||
|
return self.redis.zrevrangebyscore(f'hhhashes|{hhh}|captures', max_score, min_score, withscores=True)
|
||||||
|
|
||||||
|
def get_captures_hhhash_count(self, hhh: str) -> int:
|
||||||
|
return self.redis.zcard(f'hhhashes|{hhh}|captures')
|
||||||
|
|
||||||
|
def get_capture_hhhash_nodes(self, capture_uuid: str, hhh: str) -> set[str]:
|
||||||
|
if url_nodes := self.redis.smembers(f'capture_indexes|{capture_uuid}|hhhashes|{hhh}'):
|
||||||
|
return set(url_nodes)
|
||||||
|
return set()
|
||||||
|
|
||||||
# ###### URLs and Domains ######
|
# ###### URLs and Domains ######
|
||||||
|
|
||||||
|
|
|
@ -482,23 +482,20 @@ def get_favicon_investigator(favicon_sha512: str,
|
||||||
|
|
||||||
def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
||||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||||
all_captures = dict(get_indexing(flask_login.current_user).get_http_headers_hashes_captures(hhh))
|
_captures = get_indexing(flask_login.current_user).get_captures_hhhash(hhh)
|
||||||
if cached_captures := lookyloo.sorted_capture_cache([entry for entry in all_captures]):
|
|
||||||
captures = []
|
captures = []
|
||||||
for cache in cached_captures:
|
headers: list[tuple[str, str]] = []
|
||||||
try:
|
for capture_uuid, capture_ts in _captures:
|
||||||
urlnode = lookyloo.get_urlnode_from_tree(cache.uuid, all_captures[cache.uuid])
|
cache = lookyloo.capture_cache(capture_uuid)
|
||||||
except Exception:
|
if not cache:
|
||||||
# NOTE: print a logline
|
|
||||||
# logger.warning(f'Cache for {cache.uuid} needs a rebuild: {e}.')
|
|
||||||
lookyloo._captures_index.remove_pickle(cache.uuid)
|
|
||||||
continue
|
continue
|
||||||
|
for urlnode_uuid in get_indexing(flask_login.current_user).get_capture_hhhash_nodes(capture_uuid, hhh):
|
||||||
|
urlnode = lookyloo.get_urlnode_from_tree(capture_uuid, urlnode_uuid)
|
||||||
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
|
captures.append((cache.uuid, urlnode.hostnode_uuid, urlnode.name, cache.title))
|
||||||
# get the headers and format them as they were in the response
|
if not headers:
|
||||||
urlnode = lookyloo.get_urlnode_from_tree(cached_captures[0].uuid, all_captures[cached_captures[0].uuid])
|
# Just do that once.
|
||||||
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
|
headers = [(header["name"], header["value"]) for header in urlnode.response['headers']]
|
||||||
return captures, headers
|
return captures, headers
|
||||||
return [], []
|
|
||||||
|
|
||||||
|
|
||||||
def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
|
def hash_lookup(blob_hash: str, url: str, current_capture_uuid: str) -> tuple[int, dict[str, list[tuple[str, str, str, str, str]]]]:
|
||||||
|
@ -1430,8 +1427,9 @@ def cookies_lookup() -> str:
|
||||||
|
|
||||||
@app.route('/hhhashes', methods=['GET'])
|
@app.route('/hhhashes', methods=['GET'])
|
||||||
def hhhashes_lookup() -> str:
|
def hhhashes_lookup() -> str:
|
||||||
hhhashes = [(hhh, freq, get_indexing(flask_login.current_user).http_headers_hashes_number_captures(hhh))
|
hhhashes = []
|
||||||
for hhh, freq in get_indexing(flask_login.current_user).http_headers_hashes]
|
for hhh in get_indexing(flask_login.current_user).http_headers_hashes:
|
||||||
|
hhhashes.append((hhh, get_indexing(flask_login.current_user).get_captures_hhhash_count(hhh)))
|
||||||
return render_template('hhhashes.html', hhhashes=hhhashes)
|
return render_template('hhhashes.html', hhhashes=hhhashes)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,17 +21,15 @@
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>HH Hash</th>
|
<th>HH Hash</th>
|
||||||
<th style="width:10%">Frequency</th>
|
|
||||||
<th style="width:10%">Number of captures</th>
|
<th style="width:10%">Number of captures</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for hhh, freq, number_captures in hhhashes %}
|
{% for hhh, number_captures in hhhashes %}
|
||||||
<tr>
|
<tr>
|
||||||
<td >
|
<td >
|
||||||
<a href="{{ url_for('hhh_detail', hhh=hhh) }}">{{ hhh }}</a>
|
<a href="{{ url_for('hhh_detail', hhh=hhh) }}">{{ hhh }}</a>
|
||||||
</td>
|
</td>
|
||||||
<td>{{ freq }}</td>
|
|
||||||
<td>{{ number_captures }}</td>
|
<td>{{ number_captures }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
Loading…
Reference in New Issue