chg: remove refs to probabilistic favicons index

It was neither probabilistic, nor used.
pull/981/head
Raphaël Vinot 2024-10-31 15:18:39 +01:00
parent 7eece6b98f
commit ed16939790
4 changed files with 8 additions and 105 deletions

View File

@ -2,15 +2,10 @@
from __future__ import annotations
import base64
import hashlib
import logging
from io import BytesIO
from datetime import datetime, timedelta
from zipfile import ZipFile
import mmh3
from pathlib import Path
@ -845,56 +840,6 @@ class Indexing():
def get_captures_identifier_count(self, identifier_type: str, identifier: str) -> int:
return self.redis.zcard(f'identifiers|{identifier_type}|{identifier}|captures')
# ###### favicons probabilistic hashes ######
def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
return self.redis.zscore(f'favicons|{algorithm}', phash)
def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
# FIXME: this method isnt used anymore
if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
if algorithm == 'mmh3-shodan':
# Shodan uses a weird technique:
# 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
# 2. hashes the base64 string with mmh3
b64 = base64.encodebytes(favicon)
h = str(mmh3.hash(b64))
else:
raise NotImplementedError(f'Unknown algorithm: {algorithm}')
pipeline.zincrby(f'favicons|{algorithm}', 1, h)
# All captures with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
# All hashes with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
# reverse lookup to get probabilistic hashes related to a specific favicon
pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
pipeline.execute()
def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the favicon sha512 for this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')
def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
'''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')
def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the captures with this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')
# ###### Categories ######
@property

View File

@ -443,10 +443,8 @@ def get_capture_hash_investigator(hash_type: str, h: str) -> list[tuple[str, str
def get_favicon_investigator(favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
/) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str, str]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
@ -460,30 +458,7 @@ def get_favicon_investigator(favicon_sha512: str,
b64_favicon = ''
mmh3_shodan = ''
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in get_indexing(flask_login.current_user).get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in get_indexing(flask_login.current_user).get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon, mmh3_shodan), probabilistic_favicons
return captures, (mimetype, b64_favicon, mmh3_shodan)
def get_hhh_investigator(hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
@ -1787,14 +1762,12 @@ def capture_hash_details(hash_type: str, h: str) -> str:
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:
_get_prob = bool(get_probabilistic)
captures, favicon, probabilistic_favicons = get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
def favicon_detail(favicon_sha512: str) -> str:
captures, favicon = get_favicon_investigator(favicon_sha512.strip())
mimetype, b64_favicon, mmh3_shodan = favicon
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon, mmh3_shodan=mmh3_shodan,
probabilistic_favicons=probabilistic_favicons)
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon,
mmh3_shodan=mmh3_shodan)
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])

View File

@ -48,18 +48,3 @@
{% endfor %}
</tbody>
</table>
{%for probabilistic_hash_algo, entries in probabilistic_favicons.items() %}
<h3>Probabilistic Favicon Hashes ({{ probabilistic_hash_algo }})</h3>
{% for mm3h, favicons in entries.items() %}
<h4>MM3 Hash: {{ mm3h }}</h4>
{% for sha512, favicon in favicons.items() %}
<a href="#faviconDetailsProbabilisticHashModal" data-remote="{{ url_for('favicon_detail', favicon_sha512=sha512, get_probabilistic=0) }}"
data-bs-toggle="modal" data-bs-target="#faviconDetailsProbabilisticHashModal" role="button">
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:32px;height:32px;"
title="Click to see other captures with the same favicon"/>
</a>
<br>
{% endfor %}
{% endfor %}
{% endfor %}

View File

@ -32,7 +32,7 @@ if (downloadFavicons) {
<tr>
<td>{{ number_captures }}</td>
<td>
<a href="#faviconDetailsModal" data-remote="{{ url_for('favicon_detail', favicon_sha512=favicon_sha512, get_probabilistic=0) }}"
<a href="#faviconDetailsModal" data-remote="{{ url_for('favicon_detail', favicon_sha512=favicon_sha512) }}"
data-bs-toggle="modal" data-bs-target="#faviconDetailsModal" role="button">
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:32px;height:32px;"
title="Click to see other captures with the same favicon"/>