mirror of https://github.com/CIRCL/lookyloo
chg: Migrate favicons to new paginated index
parent
ed16939790
commit
ab719bb191
|
@ -147,7 +147,7 @@ class Indexing():
|
||||||
self.index_hhhashes_capture(ct)
|
self.index_hhhashes_capture(ct)
|
||||||
if not indexed[4]:
|
if not indexed[4]:
|
||||||
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
self.logger.info(f'Indexing favicons for {uuid_to_index}')
|
||||||
self.index_favicons_capture(uuid_to_index, directory)
|
self.index_favicons_capture(ct, directory)
|
||||||
if not indexed[5]:
|
if not indexed[5]:
|
||||||
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
|
||||||
self.index_identifiers_capture(ct)
|
self.index_identifiers_capture(ct)
|
||||||
|
@ -618,22 +618,27 @@ class Indexing():
|
||||||
|
|
||||||
# ###### favicons ######
|
# ###### favicons ######
|
||||||
|
|
||||||
|
def _reindex_favicons(self, favicon_sha512: str) -> None:
|
||||||
|
# We changed the format of the indexes, so we need to make sure they're re-triggered.
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
if self.redis.type(f'favicons|{favicon_sha512}|captures') == 'set': # type: ignore[no-untyped-call]
|
||||||
|
pipeline.srem('indexed_favicons', *self.redis.smembers(f'favicons|{favicon_sha512}|captures'))
|
||||||
|
pipeline.delete(f'favicons|{favicon_sha512}|captures')
|
||||||
|
if self.redis.type('favicons') == 'zset': # type: ignore[no-untyped-call]
|
||||||
|
pipeline.delete('favicons')
|
||||||
|
pipeline.execute()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def favicons(self) -> list[tuple[str, float]]:
|
def favicons(self) -> set[str]:
|
||||||
return self.redis.zrevrange('favicons', 0, 200, withscores=True)
|
return self.redis.smembers('favicons')
|
||||||
|
|
||||||
def favicon_frequency(self, favicon_sha512: str) -> float | None:
|
def index_favicons_capture(self, crawled_tree: CrawledTree, capture_dir: Path) -> None:
|
||||||
return self.redis.zscore('favicons', favicon_sha512)
|
if self.redis.sismember('indexed_favicons', crawled_tree.uuid):
|
||||||
|
|
||||||
def favicon_number_captures(self, favicon_sha512: str) -> int:
|
|
||||||
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
|
|
||||||
|
|
||||||
def index_favicons_capture(self, capture_uuid: str, capture_dir: Path) -> None:
|
|
||||||
if self.redis.sismember('indexed_favicons', capture_uuid):
|
|
||||||
# Do not reindex
|
# Do not reindex
|
||||||
return
|
return
|
||||||
self.redis.sadd('indexed_favicons', capture_uuid)
|
self.redis.sadd('indexed_favicons', crawled_tree.uuid)
|
||||||
self.logger.debug(f'Indexing favicons for {capture_uuid} ... ')
|
self.logger.debug(f'Indexing favicons for {crawled_tree.uuid} ... ')
|
||||||
|
internal_index = f'capture_indexes|{crawled_tree.uuid}'
|
||||||
pipeline = self.redis.pipeline()
|
pipeline = self.redis.pipeline()
|
||||||
for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
|
for favicon_path in sorted(list(capture_dir.glob('*.potential_favicons.ico'))):
|
||||||
with favicon_path.open('rb') as f:
|
with favicon_path.open('rb') as f:
|
||||||
|
@ -642,16 +647,35 @@ class Indexing():
|
||||||
# Empty file, ignore.
|
# Empty file, ignore.
|
||||||
continue
|
continue
|
||||||
sha = hashlib.sha512(favicon).hexdigest()
|
sha = hashlib.sha512(favicon).hexdigest()
|
||||||
if not self.redis.sismember('favicons|{sha}|captures', capture_uuid):
|
self._reindex_favicons(sha)
|
||||||
# Do not count the same favicon more than once for the same capture
|
pipeline.sadd(f'{internal_index}|favicons', sha) # Only used to delete index
|
||||||
pipeline.zincrby('favicons', 1, sha)
|
pipeline.zadd(f'favicons|{sha}|captures',
|
||||||
pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
|
mapping={crawled_tree.uuid: crawled_tree.start_time.timestamp()})
|
||||||
# There is no easi access to the favicons unless we store them in redis
|
if not self.redis.sismember('favicon', sha):
|
||||||
|
pipeline.sadd('favicons', sha)
|
||||||
|
# There is no easy access to the favicons unless we store them in redis
|
||||||
pipeline.set(f'favicons|{sha}', favicon)
|
pipeline.set(f'favicons|{sha}', favicon)
|
||||||
pipeline.execute()
|
pipeline.execute()
|
||||||
|
|
||||||
def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
|
def get_captures_favicon(self, favicon_sha512: str, most_recent_capture: datetime | None=None,
|
||||||
return self.redis.smembers(f'favicons|{favicon_sha512}|captures')
|
oldest_capture: datetime | None = None) -> list[tuple[str, float]]:
|
||||||
|
"""Get all the captures for a specific favicon, on a time interval starting from the most recent one.
|
||||||
|
|
||||||
|
:param favicon_sha512: The favicon hash
|
||||||
|
:param most_recent_capture: The capture time of the most recent capture to consider
|
||||||
|
:param oldest_capture: The capture time of the oldest capture to consider, defaults to 15 days ago.
|
||||||
|
"""
|
||||||
|
max_score: str | float = most_recent_capture.timestamp() if most_recent_capture else '+Inf'
|
||||||
|
min_score: str | float = oldest_capture.timestamp() if oldest_capture else (datetime.now() - timedelta(days=15)).timestamp()
|
||||||
|
return self.redis.zrevrangebyscore(f'favicons|{favicon_sha512}|captures', max_score, min_score, withscores=True)
|
||||||
|
|
||||||
|
def get_captures_favicon_count(self, favicon_sha512: str) -> int:
|
||||||
|
if self.redis.type(f'favicons|{favicon_sha512}|captures') == 'set': # type: ignore[no-untyped-call]
|
||||||
|
# triggers the re-index soon.
|
||||||
|
self.redis.srem('indexed_favicons', *self.redis.smembers(f'favicons|{favicon_sha512}|captures'))
|
||||||
|
self.redis.delete(f'favicons|{favicon_sha512}|captures')
|
||||||
|
return 0
|
||||||
|
return self.redis.zcard(f'favicons|{favicon_sha512}|captures')
|
||||||
|
|
||||||
def get_favicon(self, favicon_sha512: str) -> bytes | None:
|
def get_favicon(self, favicon_sha512: str) -> bytes | None:
|
||||||
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
|
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
|
||||||
|
|
|
@ -446,7 +446,7 @@ def get_favicon_investigator(favicon_sha512: str,
|
||||||
/) -> tuple[list[tuple[str, str, str, datetime]],
|
/) -> tuple[list[tuple[str, str, str, datetime]],
|
||||||
tuple[str, str, str]]:
|
tuple[str, str, str]]:
|
||||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||||
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)])
|
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid, _ in get_indexing(flask_login.current_user).get_captures_favicon(favicon_sha512)])
|
||||||
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
|
||||||
favicon = get_indexing(flask_login.current_user).get_favicon(favicon_sha512)
|
favicon = get_indexing(flask_login.current_user).get_favicon(favicon_sha512)
|
||||||
if favicon:
|
if favicon:
|
||||||
|
@ -1285,11 +1285,10 @@ def tree_favicons(tree_uuid: str) -> str:
|
||||||
continue
|
continue
|
||||||
mimetype = from_string(favicon, mime=True)
|
mimetype = from_string(favicon, mime=True)
|
||||||
favicon_sha512 = hashlib.sha512(favicon).hexdigest()
|
favicon_sha512 = hashlib.sha512(favicon).hexdigest()
|
||||||
frequency = get_indexing(flask_login.current_user).favicon_frequency(favicon_sha512)
|
number_captures = get_indexing(flask_login.current_user).get_captures_favicon_count(favicon_sha512)
|
||||||
number_captures = get_indexing(flask_login.current_user).favicon_number_captures(favicon_sha512)
|
|
||||||
b64_favicon = base64.b64encode(favicon).decode()
|
b64_favicon = base64.b64encode(favicon).decode()
|
||||||
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
|
mmh3_shodan = lookyloo.compute_mmh3_shodan(favicon)
|
||||||
favicons.append((favicon_sha512, frequency, number_captures, mimetype, b64_favicon, mmh3_shodan))
|
favicons.append((favicon_sha512, number_captures, mimetype, b64_favicon, mmh3_shodan))
|
||||||
return render_template('tree_favicons.html', tree_uuid=tree_uuid, favicons=favicons)
|
return render_template('tree_favicons.html', tree_uuid=tree_uuid, favicons=favicons)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1431,13 +1430,13 @@ def hhhashes_lookup() -> str:
|
||||||
@app.route('/favicons', methods=['GET'])
|
@app.route('/favicons', methods=['GET'])
|
||||||
def favicons_lookup() -> str:
|
def favicons_lookup() -> str:
|
||||||
favicons = []
|
favicons = []
|
||||||
for sha512, freq in get_indexing(flask_login.current_user).favicons:
|
for sha512 in get_indexing(flask_login.current_user).favicons:
|
||||||
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
|
favicon = get_indexing(flask_login.current_user).get_favicon(sha512)
|
||||||
if not favicon:
|
if not favicon:
|
||||||
continue
|
continue
|
||||||
favicon_b64 = base64.b64encode(favicon).decode()
|
favicon_b64 = base64.b64encode(favicon).decode()
|
||||||
nb_captures = get_indexing(flask_login.current_user).favicon_number_captures(sha512)
|
nb_captures = get_indexing(flask_login.current_user).get_captures_favicon_count(sha512)
|
||||||
favicons.append((sha512, freq, nb_captures, favicon_b64))
|
favicons.append((sha512, nb_captures, favicon_b64))
|
||||||
return render_template('favicons.html', favicons=favicons)
|
return render_template('favicons.html', favicons=favicons)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -21,19 +21,17 @@
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th>Favicon</th>
|
<th>Favicon</th>
|
||||||
<th style="width:10%">Frequency</th>
|
|
||||||
<th style="width:10%">Number of captures</th>
|
<th style="width:10%">Number of captures</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for favicon_sha512, freq, number_captures, b64_favicon in favicons %}
|
{% for favicon_sha512, number_captures, b64_favicon in favicons %}
|
||||||
<tr>
|
<tr>
|
||||||
<td >
|
<td >
|
||||||
<a href="{{ url_for('favicon_detail', favicon_sha512=favicon_sha512) }}">
|
<a href="{{ url_for('favicon_detail', favicon_sha512=favicon_sha512) }}">
|
||||||
<img src="data:image/ico;base64,{{ b64_favicon }}" style="width:32px;height:32px;"/>
|
<img src="data:image/ico;base64,{{ b64_favicon }}" style="width:32px;height:32px;"/>
|
||||||
</a>
|
</a>
|
||||||
</td>
|
</td>
|
||||||
<td>{{ freq }}</td>
|
|
||||||
<td>{{ number_captures }}</td>
|
<td>{{ number_captures }}</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
|
|
|
@ -28,7 +28,7 @@ if (downloadFavicons) {
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
<tbody>
|
<tbody>
|
||||||
{% for favicon_sha512, freq, number_captures, mimetype, b64_favicon, mmh3_shodan in favicons %}
|
{% for favicon_sha512, number_captures, mimetype, b64_favicon, mmh3_shodan in favicons %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>{{ number_captures }}</td>
|
<td>{{ number_captures }}</td>
|
||||||
<td>
|
<td>
|
||||||
|
|
Loading…
Reference in New Issue