diff --git a/bin/background_indexer.py b/bin/background_indexer.py index 058baaef..d6e6aaed 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -136,6 +136,7 @@ class BackgroundIndexer(AbstractManager): p.sismember('indexed_body_hashes', cache.uuid) p.sismember('indexed_cookies', cache.uuid) p.sismember('indexed_hhhashes', cache.uuid) + p.sismember('indexed_favicons', cache.uuid) indexed = p.execute() if all(indexed): continue @@ -158,6 +159,10 @@ class BackgroundIndexer(AbstractManager): if not indexed[3]: self.logger.info(f'Indexing HH Hashes for {cache.uuid}') self.lookyloo.indexing.index_http_headers_hashes_capture(ct) + if not indexed[4]: + self.logger.info(f'Indexing favicons for {cache.uuid}') + favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False) + self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons) # NOTE: categories aren't taken in account here, should be fixed(?) # see indexing.index_categories_capture(capture_uuid, categories) index_redis.delete('ongoing_indexing') diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index 5a281768..59a02399 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -5,9 +5,11 @@ from __future__ import annotations import hashlib import logging # import re +from io import BytesIO from collections import defaultdict from typing import Iterable from urllib.parse import urlsplit +from zipfile import ZipFile from har2tree import CrawledTree from redis import ConnectionPool, Redis @@ -22,12 +24,18 @@ class Indexing(): def __init__(self) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) + self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, + path=get_socket_path('indexing')) self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, path=get_socket_path('indexing'), decode_responses=True) def clear_indexes(self) -> None: self.redis.flushdb() + @property + def redis_bytes(self) -> Redis: # type: ignore[type-arg] + return Redis(connection_pool=self.redis_pool_bytes) + @property def redis(self) -> Redis: # type: ignore[type-arg] return Redis(connection_pool=self.redis_pool) @@ -325,6 +333,42 @@ class Indexing(): def get_captures_hostname(self, hostname: str) -> set[str]: return self.redis.smembers(f'hostnames|{hostname}|captures') + # ###### favicons ###### + + @property + def favicons(self) -> list[tuple[str, float]]: + return self.redis.zrevrange('favicons', 0, 200, withscores=True) + + def favicon_number_captures(self, favicon_sha512: str) -> int: + return self.redis.scard(f'favicons|{favicon_sha512}|captures') + + def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None: + if self.redis.sismember('indexed_favicons', capture_uuid): + # Do not reindex + return + self.redis.sadd('indexed_favicons', capture_uuid) + pipeline = self.redis.pipeline() + with ZipFile(favicons, 'r') as myzip: + for name in myzip.namelist(): + if not name.endswith('.ico'): + continue + favicon = myzip.read(name) + if not favicon: + # Empty file, ignore. + continue + sha = hashlib.sha512(favicon).hexdigest() + pipeline.zincrby('favicons', 1, sha) + pipeline.sadd(f'favicons|{sha}|captures', capture_uuid) + # There is no easi access to the favicons unless we store them in redis + pipeline.set(f'favicons|{sha}', favicon) + pipeline.execute() + + def get_captures_favicon(self, favicon_sha512: str) -> set[str]: + return self.redis.smembers(f'favicons|{favicon_sha512}|captures') + + def get_favicon(self, favicon_sha512: str) -> bytes | None: + return self.redis_bytes.get(f'favicons|{favicon_sha512}') + # ###### Categories ###### @property diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 8519cd26..1971249b 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -1046,6 +1046,13 @@ class Lookyloo(): for domain, freq in self.indexing.get_cookie_domains(cookie_name)] return captures, domains + def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str]], bytes | None]: + '''Returns all the captures related to a cookie name entry, used in the web interface.''' + cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)]) + captures = [(cache.uuid, cache.title) for cache in cached_captures] + favicon = self.indexing.get_favicon(favicon_sha512) + return captures, favicon + def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh)) diff --git a/website/web/__init__.py b/website/web/__init__.py index 94fb8ed8..8dda2eca 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -945,6 +945,19 @@ def hhhashes_lookup() -> str: return render_template('hhhashes.html', hhhashes=hhhashes) +@app.route('/favicons', methods=['GET']) +def favicons_lookup() -> str: + favicons = [] + for sha512, freq in lookyloo.indexing.favicons: + favicon = lookyloo.indexing.get_favicon(sha512) + if not favicon: + continue + favicon_b64 = base64.b64encode(favicon).decode() + nb_captures = lookyloo.indexing.favicon_number_captures(sha512) + favicons.append((sha512, freq, nb_captures, favicon_b64)) + return render_template('favicons.html', favicons=favicons) + + @app.route('/ressources', methods=['GET']) def ressources() -> str: ressources = [] @@ -1206,6 +1219,17 @@ def hhh_detail(hhh: str) -> str: return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers) +@app.route('/favicon_details/', methods=['GET']) +def favicon_detail(favicon_sha512: str) -> str: + captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip()) + if favicon: + b64_favicon = base64.b64encode(favicon).decode() + else: + b64_favicon = '' + return render_template('favicon_details.html', favicon_sha512=favicon_sha512, + captures=captures, b64_favicon=b64_favicon) + + @app.route('/body_hashes/', methods=['GET']) def body_hash_details(body_hash: str) -> str: from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False