new: Add favicons in indexer

pull/887/head
Raphaël Vinot 2024-02-19 16:15:52 +01:00
parent a795d08456
commit 4153138644
4 changed files with 80 additions and 0 deletions

View File

@ -136,6 +136,7 @@ class BackgroundIndexer(AbstractManager):
p.sismember('indexed_body_hashes', cache.uuid) p.sismember('indexed_body_hashes', cache.uuid)
p.sismember('indexed_cookies', cache.uuid) p.sismember('indexed_cookies', cache.uuid)
p.sismember('indexed_hhhashes', cache.uuid) p.sismember('indexed_hhhashes', cache.uuid)
p.sismember('indexed_favicons', cache.uuid)
indexed = p.execute() indexed = p.execute()
if all(indexed): if all(indexed):
continue continue
@ -158,6 +159,10 @@ class BackgroundIndexer(AbstractManager):
if not indexed[3]: if not indexed[3]:
self.logger.info(f'Indexing HH Hashes for {cache.uuid}') self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
self.lookyloo.indexing.index_http_headers_hashes_capture(ct) self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
if not indexed[4]:
self.logger.info(f'Indexing favicons for {cache.uuid}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
# NOTE: categories aren't taken in account here, should be fixed(?) # NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories) # see indexing.index_categories_capture(capture_uuid, categories)
index_redis.delete('ongoing_indexing') index_redis.delete('ongoing_indexing')

View File

@ -5,9 +5,11 @@ from __future__ import annotations
import hashlib import hashlib
import logging import logging
# import re # import re
from io import BytesIO
from collections import defaultdict from collections import defaultdict
from typing import Iterable from typing import Iterable
from urllib.parse import urlsplit from urllib.parse import urlsplit
from zipfile import ZipFile
from har2tree import CrawledTree from har2tree import CrawledTree
from redis import ConnectionPool, Redis from redis import ConnectionPool, Redis
@ -22,12 +24,18 @@ class Indexing():
def __init__(self) -> None: def __init__(self) -> None:
self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(get_config('generic', 'loglevel')) self.logger.setLevel(get_config('generic', 'loglevel'))
self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'))
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection, self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
path=get_socket_path('indexing'), decode_responses=True) path=get_socket_path('indexing'), decode_responses=True)
def clear_indexes(self) -> None: def clear_indexes(self) -> None:
self.redis.flushdb() self.redis.flushdb()
@property
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool_bytes)
@property @property
def redis(self) -> Redis: # type: ignore[type-arg] def redis(self) -> Redis: # type: ignore[type-arg]
return Redis(connection_pool=self.redis_pool) return Redis(connection_pool=self.redis_pool)
@ -325,6 +333,42 @@ class Indexing():
def get_captures_hostname(self, hostname: str) -> set[str]: def get_captures_hostname(self, hostname: str) -> set[str]:
return self.redis.smembers(f'hostnames|{hostname}|captures') return self.redis.smembers(f'hostnames|{hostname}|captures')
# ###### favicons ######
@property
def favicons(self) -> list[tuple[str, float]]:
return self.redis.zrevrange('favicons', 0, 200, withscores=True)
def favicon_number_captures(self, favicon_sha512: str) -> int:
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
if self.redis.sismember('indexed_favicons', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_favicons', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
pipeline.zincrby('favicons', 1, sha)
pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
# There is no easi access to the favicons unless we store them in redis
pipeline.set(f'favicons|{sha}', favicon)
pipeline.execute()
def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
return self.redis.smembers(f'favicons|{favicon_sha512}|captures')
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### Categories ###### # ###### Categories ######
@property @property

View File

@ -1046,6 +1046,13 @@ class Lookyloo():
for domain, freq in self.indexing.get_cookie_domains(cookie_name)] for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains return captures, domains
def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str]], bytes | None]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512)
return captures, favicon
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]: def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.''' '''Returns all the captures related to a cookie name entry, used in the web interface.'''
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh)) all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))

View File

@ -945,6 +945,19 @@ def hhhashes_lookup() -> str:
return render_template('hhhashes.html', hhhashes=hhhashes) return render_template('hhhashes.html', hhhashes=hhhashes)
@app.route('/favicons', methods=['GET'])
def favicons_lookup() -> str:
favicons = []
for sha512, freq in lookyloo.indexing.favicons:
favicon = lookyloo.indexing.get_favicon(sha512)
if not favicon:
continue
favicon_b64 = base64.b64encode(favicon).decode()
nb_captures = lookyloo.indexing.favicon_number_captures(sha512)
favicons.append((sha512, freq, nb_captures, favicon_b64))
return render_template('favicons.html', favicons=favicons)
@app.route('/ressources', methods=['GET']) @app.route('/ressources', methods=['GET'])
def ressources() -> str: def ressources() -> str:
ressources = [] ressources = []
@ -1206,6 +1219,17 @@ def hhh_detail(hhh: str) -> str:
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers) return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
def favicon_detail(favicon_sha512: str) -> str:
captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip())
if favicon:
b64_favicon = base64.b64encode(favicon).decode()
else:
b64_favicon = ''
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, b64_favicon=b64_favicon)
@app.route('/body_hashes/<string:body_hash>', methods=['GET']) @app.route('/body_hashes/<string:body_hash>', methods=['GET'])
def body_hash_details(body_hash: str) -> str: def body_hash_details(body_hash: str) -> str:
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False