mirror of https://github.com/CIRCL/lookyloo
new: Add favicons in indexer
parent
a795d08456
commit
4153138644
|
@ -136,6 +136,7 @@ class BackgroundIndexer(AbstractManager):
|
||||||
p.sismember('indexed_body_hashes', cache.uuid)
|
p.sismember('indexed_body_hashes', cache.uuid)
|
||||||
p.sismember('indexed_cookies', cache.uuid)
|
p.sismember('indexed_cookies', cache.uuid)
|
||||||
p.sismember('indexed_hhhashes', cache.uuid)
|
p.sismember('indexed_hhhashes', cache.uuid)
|
||||||
|
p.sismember('indexed_favicons', cache.uuid)
|
||||||
indexed = p.execute()
|
indexed = p.execute()
|
||||||
if all(indexed):
|
if all(indexed):
|
||||||
continue
|
continue
|
||||||
|
@ -158,6 +159,10 @@ class BackgroundIndexer(AbstractManager):
|
||||||
if not indexed[3]:
|
if not indexed[3]:
|
||||||
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
self.logger.info(f'Indexing HH Hashes for {cache.uuid}')
|
||||||
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
|
self.lookyloo.indexing.index_http_headers_hashes_capture(ct)
|
||||||
|
if not indexed[4]:
|
||||||
|
self.logger.info(f'Indexing favicons for {cache.uuid}')
|
||||||
|
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
|
||||||
|
self.lookyloo.indexing.index_favicons_capture(cache.uuid, favicons)
|
||||||
# NOTE: categories aren't taken in account here, should be fixed(?)
|
# NOTE: categories aren't taken in account here, should be fixed(?)
|
||||||
# see indexing.index_categories_capture(capture_uuid, categories)
|
# see indexing.index_categories_capture(capture_uuid, categories)
|
||||||
index_redis.delete('ongoing_indexing')
|
index_redis.delete('ongoing_indexing')
|
||||||
|
|
|
@ -5,9 +5,11 @@ from __future__ import annotations
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
# import re
|
# import re
|
||||||
|
from io import BytesIO
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
from har2tree import CrawledTree
|
from har2tree import CrawledTree
|
||||||
from redis import ConnectionPool, Redis
|
from redis import ConnectionPool, Redis
|
||||||
|
@ -22,12 +24,18 @@ class Indexing():
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||||
self.logger.setLevel(get_config('generic', 'loglevel'))
|
self.logger.setLevel(get_config('generic', 'loglevel'))
|
||||||
|
self.redis_pool_bytes: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||||
|
path=get_socket_path('indexing'))
|
||||||
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
self.redis_pool: ConnectionPool = ConnectionPool(connection_class=UnixDomainSocketConnection,
|
||||||
path=get_socket_path('indexing'), decode_responses=True)
|
path=get_socket_path('indexing'), decode_responses=True)
|
||||||
|
|
||||||
def clear_indexes(self) -> None:
|
def clear_indexes(self) -> None:
|
||||||
self.redis.flushdb()
|
self.redis.flushdb()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def redis_bytes(self) -> Redis: # type: ignore[type-arg]
|
||||||
|
return Redis(connection_pool=self.redis_pool_bytes)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def redis(self) -> Redis: # type: ignore[type-arg]
|
def redis(self) -> Redis: # type: ignore[type-arg]
|
||||||
return Redis(connection_pool=self.redis_pool)
|
return Redis(connection_pool=self.redis_pool)
|
||||||
|
@ -325,6 +333,42 @@ class Indexing():
|
||||||
def get_captures_hostname(self, hostname: str) -> set[str]:
|
def get_captures_hostname(self, hostname: str) -> set[str]:
|
||||||
return self.redis.smembers(f'hostnames|{hostname}|captures')
|
return self.redis.smembers(f'hostnames|{hostname}|captures')
|
||||||
|
|
||||||
|
# ###### favicons ######
|
||||||
|
|
||||||
|
@property
|
||||||
|
def favicons(self) -> list[tuple[str, float]]:
|
||||||
|
return self.redis.zrevrange('favicons', 0, 200, withscores=True)
|
||||||
|
|
||||||
|
def favicon_number_captures(self, favicon_sha512: str) -> int:
|
||||||
|
return self.redis.scard(f'favicons|{favicon_sha512}|captures')
|
||||||
|
|
||||||
|
def index_favicons_capture(self, capture_uuid: str, favicons: BytesIO) -> None:
|
||||||
|
if self.redis.sismember('indexed_favicons', capture_uuid):
|
||||||
|
# Do not reindex
|
||||||
|
return
|
||||||
|
self.redis.sadd('indexed_favicons', capture_uuid)
|
||||||
|
pipeline = self.redis.pipeline()
|
||||||
|
with ZipFile(favicons, 'r') as myzip:
|
||||||
|
for name in myzip.namelist():
|
||||||
|
if not name.endswith('.ico'):
|
||||||
|
continue
|
||||||
|
favicon = myzip.read(name)
|
||||||
|
if not favicon:
|
||||||
|
# Empty file, ignore.
|
||||||
|
continue
|
||||||
|
sha = hashlib.sha512(favicon).hexdigest()
|
||||||
|
pipeline.zincrby('favicons', 1, sha)
|
||||||
|
pipeline.sadd(f'favicons|{sha}|captures', capture_uuid)
|
||||||
|
# There is no easi access to the favicons unless we store them in redis
|
||||||
|
pipeline.set(f'favicons|{sha}', favicon)
|
||||||
|
pipeline.execute()
|
||||||
|
|
||||||
|
def get_captures_favicon(self, favicon_sha512: str) -> set[str]:
|
||||||
|
return self.redis.smembers(f'favicons|{favicon_sha512}|captures')
|
||||||
|
|
||||||
|
def get_favicon(self, favicon_sha512: str) -> bytes | None:
|
||||||
|
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
|
||||||
|
|
||||||
# ###### Categories ######
|
# ###### Categories ######
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -1046,6 +1046,13 @@ class Lookyloo():
|
||||||
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
|
||||||
return captures, domains
|
return captures, domains
|
||||||
|
|
||||||
|
def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str]], bytes | None]:
|
||||||
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||||
|
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
|
||||||
|
captures = [(cache.uuid, cache.title) for cache in cached_captures]
|
||||||
|
favicon = self.indexing.get_favicon(favicon_sha512)
|
||||||
|
return captures, favicon
|
||||||
|
|
||||||
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
|
||||||
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
|
||||||
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))
|
all_captures = dict(self.indexing.get_http_headers_hashes_captures(hhh))
|
||||||
|
|
|
@ -945,6 +945,19 @@ def hhhashes_lookup() -> str:
|
||||||
return render_template('hhhashes.html', hhhashes=hhhashes)
|
return render_template('hhhashes.html', hhhashes=hhhashes)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/favicons', methods=['GET'])
|
||||||
|
def favicons_lookup() -> str:
|
||||||
|
favicons = []
|
||||||
|
for sha512, freq in lookyloo.indexing.favicons:
|
||||||
|
favicon = lookyloo.indexing.get_favicon(sha512)
|
||||||
|
if not favicon:
|
||||||
|
continue
|
||||||
|
favicon_b64 = base64.b64encode(favicon).decode()
|
||||||
|
nb_captures = lookyloo.indexing.favicon_number_captures(sha512)
|
||||||
|
favicons.append((sha512, freq, nb_captures, favicon_b64))
|
||||||
|
return render_template('favicons.html', favicons=favicons)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/ressources', methods=['GET'])
|
@app.route('/ressources', methods=['GET'])
|
||||||
def ressources() -> str:
|
def ressources() -> str:
|
||||||
ressources = []
|
ressources = []
|
||||||
|
@ -1206,6 +1219,17 @@ def hhh_detail(hhh: str) -> str:
|
||||||
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
|
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
|
||||||
|
def favicon_detail(favicon_sha512: str) -> str:
|
||||||
|
captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip())
|
||||||
|
if favicon:
|
||||||
|
b64_favicon = base64.b64encode(favicon).decode()
|
||||||
|
else:
|
||||||
|
b64_favicon = ''
|
||||||
|
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
|
||||||
|
captures=captures, b64_favicon=b64_favicon)
|
||||||
|
|
||||||
|
|
||||||
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
|
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])
|
||||||
def body_hash_details(body_hash: str) -> str:
|
def body_hash_details(body_hash: str) -> str:
|
||||||
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
|
from_popup = True if (request.args.get('from_popup') and request.args.get('from_popup') == 'True') else False
|
||||||
|
|
Loading…
Reference in New Issue