new: Shodan MM3H indexing

pull/888/head
Raphaël Vinot 2024-02-26 17:07:23 +01:00
parent 7e25747d82
commit decf887b63
9 changed files with 234 additions and 69 deletions

View File

@ -33,6 +33,7 @@ class BackgroundIndexer(AbstractManager):
all_done = self._build_missing_pickles() all_done = self._build_missing_pickles()
if all_done: if all_done:
self._check_indexes() self._check_indexes()
self._check_probabilistic_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name) self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _build_missing_pickles(self) -> bool: def _build_missing_pickles(self) -> bool:
@ -168,6 +169,33 @@ class BackgroundIndexer(AbstractManager):
index_redis.delete('ongoing_indexing') index_redis.delete('ongoing_indexing')
self.logger.info('... done.') self.logger.info('... done.')
def _check_probabilistic_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Probalistic indexing already ongoing in another process.')
return None
self.logger.info('Check probabilistic indexes...')
algorithms = ['mmh3-shodan']
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = index_redis.pipeline()
for algorithm in algorithms:
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
indexed = p.execute()
if all(indexed):
continue
for i, algorithm in enumerate(algorithms):
if not indexed[i]:
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
index_redis.delete('ongoing_probalistic_indexing')
self.logger.info('... done.')
def main() -> None: def main() -> None:
i = BackgroundIndexer() i = BackgroundIndexer()

View File

@ -99,7 +99,7 @@ class Processing(AbstractManager):
to_requeue.append(uuid) to_requeue.append(uuid)
for uuid in to_requeue: for uuid in to_requeue:
if self.lookyloo.redis.zscore('to_capture', uuid) is None if self.lookyloo.redis.zscore('to_capture', uuid) is None:
# The capture has been captured in the meantime. # The capture has been captured in the meantime.
continue continue
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.') self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')

View File

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import base64
import hashlib import hashlib
import logging import logging
# import re # import re
@ -11,6 +12,8 @@ from typing import Iterable
from urllib.parse import urlsplit from urllib.parse import urlsplit
from zipfile import ZipFile from zipfile import ZipFile
import mmh3
from har2tree import CrawledTree from har2tree import CrawledTree
from redis import ConnectionPool, Redis from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection from redis.connection import UnixDomainSocketConnection
@ -372,6 +375,55 @@ class Indexing():
def get_favicon(self, favicon_sha512: str) -> bytes | None: def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}') return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### favicons probabilistic hashes ######
def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
return self.redis.zscore(f'favicons|{algorithm}', phash)
def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
if algorithm == 'mmh3-shodan':
# Shodan uses a weird technique:
# 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
# 2. hashes the base64 string with mmh3
b64 = base64.encodebytes(favicon)
h = str(mmh3.hash(b64))
else:
raise NotImplementedError(f'Unknown algorithm: {algorithm}')
pipeline.zincrby(f'favicons|{algorithm}', 1, h)
# All captures with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
# All hashes with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
# reverse lookup to get probabilistic hashes related to a specific favicon
pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
pipeline.execute()
def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the favicon sha512 for this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')
def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
'''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')
def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the captures with this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')
# ###### Categories ###### # ###### Categories ######
@property @property

View File

@ -34,6 +34,7 @@ from lacuscore import (LacusCore,
CaptureSettings as CaptureSettingsCore) CaptureSettings as CaptureSettingsCore)
from PIL import Image, UnidentifiedImageError from PIL import Image, UnidentifiedImageError
from playwrightcapture import get_devices from playwrightcapture import get_devices
from puremagic import from_string # type: ignore[import-untyped]
from pylacus import (PyLacus, from pylacus import (PyLacus,
CaptureStatus as CaptureStatusPy CaptureStatus as CaptureStatusPy
# CaptureResponse as CaptureResponsePy, # CaptureResponse as CaptureResponsePy,
@ -1055,12 +1056,46 @@ class Lookyloo():
for domain, freq in self.indexing.get_cookie_domains(cookie_name)] for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains return captures, domains
def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str, str, datetime]], bytes | None]: def get_favicon_investigator(self, favicon_sha512: str,
/,
get_probabilistic=True) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.''' '''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)]) cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512) favicon = self.indexing.get_favicon(favicon_sha512)
return captures, favicon if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
else:
mimetype = ''
b64_favicon = ''
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = self.indexing.get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon), probabilistic_favicons
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]: def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.''' '''Returns all the captures related to a cookie name entry, used in the web interface.'''

112
poetry.lock generated
View File

@ -565,43 +565,43 @@ files = [
[[package]] [[package]]
name = "cryptography" name = "cryptography"
version = "42.0.4" version = "42.0.5"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"}, {file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16"},
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"}, {file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"}, {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"}, {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"}, {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"}, {file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"}, {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"}, {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"}, {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"}, {file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da"},
{file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"}, {file = "cryptography-42.0.5-cp37-abi3-win32.whl", hash = "sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74"},
{file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"}, {file = "cryptography-42.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940"},
{file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"}, {file = "cryptography-42.0.5-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"}, {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"}, {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"}, {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"}, {file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"}, {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"}, {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"}, {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"}, {file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30"},
{file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"}, {file = "cryptography-42.0.5-cp39-abi3-win32.whl", hash = "sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413"},
{file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"}, {file = "cryptography-42.0.5-cp39-abi3-win_amd64.whl", hash = "sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"}, {file = "cryptography-42.0.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"}, {file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"}, {file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"}, {file = "cryptography-42.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"}, {file = "cryptography-42.0.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"}, {file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"}, {file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"}, {file = "cryptography-42.0.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd"},
{file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"}, {file = "cryptography-42.0.5.tar.gz", hash = "sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1"},
] ]
[package.dependencies] [package.dependencies]
@ -1237,13 +1237,13 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs
[[package]] [[package]]
name = "importlib-resources" name = "importlib-resources"
version = "6.1.1" version = "6.1.2"
description = "Read resources from Python packages" description = "Read resources from Python packages"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "importlib_resources-6.1.1-py3-none-any.whl", hash = "sha256:e8bf90d8213b486f428c9c39714b920041cb02c184686a3dee24905aaa8105d6"}, {file = "importlib_resources-6.1.2-py3-none-any.whl", hash = "sha256:9a0a862501dc38b68adebc82970140c9e4209fc99601782925178f8386339938"},
{file = "importlib_resources-6.1.1.tar.gz", hash = "sha256:3893a00122eafde6894c59914446a512f728a0c1a45f9bb9b63721b6bacf0b4a"}, {file = "importlib_resources-6.1.2.tar.gz", hash = "sha256:308abf8474e2dba5f867d279237cd4076482c3de7104a40b41426370e891549b"},
] ]
[package.dependencies] [package.dependencies]
@ -1251,7 +1251,7 @@ zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
[package.extras] [package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff", "zipp (>=3.17)"] testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"]
[[package]] [[package]]
name = "ipython" name = "ipython"
@ -2286,13 +2286,13 @@ test = ["pytest"]
[[package]] [[package]]
name = "playwrightcapture" name = "playwrightcapture"
version = "1.23.4" version = "1.23.7"
description = "A simple library to capture websites using playwright" description = "A simple library to capture websites using playwright"
optional = false optional = false
python-versions = ">=3.8,<4.0" python-versions = ">=3.8,<4.0"
files = [ files = [
{file = "playwrightcapture-1.23.4-py3-none-any.whl", hash = "sha256:925117509adf3bbb48ba03eb87ec5b1230ae68f228156066ca3a4c7aee3744c0"}, {file = "playwrightcapture-1.23.7-py3-none-any.whl", hash = "sha256:26fba31f1412ceddf4595a26d635516694ddfed46b13f7c9117fd371c53d7525"},
{file = "playwrightcapture-1.23.4.tar.gz", hash = "sha256:7dd2fc11f7047fc4efd369ba3e52e0879ed1fca0255efa2c427dada6e6a08cf7"}, {file = "playwrightcapture-1.23.7.tar.gz", hash = "sha256:e317b777c40214dc3f2215ac5fdcb0470d221ce0ceff31bfdcbfb2e317cfa096"},
] ]
[package.dependencies] [package.dependencies]
@ -2300,10 +2300,11 @@ beautifulsoup4 = {version = ">=4.12.3,<5.0.0", extras = ["charset-normalizer", "
dateparser = ">=1.2.0,<2.0.0" dateparser = ">=1.2.0,<2.0.0"
playwright = ">=1.41.2,<2.0.0" playwright = ">=1.41.2,<2.0.0"
playwright-stealth = ">=1.0.6,<2.0.0" playwright-stealth = ">=1.0.6,<2.0.0"
puremagic = ">=1.20,<2.0"
pydub = {version = ">=0.25.1,<0.26.0", optional = true, markers = "extra == \"recaptcha\""} pydub = {version = ">=0.25.1,<0.26.0", optional = true, markers = "extra == \"recaptcha\""}
pytz = {version = ">=2024.1,<2025.0", markers = "python_version < \"3.9\""} pytz = {version = ">=2024.1,<2025.0", markers = "python_version < \"3.9\""}
requests = {version = ">=2.31.0,<3.0.0", extras = ["socks"], optional = true, markers = "extra == \"recaptcha\""} requests = {version = ">=2.31.0,<3.0.0", extras = ["socks"], optional = true, markers = "extra == \"recaptcha\""}
setuptools = ">=69.1.0,<70.0.0" setuptools = ">=69.1.1,<70.0.0"
SpeechRecognition = {version = ">=3.10.1,<4.0.0", optional = true, markers = "extra == \"recaptcha\""} SpeechRecognition = {version = ">=3.10.1,<4.0.0", optional = true, markers = "extra == \"recaptcha\""}
tzdata = ">=2024.1,<2025.0" tzdata = ">=2024.1,<2025.0"
w3lib = ">=2.1.2,<3.0.0" w3lib = ">=2.1.2,<3.0.0"
@ -2365,6 +2366,17 @@ files = [
[package.extras] [package.extras]
tests = ["pytest"] tests = ["pytest"]
[[package]]
name = "puremagic"
version = "1.21"
description = "Pure python implementation of magic file detection"
optional = false
python-versions = "*"
files = [
{file = "puremagic-1.21-py3-none-any.whl", hash = "sha256:8fe85c05800fe1eacdd5aa943b9e7fdbee66bc41a17aacf80efd6c668c63df45"},
{file = "puremagic-1.21.tar.gz", hash = "sha256:31ef09b37a6ad2f7f2b09b5bd6b8c4a07187a01af4025f5f1368889bdfc6d779"},
]
[[package]] [[package]]
name = "pycparser" name = "pycparser"
version = "2.21" version = "2.21"
@ -3084,19 +3096,19 @@ boto3 = ["aiobotocore[boto3] (>=2.5.4,<3.0.0)"]
[[package]] [[package]]
name = "setuptools" name = "setuptools"
version = "69.1.0" version = "69.1.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages" description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "setuptools-69.1.0-py3-none-any.whl", hash = "sha256:c054629b81b946d63a9c6e732bc8b2513a7c3ea645f11d0139a2191d735c60c6"}, {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
{file = "setuptools-69.1.0.tar.gz", hash = "sha256:850894c4195f09c4ed30dba56213bf7c3f21d86ed6bdaafb5df5972593bfc401"}, {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
] ]
[package.extras] [package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]] [[package]]
name = "six" name = "six"
@ -3353,13 +3365,13 @@ files = [
[[package]] [[package]]
name = "typing-extensions" name = "typing-extensions"
version = "4.9.0" version = "4.10.0"
description = "Backported and Experimental Type Hints for Python 3.8+" description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"},
{file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"},
] ]
[[package]] [[package]]
@ -3722,4 +3734,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<3.13" python-versions = ">=3.8.1,<3.13"
content-hash = "729283436c4c78f2f0114df11aabb366542ee1f23799bd3c91c814dbc3d9eabd" content-hash = "bde26fa1f6022d48b4976b949fbd87eaf483d7e174624d319ea1c2f1f7f430d7"

View File

@ -80,8 +80,8 @@ urllib3 = [
{version = "^2.0.7", python = ">=3.10"} {version = "^2.0.7", python = ">=3.10"}
] ]
pypdns = "^2.2.2" pypdns = "^2.2.2"
python-magic = "^0.4.27"
mmh3 = "^4.1.0" mmh3 = "^4.1.0"
puremagic = "^1.21"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
mypy = "^1.8.0" mypy = "^1.8.0"

View File

@ -15,7 +15,6 @@ import sys
import time import time
import filetype # type: ignore[import-untyped] import filetype # type: ignore[import-untyped]
import magic
from datetime import date, datetime, timedelta, timezone from datetime import date, datetime, timedelta, timezone
from importlib.metadata import version from importlib.metadata import version
@ -32,6 +31,7 @@ from flask_bootstrap import Bootstrap5 # type: ignore[import-untyped]
from flask_cors import CORS # type: ignore[import-untyped] from flask_cors import CORS # type: ignore[import-untyped]
from flask_restx import Api # type: ignore[import-untyped] from flask_restx import Api # type: ignore[import-untyped]
from lacuscore import CaptureStatus from lacuscore import CaptureStatus
from puremagic import from_string # type: ignore[import-untyped]
from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined] from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined]
from werkzeug.security import check_password_hash from werkzeug.security import check_password_hash
from werkzeug.wrappers.response import Response as WerkzeugResponse from werkzeug.wrappers.response import Response as WerkzeugResponse
@ -840,7 +840,6 @@ def mark_as_legitimate(tree_uuid: str) -> Response:
def tree_favicons(tree_uuid: str) -> str: def tree_favicons(tree_uuid: str) -> str:
favicons = [] favicons = []
favicons_zip = lookyloo.get_potential_favicons(tree_uuid, all_favicons=True, for_datauri=False) favicons_zip = lookyloo.get_potential_favicons(tree_uuid, all_favicons=True, for_datauri=False)
f = magic.Magic(mime=True)
with ZipFile(favicons_zip, 'r') as myzip: with ZipFile(favicons_zip, 'r') as myzip:
for name in myzip.namelist(): for name in myzip.namelist():
if not name.endswith('.ico'): if not name.endswith('.ico'):
@ -848,7 +847,7 @@ def tree_favicons(tree_uuid: str) -> str:
favicon = myzip.read(name) favicon = myzip.read(name)
if not favicon: if not favicon:
continue continue
mimetype = f.from_buffer(favicon) mimetype = from_string(favicon, mime=True)
favicon_sha512 = hashlib.sha512(favicon).hexdigest() favicon_sha512 = hashlib.sha512(favicon).hexdigest()
frequency = lookyloo.indexing.favicon_frequency(favicon_sha512) frequency = lookyloo.indexing.favicon_frequency(favicon_sha512)
number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512) number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512)
@ -1251,17 +1250,14 @@ def hhh_detail(hhh: str) -> str:
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET']) @app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
def favicon_detail(favicon_sha512: str) -> str: @app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip()) def favicon_detail(favicon_sha512: str, get_probabilistic: int=1) -> str:
if favicon: _get_prob = bool(get_probabilistic)
f = magic.Magic(mime=True) captures, favicon, probabilistic_favicons = lookyloo.get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
mimetype = f.from_buffer(favicon) mimetype, b64_favicon = favicon
b64_favicon = base64.b64encode(favicon).decode()
else:
b64_favicon = ''
mimetype = ''
return render_template('favicon_details.html', favicon_sha512=favicon_sha512, return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon) captures=captures, mimetype=mimetype, b64_favicon=b64_favicon,
probabilistic_favicons=probabilistic_favicons)
@app.route('/body_hashes/<string:body_hash>', methods=['GET']) @app.route('/body_hashes/<string:body_hash>', methods=['GET'])

View File

@ -1,7 +1,7 @@
{% from "macros.html" import shorten_string %} {% from "macros.html" import shorten_string %}
<script type="text/javascript"> <script type="text/javascript">
new DataTable('#faviconDetailsTable', { new DataTable('#faviconDetailsTable_{{favicon_sha512}}', {
columnDefs: [{ width: '30%', targets: 0 }, columnDefs: [{ width: '30%', targets: 0 },
{ width: '30%', targets: 1 }, { width: '30%', targets: 1 },
{ width: '50%', targets: 2 }], { width: '50%', targets: 2 }],
@ -11,7 +11,7 @@
<center> <center>
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:64px;height:64px;"/> <img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:64px;height:64px;"/>
</center> </center>
<table id="faviconDetailsTable" class="table table-striped" style="width:100%"> <table id="faviconDetailsTable_{{favicon_sha512}}" class="table table-striped" style="width:100%">
<thead> <thead>
<tr> <tr>
<th>Capture Time</th> <th>Capture Time</th>
@ -39,3 +39,18 @@
{% endfor %} {% endfor %}
</tbody> </tbody>
</table> </table>
{%for probabilistic_hash_algo, entries in probabilistic_favicons.items() %}
<h3>Probabilistic Favicon Hashes ({{ probabilistic_hash_algo }})</h3>
{% for mm3h, favicons in entries.items() %}
<h4>MM3 Hash: {{ mm3h }}</h4>
{% for sha512, favicon in favicons.items() %}
<a href="#faviconDetailsProbabilisticHashModal" data-remote="{{ url_for('favicon_detail', favicon_sha512=sha512, get_probabilistic=0) }}"
data-bs-toggle="modal" data-bs-target="#faviconDetailsProbabilisticHashModal" role="button">
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:32px;height:32px;"
title="Click to see other captures with the same favicon"/>
</a>
<br>
{% endfor %}
{% endfor %}
{% endfor %}

View File

@ -98,6 +98,13 @@
}); });
</script> </script>
<script> <script>
$('#faviconDetailsProbabilisticHashModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#bodyHashesModal').on('show.bs.modal', function(e) { $('#bodyHashesModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget); var button = $(e.relatedTarget);
var modal = $(this); var modal = $(this);
@ -572,6 +579,26 @@
</div> </div>
</div> </div>
<div class="modal fade" id="faviconDetailsProbabilisticHashModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="faviconDetailsProbabilisticHashModalLabel">Other occurrences of the favicon from a probabilistic hash</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading favicon details from probabilistic hash ...
</div>
<div class="modal-footer">
<a class="btn btn-primary" href="#faviconsModal"
data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Back to capture's favicons</a>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="bodyHashesModal" tabindex="-1" role="dialog"> <div class="modal fade" id="bodyHashesModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document"> <div class="modal-dialog modal-xl" role="document">
<div class="modal-content"> <div class="modal-content">