new: Shodan MM3H indexing

pull/888/head
Raphaël Vinot 2024-02-26 17:07:23 +01:00
parent 7e25747d82
commit decf887b63
9 changed files with 234 additions and 69 deletions

View File

@ -33,6 +33,7 @@ class BackgroundIndexer(AbstractManager):
all_done = self._build_missing_pickles()
if all_done:
self._check_indexes()
self._check_probabilistic_indexes()
self.lookyloo.update_tree_cache_info(os.getpid(), self.script_name)
def _build_missing_pickles(self) -> bool:
@ -168,6 +169,33 @@ class BackgroundIndexer(AbstractManager):
index_redis.delete('ongoing_indexing')
self.logger.info('... done.')
def _check_probabilistic_indexes(self) -> None:
index_redis = self.lookyloo.indexing.redis
can_index = index_redis.set('ongoing_probalistic_indexing', 1, ex=3600, nx=True)
if not can_index:
# There is no reason to run this method in multiple scripts.
self.logger.info('Probalistic indexing already ongoing in another process.')
return None
self.logger.info('Check probabilistic indexes...')
algorithms = ['mmh3-shodan']
for cache in self.lookyloo.sorted_capture_cache(cached_captures_only=False):
if self.lookyloo.is_public_instance and cache.no_index:
# Capture unindexed
continue
p = index_redis.pipeline()
for algorithm in algorithms:
p.sismember(f'indexed_favicons_probabilistic|{algorithm}', cache.uuid)
indexed = p.execute()
if all(indexed):
continue
for i, algorithm in enumerate(algorithms):
if not indexed[i]:
self.logger.info(f'Probabilistic indexing favicons for {cache.uuid} with {algorithm}')
favicons = self.lookyloo.get_potential_favicons(cache.uuid, all_favicons=True, for_datauri=False)
self.lookyloo.indexing.index_favicons_probabilistic(cache.uuid, favicons, algorithm)
index_redis.delete('ongoing_probalistic_indexing')
self.logger.info('... done.')
def main() -> None:
i = BackgroundIndexer()

View File

@ -99,7 +99,7 @@ class Processing(AbstractManager):
to_requeue.append(uuid)
for uuid in to_requeue:
if self.lookyloo.redis.zscore('to_capture', uuid) is None
if self.lookyloo.redis.zscore('to_capture', uuid) is None:
# The capture has been captured in the meantime.
continue
self.logger.info(f'Found a non-queued capture ({uuid}), retrying now.')

View File

@ -2,6 +2,7 @@
from __future__ import annotations
import base64
import hashlib
import logging
# import re
@ -11,6 +12,8 @@ from typing import Iterable
from urllib.parse import urlsplit
from zipfile import ZipFile
import mmh3
from har2tree import CrawledTree
from redis import ConnectionPool, Redis
from redis.connection import UnixDomainSocketConnection
@ -372,6 +375,55 @@ class Indexing():
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### favicons probabilistic hashes ######
def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:
return self.redis.zscore(f'favicons|{algorithm}', phash)
def index_favicons_probabilistic(self, capture_uuid: str, favicons: BytesIO, algorithm: str) -> None:
if self.redis.sismember(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid):
# Do not reindex
return
self.redis.sadd(f'indexed_favicons_probabilistic|{algorithm}', capture_uuid)
pipeline = self.redis.pipeline()
with ZipFile(favicons, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
continue
favicon = myzip.read(name)
if not favicon:
# Empty file, ignore.
continue
sha = hashlib.sha512(favicon).hexdigest()
if algorithm == 'mmh3-shodan':
# Shodan uses a weird technique:
# 1. encodes the image to base64, with newlines every 76 characters (as per RFC 2045)
# 2. hashes the base64 string with mmh3
b64 = base64.encodebytes(favicon)
h = str(mmh3.hash(b64))
else:
raise NotImplementedError(f'Unknown algorithm: {algorithm}')
pipeline.zincrby(f'favicons|{algorithm}', 1, h)
# All captures with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|captures', capture_uuid)
# All hashes with this hash for this algorithm
pipeline.sadd(f'favicons|{algorithm}|{h}|favicons', sha)
# reverse lookup to get probabilistic hashes related to a specific favicon
pipeline.sadd(f'favicons|{algorithm}|{sha}', h)
pipeline.execute()
def get_hashes_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the favicon sha512 for this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|favicons')
def get_probabilistic_hashes_favicon(self, algorithm: str, favicon_sha512: str) -> set[str]:
'''All the probabilistic hashes for this favicon SHA512 for this algorithm'''''
return self.redis.smembers(f'favicons|{algorithm}|{favicon_sha512}')
def get_captures_favicon_probablistic(self, algorithm: str, phash: str) -> set[str]:
'''All the captures with this probabilistic hash for this algorithm'''
return self.redis.smembers(f'favicons|{algorithm}|{phash}|captures')
# ###### Categories ######
@property

View File

@ -34,6 +34,7 @@ from lacuscore import (LacusCore,
CaptureSettings as CaptureSettingsCore)
from PIL import Image, UnidentifiedImageError
from playwrightcapture import get_devices
from puremagic import from_string # type: ignore[import-untyped]
from pylacus import (PyLacus,
CaptureStatus as CaptureStatusPy
# CaptureResponse as CaptureResponsePy,
@ -1055,12 +1056,46 @@ class Lookyloo():
for domain, freq in self.indexing.get_cookie_domains(cookie_name)]
return captures, domains
def get_favicon_investigator(self, favicon_sha512: str, /) -> tuple[list[tuple[str, str, str, datetime]], bytes | None]:
def get_favicon_investigator(self, favicon_sha512: str,
/,
get_probabilistic=True) -> tuple[list[tuple[str, str, str, datetime]],
tuple[str, str],
dict[str, dict[str, dict[str, tuple[str, str]]]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''
cached_captures = self.sorted_capture_cache([uuid for uuid in self.indexing.get_captures_favicon(favicon_sha512)])
captures = [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
favicon = self.indexing.get_favicon(favicon_sha512)
return captures, favicon
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
else:
mimetype = ''
b64_favicon = ''
# For now, there is only one probabilistic hash algo for favicons, keeping it simple
probabilistic_hash_algos = ['mmh3-shodan']
probabilistic_favicons: dict[str, dict[str, dict[str, tuple[str, str]]]] = {}
if get_probabilistic:
for algo in probabilistic_hash_algos:
probabilistic_favicons[algo] = {}
for mm3hash in self.indexing.get_probabilistic_hashes_favicon(algo, favicon_sha512):
probabilistic_favicons[algo][mm3hash] = {}
for sha512 in self.indexing.get_hashes_favicon_probablistic(algo, mm3hash):
if sha512 == favicon_sha512:
# Skip entry if it is the same as the favicon we are investigating
continue
favicon = self.indexing.get_favicon(sha512)
if favicon:
mimetype = from_string(favicon, mime=True)
b64_favicon = base64.b64encode(favicon).decode()
probabilistic_favicons[algo][mm3hash][sha512] = (mimetype, b64_favicon)
if not probabilistic_favicons[algo][mm3hash]:
# remove entry if it has no favicon
probabilistic_favicons[algo].pop(mm3hash)
if not probabilistic_favicons[algo]:
# remove entry if it has no hash
probabilistic_favicons.pop(algo)
return captures, (mimetype, b64_favicon), probabilistic_favicons
def get_hhh_investigator(self, hhh: str, /) -> tuple[list[tuple[str, str, str, str]], list[tuple[str, str]]]:
'''Returns all the captures related to a cookie name entry, used in the web interface.'''

112
poetry.lock generated
View File

@ -565,43 +565,43 @@ files = [
[[package]]
name = "cryptography"
version = "42.0.4"
version = "42.0.5"
description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
optional = false
python-versions = ">=3.7"
files = [
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ffc73996c4fca3d2b6c1c8c12bfd3ad00def8621da24f547626bf06441400449"},
{file = "cryptography-42.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:db4b65b02f59035037fde0998974d84244a64c3265bdef32a827ab9b63d61b18"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dad9c385ba8ee025bb0d856714f71d7840020fe176ae0229de618f14dae7a6e2"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69b22ab6506a3fe483d67d1ed878e1602bdd5912a134e6202c1ec672233241c1"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e09469a2cec88fb7b078e16d4adec594414397e8879a4341c6ace96013463d5b"},
{file = "cryptography-42.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:3e970a2119507d0b104f0a8e281521ad28fc26f2820687b3436b8c9a5fcf20d1"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:e53dc41cda40b248ebc40b83b31516487f7db95ab8ceac1f042626bc43a2f992"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c3a5cbc620e1e17009f30dd34cb0d85c987afd21c41a74352d1719be33380885"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:6bfadd884e7280df24d26f2186e4e07556a05d37393b0f220a840b083dc6a824"},
{file = "cryptography-42.0.4-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:01911714117642a3f1792c7f376db572aadadbafcd8d75bb527166009c9f1d1b"},
{file = "cryptography-42.0.4-cp37-abi3-win32.whl", hash = "sha256:fb0cef872d8193e487fc6bdb08559c3aa41b659a7d9be48b2e10747f47863925"},
{file = "cryptography-42.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c1f25b252d2c87088abc8bbc4f1ecbf7c919e05508a7e8628e6875c40bc70923"},
{file = "cryptography-42.0.4-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:15a1fb843c48b4a604663fa30af60818cd28f895572386e5f9b8a665874c26e7"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1327f280c824ff7885bdeef8578f74690e9079267c1c8bd7dc5cc5aa065ae52"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ffb03d419edcab93b4b19c22ee80c007fb2d708429cecebf1dd3258956a563a"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:1df6fcbf60560d2113b5ed90f072dc0b108d64750d4cbd46a21ec882c7aefce9"},
{file = "cryptography-42.0.4-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:44a64043f743485925d3bcac548d05df0f9bb445c5fcca6681889c7c3ab12764"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:3c6048f217533d89f2f8f4f0fe3044bf0b2090453b7b73d0b77db47b80af8dff"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6d0fbe73728c44ca3a241eff9aefe6496ab2656d6e7a4ea2459865f2e8613257"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:887623fe0d70f48ab3f5e4dbf234986b1329a64c066d719432d0698522749929"},
{file = "cryptography-42.0.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ce8613beaffc7c14f091497346ef117c1798c202b01153a8cc7b8e2ebaaf41c0"},
{file = "cryptography-42.0.4-cp39-abi3-win32.whl", hash = "sha256:810bcf151caefc03e51a3d61e53335cd5c7316c0a105cc695f0959f2c638b129"},
{file = "cryptography-42.0.4-cp39-abi3-win_amd64.whl", hash = "sha256:a0298bdc6e98ca21382afe914c642620370ce0470a01e1bef6dd9b5354c36854"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5f8907fcf57392cd917892ae83708761c6ff3c37a8e835d7246ff0ad251d9298"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:12d341bd42cdb7d4937b0cabbdf2a94f949413ac4504904d0cdbdce4a22cbf88"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1cdcdbd117681c88d717437ada72bdd5be9de117f96e3f4d50dab3f59fd9ab20"},
{file = "cryptography-42.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0e89f7b84f421c56e7ff69f11c441ebda73b8a8e6488d322ef71746224c20fce"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f1e85a178384bf19e36779d91ff35c7617c885da487d689b05c1366f9933ad74"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d2a27aca5597c8a71abbe10209184e1a8e91c1fd470b5070a2ea60cafec35bcd"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4e36685cb634af55e0677d435d425043967ac2f3790ec652b2b88ad03b85c27b"},
{file = "cryptography-42.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:f47be41843200f7faec0683ad751e5ef11b9a56a220d57f300376cd8aba81660"},
{file = "cryptography-42.0.4.tar.gz", hash = "sha256:831a4b37accef30cccd34fcb916a5d7b5be3cbbe27268a02832c3e450aea39cb"},
{file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:a30596bae9403a342c978fb47d9b0ee277699fa53bbafad14706af51fe543d16"},
{file = "cryptography-42.0.5-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:b7ffe927ee6531c78f81aa17e684e2ff617daeba7f189f911065b2ea2d526dec"},
{file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2424ff4c4ac7f6b8177b53c17ed5d8fa74ae5955656867f5a8affaca36a27abb"},
{file = "cryptography-42.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:329906dcc7b20ff3cad13c069a78124ed8247adcac44b10bea1130e36caae0b4"},
{file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b03c2ae5d2f0fc05f9a2c0c997e1bc18c8229f392234e8a0194f202169ccd278"},
{file = "cryptography-42.0.5-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:f8837fe1d6ac4a8052a9a8ddab256bc006242696f03368a4009be7ee3075cdb7"},
{file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:0270572b8bd2c833c3981724b8ee9747b3ec96f699a9665470018594301439ee"},
{file = "cryptography-42.0.5-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:b8cac287fafc4ad485b8a9b67d0ee80c66bf3574f655d3b97ef2e1082360faf1"},
{file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:16a48c23a62a2f4a285699dba2e4ff2d1cff3115b9df052cdd976a18856d8e3d"},
{file = "cryptography-42.0.5-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:2bce03af1ce5a5567ab89bd90d11e7bbdff56b8af3acbbec1faded8f44cb06da"},
{file = "cryptography-42.0.5-cp37-abi3-win32.whl", hash = "sha256:b6cd2203306b63e41acdf39aa93b86fb566049aeb6dc489b70e34bcd07adca74"},
{file = "cryptography-42.0.5-cp37-abi3-win_amd64.whl", hash = "sha256:98d8dc6d012b82287f2c3d26ce1d2dd130ec200c8679b6213b3c73c08b2b7940"},
{file = "cryptography-42.0.5-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:5e6275c09d2badf57aea3afa80d975444f4be8d3bc58f7f80d2a484c6f9485c8"},
{file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e4985a790f921508f36f81831817cbc03b102d643b5fcb81cd33df3fa291a1a1"},
{file = "cryptography-42.0.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cde5f38e614f55e28d831754e8a3bacf9ace5d1566235e39d91b35502d6936e"},
{file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7367d7b2eca6513681127ebad53b2582911d1736dc2ffc19f2c3ae49997496bc"},
{file = "cryptography-42.0.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:cd2030f6650c089aeb304cf093f3244d34745ce0cfcc39f20c6fbfe030102e2a"},
{file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a2913c5375154b6ef2e91c10b5720ea6e21007412f6437504ffea2109b5a33d7"},
{file = "cryptography-42.0.5-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:c41fb5e6a5fe9ebcd58ca3abfeb51dffb5d83d6775405305bfa8715b76521922"},
{file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:3eaafe47ec0d0ffcc9349e1708be2aaea4c6dd4978d76bf6eb0cb2c13636c6fc"},
{file = "cryptography-42.0.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b95b98b0d2af784078fa69f637135e3c317091b615cd0905f8b8a087e86fa30"},
{file = "cryptography-42.0.5-cp39-abi3-win32.whl", hash = "sha256:1f71c10d1e88467126f0efd484bd44bca5e14c664ec2ede64c32f20875c0d413"},
{file = "cryptography-42.0.5-cp39-abi3-win_amd64.whl", hash = "sha256:a011a644f6d7d03736214d38832e030d8268bcff4a41f728e6030325fea3e400"},
{file = "cryptography-42.0.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:9481ffe3cf013b71b2428b905c4f7a9a4f76ec03065b05ff499bb5682a8d9ad8"},
{file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:ba334e6e4b1d92442b75ddacc615c5476d4ad55cc29b15d590cc6b86efa487e2"},
{file = "cryptography-42.0.5-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba3e4a42397c25b7ff88cdec6e2a16c2be18720f317506ee25210f6d31925f9c"},
{file = "cryptography-42.0.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:111a0d8553afcf8eb02a4fea6ca4f59d48ddb34497aa8706a6cf536f1a5ec576"},
{file = "cryptography-42.0.5-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:cd65d75953847815962c84a4654a84850b2bb4aed3f26fadcc1c13892e1e29f6"},
{file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e807b3188f9eb0eaa7bbb579b462c5ace579f1cedb28107ce8b48a9f7ad3679e"},
{file = "cryptography-42.0.5-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f12764b8fffc7a123f641d7d049d382b73f96a34117e0b637b80643169cec8ac"},
{file = "cryptography-42.0.5-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:37dd623507659e08be98eec89323469e8c7b4c1407c85112634ae3dbdb926fdd"},
{file = "cryptography-42.0.5.tar.gz", hash = "sha256:6fe07eec95dfd477eb9530aef5bead34fec819b3aaf6c5bd6d20565da607bfe1"},
]
[package.dependencies]
@ -1237,13 +1237,13 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs
[[package]]
name = "importlib-resources"
version = "6.1.1"
version = "6.1.2"
description = "Read resources from Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "importlib_resources-6.1.1-py3-none-any.whl", hash = "sha256:e8bf90d8213b486f428c9c39714b920041cb02c184686a3dee24905aaa8105d6"},
{file = "importlib_resources-6.1.1.tar.gz", hash = "sha256:3893a00122eafde6894c59914446a512f728a0c1a45f9bb9b63721b6bacf0b4a"},
{file = "importlib_resources-6.1.2-py3-none-any.whl", hash = "sha256:9a0a862501dc38b68adebc82970140c9e4209fc99601782925178f8386339938"},
{file = "importlib_resources-6.1.2.tar.gz", hash = "sha256:308abf8474e2dba5f867d279237cd4076482c3de7104a40b41426370e891549b"},
]
[package.dependencies]
@ -1251,7 +1251,7 @@ zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-ruff", "zipp (>=3.17)"]
testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy", "pytest-ruff (>=0.2.1)", "zipp (>=3.17)"]
[[package]]
name = "ipython"
@ -2286,13 +2286,13 @@ test = ["pytest"]
[[package]]
name = "playwrightcapture"
version = "1.23.4"
version = "1.23.7"
description = "A simple library to capture websites using playwright"
optional = false
python-versions = ">=3.8,<4.0"
files = [
{file = "playwrightcapture-1.23.4-py3-none-any.whl", hash = "sha256:925117509adf3bbb48ba03eb87ec5b1230ae68f228156066ca3a4c7aee3744c0"},
{file = "playwrightcapture-1.23.4.tar.gz", hash = "sha256:7dd2fc11f7047fc4efd369ba3e52e0879ed1fca0255efa2c427dada6e6a08cf7"},
{file = "playwrightcapture-1.23.7-py3-none-any.whl", hash = "sha256:26fba31f1412ceddf4595a26d635516694ddfed46b13f7c9117fd371c53d7525"},
{file = "playwrightcapture-1.23.7.tar.gz", hash = "sha256:e317b777c40214dc3f2215ac5fdcb0470d221ce0ceff31bfdcbfb2e317cfa096"},
]
[package.dependencies]
@ -2300,10 +2300,11 @@ beautifulsoup4 = {version = ">=4.12.3,<5.0.0", extras = ["charset-normalizer", "
dateparser = ">=1.2.0,<2.0.0"
playwright = ">=1.41.2,<2.0.0"
playwright-stealth = ">=1.0.6,<2.0.0"
puremagic = ">=1.20,<2.0"
pydub = {version = ">=0.25.1,<0.26.0", optional = true, markers = "extra == \"recaptcha\""}
pytz = {version = ">=2024.1,<2025.0", markers = "python_version < \"3.9\""}
requests = {version = ">=2.31.0,<3.0.0", extras = ["socks"], optional = true, markers = "extra == \"recaptcha\""}
setuptools = ">=69.1.0,<70.0.0"
setuptools = ">=69.1.1,<70.0.0"
SpeechRecognition = {version = ">=3.10.1,<4.0.0", optional = true, markers = "extra == \"recaptcha\""}
tzdata = ">=2024.1,<2025.0"
w3lib = ">=2.1.2,<3.0.0"
@ -2365,6 +2366,17 @@ files = [
[package.extras]
tests = ["pytest"]
[[package]]
name = "puremagic"
version = "1.21"
description = "Pure python implementation of magic file detection"
optional = false
python-versions = "*"
files = [
{file = "puremagic-1.21-py3-none-any.whl", hash = "sha256:8fe85c05800fe1eacdd5aa943b9e7fdbee66bc41a17aacf80efd6c668c63df45"},
{file = "puremagic-1.21.tar.gz", hash = "sha256:31ef09b37a6ad2f7f2b09b5bd6b8c4a07187a01af4025f5f1368889bdfc6d779"},
]
[[package]]
name = "pycparser"
version = "2.21"
@ -3084,19 +3096,19 @@ boto3 = ["aiobotocore[boto3] (>=2.5.4,<3.0.0)"]
[[package]]
name = "setuptools"
version = "69.1.0"
version = "69.1.1"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "setuptools-69.1.0-py3-none-any.whl", hash = "sha256:c054629b81b946d63a9c6e732bc8b2513a7c3ea645f11d0139a2191d735c60c6"},
{file = "setuptools-69.1.0.tar.gz", hash = "sha256:850894c4195f09c4ed30dba56213bf7c3f21d86ed6bdaafb5df5972593bfc401"},
{file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
{file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.1)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
name = "six"
@ -3353,13 +3365,13 @@ files = [
[[package]]
name = "typing-extensions"
version = "4.9.0"
version = "4.10.0"
description = "Backported and Experimental Type Hints for Python 3.8+"
optional = false
python-versions = ">=3.8"
files = [
{file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"},
{file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"},
{file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"},
{file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"},
]
[[package]]
@ -3722,4 +3734,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<3.13"
content-hash = "729283436c4c78f2f0114df11aabb366542ee1f23799bd3c91c814dbc3d9eabd"
content-hash = "bde26fa1f6022d48b4976b949fbd87eaf483d7e174624d319ea1c2f1f7f430d7"

View File

@ -80,8 +80,8 @@ urllib3 = [
{version = "^2.0.7", python = ">=3.10"}
]
pypdns = "^2.2.2"
python-magic = "^0.4.27"
mmh3 = "^4.1.0"
puremagic = "^1.21"
[tool.poetry.group.dev.dependencies]
mypy = "^1.8.0"

View File

@ -15,7 +15,6 @@ import sys
import time
import filetype # type: ignore[import-untyped]
import magic
from datetime import date, datetime, timedelta, timezone
from importlib.metadata import version
@ -32,6 +31,7 @@ from flask_bootstrap import Bootstrap5 # type: ignore[import-untyped]
from flask_cors import CORS # type: ignore[import-untyped]
from flask_restx import Api # type: ignore[import-untyped]
from lacuscore import CaptureStatus
from puremagic import from_string # type: ignore[import-untyped]
from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined]
from werkzeug.security import check_password_hash
from werkzeug.wrappers.response import Response as WerkzeugResponse
@ -840,7 +840,6 @@ def mark_as_legitimate(tree_uuid: str) -> Response:
def tree_favicons(tree_uuid: str) -> str:
favicons = []
favicons_zip = lookyloo.get_potential_favicons(tree_uuid, all_favicons=True, for_datauri=False)
f = magic.Magic(mime=True)
with ZipFile(favicons_zip, 'r') as myzip:
for name in myzip.namelist():
if not name.endswith('.ico'):
@ -848,7 +847,7 @@ def tree_favicons(tree_uuid: str) -> str:
favicon = myzip.read(name)
if not favicon:
continue
mimetype = f.from_buffer(favicon)
mimetype = from_string(favicon, mime=True)
favicon_sha512 = hashlib.sha512(favicon).hexdigest()
frequency = lookyloo.indexing.favicon_frequency(favicon_sha512)
number_captures = lookyloo.indexing.favicon_number_captures(favicon_sha512)
@ -1251,17 +1250,14 @@ def hhh_detail(hhh: str) -> str:
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
def favicon_detail(favicon_sha512: str) -> str:
captures, favicon = lookyloo.get_favicon_investigator(favicon_sha512.strip())
if favicon:
f = magic.Magic(mime=True)
mimetype = f.from_buffer(favicon)
b64_favicon = base64.b64encode(favicon).decode()
else:
b64_favicon = ''
mimetype = ''
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=1) -> str:
_get_prob = bool(get_probabilistic)
captures, favicon, probabilistic_favicons = lookyloo.get_favicon_investigator(favicon_sha512.strip(), get_probabilistic=_get_prob)
mimetype, b64_favicon = favicon
return render_template('favicon_details.html', favicon_sha512=favicon_sha512,
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon)
captures=captures, mimetype=mimetype, b64_favicon=b64_favicon,
probabilistic_favicons=probabilistic_favicons)
@app.route('/body_hashes/<string:body_hash>', methods=['GET'])

View File

@ -1,7 +1,7 @@
{% from "macros.html" import shorten_string %}
<script type="text/javascript">
new DataTable('#faviconDetailsTable', {
new DataTable('#faviconDetailsTable_{{favicon_sha512}}', {
columnDefs: [{ width: '30%', targets: 0 },
{ width: '30%', targets: 1 },
{ width: '50%', targets: 2 }],
@ -11,7 +11,7 @@
<center>
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:64px;height:64px;"/>
</center>
<table id="faviconDetailsTable" class="table table-striped" style="width:100%">
<table id="faviconDetailsTable_{{favicon_sha512}}" class="table table-striped" style="width:100%">
<thead>
<tr>
<th>Capture Time</th>
@ -39,3 +39,18 @@
{% endfor %}
</tbody>
</table>
{%for probabilistic_hash_algo, entries in probabilistic_favicons.items() %}
<h3>Probabilistic Favicon Hashes ({{ probabilistic_hash_algo }})</h3>
{% for mm3h, favicons in entries.items() %}
<h4>MM3 Hash: {{ mm3h }}</h4>
{% for sha512, favicon in favicons.items() %}
<a href="#faviconDetailsProbabilisticHashModal" data-remote="{{ url_for('favicon_detail', favicon_sha512=sha512, get_probabilistic=0) }}"
data-bs-toggle="modal" data-bs-target="#faviconDetailsProbabilisticHashModal" role="button">
<img src="data:{{mimetype}};base64,{{ b64_favicon }}" style="width:32px;height:32px;"
title="Click to see other captures with the same favicon"/>
</a>
<br>
{% endfor %}
{% endfor %}
{% endfor %}

View File

@ -98,6 +98,13 @@
});
</script>
<script>
$('#faviconDetailsProbabilisticHashModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#bodyHashesModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
@ -572,6 +579,26 @@
</div>
</div>
<div class="modal fade" id="faviconDetailsProbabilisticHashModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="faviconDetailsProbabilisticHashModalLabel">Other occurrences of the favicon from a probabilistic hash</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading favicon details from probabilistic hash ...
</div>
<div class="modal-footer">
<a class="btn btn-primary" href="#faviconsModal"
data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Back to capture's favicons</a>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="bodyHashesModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">