diff --git a/bin/background_indexer.py b/bin/background_indexer.py index df9f440..3e6a7ff 100755 --- a/bin/background_indexer.py +++ b/bin/background_indexer.py @@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager): # Don't need the cache in this class. self.lookyloo.clear_tree_cache() - def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]: + def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]: # NOTE: only get the non-archived captures for now. for uuid, directory in self.redis.hscan_iter('lookup_dirs'): if not self.full_indexer: @@ -85,6 +85,9 @@ class BackgroundIndexer(AbstractManager): self.logger.info(f'Indexing favicons for {uuid_to_index}') favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False) self.indexing.index_favicons_capture(uuid_to_index, favicons) + if not indexed[5]: + self.logger.info(f'Indexing identifiers for {uuid_to_index}') + self.indexing.index_identifiers_capture(ct) # NOTE: categories aren't taken in account here, should be fixed(?) # see indexing.index_categories_capture(capture_uuid, categories) self.indexing.indexing_done() diff --git a/lookyloo/indexing.py b/lookyloo/indexing.py index e335d12..ea2b141 100644 --- a/lookyloo/indexing.py +++ b/lookyloo/indexing.py @@ -65,16 +65,18 @@ class Indexing(): p.srem('indexed_cookies', capture_uuid) p.srem('indexed_hhhashes', capture_uuid) p.srem('indexed_favicons', capture_uuid) + p.srem('indexed_identifiers', capture_uuid) p.execute() - def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]: + def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]: p = self.redis.pipeline() p.sismember('indexed_urls', capture_uuid) p.sismember('indexed_body_hashes', capture_uuid) p.sismember('indexed_cookies', capture_uuid) p.sismember('indexed_hhhashes', capture_uuid) p.sismember('indexed_favicons', capture_uuid) - # This call for sure returns a tuple of 5 booleans + p.sismember('indexed_identifiers', capture_uuid) + # This call for sure returns a tuple of 6 booleans return p.execute() # type: ignore[return-value] # ###### Cookies ###### @@ -365,6 +367,57 @@ class Indexing(): def get_favicon(self, favicon_sha512: str) -> bytes | None: return self.redis_bytes.get(f'favicons|{favicon_sha512}') + # ###### identifiers ###### + + def identifiers_types(self) -> set[str]: + return self.redis.smembers('identifiers_types') + + def identifiers(self, identifier_type: str) -> list[tuple[str, float]]: + return self.redis.zrevrange(f'identifiers|{identifier_type}', 0, 200, withscores=True) + + def identifier_frequency(self, identifier_type: str, identifier: str) -> float | None: + return self.redis.zscore(f'identifiers|{identifier_type}', identifier) + + def identifier_number_captures(self, identifier_type: str, identifier: str) -> int: + return self.redis.scard(f'identifiers|{identifier_type}|{identifier}|captures') + + def index_identifiers_capture(self, crawled_tree: CrawledTree) -> None: + capture_uuid = crawled_tree.uuid + if self.redis.sismember('indexed_identifiers', capture_uuid): + # Do not reindex + return + self.redis.sadd('indexed_identifiers', capture_uuid) + if (not hasattr(crawled_tree.root_hartree.rendered_node, 'identifiers') + or not crawled_tree.root_hartree.rendered_node.identifiers): + return + pipeline = self.redis.pipeline() + # We have multiple identifiers types, this is the difference with the other indexes + for identifier_type, id_values in crawled_tree.root_hartree.rendered_node.identifiers.items(): + pipeline.sadd('identifiers_types', identifier_type) # no-op if already there + if self.redis.sismember(f'indexed_identifiers|{identifier_type}|captures', capture_uuid): + # Do not reindex the same identifier type for the same capture + continue + pipeline.sadd(f'indexed_identifiers|{identifier_type}|captures', capture_uuid) + self.logger.debug(f'Indexing identifiers {identifier_type} for {capture_uuid} ... ') + for identifier in id_values: + if self.redis.sismember(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid): + # Already counted this specific identifier for this capture + continue + pipeline.sadd(f'identifiers|{capture_uuid}', identifier_type) + pipeline.sadd(f'identifiers|{capture_uuid}|{identifier_type}', identifier) + pipeline.sadd(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid) + pipeline.zincrby(f'identifiers|{identifier_type}', 1, identifier) + pipeline.execute() + + def get_identifiers_capture(self, capture_uuid: str) -> dict[str, set[str]]: + to_return = {} + for identifier_type in self.redis.smembers(f'identifiers|{capture_uuid}'): + to_return[identifier_type] = self.redis.smembers(f'identifiers|{capture_uuid}|{identifier_type}') + return to_return + + def get_captures_identifier(self, identifier_type: str, identifier: str) -> set[str]: + return self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures') + # ###### favicons probabilistic hashes ###### def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None: diff --git a/poetry.lock b/poetry.lock index d359261..18d5440 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiobotocore" @@ -1063,13 +1063,13 @@ tornado = ["tornado (>=0.2)"] [[package]] name = "har2tree" -version = "1.23.0" +version = "1.23.1" description = "HTTP Archive (HAR) to ETE Toolkit generator" optional = false python-versions = ">=3.8,<3.13" files = [ - {file = "har2tree-1.23.0-py3-none-any.whl", hash = "sha256:ccb3bb575192350c76724a6bd1024775f4bf1059f498c0b6a5ee18c07d57a3d9"}, - {file = "har2tree-1.23.0.tar.gz", hash = "sha256:eb67792a24b19351a04e897ac69b542a51facdc569bc176842cb5db2ba00d7e1"}, + {file = "har2tree-1.23.1-py3-none-any.whl", hash = "sha256:a524ae46dd6c748ddc743b50e2b29b19d27d22ee80d765e1a5cf890bf1793c11"}, + {file = "har2tree-1.23.1.tar.gz", hash = "sha256:70a77f0bd2293e7fedfa03254fc67ce31258fefe3b471acc067ce8d84aeb1838"}, ] [package.dependencies] @@ -1237,13 +1237,13 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs [[package]] name = "importlib-resources" -version = "6.1.3" +version = "6.3.0" description = "Read resources from Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "importlib_resources-6.1.3-py3-none-any.whl", hash = "sha256:4c0269e3580fe2634d364b39b38b961540a7738c02cb984e98add8b4221d793d"}, - {file = "importlib_resources-6.1.3.tar.gz", hash = "sha256:56fb4525197b78544a3354ea27793952ab93f935bb4bf746b846bb1015020f2b"}, + {file = "importlib_resources-6.3.0-py3-none-any.whl", hash = "sha256:783407aa1cd05550e3aa123e8f7cfaebee35ffa9cb0242919e2d1e4172222705"}, + {file = "importlib_resources-6.3.0.tar.gz", hash = "sha256:166072a97e86917a9025876f34286f549b9caf1d10b35a1b372bffa1600c6569"}, ] [package.dependencies] @@ -2339,13 +2339,13 @@ files = [ [[package]] name = "publicsuffixlist" -version = "0.10.0.20240305" +version = "0.10.0.20240312" description = "publicsuffixlist implement" optional = false python-versions = ">=2.6" files = [ - {file = "publicsuffixlist-0.10.0.20240305-py2.py3-none-any.whl", hash = "sha256:f6869119f8781501c0c625e59b4b65eb60e2ed5185cfd6c142c792f74ac47c21"}, - {file = "publicsuffixlist-0.10.0.20240305.tar.gz", hash = "sha256:6e79ea73b0278ce1b102f3ad6815f2a5b683864da9948ba0b0eab3180c419f7f"}, + {file = "publicsuffixlist-0.10.0.20240312-py2.py3-none-any.whl", hash = "sha256:47fd7724b8a7c8d8732d4f5380019f74acd557a406c2a485540d1a4aae6cb359"}, + {file = "publicsuffixlist-0.10.0.20240312.tar.gz", hash = "sha256:02912f3e084fad67e2463365fd431544921d9db3f51f6d43cea7169c86b1f188"}, ] [package.extras] @@ -3097,18 +3097,18 @@ boto3 = ["aiobotocore[boto3] (>=2.5.4,<3.0.0)"] [[package]] name = "setuptools" -version = "69.1.1" +version = "69.2.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"}, - {file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"}, + {file = "setuptools-69.2.0-py3-none-any.whl", hash = "sha256:c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c"}, + {file = "setuptools-69.2.0.tar.gz", hash = "sha256:0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e"}, ] [package.extras] docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] [[package]] @@ -3203,18 +3203,18 @@ files = [ [[package]] name = "traitlets" -version = "5.14.1" +version = "5.14.2" description = "Traitlets Python configuration system" optional = false python-versions = ">=3.8" files = [ - {file = "traitlets-5.14.1-py3-none-any.whl", hash = "sha256:2e5a030e6eff91737c643231bfcf04a65b0132078dad75e4936700b213652e74"}, - {file = "traitlets-5.14.1.tar.gz", hash = "sha256:8585105b371a04b8316a43d5ce29c098575c2e477850b62b848b964f1444527e"}, + {file = "traitlets-5.14.2-py3-none-any.whl", hash = "sha256:fcdf85684a772ddeba87db2f398ce00b40ff550d1528c03c14dbf6a02003cd80"}, + {file = "traitlets-5.14.2.tar.gz", hash = "sha256:8cdd83c040dab7d1dee822678e5f5d100b514f7b72b01615b26fc5718916fdf9"}, ] [package.extras] docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] -test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"] +test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.1)", "pytest-mock", "pytest-mypy-testing"] [[package]] name = "types-beautifulsoup4" @@ -3719,20 +3719,20 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.17.0" +version = "3.18.0" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false python-versions = ">=3.8" files = [ - {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, - {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, + {file = "zipp-3.18.0-py3-none-any.whl", hash = "sha256:c1bb803ed69d2cce2373152797064f7e79bc43f0a3748eb494096a867e0ebf79"}, + {file = "zipp-3.18.0.tar.gz", hash = "sha256:df8d042b02765029a09b157efd8e820451045890acc30f8e37dd2f94a060221f"}, ] [package.extras] -docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<3.13" -content-hash = "b84e96dc2eb790eacdc9036c77e85ff38c35993ef6932c8d6ab766afc384fb5d" +content-hash = "f9671474ae24270b7146bde6ea3241d73b31ec78a0ef42d45b7e13b4526b6ece" diff --git a/pyproject.toml b/pyproject.toml index aef8c68..e50a08c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ pyhashlookup = "^1.2.2" lief = "^0.14" ua-parser = "^0.18.0" Flask-Login = "^0.6.3" -har2tree = "^1.23.0" +har2tree = "^1.23.1" passivetotal = "^2.5.9" werkzeug = "^3.0.1" filetype = "^1.2.0" diff --git a/website/web/__init__.py b/website/web/__init__.py index 57c2a3d..10b989c 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -416,6 +416,11 @@ def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, s return captures, domains +def get_identifier_investigator(identifier_type: str, identifier: str) -> list[tuple[str, str, str, datetime]]: + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_identifier(identifier_type=identifier_type, identifier=identifier)]) + return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] + + def get_favicon_investigator(favicon_sha512: str, /, get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]], @@ -1187,6 +1192,17 @@ def mark_as_legitimate(tree_uuid: str) -> Response: return jsonify({'message': 'Legitimate entry added.'}) +@app.route('/tree//identifiers', methods=['GET']) +def tree_identifiers(tree_uuid: str) -> str: + to_return: list[tuple[int, str, str]] = [] + + for id_type, identifiers in get_indexing(flask_login.current_user).get_identifiers_capture(tree_uuid).items(): + for identifier in identifiers: + nb_captures = get_indexing(flask_login.current_user).identifier_number_captures(id_type, identifier) + to_return.append((nb_captures, id_type, identifier)) + return render_template('tree_identifiers.html', tree_uuid=tree_uuid, identifiers=to_return) + + @app.route('/tree//favicons', methods=['GET']) def tree_favicons(tree_uuid: str) -> str: favicons = [] @@ -1605,6 +1621,14 @@ def hhh_detail(hhh: str) -> str: return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers) +@app.route('/identifier_details//', methods=['GET']) +def identifier_details(identifier_type: str, identifier: str) -> str: + captures = get_identifier_investigator(identifier_type, identifier) + return render_template('identifier_details.html', identifier_type=identifier_type, + identifier=identifier, + captures=captures) + + @app.route('/favicon_details/', methods=['GET']) @app.route('/favicon_details//', methods=['GET']) def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str: diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index 8bbb7bb..c5e290f 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -84,6 +84,20 @@ }); + +