new: Index and views for identifiers

pull/898/head
Raphaël Vinot 2024-03-14 00:56:28 +01:00
parent 54ef3bf54f
commit 0f4ef013c9
6 changed files with 163 additions and 28 deletions

View File

@ -38,7 +38,7 @@ class BackgroundIndexer(AbstractManager):
# Don't need the cache in this class.
self.lookyloo.clear_tree_cache()
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool], str], None, None]:
def _to_index_no_cache(self) -> Generator[tuple[tuple[bool, bool, bool, bool, bool, bool], str], None, None]:
# NOTE: only get the non-archived captures for now.
for uuid, directory in self.redis.hscan_iter('lookup_dirs'):
if not self.full_indexer:
@ -85,6 +85,9 @@ class BackgroundIndexer(AbstractManager):
self.logger.info(f'Indexing favicons for {uuid_to_index}')
favicons = self.lookyloo.get_potential_favicons(uuid_to_index, all_favicons=True, for_datauri=False)
self.indexing.index_favicons_capture(uuid_to_index, favicons)
if not indexed[5]:
self.logger.info(f'Indexing identifiers for {uuid_to_index}')
self.indexing.index_identifiers_capture(ct)
# NOTE: categories aren't taken in account here, should be fixed(?)
# see indexing.index_categories_capture(capture_uuid, categories)
self.indexing.indexing_done()

View File

@ -65,16 +65,18 @@ class Indexing():
p.srem('indexed_cookies', capture_uuid)
p.srem('indexed_hhhashes', capture_uuid)
p.srem('indexed_favicons', capture_uuid)
p.srem('indexed_identifiers', capture_uuid)
p.execute()
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool]:
def capture_indexed(self, capture_uuid: str) -> tuple[bool, bool, bool, bool, bool, bool]:
p = self.redis.pipeline()
p.sismember('indexed_urls', capture_uuid)
p.sismember('indexed_body_hashes', capture_uuid)
p.sismember('indexed_cookies', capture_uuid)
p.sismember('indexed_hhhashes', capture_uuid)
p.sismember('indexed_favicons', capture_uuid)
# This call for sure returns a tuple of 5 booleans
p.sismember('indexed_identifiers', capture_uuid)
# This call for sure returns a tuple of 6 booleans
return p.execute() # type: ignore[return-value]
# ###### Cookies ######
@ -365,6 +367,57 @@ class Indexing():
def get_favicon(self, favicon_sha512: str) -> bytes | None:
return self.redis_bytes.get(f'favicons|{favicon_sha512}')
# ###### identifiers ######
def identifiers_types(self) -> set[str]:
return self.redis.smembers('identifiers_types')
def identifiers(self, identifier_type: str) -> list[tuple[str, float]]:
return self.redis.zrevrange(f'identifiers|{identifier_type}', 0, 200, withscores=True)
def identifier_frequency(self, identifier_type: str, identifier: str) -> float | None:
return self.redis.zscore(f'identifiers|{identifier_type}', identifier)
def identifier_number_captures(self, identifier_type: str, identifier: str) -> int:
return self.redis.scard(f'identifiers|{identifier_type}|{identifier}|captures')
def index_identifiers_capture(self, crawled_tree: CrawledTree) -> None:
capture_uuid = crawled_tree.uuid
if self.redis.sismember('indexed_identifiers', capture_uuid):
# Do not reindex
return
self.redis.sadd('indexed_identifiers', capture_uuid)
if (not hasattr(crawled_tree.root_hartree.rendered_node, 'identifiers')
or not crawled_tree.root_hartree.rendered_node.identifiers):
return
pipeline = self.redis.pipeline()
# We have multiple identifiers types, this is the difference with the other indexes
for identifier_type, id_values in crawled_tree.root_hartree.rendered_node.identifiers.items():
pipeline.sadd('identifiers_types', identifier_type) # no-op if already there
if self.redis.sismember(f'indexed_identifiers|{identifier_type}|captures', capture_uuid):
# Do not reindex the same identifier type for the same capture
continue
pipeline.sadd(f'indexed_identifiers|{identifier_type}|captures', capture_uuid)
self.logger.debug(f'Indexing identifiers {identifier_type} for {capture_uuid} ... ')
for identifier in id_values:
if self.redis.sismember(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid):
# Already counted this specific identifier for this capture
continue
pipeline.sadd(f'identifiers|{capture_uuid}', identifier_type)
pipeline.sadd(f'identifiers|{capture_uuid}|{identifier_type}', identifier)
pipeline.sadd(f'identifiers|{identifier_type}|{identifier}|captures', capture_uuid)
pipeline.zincrby(f'identifiers|{identifier_type}', 1, identifier)
pipeline.execute()
def get_identifiers_capture(self, capture_uuid: str) -> dict[str, set[str]]:
to_return = {}
for identifier_type in self.redis.smembers(f'identifiers|{capture_uuid}'):
to_return[identifier_type] = self.redis.smembers(f'identifiers|{capture_uuid}|{identifier_type}')
return to_return
def get_captures_identifier(self, identifier_type: str, identifier: str) -> set[str]:
return self.redis.smembers(f'identifiers|{identifier_type}|{identifier}|captures')
# ###### favicons probabilistic hashes ######
def favicon_probabilistic_frequency(self, algorithm: str, phash: str) -> float | None:

48
poetry.lock generated
View File

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
[[package]]
name = "aiobotocore"
@ -1063,13 +1063,13 @@ tornado = ["tornado (>=0.2)"]
[[package]]
name = "har2tree"
version = "1.23.0"
version = "1.23.1"
description = "HTTP Archive (HAR) to ETE Toolkit generator"
optional = false
python-versions = ">=3.8,<3.13"
files = [
{file = "har2tree-1.23.0-py3-none-any.whl", hash = "sha256:ccb3bb575192350c76724a6bd1024775f4bf1059f498c0b6a5ee18c07d57a3d9"},
{file = "har2tree-1.23.0.tar.gz", hash = "sha256:eb67792a24b19351a04e897ac69b542a51facdc569bc176842cb5db2ba00d7e1"},
{file = "har2tree-1.23.1-py3-none-any.whl", hash = "sha256:a524ae46dd6c748ddc743b50e2b29b19d27d22ee80d765e1a5cf890bf1793c11"},
{file = "har2tree-1.23.1.tar.gz", hash = "sha256:70a77f0bd2293e7fedfa03254fc67ce31258fefe3b471acc067ce8d84aeb1838"},
]
[package.dependencies]
@ -1237,13 +1237,13 @@ testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs
[[package]]
name = "importlib-resources"
version = "6.1.3"
version = "6.3.0"
description = "Read resources from Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "importlib_resources-6.1.3-py3-none-any.whl", hash = "sha256:4c0269e3580fe2634d364b39b38b961540a7738c02cb984e98add8b4221d793d"},
{file = "importlib_resources-6.1.3.tar.gz", hash = "sha256:56fb4525197b78544a3354ea27793952ab93f935bb4bf746b846bb1015020f2b"},
{file = "importlib_resources-6.3.0-py3-none-any.whl", hash = "sha256:783407aa1cd05550e3aa123e8f7cfaebee35ffa9cb0242919e2d1e4172222705"},
{file = "importlib_resources-6.3.0.tar.gz", hash = "sha256:166072a97e86917a9025876f34286f549b9caf1d10b35a1b372bffa1600c6569"},
]
[package.dependencies]
@ -2339,13 +2339,13 @@ files = [
[[package]]
name = "publicsuffixlist"
version = "0.10.0.20240305"
version = "0.10.0.20240312"
description = "publicsuffixlist implement"
optional = false
python-versions = ">=2.6"
files = [
{file = "publicsuffixlist-0.10.0.20240305-py2.py3-none-any.whl", hash = "sha256:f6869119f8781501c0c625e59b4b65eb60e2ed5185cfd6c142c792f74ac47c21"},
{file = "publicsuffixlist-0.10.0.20240305.tar.gz", hash = "sha256:6e79ea73b0278ce1b102f3ad6815f2a5b683864da9948ba0b0eab3180c419f7f"},
{file = "publicsuffixlist-0.10.0.20240312-py2.py3-none-any.whl", hash = "sha256:47fd7724b8a7c8d8732d4f5380019f74acd557a406c2a485540d1a4aae6cb359"},
{file = "publicsuffixlist-0.10.0.20240312.tar.gz", hash = "sha256:02912f3e084fad67e2463365fd431544921d9db3f51f6d43cea7169c86b1f188"},
]
[package.extras]
@ -3097,18 +3097,18 @@ boto3 = ["aiobotocore[boto3] (>=2.5.4,<3.0.0)"]
[[package]]
name = "setuptools"
version = "69.1.1"
version = "69.2.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.8"
files = [
{file = "setuptools-69.1.1-py3-none-any.whl", hash = "sha256:02fa291a0471b3a18b2b2481ed902af520c69e8ae0919c13da936542754b4c56"},
{file = "setuptools-69.1.1.tar.gz", hash = "sha256:5c0806c7d9af348e6dd3777b4f4dbb42c7ad85b190104837488eab9a7c945cf8"},
{file = "setuptools-69.2.0-py3-none-any.whl", hash = "sha256:c21c49fb1042386df081cb5d86759792ab89efca84cf114889191cd09aacc80c"},
{file = "setuptools-69.2.0.tar.gz", hash = "sha256:0ff4183f8f42cd8fa3acea16c45205521a4ef28f73c6391d8a25e92893134f2e"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing = ["build[virtualenv]", "filelock (>=3.4.0)", "importlib-metadata", "ini2toml[lite] (>=0.9)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mypy (==1.9)", "packaging (>=23.2)", "pip (>=19.1)", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-home (>=0.5)", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-ruff (>=0.2.1)", "pytest-timeout", "pytest-xdist (>=3)", "tomli", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "packaging (>=23.2)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
[[package]]
@ -3203,18 +3203,18 @@ files = [
[[package]]
name = "traitlets"
version = "5.14.1"
version = "5.14.2"
description = "Traitlets Python configuration system"
optional = false
python-versions = ">=3.8"
files = [
{file = "traitlets-5.14.1-py3-none-any.whl", hash = "sha256:2e5a030e6eff91737c643231bfcf04a65b0132078dad75e4936700b213652e74"},
{file = "traitlets-5.14.1.tar.gz", hash = "sha256:8585105b371a04b8316a43d5ce29c098575c2e477850b62b848b964f1444527e"},
{file = "traitlets-5.14.2-py3-none-any.whl", hash = "sha256:fcdf85684a772ddeba87db2f398ce00b40ff550d1528c03c14dbf6a02003cd80"},
{file = "traitlets-5.14.2.tar.gz", hash = "sha256:8cdd83c040dab7d1dee822678e5f5d100b514f7b72b01615b26fc5718916fdf9"},
]
[package.extras]
docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"]
test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<7.5)", "pytest-mock", "pytest-mypy-testing"]
test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.1)", "pytest-mock", "pytest-mypy-testing"]
[[package]]
name = "types-beautifulsoup4"
@ -3719,20 +3719,20 @@ multidict = ">=4.0"
[[package]]
name = "zipp"
version = "3.17.0"
version = "3.18.0"
description = "Backport of pathlib-compatible object wrapper for zip files"
optional = false
python-versions = ">=3.8"
files = [
{file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"},
{file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"},
{file = "zipp-3.18.0-py3-none-any.whl", hash = "sha256:c1bb803ed69d2cce2373152797064f7e79bc43f0a3748eb494096a867e0ebf79"},
{file = "zipp-3.18.0.tar.gz", hash = "sha256:df8d042b02765029a09b157efd8e820451045890acc30f8e37dd2f94a060221f"},
]
[package.extras]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"]
docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<3.13"
content-hash = "b84e96dc2eb790eacdc9036c77e85ff38c35993ef6932c8d6ab766afc384fb5d"
content-hash = "f9671474ae24270b7146bde6ea3241d73b31ec78a0ef42d45b7e13b4526b6ece"

View File

@ -62,7 +62,7 @@ pyhashlookup = "^1.2.2"
lief = "^0.14"
ua-parser = "^0.18.0"
Flask-Login = "^0.6.3"
har2tree = "^1.23.0"
har2tree = "^1.23.1"
passivetotal = "^2.5.9"
werkzeug = "^3.0.1"
filetype = "^1.2.0"

View File

@ -416,6 +416,11 @@ def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, s
return captures, domains
def get_identifier_investigator(identifier_type: str, identifier: str) -> list[tuple[str, str, str, datetime]]:
cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_identifier(identifier_type=identifier_type, identifier=identifier)])
return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures]
def get_favicon_investigator(favicon_sha512: str,
/,
get_probabilistic: bool=False) -> tuple[list[tuple[str, str, str, datetime]],
@ -1187,6 +1192,17 @@ def mark_as_legitimate(tree_uuid: str) -> Response:
return jsonify({'message': 'Legitimate entry added.'})
@app.route('/tree/<string:tree_uuid>/identifiers', methods=['GET'])
def tree_identifiers(tree_uuid: str) -> str:
to_return: list[tuple[int, str, str]] = []
for id_type, identifiers in get_indexing(flask_login.current_user).get_identifiers_capture(tree_uuid).items():
for identifier in identifiers:
nb_captures = get_indexing(flask_login.current_user).identifier_number_captures(id_type, identifier)
to_return.append((nb_captures, id_type, identifier))
return render_template('tree_identifiers.html', tree_uuid=tree_uuid, identifiers=to_return)
@app.route('/tree/<string:tree_uuid>/favicons', methods=['GET'])
def tree_favicons(tree_uuid: str) -> str:
favicons = []
@ -1605,6 +1621,14 @@ def hhh_detail(hhh: str) -> str:
return render_template('hhh_details.html', hhh=hhh, captures=captures, headers=headers)
@app.route('/identifier_details/<string:identifier_type>/<string:identifier>', methods=['GET'])
def identifier_details(identifier_type: str, identifier: str) -> str:
captures = get_identifier_investigator(identifier_type, identifier)
return render_template('identifier_details.html', identifier_type=identifier_type,
identifier=identifier,
captures=captures)
@app.route('/favicon_details/<string:favicon_sha512>', methods=['GET'])
@app.route('/favicon_details/<string:favicon_sha512>/<int:get_probabilistic>', methods=['GET'])
def favicon_detail(favicon_sha512: str, get_probabilistic: int=0) -> str:

View File

@ -84,6 +84,20 @@
});
</script>
<script>
$('#identifiersModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#identifierDetailsModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
modal.find('.modal-body').load(button.data("remote"));
});
</script>
<script>
$('#faviconsModal').on('show.bs.modal', function(e) {
var button = $(e.relatedTarget);
var modal = $(this);
@ -320,6 +334,10 @@
<a href="#faviconsModal" data-remote="{{ url_for('tree_favicons', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#faviconsModal" role="button">Favicons Capture</a>
</li>
<li class="list-group-item">
<a href="#identifiersModal" data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Identifiers Capture</a>
</li>
</ul>
</div>
{% if current_user.is_authenticated %}
@ -542,6 +560,43 @@
</div>
</div>
<div class="modal fade" id="identifiersModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="identifiersModalLabel">Identifiers found on the rendered page</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading identifiers ...
</div>
<div class="modal-footer">
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="identifierDetailsModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="identifierDetailsModalLabel">Other occurrences of the identifier</h5>
<button type="button" class="btn btn-close" data-bs-dismiss="modal" aria-label="Close"></button>
</div>
<div class="modal-body">
... loading identifier details ...
</div>
<div class="modal-footer">
<a class="btn btn-primary" href="#identifiersModal"
data-remote="{{ url_for('tree_identifiers', tree_uuid=tree_uuid) }}"
data-bs-toggle="modal" data-bs-target="#identifiersModal" role="button">Back to capture's identifiers</a>
<button type="button" class="btn btn-secondary" data-bs-dismiss="modal">Close</button>
</div>
</div>
</div>
</div>
<div class="modal fade" id="faviconsModal" tabindex="-1" role="dialog">
<div class="modal-dialog modal-xl" role="document">
<div class="modal-content">