From 6bb26c43589685e2a10079fce73806537678d513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 19 Jun 2020 00:25:24 +0200 Subject: [PATCH] new: Initial support for body hashes index --- lookyloo/lookyloo.py | 67 ++++++++++++++++++++--- website/web/__init__.py | 8 +++ website/web/templates/body_hash.html | 53 ++++++++++++++++++ website/web/templates/hostname_popup.html | 10 ++++ 4 files changed, 131 insertions(+), 7 deletions(-) create mode 100644 website/web/templates/body_hash.html diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 5fd6dd4c..4027da00 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -38,6 +38,15 @@ class Indexing(): self.lookyloo = Lookyloo() self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True) + def clear_indexes(self): + self.redis.flushdb() + + def index_all(self): + self.index_cookies() + self.index_body_hashes() + + # ###### Cookies ###### + @property def cookies_names(self) -> List[Tuple[str, float]]: return self.redis.zrevrange('cookies_names', 0, -1, withscores=True) @@ -60,14 +69,8 @@ class Indexing(): def get_cookies_names_captures(self, cookie_name: str) -> List[Tuple[str, str]]: return [uuids.split('|')for uuids in self.redis.smembers(f'cn|{cookie_name}|captures')] - def clear_indexes(self): - self.redis.flushdb() - - def index_all(self): - self.index_cookies() - def index_cookies_capture(self, capture_dir: Path) -> None: - print(f'Processing {capture_dir}') + print(f'Index cookies {capture_dir}') try: crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) except Exception as e: @@ -118,6 +121,49 @@ class Indexing(): self.redis.delete('aggregate_cn_domains') return {'domains': aggregate_domains_cn, 'cookies': aggregate_cn_domains} + # ###### Body hashes ###### + + def body_hash_fequency(self, body_hash: str) -> float: + return {'hash_freq': self.redis.zscore('body_hashes', body_hash), + 'hash_domains_freq': self.redis.zcard(f'bh|{body_hash}')} + + def index_body_hashes_capture(self, capture_dir: Path) -> None: + print(f'Index body hashes {capture_dir}') + try: + crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) + except Exception as e: + print(e) + return + + if self.redis.sismember('indexed_body_hashes', crawled_tree.uuid): + # Do not reindex + return + self.redis.sadd('indexed_body_hashes', crawled_tree.uuid) + + pipeline = self.redis.pipeline() + already_loaded: Set[str] = set() + for urlnode in crawled_tree.root_hartree.url_tree.traverse(): + if not urlnode.empty_response: + if urlnode.body_hash in already_loaded: + # Same hash multiple times in the same capture, skip + continue + already_loaded.add(urlnode.body_hash) + pipeline.zincrby('body_hashes', 1, urlnode.body_hash) + pipeline.zincrby(f'bh|{urlnode.body_hash}', 1, urlnode.hostname) + pipeline.sadd(f'bh|{urlnode.body_hash}|captures', f'{crawled_tree.uuid}|{urlnode.uuid}') + + pipeline.execute() + + def index_body_hashes(self) -> None: + for capture_dir in self.lookyloo.capture_dirs: + self.index_body_hashes_capture(capture_dir) + + def get_body_hash_captures(self, body_hash: str) -> List[Tuple[str, str]]: + return [uuids.split('|')for uuids in self.redis.smembers(f'bh|{body_hash}|captures')] + + def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]: + return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True) + class Lookyloo(): @@ -691,6 +737,13 @@ class Lookyloo(): to_append['url_path_short'] = to_append['url_path'] if not url.empty_response: + # Index lookup + # NOTE: We probably don't want to leave it there. + indexing = Indexing() + freq = indexing.body_hash_fequency(url.body_hash) + if freq['hash_freq'] > 1: + to_append['body_hash_fequency'] = freq + # Optional: SaneJS information if url.body_hash in sanejs_lookups: if sanejs_lookups[url.body_hash]: diff --git a/website/web/__init__.py b/website/web/__init__.py index bf5fca02..d186a3d8 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -436,6 +436,14 @@ def cookies_name_detail(cookie_name: str): for domain, freq in i.get_cookie_domains(cookie_name)] return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures) + +@app.route('/body_hashes/', methods=['GET']) +def body_hash_details(body_hash: str): + i = Indexing() + captures = [i.get_capture_cache(capture) for capture, url in i.get_body_hash_captures(body_hash)] + domains = i.get_body_hash_domains(body_hash) + return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures) + # Query API diff --git a/website/web/templates/body_hash.html b/website/web/templates/body_hash.html new file mode 100644 index 00000000..0e1ecba6 --- /dev/null +++ b/website/web/templates/body_hash.html @@ -0,0 +1,53 @@ +{% extends "main.html" %} + +{% from 'bootstrap/utils.html' import render_messages %} + +{% block title %}{{ body_hash }}{% endblock %} + +{% block scripts %} +{{ super() }} + + + +{% endblock %} + +{% block styles %} +{{ super() }} + +{% endblock %} + + +{% block content %} +

{{ cookie_name }}

+
+ + + + + + + + + {% for domain, freq in domains %} + + + + + {% endfor %} + +
HostnameFrequency
+ {{ domain }} + {{ freq }}
+
+

The same file was seen in these captures:

+ +{% endblock %} diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index c3a3adf7..cff83e58 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -103,6 +103,16 @@ + {% if url['body_hash_fequency'] %} +
+ This file can be found {{ url['body_hash_fequency']['hash_freq'] }} times + across all the captures on this lookyloo instance, in {{ url['body_hash_fequency']['hash_domains_freq'] }} unique domains. +
+ + Show details + +
+ {% endif %} {% if url['sane_js'] %}
{% if url['sane_js'] is string %}