From 29c78d34858134222a93712963f120f0607a90bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Sat, 20 Jun 2020 02:09:45 +0200 Subject: [PATCH] chg: Cleanup and improve index rendering --- bin/rebuild_caches.py | 4 +- lookyloo/lookyloo.py | 69 +++++++++++------------ website/web/__init__.py | 9 +-- website/web/templates/body_hash.html | 4 +- website/web/templates/cookie_name.html | 4 +- website/web/templates/hostname_popup.html | 42 +++++++++++--- 6 files changed, 76 insertions(+), 56 deletions(-) diff --git a/bin/rebuild_caches.py b/bin/rebuild_caches.py index db5df912..027481e2 100755 --- a/bin/rebuild_caches.py +++ b/bin/rebuild_caches.py @@ -22,4 +22,6 @@ if __name__ == '__main__': indexing = Indexing() indexing.clear_indexes() - indexing.index_all() + for capture_dir in lookyloo.capture_dirs: + indexing.index_cookies_capture(capture_dir) + indexing.index_body_hashes_capture(capture_dir) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index d2905e6f..6176765d 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -35,22 +35,11 @@ from .modules import VirusTotal, SaneJavaScript, PhishingInitiative class Indexing(): def __init__(self) -> None: - self.lookyloo = Lookyloo() self.redis: Redis = Redis(unix_socket_path=get_socket_path('indexing'), decode_responses=True) def clear_indexes(self): self.redis.flushdb() - def index_all(self): - self.index_cookies() - self.index_body_hashes() - - def get_capture_cache(self, capture_uuid: str) -> Optional[Dict[str, Any]]: - capture_dir = self.lookyloo.lookup_capture_dir(capture_uuid) - if capture_dir: - return self.lookyloo.capture_cache(capture_dir) - return {} - # ###### Cookies ###### @property @@ -72,7 +61,7 @@ class Indexing(): def index_cookies_capture(self, capture_dir: Path) -> None: print(f'Index cookies {capture_dir}') try: - crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) + crawled_tree = Lookyloo.get_crawled_tree(capture_dir) except Exception as e: print(e) return @@ -101,10 +90,6 @@ class Indexing(): pipeline.sadd(domain, name) pipeline.execute() - def index_cookies(self) -> None: - for capture_dir in self.lookyloo.capture_dirs: - self.index_cookies_capture(capture_dir) - def aggregate_domain_cookies(self): psl = publicsuffix2.PublicSuffixList() pipeline = self.redis.pipeline() @@ -130,7 +115,7 @@ class Indexing(): def index_body_hashes_capture(self, capture_dir: Path) -> None: print(f'Index body hashes {capture_dir}') try: - crawled_tree = self.lookyloo.get_crawled_tree(capture_dir) + crawled_tree = Lookyloo.get_crawled_tree(capture_dir) except Exception as e: print(e) return @@ -152,20 +137,13 @@ class Indexing(): pipeline.execute() - def index_body_hashes(self) -> None: - for capture_dir in self.lookyloo.capture_dirs: - self.index_body_hashes_capture(capture_dir) - - def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None) -> List[str]: - if not filter_url: - return self.redis.smembers(f'bh|{body_hash}|captures') - # We only want the captures if the hash match on a different URL + def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None) -> List[Tuple[str, str, str]]: to_return = [] for capture_uuid in self.redis.smembers(f'bh|{body_hash}|captures'): for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1): url_uuid, url = entry.split('|', 1) - if url != filter_url: - to_return.append(capture_uuid) + if filter_url is None or url != filter_url: + to_return.append((capture_uuid, url_uuid, urlsplit(url).hostname)) return to_return def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]: @@ -283,12 +261,6 @@ class Lookyloo(): ct = self.get_crawled_tree(capture_uuid) return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta - def remove_pickle(self, capture_uuid: str) -> None: - capture_dir = self.lookup_capture_dir(capture_uuid) - if not capture_dir: - raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') - remove_pickle_tree(capture_dir) - def rebuild_cache(self) -> None: self.redis.flushdb() self._init_existing_dumps() @@ -708,6 +680,27 @@ class Lookyloo(): self._set_capture_cache(dirpath) return perma_uuid + def get_body_hash_investigator(self, body_hash: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, float]]]: + indexing = Indexing() + captures = [] + for capture_uuid, url_uuid, url_hostname in indexing.get_body_hash_captures(body_hash): + cache = self.get_capture_cache(capture_uuid) + if cache: + captures.append((capture_uuid, cache['title'])) + domains = indexing.get_body_hash_domains(body_hash) + return captures, domains + + def get_cookie_name_investigator(self, cookie_name: str): + indexing = Indexing() + captures = [] + for capture_uuid, url_uuid in indexing.get_cookies_names_captures(cookie_name): + cache = self.get_capture_cache(capture_uuid) + if cache: + captures.append((capture_uuid, cache['title'])) + domains = [(domain, freq, indexing.cookies_names_domains_values(cookie_name, domain)) + for domain, freq in indexing.get_cookie_domains(cookie_name)] + return captures, domains + def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: capture_dir = self.lookup_capture_dir(capture_uuid) if not capture_dir: @@ -750,8 +743,14 @@ class Lookyloo(): freq = indexing.body_hash_fequency(url.body_hash) if freq['hash_freq'] > 1: to_append['body_hash_details'] = freq - to_append['body_hash_details']['other_captures'] = [indexing.get_capture_cache(capture) - for capture in indexing.get_body_hash_captures(url.body_hash, url.name)] + + captures_list: List[Tuple[str, str, str]] = [] + for capture_uuid, url_uuid, url_hostname in indexing.get_body_hash_captures(url.body_hash, url.name): + cache = self.get_capture_cache(capture_uuid) + if cache: + captures_list.append((capture_uuid, cache['title'], url_hostname)) + + to_append['body_hash_details']['other_captures'] = captures_list # Optional: SaneJS information if url.body_hash in sanejs_lookups: diff --git a/website/web/__init__.py b/website/web/__init__.py index e685e8ff..28ac6017 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -430,18 +430,13 @@ def cookies_lookup(): @app.route('/cookies/', methods=['GET']) def cookies_name_detail(cookie_name: str): - i = Indexing() - captures = [i.get_capture_cache(capture) for capture, url in i.get_cookies_names_captures(cookie_name)] - domains = [(domain, freq, i.cookies_names_domains_values(cookie_name, domain)) - for domain, freq in i.get_cookie_domains(cookie_name)] + captures, domains = lookyloo.get_cookie_name_investigator(cookie_name) return render_template('cookie_name.html', cookie_name=cookie_name, domains=domains, captures=captures) @app.route('/body_hashes/', methods=['GET']) def body_hash_details(body_hash: str): - i = Indexing() - captures = [i.get_capture_cache(capture) for capture in i.get_body_hash_captures(body_hash)] - domains = i.get_body_hash_domains(body_hash) + captures, domains = lookyloo.get_body_hash_investigator(body_hash) return render_template('body_hash.html', body_hash=body_hash, domains=domains, captures=captures) # Query API diff --git a/website/web/templates/body_hash.html b/website/web/templates/body_hash.html index 7ada9220..78a1a6de 100644 --- a/website/web/templates/body_hash.html +++ b/website/web/templates/body_hash.html @@ -46,8 +46,8 @@

The same file was seen in these captures:

{% endblock %} diff --git a/website/web/templates/cookie_name.html b/website/web/templates/cookie_name.html index d01b1ec1..6fd14910 100644 --- a/website/web/templates/cookie_name.html +++ b/website/web/templates/cookie_name.html @@ -54,8 +54,8 @@

A cookie with that name was seen in these captures:

{% endblock %} diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index 6fc15cc6..1ed29a23 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -103,22 +103,47 @@ + {% if url['sane_js'] %} +
+ {% if url['sane_js'] is string %} + {{ url['sane_js'] }} + {% else %} + This file is known as part of {{ url['sane_js'][0] }} + version {{ url['sane_js'][1] }}: {{ url['sane_js'][2] }}. + {% if url['sane_js'][3] > 1%} + It is also present in {{ url['sane_js'][3] -1 }} other libraries. + {%endif%} + {%endif%} +
+ {% endif %} + {% if url['body_hash_details'] %}
This file can be found {{ url['body_hash_details']['hash_freq'] }} times across all the captures on this lookyloo instance, in {{ url['body_hash_details']['hash_domains_freq'] }} unique domains.
{% if url['body_hash_details']['other_captures'] %} -

The same file was seen in these captures:

-
    - {% for capture in url['body_hash_details']['other_captures'] %} -
  • {{ capture['title'] }}
  • +

    + The same file was seen in {{ url['body_hash_details']['other_captures']|length }} other captures. + +

    +
    +
    +
      + {% for capture_uuid, title, hostname in url['body_hash_details']['other_captures'] %} +
    • {{ title }} - {{ hostname }}
    • {% endfor %} -
    +
+
+ + {% else %} +

All the captures are loading it from the same URL.

{% endif %} - - Show details - +

+ Show more information about this response body. +

{% endif %} {% if url['sane_js'] %} @@ -135,7 +160,6 @@
{% endif %} - {% if url['cookies_received'] %}

Cookies