new: Hash lookup method

2020-10-23 20:51:15 +02:00 · 2020-10-23 20:51:15 +02:00 · bdc0488e38
parent c6c4da981c
commit bdc0488e38
3 changed files with 49 additions and 2 deletions
--- a/lookyloo/indexing.py
+++ b/lookyloo/indexing.py
@ -3,6 +3,7 @@

 from urllib.parse import urlsplit
 from typing import List, Tuple, Set, Dict, Optional
+from collections import defaultdict

 from redis import Redis
 from har2tree import CrawledTree
@ -104,8 +105,8 @@ class Indexing():
                # set of all captures with this hash
                pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
                # ZSet of all urlnode_UUIDs|full_url
-                pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
-
+                pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
+                                 f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
        pipeline.execute()

    def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]:
@ -136,3 +137,12 @@ class Indexing():

    def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
        return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
+
+    def get_body_hash_urls(self, body_hash: str) -> Dict[str, List[Dict[str, str]]]:
+        all_captures: Set[str] = self.redis.smembers(f'bh|{body_hash}|captures')  # type: ignore
+        urls = defaultdict(list)
+        for capture_uuid in list(all_captures):
+            for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
+                url_uuid, hostnode_uuid, url = entry.split('|', 2)
+                urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})
+        return urls
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@ -657,6 +657,32 @@ class Lookyloo():
        domains = self.indexing.get_body_hash_domains(body_hash)
        return captures, domains

+    def get_body_hash_full(self, body_hash: str) -> Tuple[Dict[str, List[Dict[str, str]]], BytesIO]:
+        details = self.indexing.get_body_hash_urls(body_hash)
+        body_content = BytesIO()
+        # get the body from the first entry in the details list
+        for url, entries in details.items():
+            capture_dir = self.lookup_capture_dir(entries[0]['capture'])
+            if not capture_dir:
+                raise MissingUUID(f"Unable to find {entries[0]['capture']}")
+
+            ct = load_pickle_tree(capture_dir)
+            if not ct:
+                raise MissingUUID(f'Unable to find {capture_dir}')
+            urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
+            if urlnode.body_hash == body_hash:
+                # the hash we're looking for is the whole file
+                body_content = urlnode.body
+            else:
+                # The hash is an embedded resource
+                for mimetype, blobs in urlnode.body_hash.embedded_ressources.items():
+                    for h, b in blobs:
+                        if h == body_hash:
+                            body_content = b
+                            break
+            break
+        return details, body_content
+
    def get_cookie_name_investigator(self, cookie_name: str):
        captures = []
        for capture_uuid, url_uuid in self.indexing.get_cookies_names_captures(cookie_name):
--- a/website/web/init.py
+++ b/website/web/init.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+import base64
 from zipfile import ZipFile, ZIP_DEFLATED
 from io import BytesIO
 import os
@ -599,3 +600,13 @@ def json_redirects(tree_uuid: str):
        to_return['response']['redirects'] = cache['redirects']

    return jsonify(to_return)
+
+
+@app.route('/json/hash_info/<h>', methods=['GET'])
+def json_hash_info(h: str):
+    details, body = lookyloo.get_body_hash_full(h)
+    if not details:
+        return {'error': 'Unknown Hash.'}
+    to_return: Dict[str, Any] = {'response': {'hash': h, 'details': details,
+                                              'body': base64.b64encode(body.getvalue()).decode()}}
+    return jsonify(to_return)