mirror of https://github.com/CIRCL/lookyloo
new: Hash lookup method
parent
c6c4da981c
commit
bdc0488e38
|
@ -3,6 +3,7 @@
|
|||
|
||||
from urllib.parse import urlsplit
|
||||
from typing import List, Tuple, Set, Dict, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
from redis import Redis
|
||||
from har2tree import CrawledTree
|
||||
|
@ -104,8 +105,8 @@ class Indexing():
|
|||
# set of all captures with this hash
|
||||
pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid)
|
||||
# ZSet of all urlnode_UUIDs|full_url
|
||||
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
|
||||
|
||||
pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1,
|
||||
f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}')
|
||||
pipeline.execute()
|
||||
|
||||
def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]:
|
||||
|
@ -136,3 +137,12 @@ class Indexing():
|
|||
|
||||
def get_body_hash_domains(self, body_hash: str) -> List[Tuple[str, float]]:
|
||||
return self.redis.zrevrange(f'bh|{body_hash}', 0, -1, withscores=True)
|
||||
|
||||
def get_body_hash_urls(self, body_hash: str) -> Dict[str, List[Dict[str, str]]]:
|
||||
all_captures: Set[str] = self.redis.smembers(f'bh|{body_hash}|captures') # type: ignore
|
||||
urls = defaultdict(list)
|
||||
for capture_uuid in list(all_captures):
|
||||
for entry in self.redis.zrevrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, -1):
|
||||
url_uuid, hostnode_uuid, url = entry.split('|', 2)
|
||||
urls[url].append({'capture': capture_uuid, 'hostnode': hostnode_uuid, 'urlnode': url_uuid})
|
||||
return urls
|
||||
|
|
|
@ -657,6 +657,32 @@ class Lookyloo():
|
|||
domains = self.indexing.get_body_hash_domains(body_hash)
|
||||
return captures, domains
|
||||
|
||||
def get_body_hash_full(self, body_hash: str) -> Tuple[Dict[str, List[Dict[str, str]]], BytesIO]:
|
||||
details = self.indexing.get_body_hash_urls(body_hash)
|
||||
body_content = BytesIO()
|
||||
# get the body from the first entry in the details list
|
||||
for url, entries in details.items():
|
||||
capture_dir = self.lookup_capture_dir(entries[0]['capture'])
|
||||
if not capture_dir:
|
||||
raise MissingUUID(f"Unable to find {entries[0]['capture']}")
|
||||
|
||||
ct = load_pickle_tree(capture_dir)
|
||||
if not ct:
|
||||
raise MissingUUID(f'Unable to find {capture_dir}')
|
||||
urlnode = ct.root_hartree.get_url_node_by_uuid(entries[0]['urlnode'])
|
||||
if urlnode.body_hash == body_hash:
|
||||
# the hash we're looking for is the whole file
|
||||
body_content = urlnode.body
|
||||
else:
|
||||
# The hash is an embedded resource
|
||||
for mimetype, blobs in urlnode.body_hash.embedded_ressources.items():
|
||||
for h, b in blobs:
|
||||
if h == body_hash:
|
||||
body_content = b
|
||||
break
|
||||
break
|
||||
return details, body_content
|
||||
|
||||
def get_cookie_name_investigator(self, cookie_name: str):
|
||||
captures = []
|
||||
for capture_uuid, url_uuid in self.indexing.get_cookies_names_captures(cookie_name):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import base64
|
||||
from zipfile import ZipFile, ZIP_DEFLATED
|
||||
from io import BytesIO
|
||||
import os
|
||||
|
@ -599,3 +600,13 @@ def json_redirects(tree_uuid: str):
|
|||
to_return['response']['redirects'] = cache['redirects']
|
||||
|
||||
return jsonify(to_return)
|
||||
|
||||
|
||||
@app.route('/json/hash_info/<h>', methods=['GET'])
|
||||
def json_hash_info(h: str):
|
||||
details, body = lookyloo.get_body_hash_full(h)
|
||||
if not details:
|
||||
return {'error': 'Unknown Hash.'}
|
||||
to_return: Dict[str, Any] = {'response': {'hash': h, 'details': details,
|
||||
'body': base64.b64encode(body.getvalue()).decode()}}
|
||||
return jsonify(to_return)
|
||||
|
|
Loading…
Reference in New Issue