diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index afff09a1..33fe41bc 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -107,6 +107,13 @@ class Indexing(): # ###### Body hashes ###### + @property + def ressources(self) -> List[Tuple[str, float]]: + return self.redis.zrevrange('body_hashes', 0, 200, withscores=True) + + def ressources_number_domains(self, h: str) -> int: + return self.redis.zcard(f'bh|{h}') + def body_hash_fequency(self, body_hash: str) -> Dict[str, float]: return {'hash_freq': self.redis.zscore('body_hashes', body_hash), 'hash_domains_freq': self.redis.zcard(f'bh|{body_hash}')} @@ -119,25 +126,22 @@ class Indexing(): pipeline = self.redis.pipeline() for urlnode in crawled_tree.root_hartree.url_tree.traverse(): - if urlnode.empty_response: - continue - pipeline.zincrby('body_hashes', 1, urlnode.body_hash) - pipeline.zincrby(f'bh|{urlnode.body_hash}', 1, urlnode.hostname) - # set of all captures with this hash - pipeline.sadd(f'bh|{urlnode.body_hash}|captures', crawled_tree.uuid) - # ZSet of all urlnode_UUIDs|full_url - pipeline.zincrby(f'bh|{urlnode.body_hash}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') - if hasattr(urlnode, 'embedded_ressources') and urlnode.embedded_ressources: - for mimetype, blobs in urlnode.embedded_ressources.items(): - for h, body in blobs: - pipeline.zincrby('body_hashes', 1, h) - pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) - pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid) - pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, - f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') + for h in urlnode.resources_hashes: + pipeline.zincrby('body_hashes', 1, h) + pipeline.zincrby(f'bh|{h}', 1, urlnode.hostname) + # set of all captures with this hash + pipeline.sadd(f'bh|{h}|captures', crawled_tree.uuid) + # ZSet of all urlnode_UUIDs|full_url + pipeline.zincrby(f'bh|{h}|captures|{crawled_tree.uuid}', 1, f'{urlnode.uuid}|{urlnode.hostnode_uuid}|{urlnode.name}') pipeline.execute() + def get_hash_uuids(self, body_hash: str) -> Tuple[str, str]: + capture_uuid = self.redis.srandmember(f'bh|{body_hash}|captures') + entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0] + urlnode_uuid, hostnode_uuid, url = entry.split('|', 2) + return capture_uuid, urlnode_uuid + def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None, limit: int=20) -> Tuple[int, List[Tuple[str, str, str, bool]]]: to_return: List[Tuple[str, str, str, bool]] = [] @@ -208,9 +212,12 @@ class Context(): p.sadd(f'bh|{h}|legitimate', *details['hostnames']) p.execute() - def find_known_content(self, har2tree_container: Union[CrawledTree, HostNode, URLNode]) -> Dict[str, Any]: + def find_known_content(self, har2tree_container: Union[CrawledTree, HostNode, URLNode, str]) -> Dict[str, Any]: """Return a dictionary of content resources found in the local known_content database, or in SaneJS (if enabled)""" - to_lookup: Set[str] = self._get_resources_hashes(har2tree_container) + if isinstance(har2tree_container, str): + to_lookup: Set[str] = {har2tree_container, } + else: + to_lookup: Set[str] = self._get_resources_hashes(har2tree_container) known_content_table: Dict[str, Any] = {} if not to_lookup: return known_content_table @@ -1093,6 +1100,22 @@ class Lookyloo(): return known, legitimate + def get_ressource(self, tree_uuid: str, urlnode_uuid: str, h: Optional[str]) -> Optional[Tuple[str, BytesIO]]: + url = self.get_urlnode_from_tree(tree_uuid, urlnode_uuid) + if url.empty_response: + return None + if not h or h == url.body_hash: + # we want the body + return url.filename if url.filename else 'file.bin', url.body + + # We want an embedded ressource + if h not in url.resources_hashes: + return None + for mimetype, blobs in url.embedded_ressources.items(): + for ressource_h, blob in blobs: + if ressource_h == h: + return 'embedded_ressource.bin', blob + def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: capture_dir = self.lookup_capture_dir(capture_uuid) if not capture_dir: diff --git a/website/web/__init__.py b/website/web/__init__.py index 6dc7d3dd..ec4967e0 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -248,42 +248,20 @@ def urlnode_post_request(tree_uuid: str, node_uuid: str): as_attachment=True, attachment_filename='posted_data.txt') -@app.route('/tree//url//embedded_ressource', methods=['POST']) -def get_embedded_ressource(tree_uuid: str, node_uuid: str): - url = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid) - h_request = request.form.get('ressource_hash') - for mimetype, blobs in url.embedded_ressources.items(): - for h, blob in blobs: - if h == h_request: - to_return = BytesIO() - with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: - zfile.writestr('file.bin', blob.getvalue()) - to_return.seek(0) - return send_file(to_return, mimetype='application/zip', - as_attachment=True, attachment_filename='file.zip') - - -@app.route('/tree//url/', methods=['GET']) -def urlnode_details(tree_uuid: str, node_uuid: str): - urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid) +@app.route('/tree//url//ressource', methods=['POST', 'GET']) +def get_ressource(tree_uuid: str, node_uuid: str): + if request.method == 'POST': + h_request = request.form.get('ressource_hash') + else: + h_request = None + ressource = lookyloo.get_ressource(tree_uuid, node_uuid, h_request) to_return = BytesIO() - got_content = False - if hasattr(urlnode, 'body'): - body_content = urlnode.body.getvalue() - if body_content: - got_content = True - if hasattr(urlnode, 'json') and urlnode.json: - try: - loaded = json.loads(body_content) - body_content = json.dumps(loaded, indent=2).encode() - except Exception: - # Not json, but junk - pass - with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: - zfile.writestr(urlnode.filename, body_content) - if not got_content: - with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: - zfile.writestr('file.txt', b'Response body empty') + with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: + if ressource: + filename, r = ressource + zfile.writestr(filename, r.getvalue()) + else: + zfile.writestr('file.txt', b'Unknown Hash') to_return.seek(0) return send_file(to_return, mimetype='application/zip', as_attachment=True, attachment_filename='file.zip') @@ -490,6 +468,18 @@ def cookies_lookup(): return render_template('cookies.html', cookies_names=cookies_names) +@app.route('/ressources', methods=['GET']) +def ressources(): + i = Indexing() + ressources = [] + for h, freq in i.ressources: + domain_freq = i.ressources_number_domains(h) + context = lookyloo.context.find_known_content(h) + capture_uuid, url_uuid = i.get_hash_uuids(h) + ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid)) + return render_template('ressources.html', ressources=ressources) + + @app.route('/cookies/', methods=['GET']) def cookies_name_detail(cookie_name: str): captures, domains = lookyloo.get_cookie_name_investigator(cookie_name) diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index 7eafcf91..991f897c 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -6,6 +6,7 @@ {% from "macros.html" import popup_icons %} {% from "macros.html" import shorten_string %} {% from "macros.html" import other_captures_table %} +{% from "macros.html" import get_ressource_button %} {% block title %}Details for {{ hostname }} {% endblock %} @@ -220,9 +221,7 @@
This file ({{ details['type'] }}) can be found {{ details['hash_freq'] }} times across all the captures on this lookyloo instance, in {{ details['hash_domains_freq'] }} unique domains. -
- -
+ {{ get_ressource_button(tree_uuid, url['url_object'].uuid, hash, 'Download the embedded ressource') }}
{% if 'other_captures' in details %} diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index d3c5ab63..cc8f97ac 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -12,6 +12,12 @@
{% endmacro %} +{% macro get_ressource_button(capture_uuid, urlnode_uuid, hash, text) %} +
+ +
+{% endmacro %} + {% macro ressource_legitimacy_details(details, ressource_size) %} {% if details and details[0] == False %} @@ -140,7 +146,7 @@ Body size: {{ sizeof_fmt(ressource_size) }} {{ key }} {% elif key in ["js", "exe", "css", "font", "html", "json", "image", "video", "unknown_mimetype", "text", "unset_mimetype", "octet-stream", "livestream"] and not urlnode.empty_response %} - + {{ key }} {% elif key == "redirect" %}