From 3a99f1a63f36055174ce1f92246f7608a651488f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 14 May 2024 18:52:26 +0200 Subject: [PATCH] new: find related captures by hostname and URL --- .gitignore | 29 ++++- website/web/__init__.py | 107 +++++------------- website/web/genericapi.py | 55 +++++++++- website/web/helpers.py | 21 +++- website/web/templates/body_hash.html | 4 +- website/web/templates/hostname.html | 38 +------ website/web/templates/tree.html | 65 ++++++++++- website/web/templates/tree_body_hashes.html | 2 +- website/web/templates/tree_hostnames.html | 64 +++++++++++ website/web/templates/tree_urls.html | 38 +++++++ website/web/templates/url.html | 115 ++++++++------------ 11 files changed, 342 insertions(+), 196 deletions(-) create mode 100644 website/web/templates/tree_hostnames.html create mode 100644 website/web/templates/tree_urls.html diff --git a/.gitignore b/.gitignore index e652d893..3b2b9881 100644 --- a/.gitignore +++ b/.gitignore @@ -112,11 +112,14 @@ FileSaver.js d3.v5.min.js d3.v5.js -cache.pid -dump.rdb +*.pid +*.rdb +*log* +full_index/db # Local config files config/*.json +config/users/*.json config/*.json.bkp config/takedown_filters.ini @@ -126,3 +129,25 @@ known_content_user/ user_agents/ .DS_Store + +.idea + +archived_captures +discarded_captures +removed_captures + +website/web/static/d3.min.js +website/web/static/datatables.min.css +website/web/static/datatables.min.js +website/web/static/jquery.min.js + +# Modules +circl_pypdns +eupi +own_user_agents +phishtank +riskiq +sanejs +urlhaus +urlscan +vt_url diff --git a/website/web/__init__.py b/website/web/__init__.py index d78f8b4a..910f912b 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -39,8 +39,7 @@ from pymisp import MISPEvent, MISPServerError # type: ignore[attr-defined] from werkzeug.security import check_password_hash from werkzeug.wrappers.response import Response as WerkzeugResponse -from lookyloo import Lookyloo, CaptureSettings, Indexing -from lookyloo.capturecache import CaptureCache +from lookyloo import Lookyloo, CaptureSettings from lookyloo.default import get_config from lookyloo.exceptions import MissingUUID, NoValidHarFile from lookyloo.helpers import get_taxonomies, UserAgents, load_cookies @@ -54,7 +53,7 @@ else: from .genericapi import api as generic_api from .helpers import (User, build_users_table, get_secret_key, load_user_from_request, src_request_ip, sri_load, - get_lookyloo_instance) + get_lookyloo_instance, get_indexing) from .proxied import ReverseProxied logging.config.dictConfig(get_config('logging')) @@ -270,23 +269,6 @@ def file_response(func): # type: ignore[no-untyped-def] # ##### Methods querying the indexes ##### -@functools.cache -def get_indexing(user: User | None) -> Indexing: - '''Depending if we're logged in or not, we (can) get different indexes: - if index_everything is enabled, we have an index in kvrocks that contains all - the indexes for all the captures. - It is only accessible to the admin user. - ''' - if not get_config('generic', 'index_everything'): - return Indexing() - - if not user or not user.is_authenticated: - # No user or anonymous - return Indexing() - # Logged in user - return Indexing(full_index=True) - - def _get_body_hash_investigator(body_hash: str, /) -> tuple[list[tuple[str, str, datetime, str, str]], list[tuple[str, float]]]: '''Returns all the captures related to a hash (sha512), used in the web interface.''' total_captures, details = get_indexing(flask_login.current_user).get_body_hash_captures(body_hash, limit=-1) @@ -365,70 +347,33 @@ def get_all_hostnames(capture_uuid: str, /) -> dict[str, dict[str, int | list[UR return to_return -def get_latest_url_capture(url: str, /) -> CaptureCache | None: - '''Get the most recent capture with this URL''' - captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url)) - if captures: - return captures[0] - return None - - -def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: - '''Get the most recent captures and URL nodes where the URL has been seen.''' - captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url), cached_captures_only=cached_captures_only) - - to_return: list[dict[str, Any]] = [] - for capture in captures[:limit]: - ct = lookyloo.get_crawled_tree(capture.uuid) - to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid, - 'start_timestamp': capture.timestamp.isoformat(), - 'title': capture.title} - urlnodes: dict[str, dict[str, str]] = {} - for urlnode in ct.root_hartree.url_tree.search_nodes(name=url): - urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), - 'hostnode_uuid': urlnode.hostnode_uuid} - if hasattr(urlnode, 'body_hash'): - urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash - to_append['urlnodes'] = urlnodes - to_return.append(to_append) - return to_return - - -def get_hostname_occurrences(hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: - '''Get the most recent captures and URL nodes where the hostname has been seen.''' - captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_hostname(hostname), cached_captures_only=cached_captures_only) - - to_return: list[dict[str, Any]] = [] - for capture in captures[:limit]: - ct = lookyloo.get_crawled_tree(capture.uuid) - to_append: dict[str, str | list[Any] | dict[str, Any]] = { - 'capture_uuid': capture.uuid, - 'start_timestamp': capture.timestamp.isoformat(), - 'title': capture.title} - hostnodes: list[str] = [] - if with_urls_occurrences: - urlnodes: dict[str, dict[str, str]] = {} - for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname): - hostnodes.append(hostnode.uuid) - if with_urls_occurrences: - for urlnode in hostnode.urls: - urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), - 'url': urlnode.name, - 'hostnode_uuid': urlnode.hostnode_uuid} - if hasattr(urlnode, 'body_hash'): - urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash - to_append['hostnodes'] = hostnodes - if with_urls_occurrences: - to_append['urlnodes'] = urlnodes - to_return.append(to_append) +def get_all_urls(capture_uuid: str, /) -> dict[str, dict[str, int | list[URLNode] | str]]: + ct = lookyloo.get_crawled_tree(capture_uuid) + to_return: dict[str, dict[str, list[URLNode] | int | str]] = defaultdict() + for node in ct.root_hartree.url_tree.traverse(): + if not node.name: + continue + captures = get_indexing(flask_login.current_user).get_captures_url(node.name) + # Note for future: mayeb get url, capture title, something better than just the hash to show to the user + if node.hostname not in to_return: + to_return[node.name] = {'total_captures': len(captures), 'nodes': [], + 'quoted_url': quote_plus(node.name)} + to_return[node.name]['nodes'].append(node) # type: ignore[union-attr] return to_return def get_hostname_investigator(hostname: str) -> list[tuple[str, str, str, datetime]]: + '''Returns all the captures loading content from that hostname, used in the web interface.''' cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_hostname(hostname=hostname)]) return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] +def get_url_investigator(url: str) -> list[tuple[str, str, str, datetime]]: + '''Returns all the captures loading content from that url, used in the web interface.''' + cached_captures = lookyloo.sorted_capture_cache([uuid for uuid in get_indexing(flask_login.current_user).get_captures_url(url=url)]) + return [(cache.uuid, cache.title, cache.redirects[-1], cache.timestamp) for cache in cached_captures] + + def get_cookie_name_investigator(cookie_name: str, /) -> tuple[list[tuple[str, str]], list[tuple[str, float, list[tuple[str, float]]]]]: '''Returns all the captures related to a cookie name entry, used in the web interface.''' cached_captures = lookyloo.sorted_capture_cache([entry[0] for entry in get_indexing(flask_login.current_user).get_cookies_names_captures(cookie_name)]) @@ -1282,6 +1227,12 @@ def tree_hostnames(tree_uuid: str) -> str: return render_template('tree_hostnames.html', tree_uuid=tree_uuid, hostnames=hostnames) +@app.route('/tree//urls', methods=['GET']) +def tree_urls(tree_uuid: str) -> str: + urls = get_all_urls(tree_uuid) + return render_template('tree_urls.html', tree_uuid=tree_uuid, urls=urls) + + @app.route('/tree//pandora', methods=['GET', 'POST']) def pandora_submit(tree_uuid: str) -> dict[str, Any] | Response: node_uuid = None @@ -1743,8 +1694,8 @@ def body_hash_details(body_hash: str) -> str: @app.route('/urls/', methods=['GET']) def url_details(url: str) -> str: url = unquote_plus(url).strip() - hits = get_url_occurrences(url, limit=50) - return render_template('url.html', url=url, hits=hits) + captures = get_url_investigator(url) + return render_template('url.html', url=url, captures=captures) @app.route('/hostnames/', methods=['GET']) diff --git a/website/web/genericapi.py b/website/web/genericapi.py index 9e51bf7e..56332bd3 100644 --- a/website/web/genericapi.py +++ b/website/web/genericapi.py @@ -21,7 +21,7 @@ from lookyloo import CaptureSettings, Lookyloo from lookyloo.comparator import Comparator from lookyloo.exceptions import MissingUUID, NoValidHarFile -from .helpers import build_users_table, load_user_from_request, src_request_ip, get_lookyloo_instance +from .helpers import build_users_table, load_user_from_request, src_request_ip, get_lookyloo_instance, get_indexing api = Namespace('GenericAPI', description='Generic Lookyloo API', path='/') @@ -305,6 +305,27 @@ class HashInfo(Resource): # type: ignore[misc] return to_return +def get_url_occurrences(url: str, /, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: + '''Get the most recent captures and URL nodes where the URL has been seen.''' + captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_url(url), cached_captures_only=cached_captures_only) + + to_return: list[dict[str, Any]] = [] + for capture in captures[:limit]: + ct = lookyloo.get_crawled_tree(capture.uuid) + to_append: dict[str, str | dict[str, Any]] = {'capture_uuid': capture.uuid, + 'start_timestamp': capture.timestamp.isoformat(), + 'title': capture.title} + urlnodes: dict[str, dict[str, str]] = {} + for urlnode in ct.root_hartree.url_tree.search_nodes(name=url): + urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), + 'hostnode_uuid': urlnode.hostnode_uuid} + if hasattr(urlnode, 'body_hash'): + urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash + to_append['urlnodes'] = urlnodes + to_return.append(to_append) + return to_return + + url_info_fields = api.model('URLInfoFields', { 'url': fields.String(description="The URL to search", required=True), 'limit': fields.Integer(description="The maximal amount of captures to return", example=20), @@ -318,12 +339,41 @@ class URLInfo(Resource): # type: ignore[misc] @api.doc(body=url_info_fields) # type: ignore[misc] def post(self) -> list[dict[str, Any]]: - from . import get_url_occurrences to_query: dict[str, Any] = request.get_json(force=True) occurrences = get_url_occurrences(to_query.pop('url'), **to_query) return occurrences +def get_hostname_occurrences(hostname: str, /, with_urls_occurrences: bool=False, limit: int=20, cached_captures_only: bool=True) -> list[dict[str, Any]]: + '''Get the most recent captures and URL nodes where the hostname has been seen.''' + captures = lookyloo.sorted_capture_cache(get_indexing(flask_login.current_user).get_captures_hostname(hostname), cached_captures_only=cached_captures_only) + + to_return: list[dict[str, Any]] = [] + for capture in captures[:limit]: + ct = lookyloo.get_crawled_tree(capture.uuid) + to_append: dict[str, str | list[Any] | dict[str, Any]] = { + 'capture_uuid': capture.uuid, + 'start_timestamp': capture.timestamp.isoformat(), + 'title': capture.title} + hostnodes: list[str] = [] + if with_urls_occurrences: + urlnodes: dict[str, dict[str, str]] = {} + for hostnode in ct.root_hartree.hostname_tree.search_nodes(name=hostname): + hostnodes.append(hostnode.uuid) + if with_urls_occurrences: + for urlnode in hostnode.urls: + urlnodes[urlnode.uuid] = {'start_time': urlnode.start_time.isoformat(), + 'url': urlnode.name, + 'hostnode_uuid': urlnode.hostnode_uuid} + if hasattr(urlnode, 'body_hash'): + urlnodes[urlnode.uuid]['hash'] = urlnode.body_hash + to_append['hostnodes'] = hostnodes + if with_urls_occurrences: + to_append['urlnodes'] = urlnodes + to_return.append(to_append) + return to_return + + hostname_info_fields = api.model('HostnameInfoFields', { 'hostname': fields.String(description="The hostname to search", required=True), 'limit': fields.Integer(description="The maximal amount of captures to return", example=20), @@ -337,7 +387,6 @@ class HostnameInfo(Resource): # type: ignore[misc] @api.doc(body=hostname_info_fields) # type: ignore[misc] def post(self) -> list[dict[str, Any]]: - from . import get_hostname_occurrences to_query: dict[str, Any] = request.get_json(force=True) return get_hostname_occurrences(to_query.pop('hostname'), **to_query) diff --git a/website/web/helpers.py b/website/web/helpers.py index 64648952..6d15aa3d 100644 --- a/website/web/helpers.py +++ b/website/web/helpers.py @@ -6,14 +6,14 @@ import hashlib import json import os import re -from functools import lru_cache +from functools import lru_cache, cache from pathlib import Path import flask_login # type: ignore[import-untyped] from flask import Request from werkzeug.security import generate_password_hash -from lookyloo import Lookyloo +from lookyloo import Lookyloo, Indexing from lookyloo.default import get_config, get_homedir __global_lookyloo_instance = None @@ -113,3 +113,20 @@ def get_secret_key() -> bytes: def sri_load() -> dict[str, dict[str, str]]: with (get_homedir() / 'website' / 'web' / 'sri.txt').open() as f: return json.load(f) + + +@cache +def get_indexing(user: User | None) -> Indexing: + '''Depending if we're logged in or not, we (can) get different indexes: + if index_everything is enabled, we have an index in kvrocks that contains all + the indexes for all the captures. + It is only accessible to the admin user. + ''' + if not get_config('generic', 'index_everything'): + return Indexing() + + if not user or not user.is_authenticated: + # No user or anonymous + return Indexing() + # Logged in user + return Indexing(full_index=True) diff --git a/website/web/templates/body_hash.html b/website/web/templates/body_hash.html index 0fe7bb04..2a6a6872 100644 --- a/website/web/templates/body_hash.html +++ b/website/web/templates/body_hash.html @@ -77,8 +77,8 @@ - - + + diff --git a/website/web/templates/hostname.html b/website/web/templates/hostname.html index 701d829d..0b7b4191 100644 --- a/website/web/templates/hostname.html +++ b/website/web/templates/hostname.html @@ -1,42 +1,7 @@ {% from "macros.html" import shorten_string %} -{% if from_popup %} -{% extends "main.html" %} - -{% from 'bootstrap5/utils.html' import render_messages %} - -{% block title %}{{ url }}{% endblock %} - -{% block scripts %} -{{ super() }} - - - -{% endblock %} -{%endif%} - {% block content %} -{% if from_popup %} - -{%endif%} -

{{ hostname }}

@@ -50,7 +15,8 @@ return date.getFullYear() + '-' + (date.getMonth() + 1).toString().padStart(2, "0") + '-' + date.getDate().toString().padStart(2, "0") + ' ' + date.toTimeString(); } }, - { width: '80%', targets: 1 }], + { width: '40%', targets: 1 }, + { width: '40%', targets: 2 }], }); diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index f57bcf8a..56c46e72 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -161,6 +161,20 @@ }); + + + +
TimestampTitleCapture TimeCapture Title URL
+ + + + + + + + + {% for hostname, info in hostnames.items() %} + + + + + + {% endfor %} + +
Number of capturesHostnameURLs
{{ info['total_captures'] }} + + + {{hostname}} + + + +

+ +

+
+
+ +
    + {% for node in info['nodes'] %} +
  • +

    {{ node.name }}

    + Show on tree +
  • + {% endfor %} +
      + +
+
+ +
diff --git a/website/web/templates/tree_urls.html b/website/web/templates/tree_urls.html new file mode 100644 index 00000000..ac3e5c84 --- /dev/null +++ b/website/web/templates/tree_urls.html @@ -0,0 +1,38 @@ +{% from "macros.html" import popup_icons_response %} + + + + + + + + + + + + {% for url, info in urls.items() %} + + + + + {% endfor %} + +
Number of capturesURL
{{ info['total_captures'] }} + + + {{url}} + + +
diff --git a/website/web/templates/url.html b/website/web/templates/url.html index e60b4ce6..393feb2b 100644 --- a/website/web/templates/url.html +++ b/website/web/templates/url.html @@ -1,70 +1,51 @@ -{% extends "main.html" %} - -{% from 'bootstrap5/utils.html' import render_messages %} - -{% block title %}{{ url }}{% endblock %} - -{% block scripts %} -{{ super() }} - - - - -{% endblock %} +{% from "macros.html" import shorten_string %} {% block content %} -
-

{{ url }}

- -
-
- - - - - - - - - {% for hit in hits %} - - - - - {% endfor %} - -
Start timestampCaptures
- {{ hit['start_timestamp'] }} - {{ hit['title'] }} -
- Nodes: - -
-
-

The same file was seen in these captures:

-
    - {% for capture_uuid, title in captures %} -
  • {{ title }}
  • - {% endfor %} -
+ +
+

{{ url }}

+
+ + + + + + + + + + + + + {% for capture_uuid, title, landing_page, capture_time in captures %} + + + + + + {% endfor %} + +
Capture TimeCapture TitleLanding page
+ {{capture_time}} + + + {{ title }} + + + + {{ landing_page }} + +
{% endblock %}