From 2802cfd46ca82946f1c023cb3dde40c18acf6d89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Wed, 28 Oct 2020 18:49:15 +0100 Subject: [PATCH] new: Add captures categorization --- config/generic.json.sample | 1 + lookyloo/helpers.py | 7 ++ lookyloo/lookyloo.py | 50 +++++++++++++- website/web/__init__.py | 37 ++++++++++- website/web/templates/categories_capture.html | 21 ++++++ website/web/templates/macros.html | 62 +++++++++++++++++ website/web/templates/tree.html | 66 ++++++++++++++++--- 7 files changed, 233 insertions(+), 11 deletions(-) create mode 100644 website/web/templates/categories_capture.html diff --git a/config/generic.json.sample b/config/generic.json.sample index 046df5b..3ab747e 100644 --- a/config/generic.json.sample +++ b/config/generic.json.sample @@ -17,6 +17,7 @@ "use_user_agents_users": false, "enable_default_blur_screenshot": false, "enable_context_by_users": false, + "enable_categorization": false, "enable_mail_notification": false, "email": { "from": "Lookyloo ", diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 089c4d5..053dad8 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -19,6 +19,8 @@ from redis import Redis from redis.exceptions import ConnectionError from publicsuffix2 import PublicSuffixList, fetch # type: ignore from bs4 import BeautifulSoup # type: ignore +from pytaxonomies import Taxonomies + try: import cloudscraper # type: ignore HAS_CF = True @@ -55,6 +57,11 @@ def get_resources_hashes(har2tree_container: Union[CrawledTree, HostNode, URLNod return all_ressources_hashes +@lru_cache(64) +def get_taxonomies(): + return Taxonomies() + + @lru_cache(64) def get_public_suffix_list(): """Initialize Public Suffix List""" diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 444b457..393c61a 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -30,7 +30,7 @@ from werkzeug.useragents import UserAgent from .exceptions import NoValidHarFile, MissingUUID from .helpers import (get_homedir, get_socket_path, load_cookies, get_config, safe_create_dir, get_email_template, load_pickle_tree, - remove_pickle_tree, get_resources_hashes) + remove_pickle_tree, get_resources_hashes, get_taxonomies) from .modules import VirusTotal, SaneJavaScript, PhishingInitiative from .context import Context from .indexing import Indexing @@ -43,6 +43,7 @@ class Lookyloo(): self.logger.setLevel(get_config('generic', 'loglevel')) self.indexing = Indexing() self.is_public_instance = get_config('generic', 'public_instance') + self.taxonomies = get_taxonomies() self.redis: Redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) self.scrape_dir: Path = get_homedir() / 'scraped' @@ -269,6 +270,53 @@ class Lookyloo(): return {} return ct.root_hartree.stats + def categories_capture(self, capture_uuid: str): + capture_dir = self.lookup_capture_dir(capture_uuid) + if not capture_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + # get existing categories if possible + if (capture_dir / 'categories').exists(): + with (capture_dir / 'categories').open() as f: + current_categories = [line.strip() for line in f.readlines()] + else: + current_categories = [] + return {e: self.taxonomies.revert_machinetag(e) for e in current_categories} + + def categorize_capture(self, capture_uuid: str, category: str): + if not get_config('generic', 'enable_categorization'): + return + # Make sure the category is mappable to a taxonomy. + self.taxonomies.revert_machinetag(category) + + capture_dir = self.lookup_capture_dir(capture_uuid) + if not capture_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + # get existing categories if possible + if (capture_dir / 'categories').exists(): + with (capture_dir / 'categories').open() as f: + current_categories = set(line.strip() for line in f.readlines()) + else: + current_categories = set() + current_categories.add(category) + with (capture_dir / 'categories').open('w') as f: + f.writelines(f'{t}\n' for t in current_categories) + + def uncategorize_capture(self, capture_uuid: str, category: str): + if not get_config('generic', 'enable_categorization'): + return + capture_dir = self.lookup_capture_dir(capture_uuid) + if not capture_dir: + raise MissingUUID(f'Unable to find UUID {capture_uuid} in the cache') + # get existing categories if possible + if (capture_dir / 'categories').exists(): + with (capture_dir / 'categories').open() as f: + current_categories = set(line.strip() for line in f.readlines()) + else: + current_categories = set() + current_categories.remove(category) + with (capture_dir / 'categories').open('w') as f: + f.writelines(f'{t}\n' for t in current_categories) + def trigger_modules(self, capture_uuid: str, force: bool=False) -> None: capture_dir = self.lookup_capture_dir(capture_uuid) if not capture_dir: diff --git a/website/web/__init__.py b/website/web/__init__.py index d1da9e4..4e4b5ae 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -14,7 +14,7 @@ from flask import Flask, render_template, request, send_file, redirect, url_for, from flask_bootstrap import Bootstrap # type: ignore from flask_httpauth import HTTPDigestAuth # type: ignore -from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents, get_config +from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents, get_config, get_taxonomies from lookyloo.lookyloo import Lookyloo, Indexing from lookyloo.exceptions import NoValidHarFile, MissingUUID from .proxied import ReverseProxied @@ -172,6 +172,36 @@ def trigger_modules(tree_uuid: str, force: int): return redirect(url_for('modules', tree_uuid=tree_uuid)) +@app.route('/tree//categories_capture/', defaults={'query': ''}) +@app.route('/tree//categories_capture/', methods=['GET']) +def categories_capture(tree_uuid: str, query: str): + current_categories = lookyloo.categories_capture(tree_uuid) + matching_categories = None + if query: + matching_categories = {} + t = get_taxonomies() + entries = t.search(query) + if entries: + matching_categories = {e: t.revert_machinetag(e) for e in entries} + return render_template('categories_capture.html', tree_uuid=tree_uuid, + current_categories=current_categories, + matching_categories=matching_categories) + + +@app.route('/tree//uncategorize/', defaults={'category': ''}) +@app.route('/tree//uncategorize/', methods=['GET']) +def uncategorize_capture(tree_uuid: str, category: str): + lookyloo.uncategorize_capture(tree_uuid, category) + return jsonify({'response': f'{category} successfully added to {tree_uuid}'}) + + +@app.route('/tree//categorize/', defaults={'category': ''}) +@app.route('/tree//categorize/', methods=['GET']) +def categorize_capture(tree_uuid: str, category: str): + lookyloo.categorize_capture(tree_uuid, category) + return jsonify({'response': f'{category} successfully removed from {tree_uuid}'}) + + @app.route('/tree//stats', methods=['GET']) def stats(tree_uuid: str): stats = lookyloo.get_statistics(tree_uuid) @@ -311,11 +341,16 @@ def tree(tree_uuid: str, urlnode_uuid: Optional[str]=None): enable_context_by_users = True else: enable_context_by_users = False + if get_config('generic', 'enable_categorization'): + enable_categorization = True + else: + enable_categorization = False tree_json, start_time, user_agent, root_url, meta = lookyloo.load_tree(tree_uuid) return render_template('tree.html', tree_json=tree_json, start_time=start_time, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, meta=meta, enable_mail_notification=enable_mail_notification, enable_context_by_users=enable_context_by_users, + enable_categorization=enable_categorization, blur_screenshot=blur_screenshot, urlnode_uuid=urlnode_uuid, has_redirects=True if cache['redirects'] else False) diff --git a/website/web/templates/categories_capture.html b/website/web/templates/categories_capture.html new file mode 100644 index 0000000..220b831 --- /dev/null +++ b/website/web/templates/categories_capture.html @@ -0,0 +1,21 @@ +{% from "macros.html" import taxonomy_table %} + +
+ +{% if current_categories %} +

Current categories for the capture

+ {{ taxonomy_table(tree_uuid, current_categories, 0) }} +{% else %} +

The capture isn't categorized yet

+{% endif%} + +{% if matching_categories is none %} +

+{% elif matching_categories %} +

Categories matching your query

+ {{ taxonomy_table(tree_uuid, matching_categories, 1) }} +{% else %} +

No categories matching your query

+{% endif%} + +
diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index ee582e5..a8ec6d9 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -1,3 +1,65 @@ +{% macro taxonomy_table(tree_uuid, categories_info, add_category) %} +
+ + + + + + + {% if add_category %} + + {% else %} + + {% endif %} + + + + {% for mt, val in categories_info.items() %} + + + + + + + {% endfor %} + +
NameDescriptionMachinetagClick to add categoryClick to remove category
{{ val[0].name }} + {% if val|length == 3 %} + {% if val[2].description %} + {{ val[2].description }} + {% elif val[2].expanded %} + {{ val[2].expanded }} + {%endif%} + {% elif val[1].description %} + {{ val[1].description }} + {% else %} + {{ val[1].predicate }} + {%endif%} + {{ mt }} + +
+
+ +{% endmacro %} + + {% macro known_content_details(details) %}
{% if details is string %} diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index 518fc95..292218a 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -9,13 +9,35 @@ {{ super() }} + + + + - + {% if urlnode_uuid %}