From 9f4c77d5d260ca03a627d8626886da72c1d36c3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?=
Date: Thu, 3 Sep 2020 16:31:45 +0200
Subject: [PATCH] chg: Cleanups, allow to add context from ressources page
---
.gitignore | 3 ++
bin/async_scrape.py | 3 --
known_content_user/.keepdir | 0
lookyloo/lookyloo.py | 66 ++++++++++++-----------
website/web/__init__.py | 10 ++--
website/web/templates/hostname_popup.html | 45 ++--------------
website/web/templates/macros.html | 45 ++++++++++++++++
7 files changed, 92 insertions(+), 80 deletions(-)
create mode 100644 known_content_user/.keepdir
diff --git a/.gitignore b/.gitignore
index 7aeb948..c3e1436 100644
--- a/.gitignore
+++ b/.gitignore
@@ -118,3 +118,6 @@ dump.rdb
# Local config files
config/*.json
config/*.json.bkp
+
+# user defined known content
+known_content_user/
diff --git a/bin/async_scrape.py b/bin/async_scrape.py
index 52530e5..9bf1098 100755
--- a/bin/async_scrape.py
+++ b/bin/async_scrape.py
@@ -11,9 +11,6 @@ from lookyloo.lookyloo import Lookyloo
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
-# Set it to True if your instance is publicly available so users aren't able to scan your internal network
-only_global_lookups = False
-
class AsyncScraper(AbstractManager):
diff --git a/known_content_user/.keepdir b/known_content_user/.keepdir
new file mode 100644
index 0000000..e69de29
diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py
index 33fe41b..9a2d29a 100644
--- a/lookyloo/lookyloo.py
+++ b/lookyloo/lookyloo.py
@@ -136,11 +136,11 @@ class Indexing():
pipeline.execute()
- def get_hash_uuids(self, body_hash: str) -> Tuple[str, str]:
+ def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]:
capture_uuid = self.redis.srandmember(f'bh|{body_hash}|captures')
entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0]
urlnode_uuid, hostnode_uuid, url = entry.split('|', 2)
- return capture_uuid, urlnode_uuid
+ return capture_uuid, urlnode_uuid, hostnode_uuid
def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None,
limit: int=20) -> Tuple[int, List[Tuple[str, str, str, bool]]]:
@@ -185,39 +185,40 @@ class Context():
return all_ressources_hashes
def _cache_known_content(self) -> None:
- p = self.redis.pipeline()
- for filename, file_content in load_known_content().items():
- if filename == 'generic':
- # 1px images, files with spaces, empty => non-relevant stuff
- for k, type_content in file_content.items():
- p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']})
- elif filename == 'malicious':
- # User defined as malicious
- for h, details in file_content.items():
- p.sadd('bh|malicious', h)
- if 'target' in details and details['target']:
- p.sadd(f'{h}|target', *details['target'])
- if 'tag' in details and details['tag']:
- p.sadd(f'{h}|tag', *details['tag'])
- elif filename == 'legitimate':
- # User defined as legitimate
- for h, details in file_content.items():
- if 'domain' in details and details['domain']:
- p.sadd(f'bh|{h}|legitimate', *details['domain'])
- elif 'description' in details:
- p.hset('known_content', h, details['description'])
- else:
- # Full captures marked as legitimate
- for h, details in file_content.items():
- p.sadd(f'bh|{h}|legitimate', *details['hostnames'])
- p.execute()
+ for dirname in ['known_content', 'known_content_user']:
+ for filename, file_content in load_known_content(dirname).items():
+ p = self.redis.pipeline()
+ if filename == 'generic':
+ # 1px images, files with spaces, empty => non-relevant stuff
+ for k, type_content in file_content.items():
+ p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']})
+ elif filename == 'malicious':
+ # User defined as malicious
+ for h, details in file_content.items():
+ p.sadd('bh|malicious', h)
+ if 'target' in details and details['target']:
+ p.sadd(f'{h}|target', *details['target'])
+ if 'tag' in details and details['tag']:
+ p.sadd(f'{h}|tag', *details['tag'])
+ elif filename == 'legitimate':
+ # User defined as legitimate
+ for h, details in file_content.items():
+ if 'domain' in details and details['domain']:
+ p.sadd(f'bh|{h}|legitimate', *details['domain'])
+ elif 'description' in details:
+ p.hset('known_content', h, details['description'])
+ else:
+ # Full captures marked as legitimate
+ for h, details in file_content.items():
+ p.sadd(f'bh|{h}|legitimate', *details['hostnames'])
+ p.execute()
def find_known_content(self, har2tree_container: Union[CrawledTree, HostNode, URLNode, str]) -> Dict[str, Any]:
"""Return a dictionary of content resources found in the local known_content database, or in SaneJS (if enabled)"""
if isinstance(har2tree_container, str):
to_lookup: Set[str] = {har2tree_container, }
else:
- to_lookup: Set[str] = self._get_resources_hashes(har2tree_container)
+ to_lookup = self._get_resources_hashes(har2tree_container)
known_content_table: Dict[str, Any] = {}
if not to_lookup:
return known_content_table
@@ -268,7 +269,7 @@ class Context():
def store_known_legitimate_tree(self, tree: CrawledTree):
known_content = self.find_known_content(tree)
- capture_file: Path = get_homedir() / 'known_content' / f'{urlsplit(tree.root_url).hostname}.json'
+ capture_file: Path = get_homedir() / 'known_content_user' / f'{urlsplit(tree.root_url).hostname}.json'
if capture_file.exists():
with open(capture_file) as f:
to_store = json.load(f)
@@ -368,7 +369,7 @@ class Context():
self.redis.sadd(f'bh|{body_hash}|legitimate', legitimate_hostname)
def store_known_malicious_ressource(self, ressource_hash: str, details: Dict[str, str]):
- known_malicious_ressource_file = get_homedir() / 'known_content' / 'malicious.json'
+ known_malicious_ressource_file = get_homedir() / 'known_content_user' / 'malicious.json'
if known_malicious_ressource_file.exists():
with open(known_malicious_ressource_file) as f:
to_store = json.load(f)
@@ -400,7 +401,7 @@ class Context():
p.execute()
def store_known_legitimate_ressource(self, ressource_hash: str, details: Dict[str, str]):
- known_legitimate_ressource_file = get_homedir() / 'known_content' / 'legitimate.json'
+ known_legitimate_ressource_file = get_homedir() / 'known_content_user' / 'legitimate.json'
if known_legitimate_ressource_file.exists():
with open(known_legitimate_ressource_file) as f:
to_store = json.load(f)
@@ -1115,6 +1116,7 @@ class Lookyloo():
for ressource_h, blob in blobs:
if ressource_h == h:
return 'embedded_ressource.bin', blob
+ return None
def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]:
capture_dir = self.lookup_capture_dir(capture_uuid)
diff --git a/website/web/__init__.py b/website/web/__init__.py
index ec4967e..2168691 100644
--- a/website/web/__init__.py
+++ b/website/web/__init__.py
@@ -475,8 +475,8 @@ def ressources():
for h, freq in i.ressources:
domain_freq = i.ressources_number_domains(h)
context = lookyloo.context.find_known_content(h)
- capture_uuid, url_uuid = i.get_hash_uuids(h)
- ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid))
+ capture_uuid, url_uuid, hostnode_uuid = i.get_hash_uuids(h)
+ ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid))
return render_template('ressources.html', ressources=ressources)
@@ -508,6 +508,7 @@ def add_context(tree_uuid: str, urlnode_uuid: str):
context_data = request.form
ressource_hash: str = context_data.get('hash_to_contextualize') # type: ignore
hostnode_uuid: str = context_data.get('hostnode_uuid') # type: ignore
+ callback_str: str = context_data.get('callback_str') # type: ignore
legitimate: bool = True if context_data.get('legitimate') else False
malicious: bool = True if context_data.get('malicious') else False
details: Dict[str, Dict] = {'malicious': {}, 'legitimate': {}}
@@ -526,7 +527,10 @@ def add_context(tree_uuid: str, urlnode_uuid: str):
legitimate_details['description'] = context_data['legitimate_description']
details['legitimate'] = legitimate_details
lookyloo.add_context(tree_uuid, urlnode_uuid, ressource_hash, legitimate, malicious, details)
- return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid))
+ if callback_str == 'hostnode_popup':
+ return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid))
+ elif callback_str == 'ressources':
+ return redirect(url_for('ressources'))
# Query API
diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html
index 991f897..ee92c52 100644
--- a/website/web/templates/hostname_popup.html
+++ b/website/web/templates/hostname_popup.html
@@ -7,6 +7,7 @@
{% from "macros.html" import shorten_string %}
{% from "macros.html" import other_captures_table %}
{% from "macros.html" import get_ressource_button %}
+{% from "macros.html" import context_form %}
{% block title %}Details for {{ hostname }} {% endblock %}
@@ -159,48 +160,8 @@
{% endif %}
- {% if enable_context_by_users%}
-
-
+ {% if enable_context_by_users %}
+ {{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, url['url_object'].body_hash, 'hostnode_popup') }}
{% endif %}
{% if url['embedded_ressources'] %}
diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html
index cc8f97a..986b57a 100644
--- a/website/web/templates/macros.html
+++ b/website/web/templates/macros.html
@@ -12,6 +12,51 @@
{% endmacro %}
+{% macro context_form(tree_uuid, urlnode_uuid, hostnode_uuid, hash, callback_str) %}
+
+
+{% endmacro %}
+
{% macro get_ressource_button(capture_uuid, urlnode_uuid, hash, text) %}