diff --git a/.gitignore b/.gitignore index 7aeb948..c3e1436 100644 --- a/.gitignore +++ b/.gitignore @@ -118,3 +118,6 @@ dump.rdb # Local config files config/*.json config/*.json.bkp + +# user defined known content +known_content_user/ diff --git a/bin/async_scrape.py b/bin/async_scrape.py index 52530e5..9bf1098 100755 --- a/bin/async_scrape.py +++ b/bin/async_scrape.py @@ -11,9 +11,6 @@ from lookyloo.lookyloo import Lookyloo logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S') -# Set it to True if your instance is publicly available so users aren't able to scan your internal network -only_global_lookups = False - class AsyncScraper(AbstractManager): diff --git a/known_content_user/.keepdir b/known_content_user/.keepdir new file mode 100644 index 0000000..e69de29 diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 33fe41b..9a2d29a 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -136,11 +136,11 @@ class Indexing(): pipeline.execute() - def get_hash_uuids(self, body_hash: str) -> Tuple[str, str]: + def get_hash_uuids(self, body_hash: str) -> Tuple[str, str, str]: capture_uuid = self.redis.srandmember(f'bh|{body_hash}|captures') entry = self.redis.zrange(f'bh|{body_hash}|captures|{capture_uuid}', 0, 1)[0] urlnode_uuid, hostnode_uuid, url = entry.split('|', 2) - return capture_uuid, urlnode_uuid + return capture_uuid, urlnode_uuid, hostnode_uuid def get_body_hash_captures(self, body_hash: str, filter_url: Optional[str]=None, limit: int=20) -> Tuple[int, List[Tuple[str, str, str, bool]]]: @@ -185,39 +185,40 @@ class Context(): return all_ressources_hashes def _cache_known_content(self) -> None: - p = self.redis.pipeline() - for filename, file_content in load_known_content().items(): - if filename == 'generic': - # 1px images, files with spaces, empty => non-relevant stuff - for k, type_content in file_content.items(): - p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']}) - elif filename == 'malicious': - # User defined as malicious - for h, details in file_content.items(): - p.sadd('bh|malicious', h) - if 'target' in details and details['target']: - p.sadd(f'{h}|target', *details['target']) - if 'tag' in details and details['tag']: - p.sadd(f'{h}|tag', *details['tag']) - elif filename == 'legitimate': - # User defined as legitimate - for h, details in file_content.items(): - if 'domain' in details and details['domain']: - p.sadd(f'bh|{h}|legitimate', *details['domain']) - elif 'description' in details: - p.hset('known_content', h, details['description']) - else: - # Full captures marked as legitimate - for h, details in file_content.items(): - p.sadd(f'bh|{h}|legitimate', *details['hostnames']) - p.execute() + for dirname in ['known_content', 'known_content_user']: + for filename, file_content in load_known_content(dirname).items(): + p = self.redis.pipeline() + if filename == 'generic': + # 1px images, files with spaces, empty => non-relevant stuff + for k, type_content in file_content.items(): + p.hmset('known_content', {h: type_content['description'] for h in type_content['entries']}) + elif filename == 'malicious': + # User defined as malicious + for h, details in file_content.items(): + p.sadd('bh|malicious', h) + if 'target' in details and details['target']: + p.sadd(f'{h}|target', *details['target']) + if 'tag' in details and details['tag']: + p.sadd(f'{h}|tag', *details['tag']) + elif filename == 'legitimate': + # User defined as legitimate + for h, details in file_content.items(): + if 'domain' in details and details['domain']: + p.sadd(f'bh|{h}|legitimate', *details['domain']) + elif 'description' in details: + p.hset('known_content', h, details['description']) + else: + # Full captures marked as legitimate + for h, details in file_content.items(): + p.sadd(f'bh|{h}|legitimate', *details['hostnames']) + p.execute() def find_known_content(self, har2tree_container: Union[CrawledTree, HostNode, URLNode, str]) -> Dict[str, Any]: """Return a dictionary of content resources found in the local known_content database, or in SaneJS (if enabled)""" if isinstance(har2tree_container, str): to_lookup: Set[str] = {har2tree_container, } else: - to_lookup: Set[str] = self._get_resources_hashes(har2tree_container) + to_lookup = self._get_resources_hashes(har2tree_container) known_content_table: Dict[str, Any] = {} if not to_lookup: return known_content_table @@ -268,7 +269,7 @@ class Context(): def store_known_legitimate_tree(self, tree: CrawledTree): known_content = self.find_known_content(tree) - capture_file: Path = get_homedir() / 'known_content' / f'{urlsplit(tree.root_url).hostname}.json' + capture_file: Path = get_homedir() / 'known_content_user' / f'{urlsplit(tree.root_url).hostname}.json' if capture_file.exists(): with open(capture_file) as f: to_store = json.load(f) @@ -368,7 +369,7 @@ class Context(): self.redis.sadd(f'bh|{body_hash}|legitimate', legitimate_hostname) def store_known_malicious_ressource(self, ressource_hash: str, details: Dict[str, str]): - known_malicious_ressource_file = get_homedir() / 'known_content' / 'malicious.json' + known_malicious_ressource_file = get_homedir() / 'known_content_user' / 'malicious.json' if known_malicious_ressource_file.exists(): with open(known_malicious_ressource_file) as f: to_store = json.load(f) @@ -400,7 +401,7 @@ class Context(): p.execute() def store_known_legitimate_ressource(self, ressource_hash: str, details: Dict[str, str]): - known_legitimate_ressource_file = get_homedir() / 'known_content' / 'legitimate.json' + known_legitimate_ressource_file = get_homedir() / 'known_content_user' / 'legitimate.json' if known_legitimate_ressource_file.exists(): with open(known_legitimate_ressource_file) as f: to_store = json.load(f) @@ -1115,6 +1116,7 @@ class Lookyloo(): for ressource_h, blob in blobs: if ressource_h == h: return 'embedded_ressource.bin', blob + return None def get_hostnode_investigator(self, capture_uuid: str, node_uuid: str) -> Tuple[HostNode, List[Dict[str, Any]]]: capture_dir = self.lookup_capture_dir(capture_uuid) diff --git a/website/web/__init__.py b/website/web/__init__.py index ec4967e..2168691 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -475,8 +475,8 @@ def ressources(): for h, freq in i.ressources: domain_freq = i.ressources_number_domains(h) context = lookyloo.context.find_known_content(h) - capture_uuid, url_uuid = i.get_hash_uuids(h) - ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid)) + capture_uuid, url_uuid, hostnode_uuid = i.get_hash_uuids(h) + ressources.append((h, freq, domain_freq, context.get(h), capture_uuid, url_uuid, hostnode_uuid)) return render_template('ressources.html', ressources=ressources) @@ -508,6 +508,7 @@ def add_context(tree_uuid: str, urlnode_uuid: str): context_data = request.form ressource_hash: str = context_data.get('hash_to_contextualize') # type: ignore hostnode_uuid: str = context_data.get('hostnode_uuid') # type: ignore + callback_str: str = context_data.get('callback_str') # type: ignore legitimate: bool = True if context_data.get('legitimate') else False malicious: bool = True if context_data.get('malicious') else False details: Dict[str, Dict] = {'malicious': {}, 'legitimate': {}} @@ -526,7 +527,10 @@ def add_context(tree_uuid: str, urlnode_uuid: str): legitimate_details['description'] = context_data['legitimate_description'] details['legitimate'] = legitimate_details lookyloo.add_context(tree_uuid, urlnode_uuid, ressource_hash, legitimate, malicious, details) - return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid)) + if callback_str == 'hostnode_popup': + return redirect(url_for('hostnode_popup', tree_uuid=tree_uuid, node_uuid=hostnode_uuid)) + elif callback_str == 'ressources': + return redirect(url_for('ressources')) # Query API diff --git a/website/web/templates/hostname_popup.html b/website/web/templates/hostname_popup.html index 991f897..ee92c52 100644 --- a/website/web/templates/hostname_popup.html +++ b/website/web/templates/hostname_popup.html @@ -7,6 +7,7 @@ {% from "macros.html" import shorten_string %} {% from "macros.html" import other_captures_table %} {% from "macros.html" import get_ressource_button %} +{% from "macros.html" import context_form %} {% block title %}Details for {{ hostname }} {% endblock %} @@ -159,48 +160,8 @@

{% endif %} - {% if enable_context_by_users%} - -
-
-
-
-
- - -
-
-
- - -
-
- - -
-
-
- - -
-
-
- - -
-
- - -
- - - -
-
-
+ {% if enable_context_by_users %} + {{ context_form(tree_uuid, url['url_object'].uuid, hostnode_uuid, url['url_object'].body_hash, 'hostnode_popup') }} {% endif %} {% if url['embedded_ressources'] %} diff --git a/website/web/templates/macros.html b/website/web/templates/macros.html index cc8f97a..986b57a 100644 --- a/website/web/templates/macros.html +++ b/website/web/templates/macros.html @@ -12,6 +12,51 @@ {% endmacro %} +{% macro context_form(tree_uuid, urlnode_uuid, hostnode_uuid, hash, callback_str) %} + +
+
+
+
+
+ + +
+
+
+ + +
+
+ + +
+
+
+ + +
+
+
+ + +
+
+ + +
+ + + + +
+
+
+{% endmacro %} + {% macro get_ressource_button(capture_uuid, urlnode_uuid, hash, text) %}