From 2a5546128631b231960ae38f84ac03416176eae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Fri, 19 Mar 2021 17:51:25 +0100 Subject: [PATCH] new: Capture an URL on the rendered page, keep the session (WiP) --- lookyloo/lookyloo.py | 16 ++++++++ website/web/__init__.py | 28 ++++++++++++- website/web/templates/bulk_captures.html | 52 ++++++++++++++++++++++++ website/web/templates/tree.html | 30 ++++++++++++++ website/web/templates/urls_rendered.html | 12 ++++++ 5 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 website/web/templates/bulk_captures.html create mode 100644 website/web/templates/urls_rendered.html diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 8438b377..caa85da6 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -547,6 +547,8 @@ class Lookyloo(): if isinstance(value, bool): # Yes, empty string because that's False. query[key] = 1 if value else '' + if isinstance(value, list): + query[key] = json.dumps(value) p.hmset(perma_uuid, query) # type: ignore p.sadd('to_capture', perma_uuid) p.execute() @@ -560,6 +562,8 @@ class Lookyloo(): to_capture: Dict[str, Union[str, int, float]] = self.redis.hgetall(uuid) self.redis.delete(uuid) to_capture['perma_uuid'] = uuid + if 'cookies' in to_capture: + to_capture['cookies_pseudofile'] = to_capture.pop('cookies') if self.capture(**to_capture): # type: ignore self.logger.info(f'Processed {to_capture["url"]}') return True @@ -692,6 +696,11 @@ class Lookyloo(): '''Get all the files related to this capture.''' return self._get_raw(capture_uuid) + def get_urls_rendered_page(self, capture_uuid: str): + ct = self.get_crawled_tree(capture_uuid) + return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page) + - set(ct.root_hartree.all_url_requests.keys())) + def capture(self, url: str, cookies_pseudofile: Optional[Union[BufferedIOBase, str]]=None, depth: int=1, listing: bool=True, user_agent: Optional[str]=None, referer: str='', perma_uuid: Optional[str]=None, os: Optional[str]=None, @@ -819,6 +828,13 @@ class Lookyloo(): break return details, body_content + def get_latest_url_capture(self, url: str) -> Optional[CaptureCache]: + '''Get the most recent capture with this URL''' + captures = self.sorted_capture_cache(self.indexing.get_captures_url(url)) + if captures: + return captures[0] + return None + def get_url_occurrences(self, url: str, limit: int=20) -> List[Dict]: '''Get the most recent captures and URL nodes where the URL has been seen.''' captures = self.sorted_capture_cache(self.indexing.get_captures_url(url)) diff --git a/website/web/__init__.py b/website/web/__init__.py index 490cdc95..aa5f3b10 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -22,7 +22,7 @@ from werkzeug.security import generate_password_hash, check_password_hash from pymisp import MISPEvent -from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents, get_config, get_taxonomies +from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents, get_config, get_taxonomies, load_cookies from lookyloo.lookyloo import Lookyloo, Indexing from lookyloo.exceptions import NoValidHarFile, MissingUUID from .proxied import ReverseProxied @@ -404,6 +404,30 @@ def export(tree_uuid: str): as_attachment=True, attachment_filename='capture.zip') +@app.route('/tree//urls_rendered_page', methods=['GET']) +def urls_rendered_page(tree_uuid: str): + urls = lookyloo.get_urls_rendered_page(tree_uuid) + return render_template('urls_rendered.html', base_tree_uuid=tree_uuid, urls=urls) + + +@app.route('/bulk_captures/', methods=['POST']) +def bulk_captures(base_tree_uuid: str): + selected_urls = request.form.getlist('url') + urls = lookyloo.get_urls_rendered_page(base_tree_uuid) + ct = lookyloo.get_crawled_tree(base_tree_uuid) + bulk_captures = [] + for url in [urls[int(selected_id) - 1] for selected_id in selected_urls]: + cookies = load_cookies(lookyloo.get_cookies(base_tree_uuid)) + capture = {'url': url, + 'cookies': cookies, + 'referer': ct.root_url + } + new_capture_uuid = lookyloo.enqueue_capture(capture) + bulk_captures.append((new_capture_uuid, url)) + + return render_template('bulk_captures.html', uuid=base_tree_uuid, bulk_captures=bulk_captures) + + @app.route('/tree//hide', methods=['GET']) @flask_login.login_required def hide_capture(tree_uuid: str): @@ -717,6 +741,8 @@ def urlnode_response_cookies(tree_uuid: str, node_uuid: str): @app.route('/tree//url//urls_in_rendered_content', methods=['GET']) def urlnode_urls_in_rendered_content(tree_uuid: str, node_uuid: str): + # Note: we could simplify it with lookyloo.get_urls_rendered_page, but if at somepoint, + # we have multiple page rendered on one tree, it will be a problem. ct = lookyloo.get_crawled_tree(tree_uuid) urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid) if not urlnode.rendered_html: diff --git a/website/web/templates/bulk_captures.html b/website/web/templates/bulk_captures.html new file mode 100644 index 00000000..1e5d22ac --- /dev/null +++ b/website/web/templates/bulk_captures.html @@ -0,0 +1,52 @@ +{% extends "main.html" %} + +{% from 'bootstrap/utils.html' import render_messages %} + +{% block title %}Captures{% endblock %} + +{% block scripts %} +{{ super() }} + + +{% endblock %} + +{% block styles %} +{{ super() }} + +{% endblock %} + + +{% block content %} +
+

Ongoing captures

+ +
+
The captures below are queued, it will take a few minutes before the links are working
+
+ + + + + + + + + {% for uuid, captured_url in bulk_captures %} + + + + + {% endfor %} + +
URLLink
+ {{ captured_url }} + Show capture
+
+{% endblock %} diff --git a/website/web/templates/tree.html b/website/web/templates/tree.html index a53f12a9..4076f95f 100644 --- a/website/web/templates/tree.html +++ b/website/web/templates/tree.html @@ -72,6 +72,13 @@ modal.find('.modal-body').load(button.data("remote")); }); +