From 0a9003f58e0f2aeee25f0405f331dd60c6d86a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 8 Dec 2022 11:57:45 +0100 Subject: [PATCH] chg: Use cache whenever possible --- lookyloo/lookyloo.py | 2 +- website/web/__init__.py | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index 8d4e2ce..06ace52 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -702,7 +702,7 @@ class Lookyloo(): '''Get all the files related to this capture.''' return self._get_raw(capture_uuid) - def get_urls_rendered_page(self, capture_uuid: str, /): + def get_urls_rendered_page(self, capture_uuid: str, /) -> List[str]: ct = self.get_crawled_tree(capture_uuid) return sorted(set(ct.root_hartree.rendered_node.urls_in_rendered_page) - set(ct.root_hartree.all_url_requests.keys())) diff --git a/website/web/__init__.py b/website/web/__init__.py index dd9e5ce..caa8fc9 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -578,15 +578,17 @@ def bulk_captures(base_tree_uuid: str): user = src_request_ip(request) selected_urls = request.form.getlist('url') urls = lookyloo.get_urls_rendered_page(base_tree_uuid) - ct = lookyloo.get_crawled_tree(base_tree_uuid) cache = lookyloo.capture_cache(base_tree_uuid) + if not cache: + flash('Unable to find capture {base_tree_uuid} in cache.', 'error') + return redirect(url_for('tree', tree_uuid=base_tree_uuid)) cookies = load_cookies(lookyloo.get_cookies(base_tree_uuid)) bulk_captures = [] for url in [urls[int(selected_id) - 1] for selected_id in selected_urls]: capture = {'url': url, 'cookies': cookies, - 'referer': ct.redirects[-1] if ct.redirects else ct.root_url, - 'user_agent': ct.user_agent, + 'referer': cache.redirects[-1] if cache.redirects else cache.url, + 'user_agent': cache.user_agent, 'parent': base_tree_uuid, 'listing': False if cache and cache.no_index else True } @@ -1092,11 +1094,11 @@ def urlnode_response_cookies(tree_uuid: str, node_uuid: str): def urlnode_urls_in_rendered_content(tree_uuid: str, node_uuid: str): # Note: we could simplify it with lookyloo.get_urls_rendered_page, but if at somepoint, # we have multiple page rendered on one tree, it will be a problem. - ct = lookyloo.get_crawled_tree(tree_uuid) - urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid) - if not urlnode.rendered_html: + urlnode = lookyloo.get_urlnode_from_tree(tree_uuid, node_uuid) + if not hasattr(urlnode, 'rendered_html') or not urlnode.rendered_html: return + ct = lookyloo.get_crawled_tree(tree_uuid) not_loaded_urls = sorted(set(urlnode.urls_in_rendered_page) - set(ct.root_hartree.all_url_requests.keys())) to_return = StringIO()