From 949ad58667e8f9fa604994be58dc80600028fb28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 26 Mar 2020 01:56:24 +0100 Subject: [PATCH] chg: Refactoring of the redirects, rename report -> capture --- config/.keepdir | 0 lookyloo/helpers.py | 6 +- lookyloo/lookyloo.py | 118 +++++++++++++++++++------------ poetry.lock | 6 +- website/web/__init__.py | 39 ++++++---- website/web/templates/index.html | 8 ++- 6 files changed, 108 insertions(+), 69 deletions(-) create mode 100644 config/.keepdir diff --git a/config/.keepdir b/config/.keepdir new file mode 100644 index 0000000..e69de29 diff --git a/lookyloo/helpers.py b/lookyloo/helpers.py index 83d027e..f643678 100644 --- a/lookyloo/helpers.py +++ b/lookyloo/helpers.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- import os -from typing import List, Optional +from typing import List, Optional, Dict, Union, Any from io import BufferedIOBase from pathlib import Path from .exceptions import MissingEnv, CreateDirectoryException @@ -116,7 +116,7 @@ def update_user_agents(): try: s = cloudscraper.create_scraper() r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') - except Exception as e: + except Exception: traceback.print_exc() return soup = BeautifulSoup(r.text, 'html.parser') @@ -165,7 +165,7 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[dict] 'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z', 'domain': u, 'value': cookie['Content raw'] - } + } to_return.append(to_add) except Exception as e: print(f'Unable to load the cookie file: {e}') diff --git a/lookyloo/lookyloo.py b/lookyloo/lookyloo.py index d900b69..e9a1f2b 100644 --- a/lookyloo/lookyloo.py +++ b/lookyloo/lookyloo.py @@ -24,7 +24,7 @@ from .helpers import get_homedir, get_socket_path, load_cookies from .exceptions import NoValidHarFile from redis import Redis -from typing import Union, Dict, List, Tuple, Optional +from typing import Union, Dict, List, Tuple, Optional, Any import logging @@ -60,76 +60,93 @@ class Lookyloo(): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(loglevel) - def _set_report_cache(self, report_dir: Path) -> None: - if self.redis.exists(str(report_dir)): + def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None: + if force or not self.redis.exists(str(capture_dir)): + # (re)build cache + pass + else: return - with (report_dir / 'uuid').open() as f: + + with (capture_dir / 'uuid').open() as f: uuid = f.read().strip() - har_files = sorted(report_dir.glob('*.har')) + har_files = sorted(capture_dir.glob('*.har')) error_cache: Dict[str, str] = {} - if (report_dir / 'error.txt').exists(): + if (capture_dir / 'error.txt').exists(): # Something went wrong - with (Path(report_dir) / 'error.txt').open() as _error: - error_cache['error'] = f'Capture in {report_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum' + with (Path(capture_dir) / 'error.txt').open() as _error: + error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum' elif not har_files: - error_cache['error'] = f'No har files in {report_dir}' + error_cache['error'] = f'No har files in {capture_dir}' if error_cache: self.logger.warning(error_cache['error']) - self.redis.hmset(str(report_dir), error_cache) - self.redis.hset('lookup_dirs', uuid, str(report_dir)) + self.redis.hmset(str(capture_dir), error_cache) + self.redis.hset('lookup_dirs', uuid, str(capture_dir)) return har = HarFile(har_files[0]) + redirects = har.initial_redirects + incomplete_redirects = False + if redirects and har.need_tree_redirects: + # load tree from disk, get redirects + ct = self._load_pickle(capture_dir / 'tree.pickle') + if ct: + redirects = ct.redirects + else: + # Pickle not available + incomplete_redirects = True + cache: Dict[str, Union[str, int]] = {'uuid': uuid, 'title': har.initial_title, 'timestamp': har.initial_start_time, 'url': har.first_url, - 'redirects': json.dumps(har.initial_redirects)} - if (report_dir / 'no_index').exists(): # If the folders claims anonymity + 'redirects': json.dumps(redirects), + 'incomplete_redirects': 1 if incomplete_redirects else 0} + if (capture_dir / 'no_index').exists(): # If the folders claims anonymity cache['no_index'] = 1 - if uuid and not self.redis.exists(str(report_dir)): - self.redis.hmset(str(report_dir), cache) - self.redis.hset('lookup_dirs', uuid, str(report_dir)) - def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]: - if isinstance(report_dir, Path): - report_dir = str(report_dir) - cached = self.redis.hgetall(report_dir) + self.redis.hmset(str(capture_dir), cache) + self.redis.hset('lookup_dirs', uuid, str(capture_dir)) + + def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]: + if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1': + # try to rebuild the cache + self._set_capture_cache(capture_dir, force=True) + cached = self.redis.hgetall(str(capture_dir)) if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']): cached['redirects'] = json.loads(cached['redirects']) return cached elif 'error' in cached: return cached else: - self.logger.warning(f'Cache ({report_dir}) is invalid: {json.dumps(cached, indent=2)}') + self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}') return None def _init_existing_dumps(self) -> None: - for report_dir in self.report_dirs: - if report_dir.exists(): - self._set_report_cache(report_dir) + for capture_dir in self.capture_dirs: + if capture_dir.exists(): + self._set_capture_cache(capture_dir) self.redis.set('cache_loaded', 1) @property - def report_dirs(self) -> List[Path]: - for report_dir in self.scrape_dir.iterdir(): - if report_dir.is_dir() and not report_dir.iterdir(): + def capture_dirs(self) -> List[Path]: + for capture_dir in self.scrape_dir.iterdir(): + if capture_dir.is_dir() and not capture_dir.iterdir(): # Cleanup self.scrape_dir of failed runs. - report_dir.rmdir() - if not (report_dir / 'uuid').exists(): + capture_dir.rmdir() + if not (capture_dir / 'uuid').exists(): # Create uuid if missing - with (report_dir / 'uuid').open('w') as f: + with (capture_dir / 'uuid').open('w') as f: f.write(str(uuid4())) return sorted(self.scrape_dir.iterdir(), reverse=True) - def lookup_report_dir(self, uuid) -> Union[Path, None]: - report_dir = self.redis.hget('lookup_dirs', uuid) - if report_dir: - return Path(report_dir) + def lookup_capture_dir(self, uuid) -> Union[Path, None]: + capture_dir = self.redis.hget('lookup_dirs', uuid) + if capture_dir: + return Path(capture_dir) return None def enqueue_scrape(self, query: dict) -> str: @@ -152,18 +169,27 @@ class Lookyloo(): return True return False - def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]: - har_files = sorted(report_dir.glob('*.har')) + def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]: + if pickle_file.exists(): + with pickle_file.open('rb') as _p: + return pickle.load(_p) + return None + + def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]: + har_files = sorted(capture_dir.glob('*.har')) + pickle_file = capture_dir / 'tree.pickle' try: meta = {} - if (report_dir / 'meta').exists(): - with open((report_dir / 'meta'), 'r') as f: + if (capture_dir / 'meta').exists(): + # NOTE: Legacy, the meta file should be present + with open((capture_dir / 'meta'), 'r') as f: meta = json.load(f) - ct = CrawledTree(har_files) - temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) - pickle.dump(ct, temp) - temp.close() - return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta + ct = self._load_pickle(pickle_file) + if not ct: + ct = CrawledTree(har_files) + with pickle_file.open('wb') as _p: + pickle.dump(ct, _p) + return pickle_file.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta except Har2TreeError as e: raise NoValidHarFile(e.message) @@ -172,8 +198,8 @@ class Lookyloo(): if time.time() - tmpfile.stat().st_atime > 36000: tmpfile.unlink() - def load_image(self, report_dir: Path) -> BytesIO: - with open(list(report_dir.glob('*.png'))[0], 'rb') as f: + def load_image(self, capture_dir: Path) -> BytesIO: + with open(list(capture_dir.glob('*.png'))[0], 'rb') as f: return BytesIO(f.read()) def sane_js_query(self, sha512: str) -> Dict: @@ -254,5 +280,5 @@ class Lookyloo(): with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies: json.dump(cookies, _cookies) - self._set_report_cache(dirpath) + self._set_capture_cache(dirpath) return perma_uuid diff --git a/poetry.lock b/poetry.lock index d1ca31f..947d9a2 100644 --- a/poetry.lock +++ b/poetry.lock @@ -251,7 +251,7 @@ publicsuffix2 = "^2.20191221" six = "^1.14.0" [package.source] -reference = "7656d60b3a200285205f991f6646d9a1e366c7cf" +reference = "02c7e19229c33a62bfabbc3a1981f0401d7b3a71" type = "git" url = "https://github.com/viper-framework/har2tree.git" [[package]] @@ -701,7 +701,7 @@ scrapy = "^1.8.0" scrapy-splash = "^0.7.2" [package.source] -reference = "383fafad20a111e02fa53dc639b4cc53c7b8456c" +reference = "bf18e5e1c88c9263b90a69348e0020ceccf2aa12" type = "git" url = "https://github.com/viper-framework/ScrapySplashWrapper.git" [[package]] @@ -795,7 +795,7 @@ python-versions = "*" version = "1.4.1" [[package]] -category = "dev" +category = "main" description = "Backported and Experimental Type Hints for Python 3.5+" name = "typing-extensions" optional = false diff --git a/website/web/__init__.py b/website/web/__init__.py index 5b3d61e..505f0bd 100644 --- a/website/web/__init__.py +++ b/website/web/__init__.py @@ -47,9 +47,9 @@ lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_glo # keep -def load_tree(report_dir: Path) -> Tuple[dict, str, str, str, dict]: +def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]: session.clear() - temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir) + temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir) session["tree"] = temp_file_name return tree_json, tree_time, tree_ua, tree_root_url, meta @@ -137,20 +137,20 @@ def urlnode_details(node_uuid): @app.route('/tree//image', methods=['GET']) def image(tree_uuid): - report_dir = lookyloo.lookup_report_dir(tree_uuid) - if not report_dir: + capture_dir = lookyloo.lookup_capture_dir(tree_uuid) + if not capture_dir: return Response('Not available.', mimetype='text/text') - to_return = lookyloo.load_image(report_dir) + to_return = lookyloo.load_image(capture_dir) return send_file(to_return, mimetype='image/png', as_attachment=True, attachment_filename='image.png') @app.route('/redirects/', methods=['GET']) def redirects(tree_uuid): - report_dir = lookyloo.lookup_report_dir(tree_uuid) - if not report_dir: + capture_dir = lookyloo.lookup_capture_dir(tree_uuid) + if not capture_dir: return Response('Not available.', mimetype='text/text') - cache = lookyloo.report_cache(report_dir) + cache = lookyloo.capture_cache(capture_dir) if not cache['redirects']: return Response('No redirects.', mimetype='text/text') to_return = BytesIO('\n'.join(cache['redirects']).encode()) @@ -158,23 +158,31 @@ def redirects(tree_uuid): as_attachment=True, attachment_filename='redirects.txt') +@app.route('/cache_tree/', methods=['GET']) +def cache_tree(tree_uuid): + capture_dir = lookyloo.lookup_capture_dir(tree_uuid) + if capture_dir: + lookyloo.load_tree(capture_dir) + return redirect(url_for('index')) + + @app.route('/tree/', methods=['GET']) def tree(tree_uuid): if tree_uuid == 'False': flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') return redirect(url_for('index')) - report_dir = lookyloo.lookup_report_dir(tree_uuid) - if not report_dir: + capture_dir = lookyloo.lookup_capture_dir(tree_uuid) + if not capture_dir: flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error') return redirect(url_for('index')) - cache = lookyloo.report_cache(report_dir) + cache = lookyloo.capture_cache(capture_dir) if 'error' in cache: flash(cache['error'], 'error') return redirect(url_for('index')) try: - tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir) + tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir) return render_template('tree.html', tree_json=tree_json, start_time=start_time, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, meta=meta) @@ -190,12 +198,13 @@ def index(): lookyloo.cleanup_old_tmpfiles() update_user_agents() titles = [] - for report_dir in lookyloo.report_dirs: - cached = lookyloo.report_cache(report_dir) + for capture_dir in lookyloo.capture_dirs: + cached = lookyloo.capture_cache(capture_dir) if not cached or 'no_index' in cached or 'error' in cached: continue date, time = cached['timestamp'].split('T') time, _ = time.split('.', 1) - titles.append((cached['uuid'], cached['title'], date, time, cached['url'], cached['redirects'])) + titles.append((cached['uuid'], cached['title'], date, time, cached['url'], + cached['redirects'], True if cached['incomplete_redirects'] == '1' else False)) titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) return render_template('index.html', titles=titles) diff --git a/website/web/templates/index.html b/website/web/templates/index.html index 3fa57d4..d722792 100644 --- a/website/web/templates/index.html +++ b/website/web/templates/index.html @@ -28,7 +28,7 @@ - {% for uuid, page_title, date, time, url, redirects in titles %} + {% for uuid, page_title, date, time, url, redirects, incomplete_redirects in titles %}

{{ page_title }}

@@ -46,7 +46,11 @@ {%endif%}

{% endfor %} - Download redirects + {% if incomplete_redirects %} + Unable to find the redirects, click here to build the tree + {%else%} + Download redirects + {%endif%} {% else%} No redirect {%endif%}