chg: Refactoring of the redirects, rename report -> capture

pull/79/head
Raphaël Vinot 2020-03-26 01:56:24 +01:00
parent f26a02ef86
commit 949ad58667
6 changed files with 108 additions and 69 deletions

0
config/.keepdir Normal file
View File

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
from typing import List, Optional from typing import List, Optional, Dict, Union, Any
from io import BufferedIOBase from io import BufferedIOBase
from pathlib import Path from pathlib import Path
from .exceptions import MissingEnv, CreateDirectoryException from .exceptions import MissingEnv, CreateDirectoryException
@ -116,7 +116,7 @@ def update_user_agents():
try: try:
s = cloudscraper.create_scraper() s = cloudscraper.create_scraper()
r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
except Exception as e: except Exception:
traceback.print_exc() traceback.print_exc()
return return
soup = BeautifulSoup(r.text, 'html.parser') soup = BeautifulSoup(r.text, 'html.parser')
@ -165,7 +165,7 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[dict]
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z', 'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': u, 'domain': u,
'value': cookie['Content raw'] 'value': cookie['Content raw']
} }
to_return.append(to_add) to_return.append(to_add)
except Exception as e: except Exception as e:
print(f'Unable to load the cookie file: {e}') print(f'Unable to load the cookie file: {e}')

View File

@ -24,7 +24,7 @@ from .helpers import get_homedir, get_socket_path, load_cookies
from .exceptions import NoValidHarFile from .exceptions import NoValidHarFile
from redis import Redis from redis import Redis
from typing import Union, Dict, List, Tuple, Optional from typing import Union, Dict, List, Tuple, Optional, Any
import logging import logging
@ -60,76 +60,93 @@ class Lookyloo():
self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(loglevel) self.logger.setLevel(loglevel)
def _set_report_cache(self, report_dir: Path) -> None: def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
if self.redis.exists(str(report_dir)): if force or not self.redis.exists(str(capture_dir)):
# (re)build cache
pass
else:
return return
with (report_dir / 'uuid').open() as f:
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip() uuid = f.read().strip()
har_files = sorted(report_dir.glob('*.har')) har_files = sorted(capture_dir.glob('*.har'))
error_cache: Dict[str, str] = {} error_cache: Dict[str, str] = {}
if (report_dir / 'error.txt').exists(): if (capture_dir / 'error.txt').exists():
# Something went wrong # Something went wrong
with (Path(report_dir) / 'error.txt').open() as _error: with (Path(capture_dir) / 'error.txt').open() as _error:
error_cache['error'] = f'Capture in {report_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum' error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
elif not har_files: elif not har_files:
error_cache['error'] = f'No har files in {report_dir}' error_cache['error'] = f'No har files in {capture_dir}'
if error_cache: if error_cache:
self.logger.warning(error_cache['error']) self.logger.warning(error_cache['error'])
self.redis.hmset(str(report_dir), error_cache) self.redis.hmset(str(capture_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(report_dir)) self.redis.hset('lookup_dirs', uuid, str(capture_dir))
return return
har = HarFile(har_files[0]) har = HarFile(har_files[0])
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = self._load_pickle(capture_dir / 'tree.pickle')
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
cache: Dict[str, Union[str, int]] = {'uuid': uuid, cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title, 'title': har.initial_title,
'timestamp': har.initial_start_time, 'timestamp': har.initial_start_time,
'url': har.first_url, 'url': har.first_url,
'redirects': json.dumps(har.initial_redirects)} 'redirects': json.dumps(redirects),
if (report_dir / 'no_index').exists(): # If the folders claims anonymity 'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1 cache['no_index'] = 1
if uuid and not self.redis.exists(str(report_dir)):
self.redis.hmset(str(report_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(report_dir))
def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]: self.redis.hmset(str(capture_dir), cache)
if isinstance(report_dir, Path): self.redis.hset('lookup_dirs', uuid, str(capture_dir))
report_dir = str(report_dir)
cached = self.redis.hgetall(report_dir) def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
# try to rebuild the cache
self._set_capture_cache(capture_dir, force=True)
cached = self.redis.hgetall(str(capture_dir))
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']): if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
cached['redirects'] = json.loads(cached['redirects']) cached['redirects'] = json.loads(cached['redirects'])
return cached return cached
elif 'error' in cached: elif 'error' in cached:
return cached return cached
else: else:
self.logger.warning(f'Cache ({report_dir}) is invalid: {json.dumps(cached, indent=2)}') self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
return None return None
def _init_existing_dumps(self) -> None: def _init_existing_dumps(self) -> None:
for report_dir in self.report_dirs: for capture_dir in self.capture_dirs:
if report_dir.exists(): if capture_dir.exists():
self._set_report_cache(report_dir) self._set_capture_cache(capture_dir)
self.redis.set('cache_loaded', 1) self.redis.set('cache_loaded', 1)
@property @property
def report_dirs(self) -> List[Path]: def capture_dirs(self) -> List[Path]:
for report_dir in self.scrape_dir.iterdir(): for capture_dir in self.scrape_dir.iterdir():
if report_dir.is_dir() and not report_dir.iterdir(): if capture_dir.is_dir() and not capture_dir.iterdir():
# Cleanup self.scrape_dir of failed runs. # Cleanup self.scrape_dir of failed runs.
report_dir.rmdir() capture_dir.rmdir()
if not (report_dir / 'uuid').exists(): if not (capture_dir / 'uuid').exists():
# Create uuid if missing # Create uuid if missing
with (report_dir / 'uuid').open('w') as f: with (capture_dir / 'uuid').open('w') as f:
f.write(str(uuid4())) f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True) return sorted(self.scrape_dir.iterdir(), reverse=True)
def lookup_report_dir(self, uuid) -> Union[Path, None]: def lookup_capture_dir(self, uuid) -> Union[Path, None]:
report_dir = self.redis.hget('lookup_dirs', uuid) capture_dir = self.redis.hget('lookup_dirs', uuid)
if report_dir: if capture_dir:
return Path(report_dir) return Path(capture_dir)
return None return None
def enqueue_scrape(self, query: dict) -> str: def enqueue_scrape(self, query: dict) -> str:
@ -152,18 +169,27 @@ class Lookyloo():
return True return True
return False return False
def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]: def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
har_files = sorted(report_dir.glob('*.har')) if pickle_file.exists():
with pickle_file.open('rb') as _p:
return pickle.load(_p)
return None
def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle'
try: try:
meta = {} meta = {}
if (report_dir / 'meta').exists(): if (capture_dir / 'meta').exists():
with open((report_dir / 'meta'), 'r') as f: # NOTE: Legacy, the meta file should be present
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f) meta = json.load(f)
ct = CrawledTree(har_files) ct = self._load_pickle(pickle_file)
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) if not ct:
pickle.dump(ct, temp) ct = CrawledTree(har_files)
temp.close() with pickle_file.open('wb') as _p:
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta pickle.dump(ct, _p)
return pickle_file.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
except Har2TreeError as e: except Har2TreeError as e:
raise NoValidHarFile(e.message) raise NoValidHarFile(e.message)
@ -172,8 +198,8 @@ class Lookyloo():
if time.time() - tmpfile.stat().st_atime > 36000: if time.time() - tmpfile.stat().st_atime > 36000:
tmpfile.unlink() tmpfile.unlink()
def load_image(self, report_dir: Path) -> BytesIO: def load_image(self, capture_dir: Path) -> BytesIO:
with open(list(report_dir.glob('*.png'))[0], 'rb') as f: with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
return BytesIO(f.read()) return BytesIO(f.read())
def sane_js_query(self, sha512: str) -> Dict: def sane_js_query(self, sha512: str) -> Dict:
@ -254,5 +280,5 @@ class Lookyloo():
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies: with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies) json.dump(cookies, _cookies)
self._set_report_cache(dirpath) self._set_capture_cache(dirpath)
return perma_uuid return perma_uuid

6
poetry.lock generated
View File

@ -251,7 +251,7 @@ publicsuffix2 = "^2.20191221"
six = "^1.14.0" six = "^1.14.0"
[package.source] [package.source]
reference = "7656d60b3a200285205f991f6646d9a1e366c7cf" reference = "02c7e19229c33a62bfabbc3a1981f0401d7b3a71"
type = "git" type = "git"
url = "https://github.com/viper-framework/har2tree.git" url = "https://github.com/viper-framework/har2tree.git"
[[package]] [[package]]
@ -701,7 +701,7 @@ scrapy = "^1.8.0"
scrapy-splash = "^0.7.2" scrapy-splash = "^0.7.2"
[package.source] [package.source]
reference = "383fafad20a111e02fa53dc639b4cc53c7b8456c" reference = "bf18e5e1c88c9263b90a69348e0020ceccf2aa12"
type = "git" type = "git"
url = "https://github.com/viper-framework/ScrapySplashWrapper.git" url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
[[package]] [[package]]
@ -795,7 +795,7 @@ python-versions = "*"
version = "1.4.1" version = "1.4.1"
[[package]] [[package]]
category = "dev" category = "main"
description = "Backported and Experimental Type Hints for Python 3.5+" description = "Backported and Experimental Type Hints for Python 3.5+"
name = "typing-extensions" name = "typing-extensions"
optional = false optional = false

View File

@ -47,9 +47,9 @@ lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_glo
# keep # keep
def load_tree(report_dir: Path) -> Tuple[dict, str, str, str, dict]: def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
session.clear() session.clear()
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir) temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
session["tree"] = temp_file_name session["tree"] = temp_file_name
return tree_json, tree_time, tree_ua, tree_root_url, meta return tree_json, tree_time, tree_ua, tree_root_url, meta
@ -137,20 +137,20 @@ def urlnode_details(node_uuid):
@app.route('/tree/<string:tree_uuid>/image', methods=['GET']) @app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_uuid): def image(tree_uuid):
report_dir = lookyloo.lookup_report_dir(tree_uuid) capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not report_dir: if not capture_dir:
return Response('Not available.', mimetype='text/text') return Response('Not available.', mimetype='text/text')
to_return = lookyloo.load_image(report_dir) to_return = lookyloo.load_image(capture_dir)
return send_file(to_return, mimetype='image/png', return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png') as_attachment=True, attachment_filename='image.png')
@app.route('/redirects/<string:tree_uuid>', methods=['GET']) @app.route('/redirects/<string:tree_uuid>', methods=['GET'])
def redirects(tree_uuid): def redirects(tree_uuid):
report_dir = lookyloo.lookup_report_dir(tree_uuid) capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not report_dir: if not capture_dir:
return Response('Not available.', mimetype='text/text') return Response('Not available.', mimetype='text/text')
cache = lookyloo.report_cache(report_dir) cache = lookyloo.capture_cache(capture_dir)
if not cache['redirects']: if not cache['redirects']:
return Response('No redirects.', mimetype='text/text') return Response('No redirects.', mimetype='text/text')
to_return = BytesIO('\n'.join(cache['redirects']).encode()) to_return = BytesIO('\n'.join(cache['redirects']).encode())
@ -158,23 +158,31 @@ def redirects(tree_uuid):
as_attachment=True, attachment_filename='redirects.txt') as_attachment=True, attachment_filename='redirects.txt')
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
def cache_tree(tree_uuid):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if capture_dir:
lookyloo.load_tree(capture_dir)
return redirect(url_for('index'))
@app.route('/tree/<string:tree_uuid>', methods=['GET']) @app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_uuid): def tree(tree_uuid):
if tree_uuid == 'False': if tree_uuid == 'False':
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error') flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
report_dir = lookyloo.lookup_report_dir(tree_uuid) capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not report_dir: if not capture_dir:
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error') flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
cache = lookyloo.report_cache(report_dir) cache = lookyloo.capture_cache(capture_dir)
if 'error' in cache: if 'error' in cache:
flash(cache['error'], 'error') flash(cache['error'], 'error')
return redirect(url_for('index')) return redirect(url_for('index'))
try: try:
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir) tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time, return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta) meta=meta)
@ -190,12 +198,13 @@ def index():
lookyloo.cleanup_old_tmpfiles() lookyloo.cleanup_old_tmpfiles()
update_user_agents() update_user_agents()
titles = [] titles = []
for report_dir in lookyloo.report_dirs: for capture_dir in lookyloo.capture_dirs:
cached = lookyloo.report_cache(report_dir) cached = lookyloo.capture_cache(capture_dir)
if not cached or 'no_index' in cached or 'error' in cached: if not cached or 'no_index' in cached or 'error' in cached:
continue continue
date, time = cached['timestamp'].split('T') date, time = cached['timestamp'].split('T')
time, _ = time.split('.', 1) time, _ = time.split('.', 1)
titles.append((cached['uuid'], cached['title'], date, time, cached['url'], cached['redirects'])) titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True) titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
return render_template('index.html', titles=titles) return render_template('index.html', titles=titles)

View File

@ -28,7 +28,7 @@
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
{% for uuid, page_title, date, time, url, redirects in titles %} {% for uuid, page_title, date, time, url, redirects, incomplete_redirects in titles %}
<tr> <tr>
<td> <td>
<p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p> <p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p>
@ -46,7 +46,11 @@
{%endif%} {%endif%}
</p> </p>
{% endfor %} {% endfor %}
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a> {% if incomplete_redirects %}
<a style="float: right;" href="{{ url_for('cache_tree', tree_uuid=uuid) }}">Unable to find the redirects, click here to build the tree</a>
{%else%}
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
{%endif%}
{% else%} {% else%}
No redirect No redirect
{%endif%} {%endif%}