chg: Refactoring of the redirects, rename report -> capture

pull/79/head
Raphaël Vinot 2020-03-26 01:56:24 +01:00
parent f26a02ef86
commit 949ad58667
6 changed files with 108 additions and 69 deletions

0
config/.keepdir Normal file
View File

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
from typing import List, Optional
from typing import List, Optional, Dict, Union, Any
from io import BufferedIOBase
from pathlib import Path
from .exceptions import MissingEnv, CreateDirectoryException
@ -116,7 +116,7 @@ def update_user_agents():
try:
s = cloudscraper.create_scraper()
r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
except Exception as e:
except Exception:
traceback.print_exc()
return
soup = BeautifulSoup(r.text, 'html.parser')
@ -165,7 +165,7 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[dict]
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
'domain': u,
'value': cookie['Content raw']
}
}
to_return.append(to_add)
except Exception as e:
print(f'Unable to load the cookie file: {e}')

View File

@ -24,7 +24,7 @@ from .helpers import get_homedir, get_socket_path, load_cookies
from .exceptions import NoValidHarFile
from redis import Redis
from typing import Union, Dict, List, Tuple, Optional
from typing import Union, Dict, List, Tuple, Optional, Any
import logging
@ -60,76 +60,93 @@ class Lookyloo():
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(loglevel)
def _set_report_cache(self, report_dir: Path) -> None:
if self.redis.exists(str(report_dir)):
def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
if force or not self.redis.exists(str(capture_dir)):
# (re)build cache
pass
else:
return
with (report_dir / 'uuid').open() as f:
with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip()
har_files = sorted(report_dir.glob('*.har'))
har_files = sorted(capture_dir.glob('*.har'))
error_cache: Dict[str, str] = {}
if (report_dir / 'error.txt').exists():
if (capture_dir / 'error.txt').exists():
# Something went wrong
with (Path(report_dir) / 'error.txt').open() as _error:
error_cache['error'] = f'Capture in {report_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
with (Path(capture_dir) / 'error.txt').open() as _error:
error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
elif not har_files:
error_cache['error'] = f'No har files in {report_dir}'
error_cache['error'] = f'No har files in {capture_dir}'
if error_cache:
self.logger.warning(error_cache['error'])
self.redis.hmset(str(report_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(report_dir))
self.redis.hmset(str(capture_dir), error_cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
return
har = HarFile(har_files[0])
redirects = har.initial_redirects
incomplete_redirects = False
if redirects and har.need_tree_redirects:
# load tree from disk, get redirects
ct = self._load_pickle(capture_dir / 'tree.pickle')
if ct:
redirects = ct.redirects
else:
# Pickle not available
incomplete_redirects = True
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
'title': har.initial_title,
'timestamp': har.initial_start_time,
'url': har.first_url,
'redirects': json.dumps(har.initial_redirects)}
if (report_dir / 'no_index').exists(): # If the folders claims anonymity
'redirects': json.dumps(redirects),
'incomplete_redirects': 1 if incomplete_redirects else 0}
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
cache['no_index'] = 1
if uuid and not self.redis.exists(str(report_dir)):
self.redis.hmset(str(report_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(report_dir))
def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
if isinstance(report_dir, Path):
report_dir = str(report_dir)
cached = self.redis.hgetall(report_dir)
self.redis.hmset(str(capture_dir), cache)
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
# try to rebuild the cache
self._set_capture_cache(capture_dir, force=True)
cached = self.redis.hgetall(str(capture_dir))
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
cached['redirects'] = json.loads(cached['redirects'])
return cached
elif 'error' in cached:
return cached
else:
self.logger.warning(f'Cache ({report_dir}) is invalid: {json.dumps(cached, indent=2)}')
self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
return None
def _init_existing_dumps(self) -> None:
for report_dir in self.report_dirs:
if report_dir.exists():
self._set_report_cache(report_dir)
for capture_dir in self.capture_dirs:
if capture_dir.exists():
self._set_capture_cache(capture_dir)
self.redis.set('cache_loaded', 1)
@property
def report_dirs(self) -> List[Path]:
for report_dir in self.scrape_dir.iterdir():
if report_dir.is_dir() and not report_dir.iterdir():
def capture_dirs(self) -> List[Path]:
for capture_dir in self.scrape_dir.iterdir():
if capture_dir.is_dir() and not capture_dir.iterdir():
# Cleanup self.scrape_dir of failed runs.
report_dir.rmdir()
if not (report_dir / 'uuid').exists():
capture_dir.rmdir()
if not (capture_dir / 'uuid').exists():
# Create uuid if missing
with (report_dir / 'uuid').open('w') as f:
with (capture_dir / 'uuid').open('w') as f:
f.write(str(uuid4()))
return sorted(self.scrape_dir.iterdir(), reverse=True)
def lookup_report_dir(self, uuid) -> Union[Path, None]:
report_dir = self.redis.hget('lookup_dirs', uuid)
if report_dir:
return Path(report_dir)
def lookup_capture_dir(self, uuid) -> Union[Path, None]:
capture_dir = self.redis.hget('lookup_dirs', uuid)
if capture_dir:
return Path(capture_dir)
return None
def enqueue_scrape(self, query: dict) -> str:
@ -152,18 +169,27 @@ class Lookyloo():
return True
return False
def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
har_files = sorted(report_dir.glob('*.har'))
def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
if pickle_file.exists():
with pickle_file.open('rb') as _p:
return pickle.load(_p)
return None
def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
har_files = sorted(capture_dir.glob('*.har'))
pickle_file = capture_dir / 'tree.pickle'
try:
meta = {}
if (report_dir / 'meta').exists():
with open((report_dir / 'meta'), 'r') as f:
if (capture_dir / 'meta').exists():
# NOTE: Legacy, the meta file should be present
with open((capture_dir / 'meta'), 'r') as f:
meta = json.load(f)
ct = CrawledTree(har_files)
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
pickle.dump(ct, temp)
temp.close()
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
ct = self._load_pickle(pickle_file)
if not ct:
ct = CrawledTree(har_files)
with pickle_file.open('wb') as _p:
pickle.dump(ct, _p)
return pickle_file.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
except Har2TreeError as e:
raise NoValidHarFile(e.message)
@ -172,8 +198,8 @@ class Lookyloo():
if time.time() - tmpfile.stat().st_atime > 36000:
tmpfile.unlink()
def load_image(self, report_dir: Path) -> BytesIO:
with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
def load_image(self, capture_dir: Path) -> BytesIO:
with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
return BytesIO(f.read())
def sane_js_query(self, sha512: str) -> Dict:
@ -254,5 +280,5 @@ class Lookyloo():
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
json.dump(cookies, _cookies)
self._set_report_cache(dirpath)
self._set_capture_cache(dirpath)
return perma_uuid

6
poetry.lock generated
View File

@ -251,7 +251,7 @@ publicsuffix2 = "^2.20191221"
six = "^1.14.0"
[package.source]
reference = "7656d60b3a200285205f991f6646d9a1e366c7cf"
reference = "02c7e19229c33a62bfabbc3a1981f0401d7b3a71"
type = "git"
url = "https://github.com/viper-framework/har2tree.git"
[[package]]
@ -701,7 +701,7 @@ scrapy = "^1.8.0"
scrapy-splash = "^0.7.2"
[package.source]
reference = "383fafad20a111e02fa53dc639b4cc53c7b8456c"
reference = "bf18e5e1c88c9263b90a69348e0020ceccf2aa12"
type = "git"
url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
[[package]]
@ -795,7 +795,7 @@ python-versions = "*"
version = "1.4.1"
[[package]]
category = "dev"
category = "main"
description = "Backported and Experimental Type Hints for Python 3.5+"
name = "typing-extensions"
optional = false

View File

@ -47,9 +47,9 @@ lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_glo
# keep
def load_tree(report_dir: Path) -> Tuple[dict, str, str, str, dict]:
def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
session.clear()
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
session["tree"] = temp_file_name
return tree_json, tree_time, tree_ua, tree_root_url, meta
@ -137,20 +137,20 @@ def urlnode_details(node_uuid):
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_uuid):
report_dir = lookyloo.lookup_report_dir(tree_uuid)
if not report_dir:
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.load_image(report_dir)
to_return = lookyloo.load_image(capture_dir)
return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png')
@app.route('/redirects/<string:tree_uuid>', methods=['GET'])
def redirects(tree_uuid):
report_dir = lookyloo.lookup_report_dir(tree_uuid)
if not report_dir:
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
return Response('Not available.', mimetype='text/text')
cache = lookyloo.report_cache(report_dir)
cache = lookyloo.capture_cache(capture_dir)
if not cache['redirects']:
return Response('No redirects.', mimetype='text/text')
to_return = BytesIO('\n'.join(cache['redirects']).encode())
@ -158,23 +158,31 @@ def redirects(tree_uuid):
as_attachment=True, attachment_filename='redirects.txt')
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
def cache_tree(tree_uuid):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if capture_dir:
lookyloo.load_tree(capture_dir)
return redirect(url_for('index'))
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_uuid):
if tree_uuid == 'False':
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
return redirect(url_for('index'))
report_dir = lookyloo.lookup_report_dir(tree_uuid)
if not report_dir:
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
return redirect(url_for('index'))
cache = lookyloo.report_cache(report_dir)
cache = lookyloo.capture_cache(capture_dir)
if 'error' in cache:
flash(cache['error'], 'error')
return redirect(url_for('index'))
try:
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta)
@ -190,12 +198,13 @@ def index():
lookyloo.cleanup_old_tmpfiles()
update_user_agents()
titles = []
for report_dir in lookyloo.report_dirs:
cached = lookyloo.report_cache(report_dir)
for capture_dir in lookyloo.capture_dirs:
cached = lookyloo.capture_cache(capture_dir)
if not cached or 'no_index' in cached or 'error' in cached:
continue
date, time = cached['timestamp'].split('T')
time, _ = time.split('.', 1)
titles.append((cached['uuid'], cached['title'], date, time, cached['url'], cached['redirects']))
titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
return render_template('index.html', titles=titles)

View File

@ -28,7 +28,7 @@
</tr>
</thead>
<tbody>
{% for uuid, page_title, date, time, url, redirects in titles %}
{% for uuid, page_title, date, time, url, redirects, incomplete_redirects in titles %}
<tr>
<td>
<p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p>
@ -46,7 +46,11 @@
{%endif%}
</p>
{% endfor %}
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
{% if incomplete_redirects %}
<a style="float: right;" href="{{ url_for('cache_tree', tree_uuid=uuid) }}">Unable to find the redirects, click here to build the tree</a>
{%else%}
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
{%endif%}
{% else%}
No redirect
{%endif%}