mirror of https://github.com/CIRCL/lookyloo
chg: Refactoring of the redirects, rename report -> capture
parent
f26a02ef86
commit
949ad58667
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
from typing import List, Optional
|
||||
from typing import List, Optional, Dict, Union, Any
|
||||
from io import BufferedIOBase
|
||||
from pathlib import Path
|
||||
from .exceptions import MissingEnv, CreateDirectoryException
|
||||
|
@ -116,7 +116,7 @@ def update_user_agents():
|
|||
try:
|
||||
s = cloudscraper.create_scraper()
|
||||
r = s.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
traceback.print_exc()
|
||||
return
|
||||
soup = BeautifulSoup(r.text, 'html.parser')
|
||||
|
@ -165,7 +165,7 @@ def load_cookies(cookie_pseudofile: Optional[BufferedIOBase]=None) -> List[dict]
|
|||
'expires': (datetime.now() + timedelta(days=10)).strftime('%Y-%m-%dT%H:%M:%S') + 'Z',
|
||||
'domain': u,
|
||||
'value': cookie['Content raw']
|
||||
}
|
||||
}
|
||||
to_return.append(to_add)
|
||||
except Exception as e:
|
||||
print(f'Unable to load the cookie file: {e}')
|
||||
|
|
|
@ -24,7 +24,7 @@ from .helpers import get_homedir, get_socket_path, load_cookies
|
|||
from .exceptions import NoValidHarFile
|
||||
from redis import Redis
|
||||
|
||||
from typing import Union, Dict, List, Tuple, Optional
|
||||
from typing import Union, Dict, List, Tuple, Optional, Any
|
||||
|
||||
import logging
|
||||
|
||||
|
@ -60,76 +60,93 @@ class Lookyloo():
|
|||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(loglevel)
|
||||
|
||||
def _set_report_cache(self, report_dir: Path) -> None:
|
||||
if self.redis.exists(str(report_dir)):
|
||||
def _set_capture_cache(self, capture_dir: Path, force: bool=False) -> None:
|
||||
if force or not self.redis.exists(str(capture_dir)):
|
||||
# (re)build cache
|
||||
pass
|
||||
else:
|
||||
return
|
||||
with (report_dir / 'uuid').open() as f:
|
||||
|
||||
with (capture_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
|
||||
har_files = sorted(report_dir.glob('*.har'))
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
|
||||
error_cache: Dict[str, str] = {}
|
||||
if (report_dir / 'error.txt').exists():
|
||||
if (capture_dir / 'error.txt').exists():
|
||||
# Something went wrong
|
||||
with (Path(report_dir) / 'error.txt').open() as _error:
|
||||
error_cache['error'] = f'Capture in {report_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
|
||||
with (Path(capture_dir) / 'error.txt').open() as _error:
|
||||
error_cache['error'] = f'Capture in {capture_dir} has an error: {_error.read()}, see https://splash.readthedocs.io/en/stable/scripting-ref.html#splash-go and https://doc.qt.io/qt-5/qnetworkreply.html#NetworkError-enum'
|
||||
elif not har_files:
|
||||
error_cache['error'] = f'No har files in {report_dir}'
|
||||
error_cache['error'] = f'No har files in {capture_dir}'
|
||||
|
||||
if error_cache:
|
||||
self.logger.warning(error_cache['error'])
|
||||
self.redis.hmset(str(report_dir), error_cache)
|
||||
self.redis.hset('lookup_dirs', uuid, str(report_dir))
|
||||
self.redis.hmset(str(capture_dir), error_cache)
|
||||
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
|
||||
return
|
||||
|
||||
har = HarFile(har_files[0])
|
||||
|
||||
redirects = har.initial_redirects
|
||||
incomplete_redirects = False
|
||||
if redirects and har.need_tree_redirects:
|
||||
# load tree from disk, get redirects
|
||||
ct = self._load_pickle(capture_dir / 'tree.pickle')
|
||||
if ct:
|
||||
redirects = ct.redirects
|
||||
else:
|
||||
# Pickle not available
|
||||
incomplete_redirects = True
|
||||
|
||||
cache: Dict[str, Union[str, int]] = {'uuid': uuid,
|
||||
'title': har.initial_title,
|
||||
'timestamp': har.initial_start_time,
|
||||
'url': har.first_url,
|
||||
'redirects': json.dumps(har.initial_redirects)}
|
||||
if (report_dir / 'no_index').exists(): # If the folders claims anonymity
|
||||
'redirects': json.dumps(redirects),
|
||||
'incomplete_redirects': 1 if incomplete_redirects else 0}
|
||||
if (capture_dir / 'no_index').exists(): # If the folders claims anonymity
|
||||
cache['no_index'] = 1
|
||||
if uuid and not self.redis.exists(str(report_dir)):
|
||||
self.redis.hmset(str(report_dir), cache)
|
||||
self.redis.hset('lookup_dirs', uuid, str(report_dir))
|
||||
|
||||
def report_cache(self, report_dir: Union[str, Path]) -> Optional[Dict[str, Union[str, int]]]:
|
||||
if isinstance(report_dir, Path):
|
||||
report_dir = str(report_dir)
|
||||
cached = self.redis.hgetall(report_dir)
|
||||
self.redis.hmset(str(capture_dir), cache)
|
||||
self.redis.hset('lookup_dirs', uuid, str(capture_dir))
|
||||
|
||||
def capture_cache(self, capture_dir: Path) -> Optional[Dict[str, Union[str, int]]]:
|
||||
if self.redis.hget(str(capture_dir), 'incomplete_redirects') == '1':
|
||||
# try to rebuild the cache
|
||||
self._set_capture_cache(capture_dir, force=True)
|
||||
cached = self.redis.hgetall(str(capture_dir))
|
||||
if all(key in cached.keys() for key in ['uuid', 'title', 'timestamp', 'url', 'redirects']):
|
||||
cached['redirects'] = json.loads(cached['redirects'])
|
||||
return cached
|
||||
elif 'error' in cached:
|
||||
return cached
|
||||
else:
|
||||
self.logger.warning(f'Cache ({report_dir}) is invalid: {json.dumps(cached, indent=2)}')
|
||||
self.logger.warning(f'Cache ({capture_dir}) is invalid: {json.dumps(cached, indent=2)}')
|
||||
return None
|
||||
|
||||
def _init_existing_dumps(self) -> None:
|
||||
for report_dir in self.report_dirs:
|
||||
if report_dir.exists():
|
||||
self._set_report_cache(report_dir)
|
||||
for capture_dir in self.capture_dirs:
|
||||
if capture_dir.exists():
|
||||
self._set_capture_cache(capture_dir)
|
||||
self.redis.set('cache_loaded', 1)
|
||||
|
||||
@property
|
||||
def report_dirs(self) -> List[Path]:
|
||||
for report_dir in self.scrape_dir.iterdir():
|
||||
if report_dir.is_dir() and not report_dir.iterdir():
|
||||
def capture_dirs(self) -> List[Path]:
|
||||
for capture_dir in self.scrape_dir.iterdir():
|
||||
if capture_dir.is_dir() and not capture_dir.iterdir():
|
||||
# Cleanup self.scrape_dir of failed runs.
|
||||
report_dir.rmdir()
|
||||
if not (report_dir / 'uuid').exists():
|
||||
capture_dir.rmdir()
|
||||
if not (capture_dir / 'uuid').exists():
|
||||
# Create uuid if missing
|
||||
with (report_dir / 'uuid').open('w') as f:
|
||||
with (capture_dir / 'uuid').open('w') as f:
|
||||
f.write(str(uuid4()))
|
||||
return sorted(self.scrape_dir.iterdir(), reverse=True)
|
||||
|
||||
def lookup_report_dir(self, uuid) -> Union[Path, None]:
|
||||
report_dir = self.redis.hget('lookup_dirs', uuid)
|
||||
if report_dir:
|
||||
return Path(report_dir)
|
||||
def lookup_capture_dir(self, uuid) -> Union[Path, None]:
|
||||
capture_dir = self.redis.hget('lookup_dirs', uuid)
|
||||
if capture_dir:
|
||||
return Path(capture_dir)
|
||||
return None
|
||||
|
||||
def enqueue_scrape(self, query: dict) -> str:
|
||||
|
@ -152,18 +169,27 @@ class Lookyloo():
|
|||
return True
|
||||
return False
|
||||
|
||||
def load_tree(self, report_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
|
||||
har_files = sorted(report_dir.glob('*.har'))
|
||||
def _load_pickle(self, pickle_file: Path) -> Optional[CrawledTree]:
|
||||
if pickle_file.exists():
|
||||
with pickle_file.open('rb') as _p:
|
||||
return pickle.load(_p)
|
||||
return None
|
||||
|
||||
def load_tree(self, capture_dir: Path) -> Tuple[str, dict, str, str, str, dict]:
|
||||
har_files = sorted(capture_dir.glob('*.har'))
|
||||
pickle_file = capture_dir / 'tree.pickle'
|
||||
try:
|
||||
meta = {}
|
||||
if (report_dir / 'meta').exists():
|
||||
with open((report_dir / 'meta'), 'r') as f:
|
||||
if (capture_dir / 'meta').exists():
|
||||
# NOTE: Legacy, the meta file should be present
|
||||
with open((capture_dir / 'meta'), 'r') as f:
|
||||
meta = json.load(f)
|
||||
ct = CrawledTree(har_files)
|
||||
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
|
||||
pickle.dump(ct, temp)
|
||||
temp.close()
|
||||
return temp.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
|
||||
ct = self._load_pickle(pickle_file)
|
||||
if not ct:
|
||||
ct = CrawledTree(har_files)
|
||||
with pickle_file.open('wb') as _p:
|
||||
pickle.dump(ct, _p)
|
||||
return pickle_file.name, ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url, meta
|
||||
except Har2TreeError as e:
|
||||
raise NoValidHarFile(e.message)
|
||||
|
||||
|
@ -172,8 +198,8 @@ class Lookyloo():
|
|||
if time.time() - tmpfile.stat().st_atime > 36000:
|
||||
tmpfile.unlink()
|
||||
|
||||
def load_image(self, report_dir: Path) -> BytesIO:
|
||||
with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
|
||||
def load_image(self, capture_dir: Path) -> BytesIO:
|
||||
with open(list(capture_dir.glob('*.png'))[0], 'rb') as f:
|
||||
return BytesIO(f.read())
|
||||
|
||||
def sane_js_query(self, sha512: str) -> Dict:
|
||||
|
@ -254,5 +280,5 @@ class Lookyloo():
|
|||
with (dirpath / '{0:0{width}}.cookies.json'.format(i, width=width)).open('w') as _cookies:
|
||||
json.dump(cookies, _cookies)
|
||||
|
||||
self._set_report_cache(dirpath)
|
||||
self._set_capture_cache(dirpath)
|
||||
return perma_uuid
|
||||
|
|
|
@ -251,7 +251,7 @@ publicsuffix2 = "^2.20191221"
|
|||
six = "^1.14.0"
|
||||
|
||||
[package.source]
|
||||
reference = "7656d60b3a200285205f991f6646d9a1e366c7cf"
|
||||
reference = "02c7e19229c33a62bfabbc3a1981f0401d7b3a71"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/har2tree.git"
|
||||
[[package]]
|
||||
|
@ -701,7 +701,7 @@ scrapy = "^1.8.0"
|
|||
scrapy-splash = "^0.7.2"
|
||||
|
||||
[package.source]
|
||||
reference = "383fafad20a111e02fa53dc639b4cc53c7b8456c"
|
||||
reference = "bf18e5e1c88c9263b90a69348e0020ceccf2aa12"
|
||||
type = "git"
|
||||
url = "https://github.com/viper-framework/ScrapySplashWrapper.git"
|
||||
[[package]]
|
||||
|
@ -795,7 +795,7 @@ python-versions = "*"
|
|||
version = "1.4.1"
|
||||
|
||||
[[package]]
|
||||
category = "dev"
|
||||
category = "main"
|
||||
description = "Backported and Experimental Type Hints for Python 3.5+"
|
||||
name = "typing-extensions"
|
||||
optional = false
|
||||
|
|
|
@ -47,9 +47,9 @@ lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_glo
|
|||
|
||||
|
||||
# keep
|
||||
def load_tree(report_dir: Path) -> Tuple[dict, str, str, str, dict]:
|
||||
def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
|
||||
session.clear()
|
||||
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(report_dir)
|
||||
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
|
||||
session["tree"] = temp_file_name
|
||||
return tree_json, tree_time, tree_ua, tree_root_url, meta
|
||||
|
||||
|
@ -137,20 +137,20 @@ def urlnode_details(node_uuid):
|
|||
|
||||
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
|
||||
def image(tree_uuid):
|
||||
report_dir = lookyloo.lookup_report_dir(tree_uuid)
|
||||
if not report_dir:
|
||||
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
||||
if not capture_dir:
|
||||
return Response('Not available.', mimetype='text/text')
|
||||
to_return = lookyloo.load_image(report_dir)
|
||||
to_return = lookyloo.load_image(capture_dir)
|
||||
return send_file(to_return, mimetype='image/png',
|
||||
as_attachment=True, attachment_filename='image.png')
|
||||
|
||||
|
||||
@app.route('/redirects/<string:tree_uuid>', methods=['GET'])
|
||||
def redirects(tree_uuid):
|
||||
report_dir = lookyloo.lookup_report_dir(tree_uuid)
|
||||
if not report_dir:
|
||||
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
||||
if not capture_dir:
|
||||
return Response('Not available.', mimetype='text/text')
|
||||
cache = lookyloo.report_cache(report_dir)
|
||||
cache = lookyloo.capture_cache(capture_dir)
|
||||
if not cache['redirects']:
|
||||
return Response('No redirects.', mimetype='text/text')
|
||||
to_return = BytesIO('\n'.join(cache['redirects']).encode())
|
||||
|
@ -158,23 +158,31 @@ def redirects(tree_uuid):
|
|||
as_attachment=True, attachment_filename='redirects.txt')
|
||||
|
||||
|
||||
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
|
||||
def cache_tree(tree_uuid):
|
||||
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
||||
if capture_dir:
|
||||
lookyloo.load_tree(capture_dir)
|
||||
return redirect(url_for('index'))
|
||||
|
||||
|
||||
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
|
||||
def tree(tree_uuid):
|
||||
if tree_uuid == 'False':
|
||||
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
|
||||
return redirect(url_for('index'))
|
||||
report_dir = lookyloo.lookup_report_dir(tree_uuid)
|
||||
if not report_dir:
|
||||
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
||||
if not capture_dir:
|
||||
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
|
||||
return redirect(url_for('index'))
|
||||
|
||||
cache = lookyloo.report_cache(report_dir)
|
||||
cache = lookyloo.capture_cache(capture_dir)
|
||||
if 'error' in cache:
|
||||
flash(cache['error'], 'error')
|
||||
return redirect(url_for('index'))
|
||||
|
||||
try:
|
||||
tree_json, start_time, user_agent, root_url, meta = load_tree(report_dir)
|
||||
tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
|
||||
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
||||
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
|
||||
meta=meta)
|
||||
|
@ -190,12 +198,13 @@ def index():
|
|||
lookyloo.cleanup_old_tmpfiles()
|
||||
update_user_agents()
|
||||
titles = []
|
||||
for report_dir in lookyloo.report_dirs:
|
||||
cached = lookyloo.report_cache(report_dir)
|
||||
for capture_dir in lookyloo.capture_dirs:
|
||||
cached = lookyloo.capture_cache(capture_dir)
|
||||
if not cached or 'no_index' in cached or 'error' in cached:
|
||||
continue
|
||||
date, time = cached['timestamp'].split('T')
|
||||
time, _ = time.split('.', 1)
|
||||
titles.append((cached['uuid'], cached['title'], date, time, cached['url'], cached['redirects']))
|
||||
titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
|
||||
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
|
||||
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
|
||||
return render_template('index.html', titles=titles)
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for uuid, page_title, date, time, url, redirects in titles %}
|
||||
{% for uuid, page_title, date, time, url, redirects, incomplete_redirects in titles %}
|
||||
<tr>
|
||||
<td>
|
||||
<p title="{{ page_title }}"><a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></p>
|
||||
|
@ -46,7 +46,11 @@
|
|||
{%endif%}
|
||||
</p>
|
||||
{% endfor %}
|
||||
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
|
||||
{% if incomplete_redirects %}
|
||||
<a style="float: right;" href="{{ url_for('cache_tree', tree_uuid=uuid) }}">Unable to find the redirects, click here to build the tree</a>
|
||||
{%else%}
|
||||
<a style="float: right;" href="{{ url_for('redirects', tree_uuid=uuid) }}">Download redirects</a>
|
||||
{%endif%}
|
||||
{% else%}
|
||||
No redirect
|
||||
{%endif%}
|
||||
|
|
Loading…
Reference in New Issue