lookyloo/lookyloo/__init__.py

245 lines
7.5 KiB
Python
Raw Normal View History

2017-07-23 19:56:51 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
2017-09-22 00:26:38 +02:00
from har2tree import CrawledTree
2017-07-23 19:56:51 +02:00
from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file, redirect, url_for
2017-07-23 19:56:51 +02:00
from flask_bootstrap import Bootstrap
from datetime import datetime
2017-10-04 15:13:42 +02:00
import pickle
import tempfile
2018-02-06 01:23:00 +01:00
import pathlib
import time
2018-02-06 01:23:00 +01:00
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
2018-02-09 18:29:22 +01:00
import base64
2019-01-23 15:13:29 +01:00
import os
from uuid import uuid4
2018-07-19 18:18:22 +02:00
2019-01-27 17:17:31 +01:00
from pysanejs import SaneJS
2017-10-04 15:13:42 +02:00
2019-01-23 15:13:29 +01:00
from .helpers import get_homedir
2017-07-23 19:56:51 +02:00
app = Flask(__name__)
2017-10-04 15:13:42 +02:00
2019-01-23 15:13:29 +01:00
secret_file_path = get_homedir() / 'secret_key'
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
with secret_file_path.open('wb') as f:
2019-01-23 15:13:29 +01:00
f.write(os.urandom(64))
with secret_file_path.open('rb') as f:
2019-01-23 15:13:29 +01:00
app.config['SECRET_KEY'] = f.read()
2017-10-04 15:13:42 +02:00
2017-07-23 19:56:51 +02:00
Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
2019-01-23 15:13:29 +01:00
app.debug = False
2017-07-23 19:56:51 +02:00
2019-01-23 15:13:29 +01:00
HAR_DIR = get_homedir() / 'scraped'
HAR_DIR.mkdir(parents=True, exist_ok=True)
2018-02-06 01:23:00 +01:00
2019-01-23 15:13:29 +01:00
SPLASH = 'http://127.0.0.1:8050'
2018-07-19 18:18:22 +02:00
SANE_JS = 'http://127.0.0.1:5007'
if SANE_JS:
2019-01-27 17:17:31 +01:00
sanejs = SaneJS(SANE_JS)
if sanejs.is_up:
2018-07-19 18:18:22 +02:00
has_sane_js = True
else:
has_sane_js = False
2017-07-23 19:56:51 +02:00
def get_report_dirs():
# Cleanup HAR_DIR of failed runs.
for report_dir in HAR_DIR.iterdir():
if report_dir.is_dir() and not report_dir.iterdir():
report_dir.rmdir()
if not (report_dir / 'uuid').exists():
# Create uuid if missing
with (report_dir / 'uuid').open('w') as f:
f.write(str(uuid4()))
return sorted(HAR_DIR.iterdir(), reverse=True)
def get_lookup_dirs():
# Build lookup table trees
lookup_dirs = {}
for report_dir in get_report_dirs():
with (report_dir / 'uuid').open() as f:
lookup_dirs[f.read().strip()] = report_dir
return lookup_dirs
def cleanup_old_tmpfiles():
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
if time.time() - tmpfile.stat().st_atime > 36000:
tmpfile.unlink()
2018-05-10 09:48:48 +02:00
def load_image(report_dir):
with open(list(report_dir.glob('*.png'))[0], 'rb') as f:
return BytesIO(f.read())
2017-07-23 19:56:51 +02:00
def load_tree(report_dir):
2017-10-04 15:13:42 +02:00
session.clear()
har_files = sorted(report_dir.glob('*.har'))
2017-07-23 19:56:51 +02:00
ct = CrawledTree(har_files)
ct.find_parents()
ct.join_trees()
2017-10-05 17:38:35 +02:00
temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False)
2017-10-04 15:13:42 +02:00
pickle.dump(ct, temp)
temp.close()
session["tree"] = temp.name
2018-01-30 15:41:35 +01:00
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
2017-07-23 19:56:51 +02:00
def sane_js_query(sha512: str):
2018-07-19 18:18:22 +02:00
if has_sane_js:
2019-01-27 17:17:31 +01:00
return sanejs.sha512(sha512)
return {'response': []}
2018-07-19 18:18:22 +02:00
def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
if not url.startswith('http'):
url = f'http://{url}'
items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
if not items:
# broken
pass
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = HAR_DIR / datetime.now().isoformat()
dirpath.mkdir()
for i, item in enumerate(items):
harfile = item['har']
png = base64.b64decode(item['png'])
child_frames = item['childFrames']
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
json.dump(harfile, f)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
f.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
f.write(html)
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
json.dump(child_frames, f)
with (dirpath / 'uuid').open('w') as f:
f.write(perma_uuid)
return perma_uuid
2017-09-07 13:44:32 +02:00
@app.route('/scrape', methods=['GET', 'POST'])
def scrape_web():
2017-07-23 19:56:51 +02:00
if request.form.get('url'):
perma_uuid = scrape(request.form.get('url'), request.form.get('depth'))
return redirect(url_for('tree', tree_uuid=perma_uuid))
2017-09-07 13:44:32 +02:00
return render_template('scrape.html')
2017-07-23 19:56:51 +02:00
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
def hostnode_details_text(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
urls.append(url.name)
content = '''# URLs
{}
'''.format('\n'.join(urls))
to_return = BytesIO(content.encode())
to_return.seek(0)
return send_file(to_return, mimetype='text/markdown',
as_attachment=True, attachment_filename='file.md')
2017-10-04 15:13:42 +02:00
@app.route('/tree/hostname/<node_uuid>', methods=['GET'])
def hostnode_details(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
2018-07-19 18:18:22 +02:00
if hasattr(url, 'body_hash'):
2019-01-27 17:17:31 +01:00
sane_js_r = sane_js_query(url.body_hash)
if sane_js_r.get('response'):
url.add_feature('sane_js_details', sane_js_r['response'])
print('######## SANEJS ##### ', url.sane_js_details)
2018-01-30 15:41:35 +01:00
urls.append(url.to_json())
2017-10-04 15:13:42 +02:00
return json.dumps(urls)
@app.route('/tree/url/<node_uuid>', methods=['GET'])
def urlnode_details(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
2018-02-06 01:23:00 +01:00
to_return = BytesIO()
got_content = False
2018-02-06 01:23:00 +01:00
if hasattr(urlnode, 'body'):
body_content = urlnode.body.getvalue()
if body_content:
got_content = True
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr(urlnode.filename, urlnode.body.getvalue())
if not got_content:
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr('file.txt', b'Response body empty')
to_return.seek(0)
2018-02-06 01:23:00 +01:00
return send_file(to_return, mimetype='application/zip',
as_attachment=True, attachment_filename='file.zip')
2017-10-04 15:13:42 +02:00
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_uuid):
lookup_dirs = get_lookup_dirs()
report_dir = lookup_dirs[tree_uuid]
2018-05-10 09:48:48 +02:00
to_return = load_image(report_dir)
return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png')
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_uuid):
lookup_dirs = get_lookup_dirs()
report_dir = lookup_dirs[tree_uuid]
2017-09-25 15:11:01 +02:00
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
2017-07-23 19:56:51 +02:00
@app.route('/', methods=['GET'])
def index():
cleanup_old_tmpfiles()
2018-04-05 11:17:26 +02:00
session.clear()
2017-07-23 19:56:51 +02:00
titles = []
if not HAR_DIR.exists():
HAR_DIR.mkdir(parents=True)
2017-08-12 19:19:24 +02:00
for report_dir in get_report_dirs():
har_files = sorted(report_dir.glob('*.har'))
2017-07-23 19:56:51 +02:00
if not har_files:
continue
with har_files[0].open() as f:
2017-07-23 19:56:51 +02:00
j = json.load(f)
title = j['log']['pages'][0]['title']
with (report_dir / 'uuid').open() as f:
uuid = f.read().strip()
titles.append((uuid, title))
2017-07-23 19:56:51 +02:00
return render_template('index.html', titles=titles)
if __name__ == '__main__':
2017-08-12 17:45:53 +02:00
app.run(port=5001, threaded=True)