2017-07-23 19:56:51 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
2017-09-22 00:26:38 +02:00
|
|
|
from har2tree import CrawledTree
|
2017-07-23 19:56:51 +02:00
|
|
|
from scrapysplashwrapper import crawl
|
|
|
|
|
2017-10-04 15:13:42 +02:00
|
|
|
from flask import Flask, render_template, request, session
|
2017-07-23 19:56:51 +02:00
|
|
|
from flask_bootstrap import Bootstrap
|
|
|
|
|
|
|
|
from glob import glob
|
|
|
|
import os
|
|
|
|
from datetime import datetime
|
|
|
|
|
2017-10-04 15:13:42 +02:00
|
|
|
import pickle
|
|
|
|
import tempfile
|
|
|
|
|
2017-07-23 19:56:51 +02:00
|
|
|
app = Flask(__name__)
|
|
|
|
|
2017-10-04 15:13:42 +02:00
|
|
|
app.secret_key = 'changeme'
|
|
|
|
|
|
|
|
if app.secret_key == 'changeme':
|
|
|
|
raise Exception('FFS, please set a proper secret key...')
|
|
|
|
|
2017-07-23 19:56:51 +02:00
|
|
|
Bootstrap(app)
|
|
|
|
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
|
|
|
|
app.debug = True
|
|
|
|
|
|
|
|
HAR_DIR = 'scraped'
|
|
|
|
SPLASH = 'http://127.0.0.1:8050'
|
|
|
|
|
|
|
|
|
2017-10-04 15:13:42 +02:00
|
|
|
@app.before_request
|
|
|
|
def session_management():
|
|
|
|
# make the session last indefinitely until it is cleared
|
|
|
|
session.permanent = True
|
|
|
|
|
|
|
|
|
2017-07-23 19:56:51 +02:00
|
|
|
def load_tree(report_dir):
|
2017-10-04 15:13:42 +02:00
|
|
|
if session.get('tree'):
|
|
|
|
# TODO delete file
|
|
|
|
pass
|
|
|
|
session.clear()
|
2017-07-23 19:56:51 +02:00
|
|
|
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har')))
|
|
|
|
ct = CrawledTree(har_files)
|
|
|
|
ct.find_parents()
|
|
|
|
ct.join_trees()
|
2017-10-04 15:13:42 +02:00
|
|
|
temp = tempfile.NamedTemporaryFile(delete=False)
|
|
|
|
pickle.dump(ct, temp)
|
|
|
|
temp.close()
|
|
|
|
session["tree"] = temp.name
|
2017-09-25 15:11:01 +02:00
|
|
|
return ct.jsonify(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
|
2017-07-23 19:56:51 +02:00
|
|
|
|
|
|
|
|
2017-09-07 13:44:32 +02:00
|
|
|
@app.route('/scrape', methods=['GET', 'POST'])
|
|
|
|
def scrape():
|
2017-07-23 19:56:51 +02:00
|
|
|
if request.form.get('url'):
|
|
|
|
url = request.form.get('url')
|
|
|
|
depth = request.form.get('depth')
|
2017-08-27 22:29:44 +02:00
|
|
|
if depth is None:
|
|
|
|
depth = 1
|
2017-07-23 19:56:51 +02:00
|
|
|
items = crawl(SPLASH, url, depth)
|
|
|
|
if not items:
|
|
|
|
# broken
|
|
|
|
pass
|
|
|
|
width = len(str(len(items)))
|
|
|
|
i = 1
|
|
|
|
dirpath = os.path.join(HAR_DIR, datetime.now().isoformat())
|
|
|
|
os.makedirs(dirpath)
|
|
|
|
for item in items:
|
|
|
|
harfile = item['har']
|
|
|
|
with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f:
|
|
|
|
json.dump(harfile, f)
|
|
|
|
i += 1
|
2017-09-13 09:37:34 +02:00
|
|
|
return tree(0)
|
2017-09-07 13:44:32 +02:00
|
|
|
return render_template('scrape.html')
|
2017-07-23 19:56:51 +02:00
|
|
|
|
|
|
|
|
2017-08-12 19:19:24 +02:00
|
|
|
def get_report_dirs():
|
|
|
|
# Cleanup HAR_DIR of failed runs.
|
|
|
|
for report_dir in os.listdir(HAR_DIR):
|
|
|
|
if not os.listdir(os.path.join(HAR_DIR, report_dir)):
|
|
|
|
os.rmdir(os.path.join(HAR_DIR, report_dir))
|
2017-09-13 09:37:34 +02:00
|
|
|
return sorted(os.listdir(HAR_DIR), reverse=True)
|
2017-08-12 19:19:24 +02:00
|
|
|
|
|
|
|
|
2017-10-04 15:13:42 +02:00
|
|
|
@app.route('/tree/hostname/<node_uuid>', methods=['GET'])
|
|
|
|
def hostnode_details(node_uuid):
|
|
|
|
with open(session["tree"], 'rb') as f:
|
|
|
|
ct = pickle.load(f)
|
|
|
|
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
|
|
|
urls = []
|
|
|
|
for url in hostnode.urls:
|
|
|
|
urls.append(url.jsonify())
|
|
|
|
return json.dumps(urls)
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/tree/url/<node_uuid>', methods=['GET'])
|
|
|
|
def urlnode_details(node_uuid):
|
|
|
|
with open(session["tree"], 'rb') as f:
|
|
|
|
ct = pickle.load(f)
|
|
|
|
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
|
|
|
|
return urlnode.jsonify()
|
|
|
|
|
|
|
|
|
2017-07-23 19:56:51 +02:00
|
|
|
@app.route('/tree/<int:tree_id>', methods=['GET'])
|
|
|
|
def tree(tree_id):
|
2017-08-12 19:19:24 +02:00
|
|
|
report_dir = get_report_dirs()[tree_id]
|
2017-09-25 15:11:01 +02:00
|
|
|
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
|
|
|
|
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
|
|
|
user_agent=user_agent, root_url=root_url)
|
2017-07-23 19:56:51 +02:00
|
|
|
|
|
|
|
|
|
|
|
@app.route('/', methods=['GET'])
|
|
|
|
def index():
|
|
|
|
i = 0
|
|
|
|
titles = []
|
2017-08-12 17:45:53 +02:00
|
|
|
if not os.path.exists(HAR_DIR):
|
|
|
|
os.makedirs(HAR_DIR)
|
2017-08-12 19:19:24 +02:00
|
|
|
for report_dir in get_report_dirs():
|
2017-07-23 19:56:51 +02:00
|
|
|
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har')))
|
|
|
|
if not har_files:
|
|
|
|
continue
|
|
|
|
with open(har_files[0], 'r') as f:
|
|
|
|
j = json.load(f)
|
|
|
|
titles.append((i, j['log']['pages'][0]['title']))
|
|
|
|
i += 1
|
|
|
|
|
|
|
|
return render_template('index.html', titles=titles)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
2017-08-12 17:45:53 +02:00
|
|
|
app.run(port=5001, threaded=True)
|