chg: Assign a UUID for each scraped page, allow permalinks

pull/27/head
Raphaël Vinot 2019-01-29 11:42:56 +01:00
parent a2c4d871fe
commit e14a0150a0
5 changed files with 80 additions and 70 deletions

1
.gitignore vendored
View File

@ -110,3 +110,4 @@ ENV/
secret_key secret_key
FileSaver.js FileSaver.js
d3.v5.min.js d3.v5.min.js
d3.v5.js

2
Pipfile.lock generated
View File

@ -447,7 +447,7 @@
"scrapysplashwrapper": { "scrapysplashwrapper": {
"editable": true, "editable": true,
"git": "https://github.com/viper-framework/ScrapySplashWrapper.git", "git": "https://github.com/viper-framework/ScrapySplashWrapper.git",
"ref": "e6d9da1a971011b2925800b168d762e7f86b2b29" "ref": "d0e129f9962098603f6686c8152030f7d31a604a"
}, },
"service-identity": { "service-identity": {
"hashes": [ "hashes": [

View File

@ -6,7 +6,7 @@ import json
from har2tree import CrawledTree from har2tree import CrawledTree
from scrapysplashwrapper import crawl from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file from flask import Flask, render_template, request, session, send_file, redirect, url_for
from flask_bootstrap import Bootstrap from flask_bootstrap import Bootstrap
from datetime import datetime from datetime import datetime
@ -19,8 +19,8 @@ import time
from zipfile import ZipFile, ZIP_DEFLATED from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO from io import BytesIO
import base64 import base64
import socket
import os import os
from uuid import uuid4
from pysanejs import SaneJS from pysanejs import SaneJS
@ -32,10 +32,10 @@ app = Flask(__name__)
secret_file_path = get_homedir() / 'secret_key' secret_file_path = get_homedir() / 'secret_key'
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
with open(secret_file_path, 'wb') as f: with secret_file_path.open('wb') as f:
f.write(os.urandom(64)) f.write(os.urandom(64))
with open(secret_file_path, 'rb') as f: with secret_file_path.open('rb') as f:
app.config['SECRET_KEY'] = f.read() app.config['SECRET_KEY'] = f.read()
Bootstrap(app) Bootstrap(app)
@ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True)
SPLASH = 'http://127.0.0.1:8050' SPLASH = 'http://127.0.0.1:8050'
SANE_JS = 'http://127.0.0.1:5007' SANE_JS = 'http://127.0.0.1:5007'
def is_open(ip, port):
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(2)
try:
s.connect((ip, int(port)))
s.shutdown(2)
return True
except Exception:
return False
if SANE_JS: if SANE_JS:
sanejs = SaneJS(SANE_JS) sanejs = SaneJS(SANE_JS)
if sanejs.is_up: if sanejs.is_up:
@ -69,6 +57,27 @@ if SANE_JS:
has_sane_js = False has_sane_js = False
def get_report_dirs():
# Cleanup HAR_DIR of failed runs.
for report_dir in HAR_DIR.iterdir():
if report_dir.is_dir() and not report_dir.iterdir():
report_dir.rmdir()
if not (report_dir / 'uuid').exists():
# Create uuid if missing
with (report_dir / 'uuid').open('w') as f:
f.write(str(uuid4()))
return sorted(HAR_DIR.iterdir(), reverse=True)
def get_lookup_dirs():
# Build lookup table trees
lookup_dirs = {}
for report_dir in get_report_dirs():
with (report_dir / 'uuid').open() as f:
lookup_dirs[f.read().strip()] = report_dir
return lookup_dirs
def cleanup_old_tmpfiles(): def cleanup_old_tmpfiles():
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
if time.time() - tmpfile.stat().st_atime > 36000: if time.time() - tmpfile.stat().st_atime > 36000:
@ -93,53 +102,50 @@ def load_tree(report_dir):
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
def sane_js_query(sha512): def sane_js_query(sha512: str):
if has_sane_js: if has_sane_js:
return sanejs.sha512(sha512) return sanejs.sha512(sha512)
return {'response': []} return {'response': []}
def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
if not url.startswith('http'):
url = f'http://{url}'
items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
if not items:
# broken
pass
if not perma_uuid:
perma_uuid = str(uuid4())
width = len(str(len(items)))
dirpath = HAR_DIR / datetime.now().isoformat()
dirpath.mkdir()
for i, item in enumerate(items):
harfile = item['har']
png = base64.b64decode(item['png'])
child_frames = item['childFrames']
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
json.dump(harfile, f)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
f.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
f.write(html)
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
json.dump(child_frames, f)
with (dirpath / 'uuid').open('w') as f:
f.write(perma_uuid)
return perma_uuid
@app.route('/scrape', methods=['GET', 'POST']) @app.route('/scrape', methods=['GET', 'POST'])
def scrape(): def scrape_web():
if request.form.get('url'): if request.form.get('url'):
url = request.form.get('url') perma_uuid = scrape(request.form.get('url'), request.form.get('depth'))
if not url.startswith('http'): return redirect(url_for('tree', tree_uuid=perma_uuid))
url = f'http://{url}'
depth = request.form.get('depth')
if depth is None:
depth = 1
items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO')
if not items:
# broken
pass
width = len(str(len(items)))
dirpath = HAR_DIR / datetime.now().isoformat()
dirpath.mkdir()
for i, item in enumerate(items):
harfile = item['har']
png = base64.b64decode(item['png'])
child_frames = item['childFrames']
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
json.dump(harfile, f)
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
f.write(png)
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
f.write(html)
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
json.dump(child_frames, f)
return tree(0)
return render_template('scrape.html') return render_template('scrape.html')
def get_report_dirs():
# Cleanup HAR_DIR of failed runs.
for report_dir in HAR_DIR.iterdir():
if report_dir.is_dir() and not report_dir.iterdir():
report_dir.rmdir()
return sorted(HAR_DIR.iterdir(), reverse=True)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET']) @app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
def hostnode_details_text(node_uuid): def hostnode_details_text(node_uuid):
with open(session["tree"], 'rb') as f: with open(session["tree"], 'rb') as f:
@ -169,7 +175,7 @@ def hostnode_details(node_uuid):
sane_js_r = sane_js_query(url.body_hash) sane_js_r = sane_js_query(url.body_hash)
if sane_js_r.get('response'): if sane_js_r.get('response'):
url.add_feature('sane_js_details', sane_js_r['response']) url.add_feature('sane_js_details', sane_js_r['response'])
print(url.sane_js_details) print('######## SANEJS ##### ', url.sane_js_details)
urls.append(url.to_json()) urls.append(url.to_json())
return json.dumps(urls) return json.dumps(urls)
@ -195,27 +201,28 @@ def urlnode_details(node_uuid):
as_attachment=True, attachment_filename='file.zip') as_attachment=True, attachment_filename='file.zip')
@app.route('/tree/<int:tree_id>/image', methods=['GET']) @app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_id): def image(tree_uuid):
report_dir = get_report_dirs()[tree_id] lookup_dirs = get_lookup_dirs()
report_dir = lookup_dirs[tree_uuid]
to_return = load_image(report_dir) to_return = load_image(report_dir)
return send_file(to_return, mimetype='image/png', return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png') as_attachment=True, attachment_filename='image.png')
@app.route('/tree/<int:tree_id>', methods=['GET']) @app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_id): def tree(tree_uuid):
report_dir = get_report_dirs()[tree_id] lookup_dirs = get_lookup_dirs()
report_dir = lookup_dirs[tree_uuid]
tree_json, start_time, user_agent, root_url = load_tree(report_dir) tree_json, start_time, user_agent, root_url = load_tree(report_dir)
return render_template('tree.html', tree_json=tree_json, start_time=start_time, return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_id=tree_id) user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
@app.route('/', methods=['GET']) @app.route('/', methods=['GET'])
def index(): def index():
cleanup_old_tmpfiles() cleanup_old_tmpfiles()
session.clear() session.clear()
i = 0
titles = [] titles = []
if not HAR_DIR.exists(): if not HAR_DIR.exists():
HAR_DIR.mkdir(parents=True) HAR_DIR.mkdir(parents=True)
@ -223,10 +230,12 @@ def index():
har_files = sorted(report_dir.glob('*.har')) har_files = sorted(report_dir.glob('*.har'))
if not har_files: if not har_files:
continue continue
with open(har_files[0], 'r') as f: with har_files[0].open() as f:
j = json.load(f) j = json.load(f)
titles.append((i, j['log']['pages'][0]['title'])) title = j['log']['pages'][0]['title']
i += 1 with (report_dir / 'uuid').open() as f:
uuid = f.read().strip()
titles.append((uuid, title))
return render_template('index.html', titles=titles) return render_template('index.html', titles=titles)

View File

@ -4,11 +4,11 @@
{% block content %} {% block content %}
<center> <center>
<h2><a href="{{ url_for('scrape') }}">Scrape a page</a></h2></br></br> <h2><a href="{{ url_for('scrape_web') }}">Scrape a page</a></h2></br></br>
</center> </center>
<center> <center>
{% for id, page_title in titles %} {% for uuid, page_title in titles %}
<a href="{{ url_for('tree', tree_id=id) }}">{{ page_title }}</a></br> <a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></br>
</br> </br>
{% endfor %} {% endfor %}
</center> </center>

View File

@ -59,7 +59,7 @@
<b>Root URL</b>: {{ root_url }}</br> <b>Root URL</b>: {{ root_url }}</br>
<b>Start time</b>: {{ start_time }}</br> <b>Start time</b>: {{ start_time }}</br>
<b>User Agent</b>: {{ user_agent }}</br> <b>User Agent</b>: {{ user_agent }}</br>
<center><a href="{{ url_for('image', tree_id=tree_id) }}">Download Image</a></center> <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
</div> </div>
{% endblock content %} {% endblock content %}