From e14a0150a00bb73711cbb2ec4311620c42c0d49f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Tue, 29 Jan 2019 11:42:56 +0100 Subject: [PATCH] chg: Assign a UUID for each scraped page, allow permalinks --- .gitignore | 1 + Pipfile.lock | 2 +- lookyloo/__init__.py | 139 ++++++++++++++++++---------------- lookyloo/templates/index.html | 6 +- lookyloo/templates/tree.html | 2 +- 5 files changed, 80 insertions(+), 70 deletions(-) diff --git a/.gitignore b/.gitignore index 8ed9d7f..ad1b45a 100644 --- a/.gitignore +++ b/.gitignore @@ -110,3 +110,4 @@ ENV/ secret_key FileSaver.js d3.v5.min.js +d3.v5.js diff --git a/Pipfile.lock b/Pipfile.lock index 7c35c8a..fb2f2d4 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -447,7 +447,7 @@ "scrapysplashwrapper": { "editable": true, "git": "https://github.com/viper-framework/ScrapySplashWrapper.git", - "ref": "e6d9da1a971011b2925800b168d762e7f86b2b29" + "ref": "d0e129f9962098603f6686c8152030f7d31a604a" }, "service-identity": { "hashes": [ diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py index 8a4c183..32df3fd 100644 --- a/lookyloo/__init__.py +++ b/lookyloo/__init__.py @@ -6,7 +6,7 @@ import json from har2tree import CrawledTree from scrapysplashwrapper import crawl -from flask import Flask, render_template, request, session, send_file +from flask import Flask, render_template, request, session, send_file, redirect, url_for from flask_bootstrap import Bootstrap from datetime import datetime @@ -19,8 +19,8 @@ import time from zipfile import ZipFile, ZIP_DEFLATED from io import BytesIO import base64 -import socket import os +from uuid import uuid4 from pysanejs import SaneJS @@ -32,10 +32,10 @@ app = Flask(__name__) secret_file_path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: - with open(secret_file_path, 'wb') as f: + with secret_file_path.open('wb') as f: f.write(os.urandom(64)) -with open(secret_file_path, 'rb') as f: +with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) @@ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True) SPLASH = 'http://127.0.0.1:8050' SANE_JS = 'http://127.0.0.1:5007' - -def is_open(ip, port): - s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - s.settimeout(2) - try: - s.connect((ip, int(port))) - s.shutdown(2) - return True - except Exception: - return False - - if SANE_JS: sanejs = SaneJS(SANE_JS) if sanejs.is_up: @@ -69,6 +57,27 @@ if SANE_JS: has_sane_js = False +def get_report_dirs(): + # Cleanup HAR_DIR of failed runs. + for report_dir in HAR_DIR.iterdir(): + if report_dir.is_dir() and not report_dir.iterdir(): + report_dir.rmdir() + if not (report_dir / 'uuid').exists(): + # Create uuid if missing + with (report_dir / 'uuid').open('w') as f: + f.write(str(uuid4())) + return sorted(HAR_DIR.iterdir(), reverse=True) + + +def get_lookup_dirs(): + # Build lookup table trees + lookup_dirs = {} + for report_dir in get_report_dirs(): + with (report_dir / 'uuid').open() as f: + lookup_dirs[f.read().strip()] = report_dir + return lookup_dirs + + def cleanup_old_tmpfiles(): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): if time.time() - tmpfile.stat().st_atime > 36000: @@ -93,53 +102,50 @@ def load_tree(report_dir): return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url -def sane_js_query(sha512): +def sane_js_query(sha512: str): if has_sane_js: return sanejs.sha512(sha512) return {'response': []} +def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None): + if not url.startswith('http'): + url = f'http://{url}' + items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') + if not items: + # broken + pass + if not perma_uuid: + perma_uuid = str(uuid4()) + width = len(str(len(items))) + dirpath = HAR_DIR / datetime.now().isoformat() + dirpath.mkdir() + for i, item in enumerate(items): + harfile = item['har'] + png = base64.b64decode(item['png']) + child_frames = item['childFrames'] + html = item['html'] + with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: + json.dump(harfile, f) + with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: + f.write(png) + with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: + f.write(html) + with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: + json.dump(child_frames, f) + with (dirpath / 'uuid').open('w') as f: + f.write(perma_uuid) + return perma_uuid + + @app.route('/scrape', methods=['GET', 'POST']) -def scrape(): +def scrape_web(): if request.form.get('url'): - url = request.form.get('url') - if not url.startswith('http'): - url = f'http://{url}' - depth = request.form.get('depth') - if depth is None: - depth = 1 - items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO') - if not items: - # broken - pass - width = len(str(len(items))) - dirpath = HAR_DIR / datetime.now().isoformat() - dirpath.mkdir() - for i, item in enumerate(items): - harfile = item['har'] - png = base64.b64decode(item['png']) - child_frames = item['childFrames'] - html = item['html'] - with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: - json.dump(harfile, f) - with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: - f.write(png) - with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: - f.write(html) - with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: - json.dump(child_frames, f) - return tree(0) + perma_uuid = scrape(request.form.get('url'), request.form.get('depth')) + return redirect(url_for('tree', tree_uuid=perma_uuid)) return render_template('scrape.html') -def get_report_dirs(): - # Cleanup HAR_DIR of failed runs. - for report_dir in HAR_DIR.iterdir(): - if report_dir.is_dir() and not report_dir.iterdir(): - report_dir.rmdir() - return sorted(HAR_DIR.iterdir(), reverse=True) - - @app.route('/tree/hostname//text', methods=['GET']) def hostnode_details_text(node_uuid): with open(session["tree"], 'rb') as f: @@ -169,7 +175,7 @@ def hostnode_details(node_uuid): sane_js_r = sane_js_query(url.body_hash) if sane_js_r.get('response'): url.add_feature('sane_js_details', sane_js_r['response']) - print(url.sane_js_details) + print('######## SANEJS ##### ', url.sane_js_details) urls.append(url.to_json()) return json.dumps(urls) @@ -195,27 +201,28 @@ def urlnode_details(node_uuid): as_attachment=True, attachment_filename='file.zip') -@app.route('/tree//image', methods=['GET']) -def image(tree_id): - report_dir = get_report_dirs()[tree_id] +@app.route('/tree//image', methods=['GET']) +def image(tree_uuid): + lookup_dirs = get_lookup_dirs() + report_dir = lookup_dirs[tree_uuid] to_return = load_image(report_dir) return send_file(to_return, mimetype='image/png', as_attachment=True, attachment_filename='image.png') -@app.route('/tree/', methods=['GET']) -def tree(tree_id): - report_dir = get_report_dirs()[tree_id] +@app.route('/tree/', methods=['GET']) +def tree(tree_uuid): + lookup_dirs = get_lookup_dirs() + report_dir = lookup_dirs[tree_uuid] tree_json, start_time, user_agent, root_url = load_tree(report_dir) return render_template('tree.html', tree_json=tree_json, start_time=start_time, - user_agent=user_agent, root_url=root_url, tree_id=tree_id) + user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid) @app.route('/', methods=['GET']) def index(): cleanup_old_tmpfiles() session.clear() - i = 0 titles = [] if not HAR_DIR.exists(): HAR_DIR.mkdir(parents=True) @@ -223,10 +230,12 @@ def index(): har_files = sorted(report_dir.glob('*.har')) if not har_files: continue - with open(har_files[0], 'r') as f: + with har_files[0].open() as f: j = json.load(f) - titles.append((i, j['log']['pages'][0]['title'])) - i += 1 + title = j['log']['pages'][0]['title'] + with (report_dir / 'uuid').open() as f: + uuid = f.read().strip() + titles.append((uuid, title)) return render_template('index.html', titles=titles) diff --git a/lookyloo/templates/index.html b/lookyloo/templates/index.html index da090cb..3ca9145 100644 --- a/lookyloo/templates/index.html +++ b/lookyloo/templates/index.html @@ -4,11 +4,11 @@ {% block content %}
-

Scrape a page



+

Scrape a page



- {% for id, page_title in titles %} - {{ page_title }}
+ {% for uuid, page_title in titles %} + {{ page_title }}

{% endfor %}
diff --git a/lookyloo/templates/tree.html b/lookyloo/templates/tree.html index 02ebcb9..7cfb3e4 100644 --- a/lookyloo/templates/tree.html +++ b/lookyloo/templates/tree.html @@ -59,7 +59,7 @@ Root URL: {{ root_url }}
Start time: {{ start_time }}
User Agent: {{ user_agent }}
-
Download Image
+
Download Image
{% endblock content %}