chg: Assign a UUID for each scraped page, allow permalinks

2019-01-29 11:42:56 +01:00 · 2019-01-29 11:42:56 +01:00 · e14a0150a0
parent a2c4d871fe
commit e14a0150a0
5 changed files with 80 additions and 70 deletions
--- a/.gitignore
+++ b/.gitignore
@ -110,3 +110,4 @@ ENV/
 secret_key
 FileSaver.js
 d3.v5.min.js
 d3.v5.js
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -447,7 +447,7 @@
        "scrapysplashwrapper": {
            "editable": true,
            "git": "https://github.com/viper-framework/ScrapySplashWrapper.git",
-            "ref": "e6d9da1a971011b2925800b168d762e7f86b2b29"
+            "ref": "d0e129f9962098603f6686c8152030f7d31a604a"
        },
        "service-identity": {
            "hashes": [
--- a/lookyloo/init.py
+++ b/lookyloo/init.py
@ -6,7 +6,7 @@ import json
 from har2tree import CrawledTree
 from scrapysplashwrapper import crawl
-from flask import Flask, render_template, request, session, send_file
+from flask import Flask, render_template, request, session, send_file, redirect, url_for
 from flask_bootstrap import Bootstrap
 from datetime import datetime
@ -19,8 +19,8 @@ import time
 from zipfile import ZipFile, ZIP_DEFLATED
 from io import BytesIO
 import base64
 import socket
 import os
 from uuid import uuid4
 from pysanejs import SaneJS
@ -32,10 +32,10 @@ app = Flask(__name__)
 secret_file_path = get_homedir() / 'secret_key'
 if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
-    with open(secret_file_path, 'wb') as f:
+    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))
-with open(secret_file_path, 'rb') as f:
+with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()
 Bootstrap(app)
@ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True)
 SPLASH = 'http://127.0.0.1:8050'
 SANE_JS = 'http://127.0.0.1:5007'
 def is_open(ip, port):
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.settimeout(2)
    try:
        s.connect((ip, int(port)))
        s.shutdown(2)
        return True
    except Exception:
        return False
 if SANE_JS:
    sanejs = SaneJS(SANE_JS)
    if sanejs.is_up:
@ -69,6 +57,27 @@ if SANE_JS:
        has_sane_js = False
 def get_report_dirs():
    # Cleanup HAR_DIR of failed runs.
    for report_dir in HAR_DIR.iterdir():
        if report_dir.is_dir() and not report_dir.iterdir():
            report_dir.rmdir()
        if not (report_dir / 'uuid').exists():
            # Create uuid if missing
            with (report_dir / 'uuid').open('w') as f:
                f.write(str(uuid4()))
    return sorted(HAR_DIR.iterdir(), reverse=True)
 def get_lookup_dirs():
    # Build lookup table trees
    lookup_dirs = {}
    for report_dir in get_report_dirs():
        with (report_dir / 'uuid').open() as f:
            lookup_dirs[f.read().strip()] = report_dir
    return lookup_dirs
 def cleanup_old_tmpfiles():
    for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
        if time.time() - tmpfile.stat().st_atime > 36000:
@ -93,53 +102,50 @@ def load_tree(report_dir):
    return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
-def sane_js_query(sha512):
+def sane_js_query(sha512: str):
    if has_sane_js:
        return sanejs.sha512(sha512)
    return {'response': []}
 def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
    if not url.startswith('http'):
        url = f'http://{url}'
    items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
    if not items:
        # broken
        pass
    if not perma_uuid:
        perma_uuid = str(uuid4())
    width = len(str(len(items)))
    dirpath = HAR_DIR / datetime.now().isoformat()
    dirpath.mkdir()
    for i, item in enumerate(items):
        harfile = item['har']
        png = base64.b64decode(item['png'])
        child_frames = item['childFrames']
        html = item['html']
        with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
            json.dump(harfile, f)
        with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
            f.write(png)
        with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
            f.write(html)
        with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
            json.dump(child_frames, f)
        with (dirpath / 'uuid').open('w') as f:
            f.write(perma_uuid)
    return perma_uuid
@app.route('/scrape', methods=['GET', 'POST'])
-def scrape():
+def scrape_web():
    if request.form.get('url'):
-        url = request.form.get('url')
+        perma_uuid = scrape(request.form.get('url'), request.form.get('depth'))
-        if not url.startswith('http'):
+        return redirect(url_for('tree', tree_uuid=perma_uuid))
            url = f'http://{url}'
        depth = request.form.get('depth')
        if depth is None:
            depth = 1
        items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO')
        if not items:
            # broken
            pass
        width = len(str(len(items)))
        dirpath = HAR_DIR / datetime.now().isoformat()
        dirpath.mkdir()
        for i, item in enumerate(items):
            harfile = item['har']
            png = base64.b64decode(item['png'])
            child_frames = item['childFrames']
            html = item['html']
            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
                json.dump(harfile, f)
            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
                f.write(png)
            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
                f.write(html)
            with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
                json.dump(child_frames, f)
        return tree(0)
    return render_template('scrape.html')
 def get_report_dirs():
    # Cleanup HAR_DIR of failed runs.
    for report_dir in HAR_DIR.iterdir():
        if report_dir.is_dir() and not report_dir.iterdir():
            report_dir.rmdir()
    return sorted(HAR_DIR.iterdir(), reverse=True)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
 def hostnode_details_text(node_uuid):
    with open(session["tree"], 'rb') as f:
@ -169,7 +175,7 @@ def hostnode_details(node_uuid):
            sane_js_r = sane_js_query(url.body_hash)
            if sane_js_r.get('response'):
                url.add_feature('sane_js_details', sane_js_r['response'])
-                print(url.sane_js_details)
+                print('######## SANEJS ##### ', url.sane_js_details)
        urls.append(url.to_json())
    return json.dumps(urls)
@ -195,27 +201,28 @@ def urlnode_details(node_uuid):
                     as_attachment=True, attachment_filename='file.zip')
-@app.route('/tree/<int:tree_id>/image', methods=['GET'])
+@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
-def image(tree_id):
+def image(tree_uuid):
-    report_dir = get_report_dirs()[tree_id]
+    lookup_dirs = get_lookup_dirs()
    report_dir = lookup_dirs[tree_uuid]
    to_return = load_image(report_dir)
    return send_file(to_return, mimetype='image/png',
                     as_attachment=True, attachment_filename='image.png')
-@app.route('/tree/<int:tree_id>', methods=['GET'])
+@app.route('/tree/<string:tree_uuid>', methods=['GET'])
-def tree(tree_id):
+def tree(tree_uuid):
-    report_dir = get_report_dirs()[tree_id]
+    lookup_dirs = get_lookup_dirs()
    report_dir = lookup_dirs[tree_uuid]
    tree_json, start_time, user_agent, root_url = load_tree(report_dir)
    return render_template('tree.html', tree_json=tree_json, start_time=start_time,
-                           user_agent=user_agent, root_url=root_url, tree_id=tree_id)
+                           user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
@app.route('/', methods=['GET'])
 def index():
    cleanup_old_tmpfiles()
    session.clear()
    i = 0
    titles = []
    if not HAR_DIR.exists():
        HAR_DIR.mkdir(parents=True)
@ -223,10 +230,12 @@ def index():
        har_files = sorted(report_dir.glob('*.har'))
        if not har_files:
            continue
-        with open(har_files[0], 'r') as f:
+        with har_files[0].open() as f:
            j = json.load(f)
-            titles.append((i, j['log']['pages'][0]['title']))
+            title = j['log']['pages'][0]['title']
-        i += 1
+        with (report_dir / 'uuid').open() as f:
            uuid = f.read().strip()
        titles.append((uuid, title))
    return render_template('index.html', titles=titles)
--- a/lookyloo/templates/index.html
+++ b/lookyloo/templates/index.html
@ -4,11 +4,11 @@
 {% block content %}
  <center>
-    <h2><a href="{{ url_for('scrape') }}">Scrape a page</a></h2></br></br>
+    <h2><a href="{{ url_for('scrape_web') }}">Scrape a page</a></h2></br></br>
  </center>
  <center>
-    {% for id, page_title in titles %}
+    {% for uuid, page_title in titles %}
-      <a href="{{ url_for('tree', tree_id=id) }}">{{ page_title }}</a></br>
+      <a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></br>
    </br>
    {% endfor %}
  </center>
--- a/lookyloo/templates/tree.html
+++ b/lookyloo/templates/tree.html
@ -59,7 +59,7 @@
    <b>Root URL</b>: {{ root_url }}</br>
    <b>Start time</b>: {{ start_time }}</br>
    <b>User Agent</b>: {{ user_agent }}</br>
-    <center><a href="{{ url_for('image', tree_id=tree_id) }}">Download Image</a></center>
+    <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
 </div>
 {% endblock content %}