chg: Assign a UUID for each scraped page, allow permalinks

2019-01-29 11:42:56 +01:00 · 2019-01-29 11:42:56 +01:00 · e14a0150a0
parent a2c4d871fe
commit e14a0150a0
5 changed files with 80 additions and 70 deletions
--- a/.gitignore
+++ b/.gitignore
@ -110,3 +110,4 @@ ENV/
 secret_key
 FileSaver.js
 d3.v5.min.js
+d3.v5.js
--- a/Pipfile.lock
+++ b/Pipfile.lock
@ -447,7 +447,7 @@
        "scrapysplashwrapper": {
            "editable": true,
            "git": "https://github.com/viper-framework/ScrapySplashWrapper.git",
-            "ref": "e6d9da1a971011b2925800b168d762e7f86b2b29"
+            "ref": "d0e129f9962098603f6686c8152030f7d31a604a"
        },
        "service-identity": {
            "hashes": [
--- a/lookyloo/init.py
+++ b/lookyloo/init.py
@ -6,7 +6,7 @@ import json
 from har2tree import CrawledTree
 from scrapysplashwrapper import crawl

-from flask import Flask, render_template, request, session, send_file
+from flask import Flask, render_template, request, session, send_file, redirect, url_for
 from flask_bootstrap import Bootstrap

 from datetime import datetime
@ -19,8 +19,8 @@ import time
 from zipfile import ZipFile, ZIP_DEFLATED
 from io import BytesIO
 import base64
-import socket
 import os
+from uuid import uuid4

 from pysanejs import SaneJS

@ -32,10 +32,10 @@ app = Flask(__name__)
 secret_file_path = get_homedir() / 'secret_key'

 if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
-    with open(secret_file_path, 'wb') as f:
+    with secret_file_path.open('wb') as f:
        f.write(os.urandom(64))

-with open(secret_file_path, 'rb') as f:
+with secret_file_path.open('rb') as f:
    app.config['SECRET_KEY'] = f.read()

 Bootstrap(app)
@ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True)
 SPLASH = 'http://127.0.0.1:8050'
 SANE_JS = 'http://127.0.0.1:5007'

-
-def is_open(ip, port):
-    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    s.settimeout(2)
-    try:
-        s.connect((ip, int(port)))
-        s.shutdown(2)
-        return True
-    except Exception:
-        return False
-
-
 if SANE_JS:
    sanejs = SaneJS(SANE_JS)
    if sanejs.is_up:
@ -69,6 +57,27 @@ if SANE_JS:
        has_sane_js = False


+def get_report_dirs():
+    # Cleanup HAR_DIR of failed runs.
+    for report_dir in HAR_DIR.iterdir():
+        if report_dir.is_dir() and not report_dir.iterdir():
+            report_dir.rmdir()
+        if not (report_dir / 'uuid').exists():
+            # Create uuid if missing
+            with (report_dir / 'uuid').open('w') as f:
+                f.write(str(uuid4()))
+    return sorted(HAR_DIR.iterdir(), reverse=True)
+
+
+def get_lookup_dirs():
+    # Build lookup table trees
+    lookup_dirs = {}
+    for report_dir in get_report_dirs():
+        with (report_dir / 'uuid').open() as f:
+            lookup_dirs[f.read().strip()] = report_dir
+    return lookup_dirs
+
+
 def cleanup_old_tmpfiles():
    for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
        if time.time() - tmpfile.stat().st_atime > 36000:
@ -93,53 +102,50 @@ def load_tree(report_dir):
    return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url


-def sane_js_query(sha512):
+def sane_js_query(sha512: str):
    if has_sane_js:
        return sanejs.sha512(sha512)
    return {'response': []}


+def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
+    if not url.startswith('http'):
+        url = f'http://{url}'
+    items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
+    if not items:
+        # broken
+        pass
+    if not perma_uuid:
+        perma_uuid = str(uuid4())
+    width = len(str(len(items)))
+    dirpath = HAR_DIR / datetime.now().isoformat()
+    dirpath.mkdir()
+    for i, item in enumerate(items):
+        harfile = item['har']
+        png = base64.b64decode(item['png'])
+        child_frames = item['childFrames']
+        html = item['html']
+        with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
+            json.dump(harfile, f)
+        with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
+            f.write(png)
+        with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
+            f.write(html)
+        with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
+            json.dump(child_frames, f)
+        with (dirpath / 'uuid').open('w') as f:
+            f.write(perma_uuid)
+    return perma_uuid
+
+
@app.route('/scrape', methods=['GET', 'POST'])
-def scrape():
+def scrape_web():
    if request.form.get('url'):
-        url = request.form.get('url')
-        if not url.startswith('http'):
-            url = f'http://{url}'
-        depth = request.form.get('depth')
-        if depth is None:
-            depth = 1
-        items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO')
-        if not items:
-            # broken
-            pass
-        width = len(str(len(items)))
-        dirpath = HAR_DIR / datetime.now().isoformat()
-        dirpath.mkdir()
-        for i, item in enumerate(items):
-            harfile = item['har']
-            png = base64.b64decode(item['png'])
-            child_frames = item['childFrames']
-            html = item['html']
-            with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
-                json.dump(harfile, f)
-            with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
-                f.write(png)
-            with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
-                f.write(html)
-            with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
-                json.dump(child_frames, f)
-        return tree(0)
+        perma_uuid = scrape(request.form.get('url'), request.form.get('depth'))
+        return redirect(url_for('tree', tree_uuid=perma_uuid))
    return render_template('scrape.html')


-def get_report_dirs():
-    # Cleanup HAR_DIR of failed runs.
-    for report_dir in HAR_DIR.iterdir():
-        if report_dir.is_dir() and not report_dir.iterdir():
-            report_dir.rmdir()
-    return sorted(HAR_DIR.iterdir(), reverse=True)
-
-
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
 def hostnode_details_text(node_uuid):
    with open(session["tree"], 'rb') as f:
@ -169,7 +175,7 @@ def hostnode_details(node_uuid):
            sane_js_r = sane_js_query(url.body_hash)
            if sane_js_r.get('response'):
                url.add_feature('sane_js_details', sane_js_r['response'])
-                print(url.sane_js_details)
+                print('######## SANEJS ##### ', url.sane_js_details)
        urls.append(url.to_json())
    return json.dumps(urls)

@ -195,27 +201,28 @@ def urlnode_details(node_uuid):
                     as_attachment=True, attachment_filename='file.zip')


-@app.route('/tree/<int:tree_id>/image', methods=['GET'])
-def image(tree_id):
-    report_dir = get_report_dirs()[tree_id]
+@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
+def image(tree_uuid):
+    lookup_dirs = get_lookup_dirs()
+    report_dir = lookup_dirs[tree_uuid]
    to_return = load_image(report_dir)
    return send_file(to_return, mimetype='image/png',
                     as_attachment=True, attachment_filename='image.png')


-@app.route('/tree/<int:tree_id>', methods=['GET'])
-def tree(tree_id):
-    report_dir = get_report_dirs()[tree_id]
+@app.route('/tree/<string:tree_uuid>', methods=['GET'])
+def tree(tree_uuid):
+    lookup_dirs = get_lookup_dirs()
+    report_dir = lookup_dirs[tree_uuid]
    tree_json, start_time, user_agent, root_url = load_tree(report_dir)
    return render_template('tree.html', tree_json=tree_json, start_time=start_time,
-                           user_agent=user_agent, root_url=root_url, tree_id=tree_id)
+                           user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)


@app.route('/', methods=['GET'])
 def index():
    cleanup_old_tmpfiles()
    session.clear()
-    i = 0
    titles = []
    if not HAR_DIR.exists():
        HAR_DIR.mkdir(parents=True)
@ -223,10 +230,12 @@ def index():
        har_files = sorted(report_dir.glob('*.har'))
        if not har_files:
            continue
-        with open(har_files[0], 'r') as f:
+        with har_files[0].open() as f:
            j = json.load(f)
-            titles.append((i, j['log']['pages'][0]['title']))
-        i += 1
+            title = j['log']['pages'][0]['title']
+        with (report_dir / 'uuid').open() as f:
+            uuid = f.read().strip()
+        titles.append((uuid, title))

    return render_template('index.html', titles=titles)

--- a/lookyloo/templates/index.html
+++ b/lookyloo/templates/index.html
@ -4,11 +4,11 @@

 {% block content %}
  <center>
-    <h2><a href="{{ url_for('scrape') }}">Scrape a page</a></h2></br></br>
+    <h2><a href="{{ url_for('scrape_web') }}">Scrape a page</a></h2></br></br>
  </center>
  <center>
-    {% for id, page_title in titles %}
-      <a href="{{ url_for('tree', tree_id=id) }}">{{ page_title }}</a></br>
+    {% for uuid, page_title in titles %}
+      <a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></br>
    </br>
    {% endfor %}
  </center>
--- a/lookyloo/templates/tree.html
+++ b/lookyloo/templates/tree.html
@ -59,7 +59,7 @@
    <b>Root URL</b>: {{ root_url }}</br>
    <b>Start time</b>: {{ start_time }}</br>
    <b>User Agent</b>: {{ user_agent }}</br>
-    <center><a href="{{ url_for('image', tree_id=tree_id) }}">Download Image</a></center>
+    <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
 </div>

 {% endblock content %}