From 247a4a26b0b0c7c666d17f2e8fdf9c956c72f7e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 22 Mar 2018 17:22:19 +0100 Subject: [PATCH 1/7] chg: Change cookie name, update legend, remove old pickled trees Fix #18 --- lookyloo/__init__.py | 7 +++++-- lookyloo/templates/tree.html | 29 +++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py index 02679168..d0497dce 100644 --- a/lookyloo/__init__.py +++ b/lookyloo/__init__.py @@ -30,6 +30,7 @@ if app.secret_key == 'changeme': Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True +app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = True HAR_DIR = 'scraped' @@ -46,8 +47,7 @@ def session_management(): def load_tree(report_dir): if session.get('tree'): - # TODO delete file - pass + os.unlink(session.get('tree')) session.clear() har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) ct = CrawledTree(har_files) @@ -80,10 +80,13 @@ def scrape(): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] + html = item['html'] with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: json.dump(harfile, f) with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f: f.write(png) + with open(os.path.join(dirpath, '{0:0{width}}.html'.format(i, width=width)), 'w') as f: + f.write(html) with open(os.path.join(dirpath, '{0:0{width}}.frames.json'.format(i, width=width)), 'w') as f: json.dump(child_frames, f) return tree(0) diff --git a/lookyloo/templates/tree.html b/lookyloo/templates/tree.html index b2a99726..4232deb1 100644 --- a/lookyloo/templates/tree.html +++ b/lookyloo/templates/tree.html @@ -27,8 +27,33 @@ Redirect Redirect
- Cookie in URL Cookie in URL
+ Font Font
+ + HTML HTML
+ + JSON JSON
+ + CSS CSS
+ + EXE EXE
+ + Image Image
+ + Video Video
+ + iFrame iFrame
+ + Content type not set/unknown Content type not set/unknown
+
Tree details

Root URL: {{ root_url }}
From fb195971e8d1364a9bbff8e59e693e2d0d400cd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 22 Mar 2018 18:33:42 +0100 Subject: [PATCH 2/7] chg: use pathlib everywhere, remove old tmpfiles --- lookyloo/__init__.py | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py index d0497dce..abdc4401 100644 --- a/lookyloo/__init__.py +++ b/lookyloo/__init__.py @@ -9,13 +9,12 @@ from scrapysplashwrapper import crawl from flask import Flask, render_template, request, session, send_file from flask_bootstrap import Bootstrap -from glob import glob -import os from datetime import datetime import pickle import tempfile import pathlib +import time from zipfile import ZipFile, ZIP_DEFLATED from io import BytesIO @@ -33,10 +32,10 @@ app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = True -HAR_DIR = 'scraped' +HAR_DIR = pathlib.Path('scraped') SPLASH = 'http://127.0.0.1:8050' -pathlib.Path(HAR_DIR).mkdir(parents=True, exist_ok=True) +HAR_DIR.mkdir(parents=True, exist_ok=True) @app.before_request @@ -45,11 +44,15 @@ def session_management(): session.permanent = True +def cleanup_old_tmpfiles(): + for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): + if time.time() - tmpfile.stat().st_atime > 36000: + tmpfile.unlink() + + def load_tree(report_dir): - if session.get('tree'): - os.unlink(session.get('tree')) session.clear() - har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + har_files = sorted(report_dir.glob('*.har')) ct = CrawledTree(har_files) ct.find_parents() ct.join_trees() @@ -74,20 +77,20 @@ def scrape(): # broken pass width = len(str(len(items))) - dirpath = os.path.join(HAR_DIR, datetime.now().isoformat()) - os.makedirs(dirpath) + dirpath = HAR_DIR / datetime.now().isoformat() + dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] - with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: + with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: json.dump(harfile, f) - with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f: + with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: f.write(png) - with open(os.path.join(dirpath, '{0:0{width}}.html'.format(i, width=width)), 'w') as f: + with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: f.write(html) - with open(os.path.join(dirpath, '{0:0{width}}.frames.json'.format(i, width=width)), 'w') as f: + with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: json.dump(child_frames, f) return tree(0) return render_template('scrape.html') @@ -95,10 +98,10 @@ def scrape(): def get_report_dirs(): # Cleanup HAR_DIR of failed runs. - for report_dir in os.listdir(HAR_DIR): - if not os.listdir(os.path.join(HAR_DIR, report_dir)): - os.rmdir(os.path.join(HAR_DIR, report_dir)) - return sorted(os.listdir(HAR_DIR), reverse=True) + for report_dir in HAR_DIR.iterdir(): + if report_dir.is_dir() and not report_dir.iterdir(): + report_dir.rmdir() + return sorted(HAR_DIR.iterdir(), reverse=True) @app.route('/tree/hostname/', methods=['GET']) @@ -141,12 +144,13 @@ def tree(tree_id): @app.route('/', methods=['GET']) def index(): + cleanup_old_tmpfiles() i = 0 titles = [] - if not os.path.exists(HAR_DIR): - os.makedirs(HAR_DIR) + if not HAR_DIR.exists(): + HAR_DIR.mkdir(parents=True) for report_dir in get_report_dirs(): - har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + har_files = sorted(report_dir.glob('*.har')) if not har_files: continue with open(har_files[0], 'r') as f: From 28d078aec48a45da5aa2ebc75edd40ba711a05a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 22 Mar 2018 18:48:41 +0100 Subject: [PATCH 3/7] fix: disable file-wrapper Reason: https://github.com/unbit/uwsgi/issues/1126 --- lookyloo.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lookyloo.ini b/lookyloo.ini index 33457ec1..9cca9753 100644 --- a/lookyloo.ini +++ b/lookyloo.ini @@ -9,3 +9,5 @@ chmod-socket = 660 vacuum = true die-on-term = true + +wsgi-disable-file-wrapper = true From 72759850ac9fd4efa878ef54bbb22ecc465f2ed7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 5 Apr 2018 11:17:26 +0200 Subject: [PATCH 4/7] chg: Use d3v5, cleanup. --- README.md | 2 +- lookyloo/__init__.py | 7 +------ lookyloo/static/tree.js | 17 ++++------------- lookyloo/templates/main.html | 2 +- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 420ac014..fae9c2a9 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disa ```bash pip install -r requirements.txt pip install -e . -wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js +wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js ``` # Run the app locally diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py index abdc4401..648872b8 100644 --- a/lookyloo/__init__.py +++ b/lookyloo/__init__.py @@ -38,12 +38,6 @@ SPLASH = 'http://127.0.0.1:8050' HAR_DIR.mkdir(parents=True, exist_ok=True) -@app.before_request -def session_management(): - # make the session last indefinitely until it is cleared - session.permanent = True - - def cleanup_old_tmpfiles(): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): if time.time() - tmpfile.stat().st_atime > 36000: @@ -145,6 +139,7 @@ def tree(tree_id): @app.route('/', methods=['GET']) def index(): cleanup_old_tmpfiles() + session.clear() i = 0 titles = [] if not HAR_DIR.exists(): diff --git a/lookyloo/static/tree.js b/lookyloo/static/tree.js index 4508cb85..59c56258 100644 --- a/lookyloo/static/tree.js +++ b/lookyloo/static/tree.js @@ -88,17 +88,9 @@ function str2bytes (str) { function urlnode_click(d) { var url = "url/" + d.data.uuid; - var xhr = new XMLHttpRequest(); - xhr.open('GET', url, true); - xhr.responseType = "blob"; - xhr.withCredentials = true; - xhr.onreadystatechange = function (){ - if (xhr.readyState === 4) { - var blob = xhr.response; - saveAs(blob, 'file.zip'); - } - }; - xhr.send(); + d3.blob(url, {credentials: 'same-origin'}).then(function(data) { + saveAs(data, 'file.zip'); + }); }; d3.selection.prototype.moveToFront = function() { @@ -154,9 +146,8 @@ function hostnode_click(d) { // Modal display var url = "/tree/hostname/" + d.data.uuid; - d3.json(url, function(error, urls) { + d3.json(url, {credentials: 'same-origin'}).then(function(urls) { var interval_entries = 40; - if (error) throw error; urls.forEach(function(url, index, array) { var jdata = JSON.parse(url) overlay_hostname.datum({'data': jdata}); diff --git a/lookyloo/templates/main.html b/lookyloo/templates/main.html index 99be2ffd..b65ea613 100644 --- a/lookyloo/templates/main.html +++ b/lookyloo/templates/main.html @@ -3,7 +3,7 @@ {% block scripts %} {{ super() }} - + {% endblock %} {% block head %} From ad530c6ee00cea94a5ece72520202c052d5253e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 5 Apr 2018 11:22:13 +0200 Subject: [PATCH 5/7] chg: Update docker file for d3v5 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index e486de6b..63f7a77b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ WORKDIR lookyloo RUN pip3 install -r requirements.txt RUN pip3 install -e . -RUN wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js +RUN wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js RUN wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js RUN sed -i "s/SPLASH = 'http:\/\/127.0.0.1:8050'/SPLASH = 'http:\/\/splash:8050'/g" lookyloo/__init__.py From d7cf92db21ebbbf3b2c8b3359fc6326fa01312c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Thu, 5 Apr 2018 22:59:45 +0200 Subject: [PATCH 6/7] new: Close button for the overlays --- lookyloo/static/tree.js | 72 ++++++++++++++++++++++++++++------------- 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/lookyloo/static/tree.js b/lookyloo/static/tree.js index 59c56258..df39dd32 100644 --- a/lookyloo/static/tree.js +++ b/lookyloo/static/tree.js @@ -29,11 +29,7 @@ var background = main_svg.append('rect') .attr('y', 0) .attr('width', width) .attr('height', height) - .style('fill', "url(#backstripes)") - .on('click', function(d) { - // Remove the - main_svg.selectAll('.overlay').remove() - }); + .style('fill', "url(#backstripes)"); // append the svg object to the body of the page // appends a 'group' element to 'svg' @@ -78,14 +74,6 @@ function getBB(selection) { }) }; -function str2bytes (str) { - var bytes = new Uint8Array(str.length); - for (var i=0; i Date: Fri, 6 Apr 2018 18:06:41 +0200 Subject: [PATCH 7/7] new: allow to download the URLs list in a text file --- lookyloo/__init__.py | 38 ++++++++++++++----- lookyloo/static/tree.js | 84 +++++++++++++++++++++++++++++++---------- 2 files changed, 93 insertions(+), 29 deletions(-) diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py index 648872b8..090a714f 100644 --- a/lookyloo/__init__.py +++ b/lookyloo/__init__.py @@ -62,7 +62,7 @@ def scrape(): if request.form.get('url'): url = request.form.get('url') if not url.startswith('http'): - url = 'http://{}'.format(url) + url = f'http://{url}' depth = request.form.get('depth') if depth is None: depth = 1 @@ -98,6 +98,24 @@ def get_report_dirs(): return sorted(HAR_DIR.iterdir(), reverse=True) +@app.route('/tree/hostname//text', methods=['GET']) +def hostnode_details_text(node_uuid): + with open(session["tree"], 'rb') as f: + ct = pickle.load(f) + hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) + urls = [] + for url in hostnode.urls: + urls.append(url.name) + content = '''# URLs + +{} +'''.format('\n'.join(urls)) + to_return = BytesIO(content.encode()) + to_return.seek(0) + return send_file(to_return, mimetype='text/markdown', + as_attachment=True, attachment_filename='file.md') + + @app.route('/tree/hostname/', methods=['GET']) def hostnode_details(node_uuid): with open(session["tree"], 'rb') as f: @@ -114,16 +132,18 @@ def urlnode_details(node_uuid): with open(session["tree"], 'rb') as f: ct = pickle.load(f) urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid) - to_return = BytesIO() + got_content = False if hasattr(urlnode, 'body'): - with ZipFile(to_return, 'a', ZIP_DEFLATED, False) as zfile: - zfile.writestr(urlnode.filename, urlnode.body.getvalue()) - to_return.seek(0) - # return send_file(urlnode.body, mimetype='application/zip', - # as_attachment=True, attachment_filename='file.zip') - with open('foo.bin', 'wb') as f: - f.write(to_return.getvalue()) + body_content = urlnode.body.getvalue() + if body_content: + got_content = True + with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: + zfile.writestr(urlnode.filename, urlnode.body.getvalue()) + if not got_content: + with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: + zfile.writestr('file.txt', b'Response body empty') + to_return.seek(0) return send_file(to_return, mimetype='application/zip', as_attachment=True, attachment_filename='file.zip') diff --git a/lookyloo/static/tree.js b/lookyloo/static/tree.js index df39dd32..f8427ae9 100644 --- a/lookyloo/static/tree.js +++ b/lookyloo/static/tree.js @@ -6,12 +6,19 @@ var margin = {top: 20, right: 200, bottom: 30, left: 90}, height = 10000 - margin.top - margin.bottom; var node_width = 0; +var max_overlay_width = 1500; var node_height = 45; var main_svg = d3.select("body").append("svg") .attr("width", width + margin.right + margin.left) .attr("height", height + margin.top + margin.bottom) +main_svg.append("clipPath") + .attr("id", "textOverlay") + .append("rect") + .attr('width', max_overlay_width - 25) + .attr('height', node_height); + // Add background pattern var pattern = main_svg.append("defs").append('pattern') .attr('id', 'backstripes') @@ -75,7 +82,7 @@ function getBB(selection) { }; function urlnode_click(d) { - var url = "url/" + d.data.uuid; + var url = "tree/url/" + d.data.uuid; d3.blob(url, {credentials: 'same-origin'}).then(function(data) { saveAs(data, 'file.zip'); }); @@ -102,7 +109,9 @@ function hostnode_click(d) { cur_node.append('line') .attr('id', 'overlay_link') - .style("stroke", "black"); + .style("opacity", "0.95") + .attr("stroke-width", "2") + .style("stroke", "gray"); var top_margin = 15; var overlay_header_height = 50; @@ -120,8 +129,8 @@ function hostnode_click(d) { d3.select(this) .attr("transform", "translate(" + d.x + "," + d.y + ")"); cur_node.select('#overlay_link') - .attr("x2", d.x + top_margin) - .attr("y2", d.y + 12); + .attr("x2", d.x + left_margin + 3) + .attr("y2", d.y + top_margin + 7); })); overlay_hostname.append('rect') @@ -139,6 +148,14 @@ function hostnode_click(d) { // Modal display var url = "/tree/hostname/" + d.data.uuid; d3.json(url, {credentials: 'same-origin'}).then(function(urls) { + overlay_hostname + .append('circle') + .attr('id', 'overlay_circle_' + d.data.uuid) + .attr('height', overlay_header_height) + .attr('cx', left_margin + 10) + .attr('cy', top_margin + 15) + .attr('r', 12); + overlay_hostname .append('text') .attr('id', 'overlay_close_' + d.data.uuid) @@ -147,12 +164,7 @@ function hostnode_click(d) { .attr('y', top_margin + 25) .style("font-size", overlay_header_height - 20) .text('\u2716') - .on("mouseover", function(d) { - d3.select(this).style("cursor", "pointer"); - }) - .on("mouseout", function(d) { - d3.select(this).style("cursor", "default"); - }) + .attr('cursor', 'pointer') .on("click", function() { main_svg.selectAll('#overlay_' + d.data.uuid).remove(); cur_node.select('#overlay_link').remove(); @@ -160,7 +172,7 @@ function hostnode_click(d) { ); overlay_hostname.append('line') - .attr('id', 'overlay_separator_' + d.data.uuid) + .attr('id', 'overlay_separator_header' + d.data.uuid) .style("stroke", "gray") .style('stroke-width', 2) .attr('x1', 15) @@ -176,23 +188,53 @@ function hostnode_click(d) { height_text = text_node.node().getBBox().height; icon_list(overlay_hostname, left_margin + 5, top_margin + height_text + overlay_header_height + (interval_entries * index)); }); + overlay_hostname.append('line') + .attr('id', 'overlay_separator_footer' + d.data.uuid) + .style("stroke", "gray") + .style('stroke-width', 2) + .attr('x1', 15) + .attr('y1', overlay_hostname.node().getBBox().height + 15) + .attr('x2', 500) + .attr('y2', overlay_hostname.node().getBBox().height); + + overlay_hostname + .append('text') + .attr('id', 'overlay_download_' + d.data.uuid) + .attr('height', overlay_header_height - 10) + .attr('x', left_margin) + .attr('y', overlay_hostname.node().getBBox().height + 40) + .style("font-size", overlay_header_height - 30) + .text('Download URLs as text') + .attr('cursor', 'pointer') + .on("click", function() { + var url = "/tree/hostname/" + d.data.uuid + '/text'; + d3.blob(url, {credentials: 'same-origin'}).then(function(data) { + saveAs(data, 'file.md'); + }); + }); + overlay_bbox = overlay_hostname.node().getBBox(); overlay_hostname.select('rect') - .attr('width', overlay_bbox.width + left_margin) - .attr('height', overlay_bbox.height + top_margin); + .attr('width', function() { + optimal_size = overlay_bbox.width + left_margin + return optimal_size < max_overlay_width ? optimal_size : max_overlay_width; + }) + .attr('height', overlay_bbox.height + 10); overlay_hostname.select('#overlay_close_' + d.data.uuid) - .attr('x', overlay_bbox.width); + .attr('x', overlay_hostname.select('rect').node().getBBox().width - left_margin); - overlay_hostname.select('#overlay_separator_' + d.data.uuid) - .attr('x2', overlay_bbox.width + left_margin + 15); + overlay_hostname.select('#overlay_separator_header' + d.data.uuid) + .attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14); + overlay_hostname.select('#overlay_separator_footer' + d.data.uuid) + .attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14); cur_node.select('#overlay_link') - .attr("x1", cur_node.x) - .attr("y1", cur_node.y) - .attr("x2", top_margin) - .attr("y2", 12); + .attr("x1", 10) + .attr("y1", 0) + .attr("x2", left_margin + 3) + .attr("y2", top_margin + 7); }); }; @@ -273,6 +315,8 @@ function text_entry(parent_svg, relative_x_pos, relative_y_pos, onclick_callback .style("font-size", "16px") .attr("stroke-width", ".2px") .style("opacity", .9) + .attr('cursor', 'pointer') + .attr("clip-path", "url(#textOverlay)") .text(function(d) { d.data.total_width = 0; // reset total_width to_display = d.data.name