Merge remote-tracking branch 'upstream/master'

pull/20/head
Michael Schaefer 2018-04-08 17:45:36 +02:00
commit 51ed616b48
7 changed files with 193 additions and 81 deletions

View File

@ -8,7 +8,7 @@ WORKDIR lookyloo
RUN pip3 install -r requirements.txt RUN pip3 install -r requirements.txt
RUN pip3 install -e . RUN pip3 install -e .
RUN wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js RUN wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js
RUN wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js RUN wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js
RUN sed -i "s/SPLASH = 'http:\/\/127.0.0.1:8050'/SPLASH = 'http:\/\/splash:8050'/g" lookyloo/__init__.py RUN sed -i "s/SPLASH = 'http:\/\/127.0.0.1:8050'/SPLASH = 'http:\/\/splash:8050'/g" lookyloo/__init__.py

View File

@ -48,7 +48,7 @@ sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disa
```bash ```bash
pip install -r requirements.txt pip install -r requirements.txt
pip install -e . pip install -e .
wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js
wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js
``` ```
# Run the app locally # Run the app locally

View File

@ -9,3 +9,5 @@ chmod-socket = 660
vacuum = true vacuum = true
die-on-term = true die-on-term = true
wsgi-disable-file-wrapper = true

View File

@ -9,13 +9,12 @@ from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file from flask import Flask, render_template, request, session, send_file
from flask_bootstrap import Bootstrap from flask_bootstrap import Bootstrap
from glob import glob
import os
from datetime import datetime from datetime import datetime
import pickle import pickle
import tempfile import tempfile
import pathlib import pathlib
import time
from zipfile import ZipFile, ZIP_DEFLATED from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO from io import BytesIO
@ -30,26 +29,24 @@ if app.secret_key == 'changeme':
Bootstrap(app) Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = True app.debug = True
HAR_DIR = 'scraped' HAR_DIR = pathlib.Path('scraped')
SPLASH = 'http://127.0.0.1:8050' SPLASH = 'http://127.0.0.1:8050'
pathlib.Path(HAR_DIR).mkdir(parents=True, exist_ok=True) HAR_DIR.mkdir(parents=True, exist_ok=True)
@app.before_request def cleanup_old_tmpfiles():
def session_management(): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
# make the session last indefinitely until it is cleared if time.time() - tmpfile.stat().st_atime > 36000:
session.permanent = True tmpfile.unlink()
def load_tree(report_dir): def load_tree(report_dir):
if session.get('tree'):
# TODO delete file
pass
session.clear() session.clear()
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) har_files = sorted(report_dir.glob('*.har'))
ct = CrawledTree(har_files) ct = CrawledTree(har_files)
ct.find_parents() ct.find_parents()
ct.join_trees() ct.join_trees()
@ -65,7 +62,7 @@ def scrape():
if request.form.get('url'): if request.form.get('url'):
url = request.form.get('url') url = request.form.get('url')
if not url.startswith('http'): if not url.startswith('http'):
url = 'http://{}'.format(url) url = f'http://{url}'
depth = request.form.get('depth') depth = request.form.get('depth')
if depth is None: if depth is None:
depth = 1 depth = 1
@ -74,17 +71,20 @@ def scrape():
# broken # broken
pass pass
width = len(str(len(items))) width = len(str(len(items)))
dirpath = os.path.join(HAR_DIR, datetime.now().isoformat()) dirpath = HAR_DIR / datetime.now().isoformat()
os.makedirs(dirpath) dirpath.mkdir()
for i, item in enumerate(items): for i, item in enumerate(items):
harfile = item['har'] harfile = item['har']
png = base64.b64decode(item['png']) png = base64.b64decode(item['png'])
child_frames = item['childFrames'] child_frames = item['childFrames']
with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
json.dump(harfile, f) json.dump(harfile, f)
with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f: with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
f.write(png) f.write(png)
with open(os.path.join(dirpath, '{0:0{width}}.frames.json'.format(i, width=width)), 'w') as f: with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
f.write(html)
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
json.dump(child_frames, f) json.dump(child_frames, f)
return tree(0) return tree(0)
return render_template('scrape.html') return render_template('scrape.html')
@ -92,10 +92,28 @@ def scrape():
def get_report_dirs(): def get_report_dirs():
# Cleanup HAR_DIR of failed runs. # Cleanup HAR_DIR of failed runs.
for report_dir in os.listdir(HAR_DIR): for report_dir in HAR_DIR.iterdir():
if not os.listdir(os.path.join(HAR_DIR, report_dir)): if report_dir.is_dir() and not report_dir.iterdir():
os.rmdir(os.path.join(HAR_DIR, report_dir)) report_dir.rmdir()
return sorted(os.listdir(HAR_DIR), reverse=True) return sorted(HAR_DIR.iterdir(), reverse=True)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
def hostnode_details_text(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
urls.append(url.name)
content = '''# URLs
{}
'''.format('\n'.join(urls))
to_return = BytesIO(content.encode())
to_return.seek(0)
return send_file(to_return, mimetype='text/markdown',
as_attachment=True, attachment_filename='file.md')
@app.route('/tree/hostname/<node_uuid>', methods=['GET']) @app.route('/tree/hostname/<node_uuid>', methods=['GET'])
@ -114,16 +132,18 @@ def urlnode_details(node_uuid):
with open(session["tree"], 'rb') as f: with open(session["tree"], 'rb') as f:
ct = pickle.load(f) ct = pickle.load(f)
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid) urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
to_return = BytesIO() to_return = BytesIO()
got_content = False
if hasattr(urlnode, 'body'): if hasattr(urlnode, 'body'):
with ZipFile(to_return, 'a', ZIP_DEFLATED, False) as zfile: body_content = urlnode.body.getvalue()
zfile.writestr(urlnode.filename, urlnode.body.getvalue()) if body_content:
to_return.seek(0) got_content = True
# return send_file(urlnode.body, mimetype='application/zip', with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
# as_attachment=True, attachment_filename='file.zip') zfile.writestr(urlnode.filename, urlnode.body.getvalue())
with open('foo.bin', 'wb') as f: if not got_content:
f.write(to_return.getvalue()) with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr('file.txt', b'Response body empty')
to_return.seek(0)
return send_file(to_return, mimetype='application/zip', return send_file(to_return, mimetype='application/zip',
as_attachment=True, attachment_filename='file.zip') as_attachment=True, attachment_filename='file.zip')
@ -138,12 +158,14 @@ def tree(tree_id):
@app.route('/', methods=['GET']) @app.route('/', methods=['GET'])
def index(): def index():
cleanup_old_tmpfiles()
session.clear()
i = 0 i = 0
titles = [] titles = []
if not os.path.exists(HAR_DIR): if not HAR_DIR.exists():
os.makedirs(HAR_DIR) HAR_DIR.mkdir(parents=True)
for report_dir in get_report_dirs(): for report_dir in get_report_dirs():
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) har_files = sorted(report_dir.glob('*.har'))
if not har_files: if not har_files:
continue continue
with open(har_files[0], 'r') as f: with open(har_files[0], 'r') as f:

View File

@ -6,12 +6,19 @@ var margin = {top: 20, right: 200, bottom: 30, left: 90},
height = 10000 - margin.top - margin.bottom; height = 10000 - margin.top - margin.bottom;
var node_width = 0; var node_width = 0;
var max_overlay_width = 1500;
var node_height = 45; var node_height = 45;
var main_svg = d3.select("body").append("svg") var main_svg = d3.select("body").append("svg")
.attr("width", width + margin.right + margin.left) .attr("width", width + margin.right + margin.left)
.attr("height", height + margin.top + margin.bottom) .attr("height", height + margin.top + margin.bottom)
main_svg.append("clipPath")
.attr("id", "textOverlay")
.append("rect")
.attr('width', max_overlay_width - 25)
.attr('height', node_height);
// Add background pattern // Add background pattern
var pattern = main_svg.append("defs").append('pattern') var pattern = main_svg.append("defs").append('pattern')
.attr('id', 'backstripes') .attr('id', 'backstripes')
@ -29,11 +36,7 @@ var background = main_svg.append('rect')
.attr('y', 0) .attr('y', 0)
.attr('width', width) .attr('width', width)
.attr('height', height) .attr('height', height)
.style('fill', "url(#backstripes)") .style('fill', "url(#backstripes)");
.on('click', function(d) {
// Remove the
main_svg.selectAll('.overlay').remove()
});
// append the svg object to the body of the page // append the svg object to the body of the page
// appends a 'group' element to 'svg' // appends a 'group' element to 'svg'
@ -78,27 +81,11 @@ function getBB(selection) {
}) })
}; };
function str2bytes (str) {
var bytes = new Uint8Array(str.length);
for (var i=0; i<str.length; i++) {
bytes[i] = str.charCodeAt(i);
}
return bytes;
}
function urlnode_click(d) { function urlnode_click(d) {
var url = "url/" + d.data.uuid; var url = "tree/url/" + d.data.uuid;
var xhr = new XMLHttpRequest(); d3.blob(url, {credentials: 'same-origin'}).then(function(data) {
xhr.open('GET', url, true); saveAs(data, 'file.zip');
xhr.responseType = "blob"; });
xhr.withCredentials = true;
xhr.onreadystatechange = function (){
if (xhr.readyState === 4) {
var blob = xhr.response;
saveAs(blob, 'file.zip');
}
};
xhr.send();
}; };
d3.selection.prototype.moveToFront = function() { d3.selection.prototype.moveToFront = function() {
@ -107,6 +94,9 @@ d3.selection.prototype.moveToFront = function() {
}); });
}; };
// What happen when clicking on a domain (load a modal display) // What happen when clicking on a domain (load a modal display)
function hostnode_click(d) { function hostnode_click(d) {
// Move the node to the front (end of the list) // Move the node to the front (end of the list)
@ -118,14 +108,17 @@ function hostnode_click(d) {
.attr('class', 'overlay'); .attr('class', 'overlay');
cur_node.append('line') cur_node.append('line')
.attr('class', 'overlay') .attr('id', 'overlay_link')
.style("stroke", "black"); .style("opacity", "0.95")
.attr("stroke-width", "2")
.style("stroke", "gray");
var top_margin = 15; var top_margin = 15;
var overlay_header_height = 50;
var left_margin = 30; var left_margin = 30;
overlay_hostname overlay_hostname
.datum({x: 0, y: 0}) .datum({x: 0, y: 0, overlay_uuid: d.data.uuid})
.attr('id', 'overlay_' + d.data.uuid) .attr('id', 'overlay_' + d.data.uuid)
.attr("transform", "translate(" + 0 + "," + 0 + ")") .attr("transform", "translate(" + 0 + "," + 0 + ")")
.call(d3.drag().on("drag", function(d, i) { .call(d3.drag().on("drag", function(d, i) {
@ -135,9 +128,9 @@ function hostnode_click(d) {
d.y += d3.event.dy d.y += d3.event.dy
d3.select(this) d3.select(this)
.attr("transform", "translate(" + d.x + "," + d.y + ")"); .attr("transform", "translate(" + d.x + "," + d.y + ")");
cur_node.select('line') cur_node.select('#overlay_link')
.attr("x2", d.x + top_margin) .attr("x2", d.x + left_margin + 3)
.attr("y2", d.y + left_margin); .attr("y2", d.y + top_margin + 7);
})); }));
overlay_hostname.append('rect') overlay_hostname.append('rect')
@ -154,26 +147,94 @@ function hostnode_click(d) {
// Modal display // Modal display
var url = "/tree/hostname/" + d.data.uuid; var url = "/tree/hostname/" + d.data.uuid;
d3.json(url, function(error, urls) { d3.json(url, {credentials: 'same-origin'}).then(function(urls) {
overlay_hostname
.append('circle')
.attr('id', 'overlay_circle_' + d.data.uuid)
.attr('height', overlay_header_height)
.attr('cx', left_margin + 10)
.attr('cy', top_margin + 15)
.attr('r', 12);
overlay_hostname
.append('text')
.attr('id', 'overlay_close_' + d.data.uuid)
.attr('height', overlay_header_height)
.attr('x', left_margin + 500)
.attr('y', top_margin + 25)
.style("font-size", overlay_header_height - 20)
.text('\u2716')
.attr('cursor', 'pointer')
.on("click", function() {
main_svg.selectAll('#overlay_' + d.data.uuid).remove();
cur_node.select('#overlay_link').remove();
}
);
overlay_hostname.append('line')
.attr('id', 'overlay_separator_header' + d.data.uuid)
.style("stroke", "gray")
.style('stroke-width', 2)
.attr('x1', 15)
.attr('y1', overlay_header_height)
.attr('x2', 500)
.attr('y2', overlay_header_height);
var interval_entries = 40; var interval_entries = 40;
if (error) throw error;
urls.forEach(function(url, index, array) { urls.forEach(function(url, index, array) {
var jdata = JSON.parse(url) var jdata = JSON.parse(url)
overlay_hostname.datum({'data': jdata}); overlay_hostname.datum({'data': jdata});
var text_node = text_entry(overlay_hostname, left_margin, top_margin + (interval_entries * index), urlnode_click); var text_node = text_entry(overlay_hostname, left_margin, top_margin + overlay_header_height + (interval_entries * index), urlnode_click);
height_text = text_node.node().getBBox().height; height_text = text_node.node().getBBox().height;
icon_list(overlay_hostname, left_margin + 5, top_margin + height_text + (interval_entries * index)); icon_list(overlay_hostname, left_margin + 5, top_margin + height_text + overlay_header_height + (interval_entries * index));
}); });
overlay_hostname.append('line')
.attr('id', 'overlay_separator_footer' + d.data.uuid)
.style("stroke", "gray")
.style('stroke-width', 2)
.attr('x1', 15)
.attr('y1', overlay_hostname.node().getBBox().height + 15)
.attr('x2', 500)
.attr('y2', overlay_hostname.node().getBBox().height);
overlay_hostname
.append('text')
.attr('id', 'overlay_download_' + d.data.uuid)
.attr('height', overlay_header_height - 10)
.attr('x', left_margin)
.attr('y', overlay_hostname.node().getBBox().height + 40)
.style("font-size", overlay_header_height - 30)
.text('Download URLs as text')
.attr('cursor', 'pointer')
.on("click", function() {
var url = "/tree/hostname/" + d.data.uuid + '/text';
d3.blob(url, {credentials: 'same-origin'}).then(function(data) {
saveAs(data, 'file.md');
});
});
overlay_bbox = overlay_hostname.node().getBBox(); overlay_bbox = overlay_hostname.node().getBBox();
overlay_hostname.select('rect') overlay_hostname.select('rect')
.attr('width', overlay_bbox.width + left_margin) .attr('width', function() {
.attr('height', overlay_bbox.height + top_margin); optimal_size = overlay_bbox.width + left_margin
return optimal_size < max_overlay_width ? optimal_size : max_overlay_width;
})
.attr('height', overlay_bbox.height + 10);
cur_node.select('line') overlay_hostname.select('#overlay_close_' + d.data.uuid)
.attr("x1", cur_node.x) .attr('x', overlay_hostname.select('rect').node().getBBox().width - left_margin);
.attr("y1", cur_node.y)
.attr("x2", top_margin) overlay_hostname.select('#overlay_separator_header' + d.data.uuid)
.attr("y2", left_margin); .attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14);
overlay_hostname.select('#overlay_separator_footer' + d.data.uuid)
.attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14);
cur_node.select('#overlay_link')
.attr("x1", 10)
.attr("y1", 0)
.attr("x2", left_margin + 3)
.attr("y2", top_margin + 7);
}); });
}; };
@ -243,7 +304,7 @@ function text_entry(parent_svg, relative_x_pos, relative_y_pos, onclick_callback
// Avoid hiding the content after the circle // Avoid hiding the content after the circle
var nodeContent = parent_svg var nodeContent = parent_svg
.append('svg') .append('svg')
.attr('height',node_height) .attr('height', node_height)
.attr('x', relative_x_pos) .attr('x', relative_x_pos)
.attr('y', relative_y_pos); .attr('y', relative_y_pos);
@ -254,6 +315,8 @@ function text_entry(parent_svg, relative_x_pos, relative_y_pos, onclick_callback
.style("font-size", "16px") .style("font-size", "16px")
.attr("stroke-width", ".2px") .attr("stroke-width", ".2px")
.style("opacity", .9) .style("opacity", .9)
.attr('cursor', 'pointer')
.attr("clip-path", "url(#textOverlay)")
.text(function(d) { .text(function(d) {
d.data.total_width = 0; // reset total_width d.data.total_width = 0; // reset total_width
to_display = d.data.name to_display = d.data.name

View File

@ -3,7 +3,7 @@
{% block scripts %} {% block scripts %}
{{ super() }} {{ super() }}
<script src='{{ url_for('static', filename='FileSaver.js') }}'></script> <script src='{{ url_for('static', filename='FileSaver.js') }}'></script>
<script src='{{ url_for('static', filename='d3.v4.min.js') }}'></script> <script src='{{ url_for('static', filename='d3.v5.min.js') }}'></script>
{% endblock %} {% endblock %}
{% block head %} {% block head %}

View File

@ -27,8 +27,33 @@
<img src="{{ url_for('static', filename='redirect.png') }}" <img src="{{ url_for('static', filename='redirect.png') }}"
alt="Redirect" height="20" width="20"> Redirect</br> alt="Redirect" height="20" width="20"> Redirect</br>
<img src="{{ url_for('static', filename='cookie_in_url.png') }}" <img src="{{ url_for('static', filename='font.png') }}"
alt="Cookie in URL" height="20" width="20"> Cookie in URL</br> alt="Font" height="20" width="20"> Font</br>
<img src="{{ url_for('static', filename='html.png') }}"
alt="HTML" height="20" width="20"> HTML</br>
<img src="{{ url_for('static', filename='json.png') }}"
alt="JSON" height="20" width="20"> JSON</br>
<img src="{{ url_for('static', filename='css.png') }}"
alt="CSS" height="20" width="20"> CSS</br>
<img src="{{ url_for('static', filename='exe.png') }}"
alt="EXE" height="20" width="20"> EXE</br>
<img src="{{ url_for('static', filename='img.png') }}"
alt="Image" height="20" width="20"> Image</br>
<img src="{{ url_for('static', filename='video.png') }}"
alt="Video" height="20" width="20"> Video</br>
<img src="{{ url_for('static', filename='ifr.png') }}"
alt="iFrame" height="20" width="20"> iFrame</br>
<img src="{{ url_for('static', filename='wtf.png') }}"
alt="Content type not set/unknown" height="20" width="20"> Content type not set/unknown</br>
</div> </div>
<div id=tree-details><center><b>Tree details</b></center></br> <div id=tree-details><center><b>Tree details</b></center></br>
<b>Root URL</b>: {{ root_url }}</br> <b>Root URL</b>: {{ root_url }}</br>