Merge remote-tracking branch 'upstream/master'

pull/20/head
Michael Schaefer 2018-04-08 17:45:36 +02:00
commit 51ed616b48
7 changed files with 193 additions and 81 deletions

View File

@ -8,7 +8,7 @@ WORKDIR lookyloo
RUN pip3 install -r requirements.txt
RUN pip3 install -e .
RUN wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js
RUN wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js
RUN wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js
RUN sed -i "s/SPLASH = 'http:\/\/127.0.0.1:8050'/SPLASH = 'http:\/\/splash:8050'/g" lookyloo/__init__.py

View File

@ -48,7 +48,7 @@ sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash --disable-ui --disa
```bash
pip install -r requirements.txt
pip install -e .
wget https://d3js.org/d3.v4.min.js -O lookyloo/static/d3.v4.min.js
wget https://d3js.org/d3.v5.min.js -O lookyloo/static/d3.v5.min.js
wget https://cdn.rawgit.com/eligrey/FileSaver.js/5733e40e5af936eb3f48554cf6a8a7075d71d18a/FileSaver.js -O lookyloo/static/FileSaver.js
```
# Run the app locally

View File

@ -9,3 +9,5 @@ chmod-socket = 660
vacuum = true
die-on-term = true
wsgi-disable-file-wrapper = true

View File

@ -9,13 +9,12 @@ from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file
from flask_bootstrap import Bootstrap
from glob import glob
import os
from datetime import datetime
import pickle
import tempfile
import pathlib
import time
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
@ -30,26 +29,24 @@ if app.secret_key == 'changeme':
Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = True
HAR_DIR = 'scraped'
HAR_DIR = pathlib.Path('scraped')
SPLASH = 'http://127.0.0.1:8050'
pathlib.Path(HAR_DIR).mkdir(parents=True, exist_ok=True)
HAR_DIR.mkdir(parents=True, exist_ok=True)
@app.before_request
def session_management():
# make the session last indefinitely until it is cleared
session.permanent = True
def cleanup_old_tmpfiles():
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
if time.time() - tmpfile.stat().st_atime > 36000:
tmpfile.unlink()
def load_tree(report_dir):
if session.get('tree'):
# TODO delete file
pass
session.clear()
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har')))
har_files = sorted(report_dir.glob('*.har'))
ct = CrawledTree(har_files)
ct.find_parents()
ct.join_trees()
@ -65,7 +62,7 @@ def scrape():
if request.form.get('url'):
url = request.form.get('url')
if not url.startswith('http'):
url = 'http://{}'.format(url)
url = f'http://{url}'
depth = request.form.get('depth')
if depth is None:
depth = 1
@ -74,17 +71,20 @@ def scrape():
# broken
pass
width = len(str(len(items)))
dirpath = os.path.join(HAR_DIR, datetime.now().isoformat())
os.makedirs(dirpath)
dirpath = HAR_DIR / datetime.now().isoformat()
dirpath.mkdir()
for i, item in enumerate(items):
harfile = item['har']
png = base64.b64decode(item['png'])
child_frames = item['childFrames']
with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f:
html = item['html']
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
json.dump(harfile, f)
with open(os.path.join(dirpath, '{0:0{width}}.png'.format(i, width=width)), 'wb') as f:
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
f.write(png)
with open(os.path.join(dirpath, '{0:0{width}}.frames.json'.format(i, width=width)), 'w') as f:
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
f.write(html)
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
json.dump(child_frames, f)
return tree(0)
return render_template('scrape.html')
@ -92,10 +92,28 @@ def scrape():
def get_report_dirs():
# Cleanup HAR_DIR of failed runs.
for report_dir in os.listdir(HAR_DIR):
if not os.listdir(os.path.join(HAR_DIR, report_dir)):
os.rmdir(os.path.join(HAR_DIR, report_dir))
return sorted(os.listdir(HAR_DIR), reverse=True)
for report_dir in HAR_DIR.iterdir():
if report_dir.is_dir() and not report_dir.iterdir():
report_dir.rmdir()
return sorted(HAR_DIR.iterdir(), reverse=True)
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
def hostnode_details_text(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
urls.append(url.name)
content = '''# URLs
{}
'''.format('\n'.join(urls))
to_return = BytesIO(content.encode())
to_return.seek(0)
return send_file(to_return, mimetype='text/markdown',
as_attachment=True, attachment_filename='file.md')
@app.route('/tree/hostname/<node_uuid>', methods=['GET'])
@ -114,16 +132,18 @@ def urlnode_details(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
to_return = BytesIO()
got_content = False
if hasattr(urlnode, 'body'):
with ZipFile(to_return, 'a', ZIP_DEFLATED, False) as zfile:
zfile.writestr(urlnode.filename, urlnode.body.getvalue())
to_return.seek(0)
# return send_file(urlnode.body, mimetype='application/zip',
# as_attachment=True, attachment_filename='file.zip')
with open('foo.bin', 'wb') as f:
f.write(to_return.getvalue())
body_content = urlnode.body.getvalue()
if body_content:
got_content = True
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr(urlnode.filename, urlnode.body.getvalue())
if not got_content:
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr('file.txt', b'Response body empty')
to_return.seek(0)
return send_file(to_return, mimetype='application/zip',
as_attachment=True, attachment_filename='file.zip')
@ -138,12 +158,14 @@ def tree(tree_id):
@app.route('/', methods=['GET'])
def index():
cleanup_old_tmpfiles()
session.clear()
i = 0
titles = []
if not os.path.exists(HAR_DIR):
os.makedirs(HAR_DIR)
if not HAR_DIR.exists():
HAR_DIR.mkdir(parents=True)
for report_dir in get_report_dirs():
har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har')))
har_files = sorted(report_dir.glob('*.har'))
if not har_files:
continue
with open(har_files[0], 'r') as f:

View File

@ -6,12 +6,19 @@ var margin = {top: 20, right: 200, bottom: 30, left: 90},
height = 10000 - margin.top - margin.bottom;
var node_width = 0;
var max_overlay_width = 1500;
var node_height = 45;
var main_svg = d3.select("body").append("svg")
.attr("width", width + margin.right + margin.left)
.attr("height", height + margin.top + margin.bottom)
main_svg.append("clipPath")
.attr("id", "textOverlay")
.append("rect")
.attr('width', max_overlay_width - 25)
.attr('height', node_height);
// Add background pattern
var pattern = main_svg.append("defs").append('pattern')
.attr('id', 'backstripes')
@ -29,11 +36,7 @@ var background = main_svg.append('rect')
.attr('y', 0)
.attr('width', width)
.attr('height', height)
.style('fill', "url(#backstripes)")
.on('click', function(d) {
// Remove the
main_svg.selectAll('.overlay').remove()
});
.style('fill', "url(#backstripes)");
// append the svg object to the body of the page
// appends a 'group' element to 'svg'
@ -78,27 +81,11 @@ function getBB(selection) {
})
};
function str2bytes (str) {
var bytes = new Uint8Array(str.length);
for (var i=0; i<str.length; i++) {
bytes[i] = str.charCodeAt(i);
}
return bytes;
}
function urlnode_click(d) {
var url = "url/" + d.data.uuid;
var xhr = new XMLHttpRequest();
xhr.open('GET', url, true);
xhr.responseType = "blob";
xhr.withCredentials = true;
xhr.onreadystatechange = function (){
if (xhr.readyState === 4) {
var blob = xhr.response;
saveAs(blob, 'file.zip');
}
};
xhr.send();
var url = "tree/url/" + d.data.uuid;
d3.blob(url, {credentials: 'same-origin'}).then(function(data) {
saveAs(data, 'file.zip');
});
};
d3.selection.prototype.moveToFront = function() {
@ -107,6 +94,9 @@ d3.selection.prototype.moveToFront = function() {
});
};
// What happen when clicking on a domain (load a modal display)
function hostnode_click(d) {
// Move the node to the front (end of the list)
@ -118,14 +108,17 @@ function hostnode_click(d) {
.attr('class', 'overlay');
cur_node.append('line')
.attr('class', 'overlay')
.style("stroke", "black");
.attr('id', 'overlay_link')
.style("opacity", "0.95")
.attr("stroke-width", "2")
.style("stroke", "gray");
var top_margin = 15;
var overlay_header_height = 50;
var left_margin = 30;
overlay_hostname
.datum({x: 0, y: 0})
.datum({x: 0, y: 0, overlay_uuid: d.data.uuid})
.attr('id', 'overlay_' + d.data.uuid)
.attr("transform", "translate(" + 0 + "," + 0 + ")")
.call(d3.drag().on("drag", function(d, i) {
@ -135,9 +128,9 @@ function hostnode_click(d) {
d.y += d3.event.dy
d3.select(this)
.attr("transform", "translate(" + d.x + "," + d.y + ")");
cur_node.select('line')
.attr("x2", d.x + top_margin)
.attr("y2", d.y + left_margin);
cur_node.select('#overlay_link')
.attr("x2", d.x + left_margin + 3)
.attr("y2", d.y + top_margin + 7);
}));
overlay_hostname.append('rect')
@ -154,26 +147,94 @@ function hostnode_click(d) {
// Modal display
var url = "/tree/hostname/" + d.data.uuid;
d3.json(url, function(error, urls) {
d3.json(url, {credentials: 'same-origin'}).then(function(urls) {
overlay_hostname
.append('circle')
.attr('id', 'overlay_circle_' + d.data.uuid)
.attr('height', overlay_header_height)
.attr('cx', left_margin + 10)
.attr('cy', top_margin + 15)
.attr('r', 12);
overlay_hostname
.append('text')
.attr('id', 'overlay_close_' + d.data.uuid)
.attr('height', overlay_header_height)
.attr('x', left_margin + 500)
.attr('y', top_margin + 25)
.style("font-size", overlay_header_height - 20)
.text('\u2716')
.attr('cursor', 'pointer')
.on("click", function() {
main_svg.selectAll('#overlay_' + d.data.uuid).remove();
cur_node.select('#overlay_link').remove();
}
);
overlay_hostname.append('line')
.attr('id', 'overlay_separator_header' + d.data.uuid)
.style("stroke", "gray")
.style('stroke-width', 2)
.attr('x1', 15)
.attr('y1', overlay_header_height)
.attr('x2', 500)
.attr('y2', overlay_header_height);
var interval_entries = 40;
if (error) throw error;
urls.forEach(function(url, index, array) {
var jdata = JSON.parse(url)
overlay_hostname.datum({'data': jdata});
var text_node = text_entry(overlay_hostname, left_margin, top_margin + (interval_entries * index), urlnode_click);
var text_node = text_entry(overlay_hostname, left_margin, top_margin + overlay_header_height + (interval_entries * index), urlnode_click);
height_text = text_node.node().getBBox().height;
icon_list(overlay_hostname, left_margin + 5, top_margin + height_text + (interval_entries * index));
icon_list(overlay_hostname, left_margin + 5, top_margin + height_text + overlay_header_height + (interval_entries * index));
});
overlay_hostname.append('line')
.attr('id', 'overlay_separator_footer' + d.data.uuid)
.style("stroke", "gray")
.style('stroke-width', 2)
.attr('x1', 15)
.attr('y1', overlay_hostname.node().getBBox().height + 15)
.attr('x2', 500)
.attr('y2', overlay_hostname.node().getBBox().height);
overlay_hostname
.append('text')
.attr('id', 'overlay_download_' + d.data.uuid)
.attr('height', overlay_header_height - 10)
.attr('x', left_margin)
.attr('y', overlay_hostname.node().getBBox().height + 40)
.style("font-size", overlay_header_height - 30)
.text('Download URLs as text')
.attr('cursor', 'pointer')
.on("click", function() {
var url = "/tree/hostname/" + d.data.uuid + '/text';
d3.blob(url, {credentials: 'same-origin'}).then(function(data) {
saveAs(data, 'file.md');
});
});
overlay_bbox = overlay_hostname.node().getBBox();
overlay_hostname.select('rect')
.attr('width', overlay_bbox.width + left_margin)
.attr('height', overlay_bbox.height + top_margin);
.attr('width', function() {
optimal_size = overlay_bbox.width + left_margin
return optimal_size < max_overlay_width ? optimal_size : max_overlay_width;
})
.attr('height', overlay_bbox.height + 10);
cur_node.select('line')
.attr("x1", cur_node.x)
.attr("y1", cur_node.y)
.attr("x2", top_margin)
.attr("y2", left_margin);
overlay_hostname.select('#overlay_close_' + d.data.uuid)
.attr('x', overlay_hostname.select('rect').node().getBBox().width - left_margin);
overlay_hostname.select('#overlay_separator_header' + d.data.uuid)
.attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14);
overlay_hostname.select('#overlay_separator_footer' + d.data.uuid)
.attr('x2', overlay_hostname.select('rect').node().getBBox().width + 14);
cur_node.select('#overlay_link')
.attr("x1", 10)
.attr("y1", 0)
.attr("x2", left_margin + 3)
.attr("y2", top_margin + 7);
});
};
@ -243,7 +304,7 @@ function text_entry(parent_svg, relative_x_pos, relative_y_pos, onclick_callback
// Avoid hiding the content after the circle
var nodeContent = parent_svg
.append('svg')
.attr('height',node_height)
.attr('height', node_height)
.attr('x', relative_x_pos)
.attr('y', relative_y_pos);
@ -254,6 +315,8 @@ function text_entry(parent_svg, relative_x_pos, relative_y_pos, onclick_callback
.style("font-size", "16px")
.attr("stroke-width", ".2px")
.style("opacity", .9)
.attr('cursor', 'pointer')
.attr("clip-path", "url(#textOverlay)")
.text(function(d) {
d.data.total_width = 0; // reset total_width
to_display = d.data.name

View File

@ -3,7 +3,7 @@
{% block scripts %}
{{ super() }}
<script src='{{ url_for('static', filename='FileSaver.js') }}'></script>
<script src='{{ url_for('static', filename='d3.v4.min.js') }}'></script>
<script src='{{ url_for('static', filename='d3.v5.min.js') }}'></script>
{% endblock %}
{% block head %}

View File

@ -27,8 +27,33 @@
<img src="{{ url_for('static', filename='redirect.png') }}"
alt="Redirect" height="20" width="20"> Redirect</br>
<img src="{{ url_for('static', filename='cookie_in_url.png') }}"
alt="Cookie in URL" height="20" width="20"> Cookie in URL</br>
<img src="{{ url_for('static', filename='font.png') }}"
alt="Font" height="20" width="20"> Font</br>
<img src="{{ url_for('static', filename='html.png') }}"
alt="HTML" height="20" width="20"> HTML</br>
<img src="{{ url_for('static', filename='json.png') }}"
alt="JSON" height="20" width="20"> JSON</br>
<img src="{{ url_for('static', filename='css.png') }}"
alt="CSS" height="20" width="20"> CSS</br>
<img src="{{ url_for('static', filename='exe.png') }}"
alt="EXE" height="20" width="20"> EXE</br>
<img src="{{ url_for('static', filename='img.png') }}"
alt="Image" height="20" width="20"> Image</br>
<img src="{{ url_for('static', filename='video.png') }}"
alt="Video" height="20" width="20"> Video</br>
<img src="{{ url_for('static', filename='ifr.png') }}"
alt="iFrame" height="20" width="20"> iFrame</br>
<img src="{{ url_for('static', filename='wtf.png') }}"
alt="Content type not set/unknown" height="20" width="20"> Content type not set/unknown</br>
</div>
<div id=tree-details><center><b>Tree details</b></center></br>
<b>Root URL</b>: {{ root_url }}</br>