mirror of https://github.com/CIRCL/lookyloo
chg: Assign a UUID for each scraped page, allow permalinks
parent
a2c4d871fe
commit
e14a0150a0
|
@ -110,3 +110,4 @@ ENV/
|
|||
secret_key
|
||||
FileSaver.js
|
||||
d3.v5.min.js
|
||||
d3.v5.js
|
||||
|
|
|
@ -447,7 +447,7 @@
|
|||
"scrapysplashwrapper": {
|
||||
"editable": true,
|
||||
"git": "https://github.com/viper-framework/ScrapySplashWrapper.git",
|
||||
"ref": "e6d9da1a971011b2925800b168d762e7f86b2b29"
|
||||
"ref": "d0e129f9962098603f6686c8152030f7d31a604a"
|
||||
},
|
||||
"service-identity": {
|
||||
"hashes": [
|
||||
|
|
|
@ -6,7 +6,7 @@ import json
|
|||
from har2tree import CrawledTree
|
||||
from scrapysplashwrapper import crawl
|
||||
|
||||
from flask import Flask, render_template, request, session, send_file
|
||||
from flask import Flask, render_template, request, session, send_file, redirect, url_for
|
||||
from flask_bootstrap import Bootstrap
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -19,8 +19,8 @@ import time
|
|||
from zipfile import ZipFile, ZIP_DEFLATED
|
||||
from io import BytesIO
|
||||
import base64
|
||||
import socket
|
||||
import os
|
||||
from uuid import uuid4
|
||||
|
||||
from pysanejs import SaneJS
|
||||
|
||||
|
@ -32,10 +32,10 @@ app = Flask(__name__)
|
|||
secret_file_path = get_homedir() / 'secret_key'
|
||||
|
||||
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
|
||||
with open(secret_file_path, 'wb') as f:
|
||||
with secret_file_path.open('wb') as f:
|
||||
f.write(os.urandom(64))
|
||||
|
||||
with open(secret_file_path, 'rb') as f:
|
||||
with secret_file_path.open('rb') as f:
|
||||
app.config['SECRET_KEY'] = f.read()
|
||||
|
||||
Bootstrap(app)
|
||||
|
@ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True)
|
|||
SPLASH = 'http://127.0.0.1:8050'
|
||||
SANE_JS = 'http://127.0.0.1:5007'
|
||||
|
||||
|
||||
def is_open(ip, port):
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
s.settimeout(2)
|
||||
try:
|
||||
s.connect((ip, int(port)))
|
||||
s.shutdown(2)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
if SANE_JS:
|
||||
sanejs = SaneJS(SANE_JS)
|
||||
if sanejs.is_up:
|
||||
|
@ -69,6 +57,27 @@ if SANE_JS:
|
|||
has_sane_js = False
|
||||
|
||||
|
||||
def get_report_dirs():
|
||||
# Cleanup HAR_DIR of failed runs.
|
||||
for report_dir in HAR_DIR.iterdir():
|
||||
if report_dir.is_dir() and not report_dir.iterdir():
|
||||
report_dir.rmdir()
|
||||
if not (report_dir / 'uuid').exists():
|
||||
# Create uuid if missing
|
||||
with (report_dir / 'uuid').open('w') as f:
|
||||
f.write(str(uuid4()))
|
||||
return sorted(HAR_DIR.iterdir(), reverse=True)
|
||||
|
||||
|
||||
def get_lookup_dirs():
|
||||
# Build lookup table trees
|
||||
lookup_dirs = {}
|
||||
for report_dir in get_report_dirs():
|
||||
with (report_dir / 'uuid').open() as f:
|
||||
lookup_dirs[f.read().strip()] = report_dir
|
||||
return lookup_dirs
|
||||
|
||||
|
||||
def cleanup_old_tmpfiles():
|
||||
for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'):
|
||||
if time.time() - tmpfile.stat().st_atime > 36000:
|
||||
|
@ -93,53 +102,50 @@ def load_tree(report_dir):
|
|||
return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url
|
||||
|
||||
|
||||
def sane_js_query(sha512):
|
||||
def sane_js_query(sha512: str):
|
||||
if has_sane_js:
|
||||
return sanejs.sha512(sha512)
|
||||
return {'response': []}
|
||||
|
||||
|
||||
def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO')
|
||||
if not items:
|
||||
# broken
|
||||
pass
|
||||
if not perma_uuid:
|
||||
perma_uuid = str(uuid4())
|
||||
width = len(str(len(items)))
|
||||
dirpath = HAR_DIR / datetime.now().isoformat()
|
||||
dirpath.mkdir()
|
||||
for i, item in enumerate(items):
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
child_frames = item['childFrames']
|
||||
html = item['html']
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
|
||||
json.dump(harfile, f)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
|
||||
f.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
|
||||
f.write(html)
|
||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
|
||||
json.dump(child_frames, f)
|
||||
with (dirpath / 'uuid').open('w') as f:
|
||||
f.write(perma_uuid)
|
||||
return perma_uuid
|
||||
|
||||
|
||||
@app.route('/scrape', methods=['GET', 'POST'])
|
||||
def scrape():
|
||||
def scrape_web():
|
||||
if request.form.get('url'):
|
||||
url = request.form.get('url')
|
||||
if not url.startswith('http'):
|
||||
url = f'http://{url}'
|
||||
depth = request.form.get('depth')
|
||||
if depth is None:
|
||||
depth = 1
|
||||
items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO')
|
||||
if not items:
|
||||
# broken
|
||||
pass
|
||||
width = len(str(len(items)))
|
||||
dirpath = HAR_DIR / datetime.now().isoformat()
|
||||
dirpath.mkdir()
|
||||
for i, item in enumerate(items):
|
||||
harfile = item['har']
|
||||
png = base64.b64decode(item['png'])
|
||||
child_frames = item['childFrames']
|
||||
html = item['html']
|
||||
with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f:
|
||||
json.dump(harfile, f)
|
||||
with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f:
|
||||
f.write(png)
|
||||
with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f:
|
||||
f.write(html)
|
||||
with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f:
|
||||
json.dump(child_frames, f)
|
||||
return tree(0)
|
||||
perma_uuid = scrape(request.form.get('url'), request.form.get('depth'))
|
||||
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
||||
return render_template('scrape.html')
|
||||
|
||||
|
||||
def get_report_dirs():
|
||||
# Cleanup HAR_DIR of failed runs.
|
||||
for report_dir in HAR_DIR.iterdir():
|
||||
if report_dir.is_dir() and not report_dir.iterdir():
|
||||
report_dir.rmdir()
|
||||
return sorted(HAR_DIR.iterdir(), reverse=True)
|
||||
|
||||
|
||||
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
|
||||
def hostnode_details_text(node_uuid):
|
||||
with open(session["tree"], 'rb') as f:
|
||||
|
@ -169,7 +175,7 @@ def hostnode_details(node_uuid):
|
|||
sane_js_r = sane_js_query(url.body_hash)
|
||||
if sane_js_r.get('response'):
|
||||
url.add_feature('sane_js_details', sane_js_r['response'])
|
||||
print(url.sane_js_details)
|
||||
print('######## SANEJS ##### ', url.sane_js_details)
|
||||
urls.append(url.to_json())
|
||||
return json.dumps(urls)
|
||||
|
||||
|
@ -195,27 +201,28 @@ def urlnode_details(node_uuid):
|
|||
as_attachment=True, attachment_filename='file.zip')
|
||||
|
||||
|
||||
@app.route('/tree/<int:tree_id>/image', methods=['GET'])
|
||||
def image(tree_id):
|
||||
report_dir = get_report_dirs()[tree_id]
|
||||
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
|
||||
def image(tree_uuid):
|
||||
lookup_dirs = get_lookup_dirs()
|
||||
report_dir = lookup_dirs[tree_uuid]
|
||||
to_return = load_image(report_dir)
|
||||
return send_file(to_return, mimetype='image/png',
|
||||
as_attachment=True, attachment_filename='image.png')
|
||||
|
||||
|
||||
@app.route('/tree/<int:tree_id>', methods=['GET'])
|
||||
def tree(tree_id):
|
||||
report_dir = get_report_dirs()[tree_id]
|
||||
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
|
||||
def tree(tree_uuid):
|
||||
lookup_dirs = get_lookup_dirs()
|
||||
report_dir = lookup_dirs[tree_uuid]
|
||||
tree_json, start_time, user_agent, root_url = load_tree(report_dir)
|
||||
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
||||
user_agent=user_agent, root_url=root_url, tree_id=tree_id)
|
||||
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid)
|
||||
|
||||
|
||||
@app.route('/', methods=['GET'])
|
||||
def index():
|
||||
cleanup_old_tmpfiles()
|
||||
session.clear()
|
||||
i = 0
|
||||
titles = []
|
||||
if not HAR_DIR.exists():
|
||||
HAR_DIR.mkdir(parents=True)
|
||||
|
@ -223,10 +230,12 @@ def index():
|
|||
har_files = sorted(report_dir.glob('*.har'))
|
||||
if not har_files:
|
||||
continue
|
||||
with open(har_files[0], 'r') as f:
|
||||
with har_files[0].open() as f:
|
||||
j = json.load(f)
|
||||
titles.append((i, j['log']['pages'][0]['title']))
|
||||
i += 1
|
||||
title = j['log']['pages'][0]['title']
|
||||
with (report_dir / 'uuid').open() as f:
|
||||
uuid = f.read().strip()
|
||||
titles.append((uuid, title))
|
||||
|
||||
return render_template('index.html', titles=titles)
|
||||
|
||||
|
|
|
@ -4,11 +4,11 @@
|
|||
|
||||
{% block content %}
|
||||
<center>
|
||||
<h2><a href="{{ url_for('scrape') }}">Scrape a page</a></h2></br></br>
|
||||
<h2><a href="{{ url_for('scrape_web') }}">Scrape a page</a></h2></br></br>
|
||||
</center>
|
||||
<center>
|
||||
{% for id, page_title in titles %}
|
||||
<a href="{{ url_for('tree', tree_id=id) }}">{{ page_title }}</a></br>
|
||||
{% for uuid, page_title in titles %}
|
||||
<a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></br>
|
||||
</br>
|
||||
{% endfor %}
|
||||
</center>
|
||||
|
|
|
@ -59,7 +59,7 @@
|
|||
<b>Root URL</b>: {{ root_url }}</br>
|
||||
<b>Start time</b>: {{ start_time }}</br>
|
||||
<b>User Agent</b>: {{ user_agent }}</br>
|
||||
<center><a href="{{ url_for('image', tree_id=tree_id) }}">Download Image</a></center>
|
||||
<center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center>
|
||||
</div>
|
||||
|
||||
{% endblock content %}
|
||||
|
|
Loading…
Reference in New Issue