2019-01-30 14:30:01 +01:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
import json
|
|
|
|
import pickle
|
|
|
|
from zipfile import ZipFile, ZIP_DEFLATED
|
|
|
|
from io import BytesIO
|
|
|
|
import os
|
2020-01-06 15:32:38 +01:00
|
|
|
from pathlib import Path
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2020-03-17 14:17:18 +01:00
|
|
|
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response, flash
|
2020-01-06 15:32:38 +01:00
|
|
|
from flask_bootstrap import Bootstrap # type: ignore
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2019-03-29 20:11:44 +01:00
|
|
|
from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
|
2019-01-30 14:30:01 +01:00
|
|
|
from lookyloo.lookyloo import Lookyloo
|
2019-02-18 13:52:48 +01:00
|
|
|
from lookyloo.exceptions import NoValidHarFile
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2020-01-06 15:32:38 +01:00
|
|
|
from typing import Tuple
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2020-01-06 15:32:38 +01:00
|
|
|
app: Flask = Flask(__name__)
|
2019-01-30 14:30:01 +01:00
|
|
|
|
2020-01-06 15:32:38 +01:00
|
|
|
secret_file_path: Path = get_homedir() / 'secret_key'
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
|
|
|
|
with secret_file_path.open('wb') as f:
|
|
|
|
f.write(os.urandom(64))
|
|
|
|
|
|
|
|
with secret_file_path.open('rb') as f:
|
|
|
|
app.config['SECRET_KEY'] = f.read()
|
|
|
|
|
|
|
|
Bootstrap(app)
|
|
|
|
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
|
|
|
|
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
|
|
|
|
app.debug = False
|
|
|
|
|
2020-03-31 14:12:49 +02:00
|
|
|
lookyloo: Lookyloo = Lookyloo()
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
# keep
|
2020-03-26 01:56:24 +01:00
|
|
|
def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
|
2019-01-30 14:30:01 +01:00
|
|
|
session.clear()
|
2020-03-26 01:56:24 +01:00
|
|
|
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
|
2019-01-30 14:30:01 +01:00
|
|
|
session["tree"] = temp_file_name
|
2019-04-07 23:54:16 +02:00
|
|
|
return tree_json, tree_time, tree_ua, tree_root_url, meta
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
@app.route('/submit', methods=['POST', 'GET'])
|
|
|
|
def submit():
|
|
|
|
to_query = request.get_json(force=True)
|
|
|
|
perma_uuid = lookyloo.enqueue_scrape(to_query)
|
|
|
|
return Response(perma_uuid, mimetype='text/text')
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/scrape', methods=['GET', 'POST'])
|
|
|
|
def scrape_web():
|
|
|
|
if request.form.get('url'):
|
2020-01-24 10:17:41 +01:00
|
|
|
# check if the post request has the file part
|
2020-01-24 11:25:53 +01:00
|
|
|
if 'cookies' in request.files and request.files['cookies'].filename:
|
|
|
|
cookie_file = request.files['cookies'].stream
|
2020-01-24 10:17:41 +01:00
|
|
|
else:
|
|
|
|
cookie_file = None
|
|
|
|
perma_uuid = lookyloo.scrape(url=request.form.get('url'),
|
|
|
|
cookies_pseudofile=cookie_file,
|
|
|
|
depth=request.form.get('depth'),
|
2019-04-07 23:54:16 +02:00
|
|
|
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
|
|
|
|
os=request.form.get('os'), browser=request.form.get('browser'))
|
2019-01-30 14:30:01 +01:00
|
|
|
return redirect(url_for('tree', tree_uuid=perma_uuid))
|
2019-03-29 20:11:44 +01:00
|
|
|
user_agents = get_user_agents()
|
|
|
|
user_agents.pop('by_frequency')
|
|
|
|
return render_template('scrape.html', user_agents=user_agents)
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
|
|
|
|
def hostnode_details_text(node_uuid):
|
|
|
|
with open(session["tree"], 'rb') as f:
|
|
|
|
ct = pickle.load(f)
|
|
|
|
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
|
|
|
urls = []
|
|
|
|
for url in hostnode.urls:
|
|
|
|
urls.append(url.name)
|
|
|
|
content = '''# URLs
|
|
|
|
|
|
|
|
{}
|
|
|
|
'''.format('\n'.join(urls))
|
|
|
|
to_return = BytesIO(content.encode())
|
|
|
|
to_return.seek(0)
|
|
|
|
return send_file(to_return, mimetype='text/markdown',
|
|
|
|
as_attachment=True, attachment_filename='file.md')
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/tree/hostname/<node_uuid>', methods=['GET'])
|
|
|
|
def hostnode_details(node_uuid):
|
|
|
|
with open(session["tree"], 'rb') as f:
|
|
|
|
ct = pickle.load(f)
|
|
|
|
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
|
|
|
|
urls = []
|
|
|
|
for url in hostnode.urls:
|
|
|
|
if hasattr(url, 'body_hash'):
|
|
|
|
sane_js_r = lookyloo.sane_js_query(url.body_hash)
|
|
|
|
if sane_js_r.get('response'):
|
|
|
|
url.add_feature('sane_js_details', sane_js_r['response'])
|
|
|
|
print('######## SANEJS ##### ', url.sane_js_details)
|
|
|
|
urls.append(url.to_json())
|
|
|
|
return json.dumps(urls)
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/tree/url/<node_uuid>', methods=['GET'])
|
|
|
|
def urlnode_details(node_uuid):
|
|
|
|
with open(session["tree"], 'rb') as f:
|
|
|
|
ct = pickle.load(f)
|
|
|
|
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
|
|
|
|
to_return = BytesIO()
|
|
|
|
got_content = False
|
|
|
|
if hasattr(urlnode, 'body'):
|
|
|
|
body_content = urlnode.body.getvalue()
|
|
|
|
if body_content:
|
|
|
|
got_content = True
|
|
|
|
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
|
|
|
|
zfile.writestr(urlnode.filename, urlnode.body.getvalue())
|
|
|
|
if not got_content:
|
|
|
|
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
|
|
|
|
zfile.writestr('file.txt', b'Response body empty')
|
|
|
|
to_return.seek(0)
|
|
|
|
return send_file(to_return, mimetype='application/zip',
|
|
|
|
as_attachment=True, attachment_filename='file.zip')
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
|
|
|
|
def image(tree_uuid):
|
2020-03-26 01:56:24 +01:00
|
|
|
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
|
|
|
if not capture_dir:
|
2019-01-30 16:01:55 +01:00
|
|
|
return Response('Not available.', mimetype='text/text')
|
2020-03-26 01:56:24 +01:00
|
|
|
to_return = lookyloo.load_image(capture_dir)
|
2019-01-30 14:30:01 +01:00
|
|
|
return send_file(to_return, mimetype='image/png',
|
|
|
|
as_attachment=True, attachment_filename='image.png')
|
|
|
|
|
|
|
|
|
2020-03-23 12:45:57 +01:00
|
|
|
@app.route('/redirects/<string:tree_uuid>', methods=['GET'])
|
|
|
|
def redirects(tree_uuid):
|
2020-03-26 01:56:24 +01:00
|
|
|
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
|
|
|
if not capture_dir:
|
2020-03-23 12:45:57 +01:00
|
|
|
return Response('Not available.', mimetype='text/text')
|
2020-03-26 01:56:24 +01:00
|
|
|
cache = lookyloo.capture_cache(capture_dir)
|
2020-03-23 12:45:57 +01:00
|
|
|
if not cache['redirects']:
|
|
|
|
return Response('No redirects.', mimetype='text/text')
|
|
|
|
to_return = BytesIO('\n'.join(cache['redirects']).encode())
|
|
|
|
return send_file(to_return, mimetype='text/text',
|
|
|
|
as_attachment=True, attachment_filename='redirects.txt')
|
|
|
|
|
|
|
|
|
2020-03-26 01:56:24 +01:00
|
|
|
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
|
|
|
|
def cache_tree(tree_uuid):
|
|
|
|
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
|
|
|
if capture_dir:
|
|
|
|
lookyloo.load_tree(capture_dir)
|
|
|
|
return redirect(url_for('index'))
|
|
|
|
|
|
|
|
|
2019-01-30 14:30:01 +01:00
|
|
|
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
|
|
|
|
def tree(tree_uuid):
|
2020-03-17 15:27:04 +01:00
|
|
|
if tree_uuid == 'False':
|
2020-03-23 12:45:57 +01:00
|
|
|
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
|
2020-03-17 15:27:04 +01:00
|
|
|
return redirect(url_for('index'))
|
2020-03-26 01:56:24 +01:00
|
|
|
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
|
|
|
|
if not capture_dir:
|
2020-03-17 14:17:18 +01:00
|
|
|
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
|
|
|
|
return redirect(url_for('index'))
|
|
|
|
|
2020-03-26 01:56:24 +01:00
|
|
|
cache = lookyloo.capture_cache(capture_dir)
|
2020-03-17 14:17:18 +01:00
|
|
|
if 'error' in cache:
|
|
|
|
flash(cache['error'], 'error')
|
2019-01-30 16:01:55 +01:00
|
|
|
return redirect(url_for('index'))
|
|
|
|
|
2019-02-18 13:52:48 +01:00
|
|
|
try:
|
2020-03-26 01:56:24 +01:00
|
|
|
tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
|
2019-02-18 13:52:48 +01:00
|
|
|
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
|
2019-04-07 23:54:16 +02:00
|
|
|
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
|
|
|
|
meta=meta)
|
2019-02-18 13:52:48 +01:00
|
|
|
except NoValidHarFile as e:
|
2019-04-05 14:05:54 +02:00
|
|
|
return render_template('error.html', error_message=e)
|
2019-01-30 14:30:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
@app.route('/', methods=['GET'])
|
|
|
|
def index():
|
|
|
|
if request.method == 'HEAD':
|
|
|
|
# Just returns ack if the webserver is running
|
|
|
|
return 'Ack'
|
|
|
|
lookyloo.cleanup_old_tmpfiles()
|
2019-03-29 20:11:44 +01:00
|
|
|
update_user_agents()
|
2019-01-30 14:30:01 +01:00
|
|
|
titles = []
|
2020-03-26 01:56:24 +01:00
|
|
|
for capture_dir in lookyloo.capture_dirs:
|
|
|
|
cached = lookyloo.capture_cache(capture_dir)
|
2020-03-17 14:17:18 +01:00
|
|
|
if not cached or 'no_index' in cached or 'error' in cached:
|
2019-01-30 14:30:01 +01:00
|
|
|
continue
|
2020-02-03 18:30:41 +01:00
|
|
|
date, time = cached['timestamp'].split('T')
|
|
|
|
time, _ = time.split('.', 1)
|
2020-03-26 01:56:24 +01:00
|
|
|
titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
|
|
|
|
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
|
2020-02-03 18:30:41 +01:00
|
|
|
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
|
2019-01-30 14:30:01 +01:00
|
|
|
return render_template('index.html', titles=titles)
|