lookyloo/website/web/__init__.py

211 lines
7.9 KiB
Python
Raw Normal View History

2019-01-30 14:30:01 +01:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import pickle
from zipfile import ZipFile, ZIP_DEFLATED
from io import BytesIO
import os
import logging
2020-01-06 15:32:38 +01:00
from pathlib import Path
2019-01-30 14:30:01 +01:00
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response, flash
2020-01-06 15:32:38 +01:00
from flask_bootstrap import Bootstrap # type: ignore
2019-01-30 14:30:01 +01:00
from lookyloo.helpers import get_homedir, update_user_agents, get_user_agents
2019-01-30 14:30:01 +01:00
from lookyloo.lookyloo import Lookyloo
2019-02-18 13:52:48 +01:00
from lookyloo.exceptions import NoValidHarFile
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
from typing import Tuple
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
app: Flask = Flask(__name__)
2019-01-30 14:30:01 +01:00
2020-01-06 15:32:38 +01:00
secret_file_path: Path = get_homedir() / 'secret_key'
2019-01-30 14:30:01 +01:00
if not secret_file_path.exists() or secret_file_path.stat().st_size < 64:
with secret_file_path.open('wb') as f:
f.write(os.urandom(64))
with secret_file_path.open('rb') as f:
app.config['SECRET_KEY'] = f.read()
Bootstrap(app)
app.config['BOOTSTRAP_SERVE_LOCAL'] = True
app.config['SESSION_COOKIE_NAME'] = 'lookyloo'
app.debug = False
2020-01-06 15:32:38 +01:00
splash_url: str = 'http://127.0.0.1:8050'
# API entry point for splash
2019-11-02 05:05:08 +01:00
if os.environ.get('SPLASH_URL'):
2020-01-06 15:32:38 +01:00
splash_url = os.environ['SPLASH_URL']
# Splash log level
loglevel = logging.DEBUG
# Set it to True if your instance is publicly available so users aren't able to scan your internal network
2020-01-06 15:32:38 +01:00
only_global_lookups: bool = False
2020-01-06 15:32:38 +01:00
lookyloo: Lookyloo = Lookyloo(splash_url=splash_url, loglevel=loglevel, only_global_lookups=only_global_lookups)
2019-01-30 14:30:01 +01:00
# keep
def load_tree(capture_dir: Path) -> Tuple[dict, str, str, str, dict]:
2019-01-30 14:30:01 +01:00
session.clear()
temp_file_name, tree_json, tree_time, tree_ua, tree_root_url, meta = lookyloo.load_tree(capture_dir)
2019-01-30 14:30:01 +01:00
session["tree"] = temp_file_name
return tree_json, tree_time, tree_ua, tree_root_url, meta
2019-01-30 14:30:01 +01:00
@app.route('/submit', methods=['POST', 'GET'])
def submit():
to_query = request.get_json(force=True)
perma_uuid = lookyloo.enqueue_scrape(to_query)
return Response(perma_uuid, mimetype='text/text')
@app.route('/scrape', methods=['GET', 'POST'])
def scrape_web():
if request.form.get('url'):
2020-01-24 10:17:41 +01:00
# check if the post request has the file part
2020-01-24 11:25:53 +01:00
if 'cookies' in request.files and request.files['cookies'].filename:
cookie_file = request.files['cookies'].stream
2020-01-24 10:17:41 +01:00
else:
cookie_file = None
perma_uuid = lookyloo.scrape(url=request.form.get('url'),
cookies_pseudofile=cookie_file,
depth=request.form.get('depth'),
listing=request.form.get('listing'), user_agent=request.form.get('user_agent'),
os=request.form.get('os'), browser=request.form.get('browser'))
2019-01-30 14:30:01 +01:00
return redirect(url_for('tree', tree_uuid=perma_uuid))
user_agents = get_user_agents()
user_agents.pop('by_frequency')
return render_template('scrape.html', user_agents=user_agents)
2019-01-30 14:30:01 +01:00
@app.route('/tree/hostname/<node_uuid>/text', methods=['GET'])
def hostnode_details_text(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
urls.append(url.name)
content = '''# URLs
{}
'''.format('\n'.join(urls))
to_return = BytesIO(content.encode())
to_return.seek(0)
return send_file(to_return, mimetype='text/markdown',
as_attachment=True, attachment_filename='file.md')
@app.route('/tree/hostname/<node_uuid>', methods=['GET'])
def hostnode_details(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid)
urls = []
for url in hostnode.urls:
if hasattr(url, 'body_hash'):
sane_js_r = lookyloo.sane_js_query(url.body_hash)
if sane_js_r.get('response'):
url.add_feature('sane_js_details', sane_js_r['response'])
print('######## SANEJS ##### ', url.sane_js_details)
urls.append(url.to_json())
return json.dumps(urls)
@app.route('/tree/url/<node_uuid>', methods=['GET'])
def urlnode_details(node_uuid):
with open(session["tree"], 'rb') as f:
ct = pickle.load(f)
urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid)
to_return = BytesIO()
got_content = False
if hasattr(urlnode, 'body'):
body_content = urlnode.body.getvalue()
if body_content:
got_content = True
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr(urlnode.filename, urlnode.body.getvalue())
if not got_content:
with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile:
zfile.writestr('file.txt', b'Response body empty')
to_return.seek(0)
return send_file(to_return, mimetype='application/zip',
as_attachment=True, attachment_filename='file.zip')
@app.route('/tree/<string:tree_uuid>/image', methods=['GET'])
def image(tree_uuid):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
2019-01-30 16:01:55 +01:00
return Response('Not available.', mimetype='text/text')
to_return = lookyloo.load_image(capture_dir)
2019-01-30 14:30:01 +01:00
return send_file(to_return, mimetype='image/png',
as_attachment=True, attachment_filename='image.png')
2020-03-23 12:45:57 +01:00
@app.route('/redirects/<string:tree_uuid>', methods=['GET'])
def redirects(tree_uuid):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
2020-03-23 12:45:57 +01:00
return Response('Not available.', mimetype='text/text')
cache = lookyloo.capture_cache(capture_dir)
2020-03-23 12:45:57 +01:00
if not cache['redirects']:
return Response('No redirects.', mimetype='text/text')
to_return = BytesIO('\n'.join(cache['redirects']).encode())
return send_file(to_return, mimetype='text/text',
as_attachment=True, attachment_filename='redirects.txt')
@app.route('/cache_tree/<string:tree_uuid>', methods=['GET'])
def cache_tree(tree_uuid):
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if capture_dir:
lookyloo.load_tree(capture_dir)
return redirect(url_for('index'))
2019-01-30 14:30:01 +01:00
@app.route('/tree/<string:tree_uuid>', methods=['GET'])
def tree(tree_uuid):
2020-03-17 15:27:04 +01:00
if tree_uuid == 'False':
2020-03-23 12:45:57 +01:00
flash("Unable to process your request. The domain may not exist, or splash isn't started", 'error')
2020-03-17 15:27:04 +01:00
return redirect(url_for('index'))
capture_dir = lookyloo.lookup_capture_dir(tree_uuid)
if not capture_dir:
flash(f'Unable to find this UUID ({tree_uuid}). The capture may still be ongoing, try again later.', 'error')
return redirect(url_for('index'))
cache = lookyloo.capture_cache(capture_dir)
if 'error' in cache:
flash(cache['error'], 'error')
2019-01-30 16:01:55 +01:00
return redirect(url_for('index'))
2019-02-18 13:52:48 +01:00
try:
tree_json, start_time, user_agent, root_url, meta = load_tree(capture_dir)
2019-02-18 13:52:48 +01:00
return render_template('tree.html', tree_json=tree_json, start_time=start_time,
user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid,
meta=meta)
2019-02-18 13:52:48 +01:00
except NoValidHarFile as e:
2019-04-05 14:05:54 +02:00
return render_template('error.html', error_message=e)
2019-01-30 14:30:01 +01:00
@app.route('/', methods=['GET'])
def index():
if request.method == 'HEAD':
# Just returns ack if the webserver is running
return 'Ack'
lookyloo.cleanup_old_tmpfiles()
update_user_agents()
2019-01-30 14:30:01 +01:00
titles = []
for capture_dir in lookyloo.capture_dirs:
cached = lookyloo.capture_cache(capture_dir)
if not cached or 'no_index' in cached or 'error' in cached:
2019-01-30 14:30:01 +01:00
continue
2020-02-03 18:30:41 +01:00
date, time = cached['timestamp'].split('T')
time, _ = time.split('.', 1)
titles.append((cached['uuid'], cached['title'], date, time, cached['url'],
cached['redirects'], True if cached['incomplete_redirects'] == '1' else False))
2020-02-03 18:30:41 +01:00
titles = sorted(titles, key=lambda x: (x[2], x[3]), reverse=True)
2019-01-30 14:30:01 +01:00
return render_template('index.html', titles=titles)