#!/usr/bin/env python3 # -*- coding: utf-8 -*- import json from har2tree import CrawledTree from scrapysplashwrapper import crawl from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response from flask_bootstrap import Bootstrap from datetime import datetime import pickle import tempfile import pathlib import time from zipfile import ZipFile, ZIP_DEFLATED from io import BytesIO import base64 import os from uuid import uuid4 from pysanejs import SaneJS from .helpers import get_homedir, get_socket_path from redis import Redis app = Flask(__name__) secret_file_path = get_homedir() / 'secret_key' if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: with secret_file_path.open('wb') as f: f.write(os.urandom(64)) with secret_file_path.open('rb') as f: app.config['SECRET_KEY'] = f.read() Bootstrap(app) app.config['BOOTSTRAP_SERVE_LOCAL'] = True app.config['SESSION_COOKIE_NAME'] = 'lookyloo' app.debug = False HAR_DIR = get_homedir() / 'scraped' HAR_DIR.mkdir(parents=True, exist_ok=True) SPLASH = 'http://127.0.0.1:8050' SANE_JS = 'http://127.0.0.1:5007' if SANE_JS: sanejs = SaneJS(SANE_JS) if sanejs.is_up: has_sane_js = True else: has_sane_js = False r = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True) def get_report_dirs(): # Cleanup HAR_DIR of failed runs. for report_dir in HAR_DIR.iterdir(): if report_dir.is_dir() and not report_dir.iterdir(): report_dir.rmdir() if not (report_dir / 'uuid').exists(): # Create uuid if missing with (report_dir / 'uuid').open('w') as f: f.write(str(uuid4())) return sorted(HAR_DIR.iterdir(), reverse=True) def get_lookup_dirs(): # Build lookup table trees lookup_dirs = {} for report_dir in get_report_dirs(): with (report_dir / 'uuid').open() as f: lookup_dirs[f.read().strip()] = report_dir return lookup_dirs def cleanup_old_tmpfiles(): for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): if time.time() - tmpfile.stat().st_atime > 36000: tmpfile.unlink() def load_image(report_dir): with open(list(report_dir.glob('*.png'))[0], 'rb') as f: return BytesIO(f.read()) def load_tree(report_dir): session.clear() har_files = sorted(report_dir.glob('*.har')) ct = CrawledTree(har_files) ct.find_parents() ct.join_trees() temp = tempfile.NamedTemporaryFile(prefix='lookyloo', delete=False) pickle.dump(ct, temp) temp.close() session["tree"] = temp.name return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url def sane_js_query(sha512: str): if has_sane_js: return sanejs.sha512(sha512) return {'response': []} def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None): if not url.startswith('http'): url = f'http://{url}' items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') if not items: # broken pass if not perma_uuid: perma_uuid = str(uuid4()) width = len(str(len(items))) dirpath = HAR_DIR / datetime.now().isoformat() dirpath.mkdir() for i, item in enumerate(items): harfile = item['har'] png = base64.b64decode(item['png']) child_frames = item['childFrames'] html = item['html'] with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: json.dump(harfile, f) with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: f.write(png) with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: f.write(html) with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: json.dump(child_frames, f) with (dirpath / 'uuid').open('w') as f: f.write(perma_uuid) return perma_uuid @app.route('/submit', methods=['POST', 'GET']) def submit(): to_query = request.get_json(force=True) perma_uuid = str(uuid4()) p = r.pipeline() p.hmset(perma_uuid, to_query) p.sadd('to_scrape', perma_uuid) p.execute() return Response(perma_uuid, mimetype='text/text') @app.route('/scrape', methods=['GET', 'POST']) def scrape_web(): if request.form.get('url'): perma_uuid = scrape(request.form.get('url'), request.form.get('depth')) return redirect(url_for('tree', tree_uuid=perma_uuid)) return render_template('scrape.html') @app.route('/tree/hostname//text', methods=['GET']) def hostnode_details_text(node_uuid): with open(session["tree"], 'rb') as f: ct = pickle.load(f) hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) urls = [] for url in hostnode.urls: urls.append(url.name) content = '''# URLs {} '''.format('\n'.join(urls)) to_return = BytesIO(content.encode()) to_return.seek(0) return send_file(to_return, mimetype='text/markdown', as_attachment=True, attachment_filename='file.md') @app.route('/tree/hostname/', methods=['GET']) def hostnode_details(node_uuid): with open(session["tree"], 'rb') as f: ct = pickle.load(f) hostnode = ct.root_hartree.get_host_node_by_uuid(node_uuid) urls = [] for url in hostnode.urls: if hasattr(url, 'body_hash'): sane_js_r = sane_js_query(url.body_hash) if sane_js_r.get('response'): url.add_feature('sane_js_details', sane_js_r['response']) print('######## SANEJS ##### ', url.sane_js_details) urls.append(url.to_json()) return json.dumps(urls) @app.route('/tree/url/', methods=['GET']) def urlnode_details(node_uuid): with open(session["tree"], 'rb') as f: ct = pickle.load(f) urlnode = ct.root_hartree.get_url_node_by_uuid(node_uuid) to_return = BytesIO() got_content = False if hasattr(urlnode, 'body'): body_content = urlnode.body.getvalue() if body_content: got_content = True with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: zfile.writestr(urlnode.filename, urlnode.body.getvalue()) if not got_content: with ZipFile(to_return, 'w', ZIP_DEFLATED) as zfile: zfile.writestr('file.txt', b'Response body empty') to_return.seek(0) return send_file(to_return, mimetype='application/zip', as_attachment=True, attachment_filename='file.zip') @app.route('/tree//image', methods=['GET']) def image(tree_uuid): lookup_dirs = get_lookup_dirs() report_dir = lookup_dirs[tree_uuid] to_return = load_image(report_dir) return send_file(to_return, mimetype='image/png', as_attachment=True, attachment_filename='image.png') @app.route('/tree/', methods=['GET']) def tree(tree_uuid): lookup_dirs = get_lookup_dirs() report_dir = lookup_dirs[tree_uuid] tree_json, start_time, user_agent, root_url = load_tree(report_dir) return render_template('tree.html', tree_json=tree_json, start_time=start_time, user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid) @app.route('/', methods=['GET']) def index(): if request.method == 'HEAD': # Just returns ack if the webserver is running return 'Ack' cleanup_old_tmpfiles() session.clear() titles = [] if not HAR_DIR.exists(): HAR_DIR.mkdir(parents=True) for report_dir in get_report_dirs(): har_files = sorted(report_dir.glob('*.har')) if not har_files: continue with har_files[0].open() as f: j = json.load(f) title = j['log']['pages'][0]['title'] with (report_dir / 'uuid').open() as f: uuid = f.read().strip() titles.append((uuid, title)) return render_template('index.html', titles=titles) if __name__ == '__main__': app.run(port=5001, threaded=True)