mirror of https://github.com/CIRCL/lookyloo
				
				
				
			chg: Assign a UUID for each scraped page, allow permalinks
							parent
							
								
									a2c4d871fe
								
							
						
					
					
						commit
						e14a0150a0
					
				|  | @ -110,3 +110,4 @@ ENV/ | |||
| secret_key | ||||
| FileSaver.js | ||||
| d3.v5.min.js | ||||
| d3.v5.js | ||||
|  |  | |||
|  | @ -447,7 +447,7 @@ | |||
|         "scrapysplashwrapper": { | ||||
|             "editable": true, | ||||
|             "git": "https://github.com/viper-framework/ScrapySplashWrapper.git", | ||||
|             "ref": "e6d9da1a971011b2925800b168d762e7f86b2b29" | ||||
|             "ref": "d0e129f9962098603f6686c8152030f7d31a604a" | ||||
|         }, | ||||
|         "service-identity": { | ||||
|             "hashes": [ | ||||
|  |  | |||
|  | @ -6,7 +6,7 @@ import json | |||
| from har2tree import CrawledTree | ||||
| from scrapysplashwrapper import crawl | ||||
| 
 | ||||
| from flask import Flask, render_template, request, session, send_file | ||||
| from flask import Flask, render_template, request, session, send_file, redirect, url_for | ||||
| from flask_bootstrap import Bootstrap | ||||
| 
 | ||||
| from datetime import datetime | ||||
|  | @ -19,8 +19,8 @@ import time | |||
| from zipfile import ZipFile, ZIP_DEFLATED | ||||
| from io import BytesIO | ||||
| import base64 | ||||
| import socket | ||||
| import os | ||||
| from uuid import uuid4 | ||||
| 
 | ||||
| from pysanejs import SaneJS | ||||
| 
 | ||||
|  | @ -32,10 +32,10 @@ app = Flask(__name__) | |||
| secret_file_path = get_homedir() / 'secret_key' | ||||
| 
 | ||||
| if not secret_file_path.exists() or secret_file_path.stat().st_size < 64: | ||||
|     with open(secret_file_path, 'wb') as f: | ||||
|     with secret_file_path.open('wb') as f: | ||||
|         f.write(os.urandom(64)) | ||||
| 
 | ||||
| with open(secret_file_path, 'rb') as f: | ||||
| with secret_file_path.open('rb') as f: | ||||
|     app.config['SECRET_KEY'] = f.read() | ||||
| 
 | ||||
| Bootstrap(app) | ||||
|  | @ -49,18 +49,6 @@ HAR_DIR.mkdir(parents=True, exist_ok=True) | |||
| SPLASH = 'http://127.0.0.1:8050' | ||||
| SANE_JS = 'http://127.0.0.1:5007' | ||||
| 
 | ||||
| 
 | ||||
| def is_open(ip, port): | ||||
|     s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||||
|     s.settimeout(2) | ||||
|     try: | ||||
|         s.connect((ip, int(port))) | ||||
|         s.shutdown(2) | ||||
|         return True | ||||
|     except Exception: | ||||
|         return False | ||||
| 
 | ||||
| 
 | ||||
| if SANE_JS: | ||||
|     sanejs = SaneJS(SANE_JS) | ||||
|     if sanejs.is_up: | ||||
|  | @ -69,6 +57,27 @@ if SANE_JS: | |||
|         has_sane_js = False | ||||
| 
 | ||||
| 
 | ||||
| def get_report_dirs(): | ||||
|     # Cleanup HAR_DIR of failed runs. | ||||
|     for report_dir in HAR_DIR.iterdir(): | ||||
|         if report_dir.is_dir() and not report_dir.iterdir(): | ||||
|             report_dir.rmdir() | ||||
|         if not (report_dir / 'uuid').exists(): | ||||
|             # Create uuid if missing | ||||
|             with (report_dir / 'uuid').open('w') as f: | ||||
|                 f.write(str(uuid4())) | ||||
|     return sorted(HAR_DIR.iterdir(), reverse=True) | ||||
| 
 | ||||
| 
 | ||||
| def get_lookup_dirs(): | ||||
|     # Build lookup table trees | ||||
|     lookup_dirs = {} | ||||
|     for report_dir in get_report_dirs(): | ||||
|         with (report_dir / 'uuid').open() as f: | ||||
|             lookup_dirs[f.read().strip()] = report_dir | ||||
|     return lookup_dirs | ||||
| 
 | ||||
| 
 | ||||
| def cleanup_old_tmpfiles(): | ||||
|     for tmpfile in pathlib.Path(tempfile.gettempdir()).glob('lookyloo*'): | ||||
|         if time.time() - tmpfile.stat().st_atime > 36000: | ||||
|  | @ -93,53 +102,50 @@ def load_tree(report_dir): | |||
|     return ct.to_json(), ct.start_time.isoformat(), ct.user_agent, ct.root_url | ||||
| 
 | ||||
| 
 | ||||
| def sane_js_query(sha512): | ||||
| def sane_js_query(sha512: str): | ||||
|     if has_sane_js: | ||||
|         return sanejs.sha512(sha512) | ||||
|     return {'response': []} | ||||
| 
 | ||||
| 
 | ||||
| def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None): | ||||
|     if not url.startswith('http'): | ||||
|         url = f'http://{url}' | ||||
|     items = crawl(SPLASH, url, depth, user_agent=user_agent, log_enabled=True, log_level='INFO') | ||||
|     if not items: | ||||
|         # broken | ||||
|         pass | ||||
|     if not perma_uuid: | ||||
|         perma_uuid = str(uuid4()) | ||||
|     width = len(str(len(items))) | ||||
|     dirpath = HAR_DIR / datetime.now().isoformat() | ||||
|     dirpath.mkdir() | ||||
|     for i, item in enumerate(items): | ||||
|         harfile = item['har'] | ||||
|         png = base64.b64decode(item['png']) | ||||
|         child_frames = item['childFrames'] | ||||
|         html = item['html'] | ||||
|         with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: | ||||
|             json.dump(harfile, f) | ||||
|         with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: | ||||
|             f.write(png) | ||||
|         with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: | ||||
|             f.write(html) | ||||
|         with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: | ||||
|             json.dump(child_frames, f) | ||||
|         with (dirpath / 'uuid').open('w') as f: | ||||
|             f.write(perma_uuid) | ||||
|     return perma_uuid | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/scrape', methods=['GET', 'POST']) | ||||
| def scrape(): | ||||
| def scrape_web(): | ||||
|     if request.form.get('url'): | ||||
|         url = request.form.get('url') | ||||
|         if not url.startswith('http'): | ||||
|             url = f'http://{url}' | ||||
|         depth = request.form.get('depth') | ||||
|         if depth is None: | ||||
|             depth = 1 | ||||
|         items = crawl(SPLASH, url, depth, log_enabled=True, log_level='INFO') | ||||
|         if not items: | ||||
|             # broken | ||||
|             pass | ||||
|         width = len(str(len(items))) | ||||
|         dirpath = HAR_DIR / datetime.now().isoformat() | ||||
|         dirpath.mkdir() | ||||
|         for i, item in enumerate(items): | ||||
|             harfile = item['har'] | ||||
|             png = base64.b64decode(item['png']) | ||||
|             child_frames = item['childFrames'] | ||||
|             html = item['html'] | ||||
|             with (dirpath / '{0:0{width}}.har'.format(i, width=width)).open('w') as f: | ||||
|                 json.dump(harfile, f) | ||||
|             with (dirpath / '{0:0{width}}.png'.format(i, width=width)).open('wb') as f: | ||||
|                 f.write(png) | ||||
|             with (dirpath / '{0:0{width}}.html'.format(i, width=width)).open('w') as f: | ||||
|                 f.write(html) | ||||
|             with (dirpath / '{0:0{width}}.frames.json'.format(i, width=width)).open('w') as f: | ||||
|                 json.dump(child_frames, f) | ||||
|         return tree(0) | ||||
|         perma_uuid = scrape(request.form.get('url'), request.form.get('depth')) | ||||
|         return redirect(url_for('tree', tree_uuid=perma_uuid)) | ||||
|     return render_template('scrape.html') | ||||
| 
 | ||||
| 
 | ||||
| def get_report_dirs(): | ||||
|     # Cleanup HAR_DIR of failed runs. | ||||
|     for report_dir in HAR_DIR.iterdir(): | ||||
|         if report_dir.is_dir() and not report_dir.iterdir(): | ||||
|             report_dir.rmdir() | ||||
|     return sorted(HAR_DIR.iterdir(), reverse=True) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/tree/hostname/<node_uuid>/text', methods=['GET']) | ||||
| def hostnode_details_text(node_uuid): | ||||
|     with open(session["tree"], 'rb') as f: | ||||
|  | @ -169,7 +175,7 @@ def hostnode_details(node_uuid): | |||
|             sane_js_r = sane_js_query(url.body_hash) | ||||
|             if sane_js_r.get('response'): | ||||
|                 url.add_feature('sane_js_details', sane_js_r['response']) | ||||
|                 print(url.sane_js_details) | ||||
|                 print('######## SANEJS ##### ', url.sane_js_details) | ||||
|         urls.append(url.to_json()) | ||||
|     return json.dumps(urls) | ||||
| 
 | ||||
|  | @ -195,27 +201,28 @@ def urlnode_details(node_uuid): | |||
|                      as_attachment=True, attachment_filename='file.zip') | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/tree/<int:tree_id>/image', methods=['GET']) | ||||
| def image(tree_id): | ||||
|     report_dir = get_report_dirs()[tree_id] | ||||
| @app.route('/tree/<string:tree_uuid>/image', methods=['GET']) | ||||
| def image(tree_uuid): | ||||
|     lookup_dirs = get_lookup_dirs() | ||||
|     report_dir = lookup_dirs[tree_uuid] | ||||
|     to_return = load_image(report_dir) | ||||
|     return send_file(to_return, mimetype='image/png', | ||||
|                      as_attachment=True, attachment_filename='image.png') | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/tree/<int:tree_id>', methods=['GET']) | ||||
| def tree(tree_id): | ||||
|     report_dir = get_report_dirs()[tree_id] | ||||
| @app.route('/tree/<string:tree_uuid>', methods=['GET']) | ||||
| def tree(tree_uuid): | ||||
|     lookup_dirs = get_lookup_dirs() | ||||
|     report_dir = lookup_dirs[tree_uuid] | ||||
|     tree_json, start_time, user_agent, root_url = load_tree(report_dir) | ||||
|     return render_template('tree.html', tree_json=tree_json, start_time=start_time, | ||||
|                            user_agent=user_agent, root_url=root_url, tree_id=tree_id) | ||||
|                            user_agent=user_agent, root_url=root_url, tree_uuid=tree_uuid) | ||||
| 
 | ||||
| 
 | ||||
| @app.route('/', methods=['GET']) | ||||
| def index(): | ||||
|     cleanup_old_tmpfiles() | ||||
|     session.clear() | ||||
|     i = 0 | ||||
|     titles = [] | ||||
|     if not HAR_DIR.exists(): | ||||
|         HAR_DIR.mkdir(parents=True) | ||||
|  | @ -223,10 +230,12 @@ def index(): | |||
|         har_files = sorted(report_dir.glob('*.har')) | ||||
|         if not har_files: | ||||
|             continue | ||||
|         with open(har_files[0], 'r') as f: | ||||
|         with har_files[0].open() as f: | ||||
|             j = json.load(f) | ||||
|             titles.append((i, j['log']['pages'][0]['title'])) | ||||
|         i += 1 | ||||
|             title = j['log']['pages'][0]['title'] | ||||
|         with (report_dir / 'uuid').open() as f: | ||||
|             uuid = f.read().strip() | ||||
|         titles.append((uuid, title)) | ||||
| 
 | ||||
|     return render_template('index.html', titles=titles) | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,11 +4,11 @@ | |||
| 
 | ||||
| {% block content %} | ||||
|   <center> | ||||
|     <h2><a href="{{ url_for('scrape') }}">Scrape a page</a></h2></br></br> | ||||
|     <h2><a href="{{ url_for('scrape_web') }}">Scrape a page</a></h2></br></br> | ||||
|   </center> | ||||
|   <center> | ||||
|     {% for id, page_title in titles %} | ||||
|       <a href="{{ url_for('tree', tree_id=id) }}">{{ page_title }}</a></br> | ||||
|     {% for uuid, page_title in titles %} | ||||
|       <a href="{{ url_for('tree', tree_uuid=uuid) }}">{{ page_title }}</a></br> | ||||
|     </br> | ||||
|     {% endfor %} | ||||
|   </center> | ||||
|  |  | |||
|  | @ -59,7 +59,7 @@ | |||
|     <b>Root URL</b>: {{ root_url }}</br> | ||||
|     <b>Start time</b>: {{ start_time }}</br> | ||||
|     <b>User Agent</b>: {{ user_agent }}</br> | ||||
|     <center><a href="{{ url_for('image', tree_id=tree_id) }}">Download Image</a></center> | ||||
|     <center><a href="{{ url_for('image', tree_uuid=tree_uuid) }}">Download Image</a></center> | ||||
| </div> | ||||
| 
 | ||||
| {% endblock content %} | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Raphaël Vinot
						Raphaël Vinot