diff --git a/.gitignore b/.gitignore index 7bbc71c..0799bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Local exclude +scraped/ +*.swp +lookyloo/ete3_webserver/webapi.py + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 1185e85..e5ff2f5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,44 @@ -# lookyloo -*Lookyloo* is a web interface allowing to scrape a website and then displays a tree of domains calling each other. +Lookyloo +======== + +*Lookyloo* is a web interface allowing to scrape a website and then displays a +tree of domains calling each other. + + +What is that name?! +=================== + + +``` +1. People who just come to look. +2. People who go out of their way to look at people or something often causing crowds and more disruption. +3. People who enjoy staring at watching other peoples misfortune. Oftentimes car onlookers to car accidents. +Same as Looky Lou; often spelled as Looky-loo (hyphen) or lookylou +In L.A. usually the lookyloo's cause more accidents by not paying full attention to what is ahead of them. +``` + +Source: Urban Dictionary + + +Implementation details +====================== + +This code is very heavily inspired by https://github.com/etetoolkit/webplugin and adapted to use flask as backend. + +Installation of har2tree +======================== + +The core dependency is ETE Toolkit, which you can install following the guide +on the official website: http://etetoolkit.org/download/ + +Protip +====== + +If you like using virtualenv and have `pew` installed you can also do it this way: + +```bash +sudo apt-get install python-qt4 +pip install -r requirements.txt +pew toggleglobalsitepackages # PyQt4 is not easily installable in a virtualenv +pip install -e . +``` diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py new file mode 100644 index 0000000..806e635 --- /dev/null +++ b/lookyloo/__init__.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import json + +from har2tree import CrawledTree, hostname_treestyle +from scrapysplashwrapper import crawl +from ete3_webserver import NodeActions, WebTreeHandler + +from flask import Flask, render_template, request +from flask_bootstrap import Bootstrap + +from glob import glob +import os +from datetime import datetime + +app = Flask(__name__) + +Bootstrap(app) +app.config['BOOTSTRAP_SERVE_LOCAL'] = True +app.debug = True + +HAR_DIR = 'scraped' +SPLASH = 'http://127.0.0.1:8050' + + +def load_tree(report_dir): + har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + ct = CrawledTree(har_files) + ct.find_parents() + ct.join_trees() + ct.root_hartree.make_hostname_tree() + actions = NodeActions() + style = hostname_treestyle() + return WebTreeHandler(ct.root_hartree.hostname_tree, actions, style) + + +@app.route('/scrap', methods=['GET', 'POST']) +def scrap(): + if request.form.get('url'): + url = request.form.get('url') + depth = request.form.get('depth') + items = crawl(SPLASH, url, depth) + if not items: + # broken + pass + width = len(str(len(items))) + i = 1 + dirpath = os.path.join(HAR_DIR, datetime.now().isoformat()) + os.makedirs(dirpath) + for item in items: + harfile = item['har'] + with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: + json.dump(harfile, f) + i += 1 + return tree(-1) + return render_template('scrap.html') + + +@app.route('/tree/', methods=['GET']) +def tree(tree_id): + report_dir = sorted(os.listdir(HAR_DIR))[tree_id] + tree = load_tree(report_dir) + nodes, faces, base64 = tree.redraw() + return render_template('tree.html', nodes=nodes, faces=faces, base64_img=base64) + + +@app.route('/', methods=['GET']) +def index(): + i = 0 + titles = [] + for report_dir in sorted(os.listdir(HAR_DIR)): + har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + if not har_files: + continue + with open(har_files[0], 'r') as f: + j = json.load(f) + titles.append((i, j['log']['pages'][0]['title'])) + i += 1 + + return render_template('index.html', titles=titles) + + +if __name__ == '__main__': + app.run(port=5001) diff --git a/lookyloo/ete3_webserver/__init__.py b/lookyloo/ete3_webserver/__init__.py new file mode 100644 index 0000000..f1964af --- /dev/null +++ b/lookyloo/ete3_webserver/__init__.py @@ -0,0 +1,3 @@ +from .tree_handler import WebTreeHandler, NodeActions + +__all__ = ['WebTreeHandler', 'NodeActions'] diff --git a/lookyloo/ete3_webserver/tree_handler.py b/lookyloo/ete3_webserver/tree_handler.py new file mode 100644 index 0000000..9f451dc --- /dev/null +++ b/lookyloo/ete3_webserver/tree_handler.py @@ -0,0 +1,90 @@ +import time +import string +import random +# import logging as log +from ete3 import Tree # , TreeStyle +from ete3.parser.newick import NewickError + + +def timeit(f): + def a_wrapper_accepting_arguments(*args, **kargs): + t1 = time.time() + r = f(*args, **kargs) + print(" %0.3f secs: %s" % (time.time() - t1, f.__name__)) + return r + return a_wrapper_accepting_arguments + + +def id_generator(size=6, chars=string.ascii_uppercase + string.digits): + return ''.join(random.choice(chars) for _ in range(size)) + + +class WebTreeHandler(object): + def __init__(self, newick, actions, style): + if isinstance(newick, Tree): + self.tree = newick + else: + try: + self.tree = Tree(newick) + except NewickError: + self.tree = Tree(newick, format=1) + + self.tree.actions = actions + self.tree.tree_style = style + + # Initialze node internal IDs + for index, n in enumerate(self.tree.traverse('preorder')): + n._nid = index + + @timeit + def redraw(self): + base64_img, img_map = self.tree.render("%%return.PNG", tree_style=self.tree.tree_style) + nodes, faces = self.get_html_map(img_map) + base64 = base64_img.data().decode() + return nodes, faces, base64 + + def get_html_map(self, img_map): + nodes = [] + if img_map.get("nodes"): + for x1, y1, x2, y2, nodeid, text in img_map["nodes"]: + nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])]) + faces = [] + if img_map.get("faces"): + for x1, y1, x2, y2, nodeid, text in img_map["faces"]: + faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])]) + return nodes, faces + + def get_avail_actions(self, nodeid): + target = self.tree.search_nodes(_nid=int(nodeid))[0] + action_list = [] + for aindex, aname, show_fn, run_fn in self.tree.actions: + if show_fn(target): + action_list.append([aindex, aname]) + return action_list + + def run_action(self, aindex, nodeid): + target = self.tree.search_nodes(_nid=int(nodeid))[0] + run_fn = self.tree.actions.actions[aindex][2] + return run_fn(self.tree, target) + + +class NodeActions(object): + def __str__(self): + text = [] + for aindex, aname, show_fn, run_fn in self: + text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn)) + return '\n'.join(text) + + def __iter__(self): + for aindex, (aname, show_fn, run_fn) in self.actions.items(): + yield (aindex, aname, show_fn, run_fn) + + def __init__(self): + self.actions = {} + + def clear_default_actions(self): + self.actions = {} + + def add_action(self, action_name, show_fn, run_fn): + aindex = "act_" + id_generator() + self.actions[aindex] = [action_name, show_fn, run_fn] diff --git a/lookyloo/static/ete.css b/lookyloo/static/ete.css new file mode 100644 index 0000000..dbba244 --- /dev/null +++ b/lookyloo/static/ete.css @@ -0,0 +1,8 @@ +#highlighter { + position: absolute; + visibility: visible; + z-index:100; + top:0; left:0; + width: 70px; height: 70px; + border: 2px solid indianred; +} diff --git a/lookyloo/static/ete.js b/lookyloo/static/ete.js new file mode 100644 index 0000000..5525187 --- /dev/null +++ b/lookyloo/static/ete.js @@ -0,0 +1,19 @@ +function highlight_node(x, y, width, height){ + //console.log(treeid, nodeid, x, y, width, height); + var img = $('#img'); + var offset = img.offset(); + // console.log(img); + // console.log(offset); + + $("#highlighter").show(); + $("#highlighter").css("top", offset.top+y-1); + $("#highlighter").css("left", offset.left+x-1); + $("#highlighter").css("width", width+1); + $("#highlighter").css("height", height+1); + +} +function unhighlight_node(){ + // console.log("unhighlight"); + $("#highlighter").hide(); +} + diff --git a/lookyloo/static/loader.gif b/lookyloo/static/loader.gif new file mode 100644 index 0000000..fe36aa0 Binary files /dev/null and b/lookyloo/static/loader.gif differ diff --git a/lookyloo/templates/index.html b/lookyloo/templates/index.html new file mode 100644 index 0000000..e4948d2 --- /dev/null +++ b/lookyloo/templates/index.html @@ -0,0 +1,15 @@ +{% extends "main.html" %} + +{% block title %}Tree{% endblock %} + +{% block content %} +
+

Scrap a page



+
+
+ {% for id, page_title in titles %} + {{ page_title }}
+
+ {% endfor %} +
+{% endblock %} diff --git a/lookyloo/templates/main.html b/lookyloo/templates/main.html new file mode 100644 index 0000000..0a2674d --- /dev/null +++ b/lookyloo/templates/main.html @@ -0,0 +1,11 @@ +{% extends "bootstrap/base.html" %} + +{% block scripts %} + {{ super() }} + +{% endblock %} + +{% block head %} + {{ super() }} + +{% endblock %} diff --git a/lookyloo/templates/scrap.html b/lookyloo/templates/scrap.html new file mode 100644 index 0000000..8df33d2 --- /dev/null +++ b/lookyloo/templates/scrap.html @@ -0,0 +1,16 @@ +{% extends "main.html" %} +{% block title %}Scrap{% endblock %} + +{% block content %} +
+

Scrap a page

+ +
+
+ + +
+ +
+
+{% endblock %} diff --git a/lookyloo/templates/tree.html b/lookyloo/templates/tree.html new file mode 100644 index 0000000..00a87de --- /dev/null +++ b/lookyloo/templates/tree.html @@ -0,0 +1,31 @@ +{% extends "main.html" %} + +{% block title %}Tree{% endblock %} + +{% block content %} + + + {% for x1, y1, x2, y2, nodeid, text, area in nodes %} + + {% endfor %} + {% for x1, y1, x2, y2, nodeid, text, area in faces %} + + {% endfor %} + +
+ +
Powered by etetoolkit
+
+{% endblock %} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..60353e2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# Web thing +flask +flask-bootstrap + +# Backend libs +git+https://github.com/viper-framework/har2tree.git +git+https://github.com/viper-framework/ScrapySplashWrapper.git + +# Required for the drawing (latest version) +git+https://github.com/etetoolkit/ete.git diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..345696b --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from setuptools import setup + + +setup( + name='lookyloo', + version='0.1', + author='Raphaël Vinot', + author_email='raphael.vinot@circl.lu', + maintainer='Raphaël Vinot', + url='https://github.com/CIRCL/lookyloo', + description='Web interface to track the trackers.', + packages=['lookyloo'], + include_package_data=True, + classifiers=[ + 'License :: OSI Approved :: BSD License', + 'Operating System :: POSIX :: Linux', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Telecommunications Industry', + 'Intended Audience :: Information Technology', + 'Programming Language :: Python :: 3', + 'Topic :: Security', + 'Topic :: Internet', + ], +)