From f7f1abc9e43ff7b9f2cffb5777678f826ad1edd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20Vinot?= Date: Sun, 23 Jul 2017 19:56:51 +0200 Subject: [PATCH] Add initial web interface --- .gitignore | 5 ++ README.md | 46 +++++++++++- lookyloo/__init__.py | 85 ++++++++++++++++++++++ lookyloo/ete3_webserver/__init__.py | 3 + lookyloo/ete3_webserver/tree_handler.py | 90 ++++++++++++++++++++++++ lookyloo/static/ete.css | 8 +++ lookyloo/static/ete.js | 19 +++++ lookyloo/static/loader.gif | Bin 0 -> 7364 bytes lookyloo/templates/index.html | 15 ++++ lookyloo/templates/main.html | 11 +++ lookyloo/templates/scrap.html | 16 +++++ lookyloo/templates/tree.html | 31 ++++++++ requirements.txt | 10 +++ setup.py | 26 +++++++ 14 files changed, 363 insertions(+), 2 deletions(-) create mode 100644 lookyloo/__init__.py create mode 100644 lookyloo/ete3_webserver/__init__.py create mode 100644 lookyloo/ete3_webserver/tree_handler.py create mode 100644 lookyloo/static/ete.css create mode 100644 lookyloo/static/ete.js create mode 100644 lookyloo/static/loader.gif create mode 100644 lookyloo/templates/index.html create mode 100644 lookyloo/templates/main.html create mode 100644 lookyloo/templates/scrap.html create mode 100644 lookyloo/templates/tree.html create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 7bbc71c..0799bd9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# Local exclude +scraped/ +*.swp +lookyloo/ete3_webserver/webapi.py + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 1185e85..e5ff2f5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,44 @@ -# lookyloo -*Lookyloo* is a web interface allowing to scrape a website and then displays a tree of domains calling each other. +Lookyloo +======== + +*Lookyloo* is a web interface allowing to scrape a website and then displays a +tree of domains calling each other. + + +What is that name?! +=================== + + +``` +1. People who just come to look. +2. People who go out of their way to look at people or something often causing crowds and more disruption. +3. People who enjoy staring at watching other peoples misfortune. Oftentimes car onlookers to car accidents. +Same as Looky Lou; often spelled as Looky-loo (hyphen) or lookylou +In L.A. usually the lookyloo's cause more accidents by not paying full attention to what is ahead of them. +``` + +Source: Urban Dictionary + + +Implementation details +====================== + +This code is very heavily inspired by https://github.com/etetoolkit/webplugin and adapted to use flask as backend. + +Installation of har2tree +======================== + +The core dependency is ETE Toolkit, which you can install following the guide +on the official website: http://etetoolkit.org/download/ + +Protip +====== + +If you like using virtualenv and have `pew` installed you can also do it this way: + +```bash +sudo apt-get install python-qt4 +pip install -r requirements.txt +pew toggleglobalsitepackages # PyQt4 is not easily installable in a virtualenv +pip install -e . +``` diff --git a/lookyloo/__init__.py b/lookyloo/__init__.py new file mode 100644 index 0000000..806e635 --- /dev/null +++ b/lookyloo/__init__.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import json + +from har2tree import CrawledTree, hostname_treestyle +from scrapysplashwrapper import crawl +from ete3_webserver import NodeActions, WebTreeHandler + +from flask import Flask, render_template, request +from flask_bootstrap import Bootstrap + +from glob import glob +import os +from datetime import datetime + +app = Flask(__name__) + +Bootstrap(app) +app.config['BOOTSTRAP_SERVE_LOCAL'] = True +app.debug = True + +HAR_DIR = 'scraped' +SPLASH = 'http://127.0.0.1:8050' + + +def load_tree(report_dir): + har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + ct = CrawledTree(har_files) + ct.find_parents() + ct.join_trees() + ct.root_hartree.make_hostname_tree() + actions = NodeActions() + style = hostname_treestyle() + return WebTreeHandler(ct.root_hartree.hostname_tree, actions, style) + + +@app.route('/scrap', methods=['GET', 'POST']) +def scrap(): + if request.form.get('url'): + url = request.form.get('url') + depth = request.form.get('depth') + items = crawl(SPLASH, url, depth) + if not items: + # broken + pass + width = len(str(len(items))) + i = 1 + dirpath = os.path.join(HAR_DIR, datetime.now().isoformat()) + os.makedirs(dirpath) + for item in items: + harfile = item['har'] + with open(os.path.join(dirpath, '{0:0{width}}.har'.format(i, width=width)), 'w') as f: + json.dump(harfile, f) + i += 1 + return tree(-1) + return render_template('scrap.html') + + +@app.route('/tree/', methods=['GET']) +def tree(tree_id): + report_dir = sorted(os.listdir(HAR_DIR))[tree_id] + tree = load_tree(report_dir) + nodes, faces, base64 = tree.redraw() + return render_template('tree.html', nodes=nodes, faces=faces, base64_img=base64) + + +@app.route('/', methods=['GET']) +def index(): + i = 0 + titles = [] + for report_dir in sorted(os.listdir(HAR_DIR)): + har_files = sorted(glob(os.path.join(HAR_DIR, report_dir, '*.har'))) + if not har_files: + continue + with open(har_files[0], 'r') as f: + j = json.load(f) + titles.append((i, j['log']['pages'][0]['title'])) + i += 1 + + return render_template('index.html', titles=titles) + + +if __name__ == '__main__': + app.run(port=5001) diff --git a/lookyloo/ete3_webserver/__init__.py b/lookyloo/ete3_webserver/__init__.py new file mode 100644 index 0000000..f1964af --- /dev/null +++ b/lookyloo/ete3_webserver/__init__.py @@ -0,0 +1,3 @@ +from .tree_handler import WebTreeHandler, NodeActions + +__all__ = ['WebTreeHandler', 'NodeActions'] diff --git a/lookyloo/ete3_webserver/tree_handler.py b/lookyloo/ete3_webserver/tree_handler.py new file mode 100644 index 0000000..9f451dc --- /dev/null +++ b/lookyloo/ete3_webserver/tree_handler.py @@ -0,0 +1,90 @@ +import time +import string +import random +# import logging as log +from ete3 import Tree # , TreeStyle +from ete3.parser.newick import NewickError + + +def timeit(f): + def a_wrapper_accepting_arguments(*args, **kargs): + t1 = time.time() + r = f(*args, **kargs) + print(" %0.3f secs: %s" % (time.time() - t1, f.__name__)) + return r + return a_wrapper_accepting_arguments + + +def id_generator(size=6, chars=string.ascii_uppercase + string.digits): + return ''.join(random.choice(chars) for _ in range(size)) + + +class WebTreeHandler(object): + def __init__(self, newick, actions, style): + if isinstance(newick, Tree): + self.tree = newick + else: + try: + self.tree = Tree(newick) + except NewickError: + self.tree = Tree(newick, format=1) + + self.tree.actions = actions + self.tree.tree_style = style + + # Initialze node internal IDs + for index, n in enumerate(self.tree.traverse('preorder')): + n._nid = index + + @timeit + def redraw(self): + base64_img, img_map = self.tree.render("%%return.PNG", tree_style=self.tree.tree_style) + nodes, faces = self.get_html_map(img_map) + base64 = base64_img.data().decode() + return nodes, faces, base64 + + def get_html_map(self, img_map): + nodes = [] + if img_map.get("nodes"): + for x1, y1, x2, y2, nodeid, text in img_map["nodes"]: + nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])]) + faces = [] + if img_map.get("faces"): + for x1, y1, x2, y2, nodeid, text in img_map["faces"]: + faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])]) + return nodes, faces + + def get_avail_actions(self, nodeid): + target = self.tree.search_nodes(_nid=int(nodeid))[0] + action_list = [] + for aindex, aname, show_fn, run_fn in self.tree.actions: + if show_fn(target): + action_list.append([aindex, aname]) + return action_list + + def run_action(self, aindex, nodeid): + target = self.tree.search_nodes(_nid=int(nodeid))[0] + run_fn = self.tree.actions.actions[aindex][2] + return run_fn(self.tree, target) + + +class NodeActions(object): + def __str__(self): + text = [] + for aindex, aname, show_fn, run_fn in self: + text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn)) + return '\n'.join(text) + + def __iter__(self): + for aindex, (aname, show_fn, run_fn) in self.actions.items(): + yield (aindex, aname, show_fn, run_fn) + + def __init__(self): + self.actions = {} + + def clear_default_actions(self): + self.actions = {} + + def add_action(self, action_name, show_fn, run_fn): + aindex = "act_" + id_generator() + self.actions[aindex] = [action_name, show_fn, run_fn] diff --git a/lookyloo/static/ete.css b/lookyloo/static/ete.css new file mode 100644 index 0000000..dbba244 --- /dev/null +++ b/lookyloo/static/ete.css @@ -0,0 +1,8 @@ +#highlighter { + position: absolute; + visibility: visible; + z-index:100; + top:0; left:0; + width: 70px; height: 70px; + border: 2px solid indianred; +} diff --git a/lookyloo/static/ete.js b/lookyloo/static/ete.js new file mode 100644 index 0000000..5525187 --- /dev/null +++ b/lookyloo/static/ete.js @@ -0,0 +1,19 @@ +function highlight_node(x, y, width, height){ + //console.log(treeid, nodeid, x, y, width, height); + var img = $('#img'); + var offset = img.offset(); + // console.log(img); + // console.log(offset); + + $("#highlighter").show(); + $("#highlighter").css("top", offset.top+y-1); + $("#highlighter").css("left", offset.left+x-1); + $("#highlighter").css("width", width+1); + $("#highlighter").css("height", height+1); + +} +function unhighlight_node(){ + // console.log("unhighlight"); + $("#highlighter").hide(); +} + diff --git a/lookyloo/static/loader.gif b/lookyloo/static/loader.gif new file mode 100644 index 0000000000000000000000000000000000000000..fe36aa0dc23c803529e1259df9a8a53b055d9e2e GIT binary patch literal 7364 zcmeI1X;c&GwuZk{Wu6H0pkfjRnGFais91r3;D8YknZ$sAfQo@e8)&2KdG=&PJ#z8p#XFDgy#LGl7jItl4fQ>odf0uf`_|95 zp1yo~?e?|b-~B#xZz$z(%GXD}o_;=Ef3AMb-Zd+Bt{b`@CbIbk7`(3K8 z^=a#C&ej~NKjgp9|J0>Z*-hD-(l^ED$9p7tfPa$1`b6>f4&30y<$F6haA1t)u~{EY zypm%gQ(~f-U&p7!F(VTqQ+Mo&jCP1mjNJuT@317Y<|@0-FZlT*0>UQ|5KE*PhcabZ z**Up+`FXl_*!cxznRvEdDR8i@mdv+<%M0e2nc0Xb+HDfOYIc1VktjAbm!GlK?-4WF zv|TzfNqTz5V7}pXYl8K4L$PI#z6y}2m%#UJCSr^02Eb-O%)D+Srcx+$!X=&IIlyuN zZ!*Cium$i_9KCaGJD~NF;_QF1hZUI6LxfK{#P(=~k_RYg`V#Q0Hz#?Z&mx$QDy+D8 zQ;rGWN$E!&;Gm)r;TScJMg3sLBy!XDj^dl^-`_$kFj9MpNxT0dR=JS1$VjNz2?dbV`3cO@%?eLDl9%PapNhqp zJ->CW5zT(}Yk7(vOAxMe62_J2(v12~b~TKX@|Py|buacN;2XD_xiGn}UL=u7OypwP z-di2iyTP}r|N85V2ajq-%H0nn%|KMfz^$5P`vM@fg<5pRN{o2F_+cEtwD3j|eENtp zmFHT28D$f!6o_<%{mTbtEz=uoGw-a&h4?)+FFc091qD=M?(dc(8E?KBiNuUyL_pdT zGGq+iT~_%ErGg((Cb>$s^%o1&p1FP<$6$cX*4klY2Afzm5hEDI8^f|{UIh;UsNc1L zMQGcAlzGPfNX&I~M~4nseCHF&(z-i&4;nM?eo~T?(p{gm;^D--c#adJ;eb^>TZG~` zY!Gg^`^jck#%mJ5Sa|<%6rR_b0x z7MOvCnDzU7x8zpFtUCOy^d`Y(={Ap{dC2l-AjaZYz@D^K|2UT*TW%%1ri5df@D-}7%J4&&Ne4uOe>&jaC7TO? zNPxjFe0}6JU}n0}#Pqa*#yA9%cN}o08qxbL+LN&rl=L^oeqeb}YOpqcus;zC?OQVd z3}L*ZjY=j+cP=`*a9knwi&zA8Jc{8~Z^3U=0EvJD@1%_Mh=FBNza zP6ZBvJyB|cd9%MqI^+ovo40&_!1?l=myfR2tgTGj`x5AlF8Dq~gn>TcnydGh5m7}t zkuA?+69aiQc61KsmKok%V6q6Z7f3efvXey+HQaXq!n)Sg?3zH@M>V=aWil1DdWDwt*R1CZUc zPDV3KC%Wob11@eN&=?Z0!YtAkH5Bc28>%b9{}64y1IGR?g2uJ`v`ZdMI5n~a?tbe~ zLVnZg9O~%ogPh|;ejFy3hz>@M=UpwzNYZ&(oTqT(MA;Lf0~C96Qz|OzczE}MCftV# z1q3-Wn)Bh(xYDAdqrTRl9Rjg7uEj*tk{WVvOOjID`c#$>yZHVNBKDl1z4+!iQsn{h z0_#{Gx=!1%abj=%$$FvtZMo_r896+A!S)wuQT|c;6R$kZj>%_I4=>DsQ)XW%u!dE> z%s(Xnk&cHo_5X_i_{BJ^1W!F$p8wf^s%li+)Y6dk2mjyJh~cxi`tLHn(0{S?*m=+; z7i-8jR>IAsR&WZvcH-2it+w_;T_t@)L;Xdm!QY30ysp(8GwTb$`l(Psm*g6M5Q3Sq&rb`foPZHWp)=AeRv_ilAag4X>g#b&7y zzhY%mxYvA6L7XK!s|gvs&z!eccb$0#>#FbC)`7zy=Y(EVg%ntE*mecMhpu~Kp3BD7 zEUjSqR-S&;85u-2C$8Vlj3`!@z{K`D#Tj$?8!&4UDgcgQEo-d37a3I@7^p&uwfY29i{YLE^AxBiYtrgtf9bz)%}$K$I;q zvH^=oRAahj)B_H6)4;~DBkPk6GKcRt<+2irP(nh5lFX)1H!rhV{4Bk-+s;}=M6Xa- z2JEjGYZlhs#bSqpLhp`?pWLcm5FQ%ez0ukb+*PEA50xvTc#wYmtZ~=tD%l z)^!RRn%>@fn`wNPSts}nXI%siZ;Vr6=C~DBQZG2ZMoZsv%wpg}`mYrMTn7b$Ur`?f zoBgJL*{^al?#Ois6=xhFEtM8lruoeaWJm{xn=AoVFJ{}8vO!3>&BA#+DkN#r(_;Cg z`TLw&$hj~!icl)#9xfot1}(Ls%w>(BuVqy$MQ@6jo7_cSZ`;;bSpk=Yyi)=06$w&V z-)go(!Ge3!!(IAe@+FzF`aM_IBigC7<@J#xxu z!r$Imlr$OCz>wc~ieLhG@J$Caob*UJB%HJRes}_!K036D=BqyUwYXh^Wf=Ka$sgQV ztYN)6IrPPoLlV}3=XYNCLiqOWSF6L~Z@+ku&hq56@3l6t3)Udc{DQdZ55$e+O=s1! zW<~~BE0kd}x=n^jj!emRG0S?Qf*CxaQmW@)A=&z&Vcg%mGQ>$jaGk66($f#@0vJm@cFxxo3_{9U0!LRmv(3way-Qcx|X3JM2Sx(Ud zM6t)6OSuI(G&CANf0&^>ViWz+E6as56tTYQWf!ukXF|+`z1rG6X-0wSCK{0P>-K4Y z=ZgnB&mzuPL4EI$c{BWnB!D21q`cKeyWEdBcb--w+ znCT;yR$?-&Kp}i`sGYa~WOwP<0CRsLfb_eJuP63Zm@lxt6oAwc0hP<54-wT9ZXR@} zr?6f%5s(b~81>HC_g(CO0e?wZM>j3XV7a30Je1+lcT^)m+kS=A#;WC%9USNWQb}C! zj~R3RWx;>R0>UC&>)niyx*nWsbmi(41S5GnV=(uibas2%U**Tl41Jn{7Zs=GGh)H6 zcgk&EDF-JGXD=;^i}l+Aa{JWQs5W#XO*6izBl5wa)yU&lhfqxny~>2&9xL}<`_#HV zNF=Ry9S0Dvi_Ov+ZHI8gyDJ6Gd3Gvvc6mQ%383Ir6-t5&PtZVS(No!GF3-;}m&;CY zqKl~Q9BGA%fUm4{jUJ8BjP61{7xsD6Tjn5hli-8)3|zvAB7_jo%85CJ<O^4OwRc=V=P!W znkpiPJqMzKR;0DY*@s~*%hL$rnYV(NTF5M^);VsrYefD7>T(kxSeNnv)_%$kwxS~7 ziqj>W2l2I}4a|gYweL9-HF-eU8G3^0Tul)jNhd?GiStK&)8(!vlAVR!nst{(3zw5U zPwpDLG^3+T43)Cd?p0sezY@WJlnDMulsGhOJbGjQ@1*!2b>ybi%%@BP^UsL@o7yyI z;JDX1lHy9nucN2omCMrP4@{kUa||Cq@}ZMcgYeQ6&`W0XoP59HoGzBoJ?#S91(6w= zj-iV7XX~QCbI<5A1~qC6POjLQStwHP_NHJx4jL&1Mf2#ka|bSwO~H2& zX5U~qI+;x(R~6n623D(qaqXk z-ILFn0GRVd6Xq0xcM3!sDU#3GfbVl$6g~qwRq$9Db3Sdfmh*)B1X)y%!3sJnbJm@^ zBGYmqdrs^&nEUcEG^kgZEhPEgVSV2Ih?2g@F=i|RrIW!XBp&1pEM)JaD>gIIH36Xk zn73Xl)LOMwx~JQ7n0(U9WD?1=_2_-F`)JiFEo(WMGi?d=S>K=<&MHWm67{uQ;felf zL!t8RzHd)}4Q_=R3)A#CZ?Z16&sk6Q3*JLR2LMn~J{2mbO+S&hS8$uR3AlY;# zZidVWzD+}*K%7-l-5C`wFK1E0#tkC+PhAnsSR<7s5py9mJ7R9gUfxv^Owx6hUq;h0 zIM-cn2v~PlwMj>J9+FfWH^^E>sZT(P3IPt*F{mh08`$8-jj}kQdY2Zr0|lk>uA5H0RR}_x4qGD4)i29+g|BZ@0jl61;l9Cy$Sz$m|0PYkL(V6kR)Ot3pwL zWLNBZdga5BW*kj8pR2z;C|-;WA>)vDth*On&P;|PMV0y=(fwk5;=FXluF`Oif z*cmPs&kxv!cE6ciP``Gy`uskqQxaS_dh^(M?dfBQviS=mPSbAtG^{%rae*T;bsE2Z z_<`s94IQ(u;4pe5PMdA)`v_TEwqzikk+^!hRgITCXwbRNJo_z2G9I%~|1}8Q z +

Scrap a page



+ +
+ {% for id, page_title in titles %} + {{ page_title }}
+
+ {% endfor %} +
+{% endblock %} diff --git a/lookyloo/templates/main.html b/lookyloo/templates/main.html new file mode 100644 index 0000000..0a2674d --- /dev/null +++ b/lookyloo/templates/main.html @@ -0,0 +1,11 @@ +{% extends "bootstrap/base.html" %} + +{% block scripts %} + {{ super() }} + +{% endblock %} + +{% block head %} + {{ super() }} + +{% endblock %} diff --git a/lookyloo/templates/scrap.html b/lookyloo/templates/scrap.html new file mode 100644 index 0000000..8df33d2 --- /dev/null +++ b/lookyloo/templates/scrap.html @@ -0,0 +1,16 @@ +{% extends "main.html" %} +{% block title %}Scrap{% endblock %} + +{% block content %} +
+

Scrap a page

+ +
+
+ + +
+ +
+
+{% endblock %} diff --git a/lookyloo/templates/tree.html b/lookyloo/templates/tree.html new file mode 100644 index 0000000..00a87de --- /dev/null +++ b/lookyloo/templates/tree.html @@ -0,0 +1,31 @@ +{% extends "main.html" %} + +{% block title %}Tree{% endblock %} + +{% block content %} + + + {% for x1, y1, x2, y2, nodeid, text, area in nodes %} + + {% endfor %} + {% for x1, y1, x2, y2, nodeid, text, area in faces %} + + {% endfor %} + + +{% endblock %} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..60353e2 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +# Web thing +flask +flask-bootstrap + +# Backend libs +git+https://github.com/viper-framework/har2tree.git +git+https://github.com/viper-framework/ScrapySplashWrapper.git + +# Required for the drawing (latest version) +git+https://github.com/etetoolkit/ete.git diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..345696b --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +from setuptools import setup + + +setup( + name='lookyloo', + version='0.1', + author='Raphaël Vinot', + author_email='raphael.vinot@circl.lu', + maintainer='Raphaël Vinot', + url='https://github.com/CIRCL/lookyloo', + description='Web interface to track the trackers.', + packages=['lookyloo'], + include_package_data=True, + classifiers=[ + 'License :: OSI Approved :: BSD License', + 'Operating System :: POSIX :: Linux', + 'Intended Audience :: Science/Research', + 'Intended Audience :: Telecommunications Industry', + 'Intended Audience :: Information Technology', + 'Programming Language :: Python :: 3', + 'Topic :: Security', + 'Topic :: Internet', + ], +)