new: Initial commit for client and async scraping

pull/27/head
Raphaël Vinot 2019-01-29 18:37:13 +01:00
parent e14a0150a0
commit 6bc316ebcf
15 changed files with 1633 additions and 103 deletions

3
.gitignore vendored
View File

@ -111,3 +111,6 @@ secret_key
FileSaver.js FileSaver.js
d3.v5.min.js d3.v5.min.js
d3.v5.js d3.v5.js
cache.pid
dump.rdb

View File

@ -15,6 +15,7 @@ flask-bootstrap = "*"
gunicorn = {extras = ["eventlet"],version = "*"} gunicorn = {extras = ["eventlet"],version = "*"}
lookyloo = {editable = true,path = "."} lookyloo = {editable = true,path = "."}
cchardet = "*" cchardet = "*"
redis = "*"
[requires] [requires]
python_version = "3.6" python_version = "3.6"

12
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "6468b6eae67fd44ea812c2d4d5cdfe2faf39a109374cc8ef526b6c9419927968" "sha256": "c3c9657b345f0168789235083c9309852a08bdcfec02df214e6d54f5927e9f20"
}, },
"pipfile-spec": 6, "pipfile-spec": 6,
"requires": { "requires": {
@ -413,7 +413,7 @@
"pysanejs": { "pysanejs": {
"editable": true, "editable": true,
"git": "https://github.com/CIRCL/PySaneJS.git", "git": "https://github.com/CIRCL/PySaneJS.git",
"ref": "a91ebf014754b1b84e1c2874759315446d4a6b85" "ref": "9153b38c1819d93725aee70c8b0195d7e662f978"
}, },
"queuelib": { "queuelib": {
"hashes": [ "hashes": [
@ -422,6 +422,14 @@
], ],
"version": "==1.5.0" "version": "==1.5.0"
}, },
"redis": {
"hashes": [
"sha256:74c892041cba46078ae1ef845241548baa3bd3634f9a6f0f952f006eb1619c71",
"sha256:7ba8612bbfd966dea8c62322543fed0095da2834dbd5a7c124afbc617a156aa7"
],
"index": "pypi",
"version": "==3.1.0"
},
"requests": { "requests": {
"hashes": [ "hashes": [
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e", "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",

35
bin/async_scrape.py Executable file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from pathlib import Path
import logging
from lookyloo.abstractmanager import AbstractManager
from lookyloo.helpers import get_homedir, get_socket_path
from lookyloo import scrape
from redis import Redis
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
class AsyncScraper(AbstractManager):
def __init__(self, storage_directory: Path=None, loglevel: int=logging.INFO):
super().__init__(loglevel)
if not storage_directory:
self.storage_directory = get_homedir() / 'scraped'
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def _to_run_forever(self):
uuid = self.redis.spop('to_scrape')
if not uuid:
return
to_scrape = self.redis.hgetall(uuid)
to_scrape['perma_uuid'] = uuid
scrape(**to_scrape)
if __name__ == '__main__':
m = AsyncScraper()
m.run(sleep_in_sec=1)

67
bin/run_backend.py Executable file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from lookyloo.helpers import get_homedir, check_running
from subprocess import Popen
import time
from pathlib import Path
import argparse
def launch_cache(storage_directory: Path=None):
if not storage_directory:
storage_directory = get_homedir()
if not check_running('cache'):
Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
def shutdown_cache(storage_directory: Path=None):
if not storage_directory:
storage_directory = get_homedir()
Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
def launch_all():
launch_cache()
def check_all(stop=False):
backends = [['cache', False]]
while True:
for b in backends:
try:
b[1] = check_running(b[0])
except Exception:
b[1] = False
if stop:
if not any(b[1] for b in backends):
break
else:
if all(b[1] for b in backends):
break
for b in backends:
if not stop and not b[1]:
print(f"Waiting on {b[0]}")
if stop and b[1]:
print(f"Waiting on {b[0]}")
time.sleep(1)
def stop_all():
shutdown_cache()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Manage backend DBs.')
parser.add_argument("--start", action='store_true', default=False, help="Start all")
parser.add_argument("--stop", action='store_true', default=False, help="Stop all")
parser.add_argument("--status", action='store_true', default=True, help="Show status")
args = parser.parse_args()
if args.start:
launch_all()
if args.stop:
stop_all()
if not args.stop and args.status:
check_all()

13
bin/start.py Executable file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from subprocess import Popen
from lookyloo.helpers import get_homedir
import time
if __name__ == '__main__':
# Just fail if the env isn't set.
get_homedir()
p = Popen(['run_backend.py', '--start'])
p.wait()
Popen(['start_website.py'])

1378
cache/cache.conf vendored Normal file

File diff suppressed because it is too large Load Diff

6
cache/run_redis.sh vendored Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
set -e
set -x
../../redis/src/redis-server ./cache.conf

6
cache/shutdown_redis.sh vendored Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
# set -e
set -x
../../redis/src/redis-cli -s ./cache.sock shutdown

View File

@ -6,7 +6,7 @@ import json
from har2tree import CrawledTree from har2tree import CrawledTree
from scrapysplashwrapper import crawl from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file, redirect, url_for from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap from flask_bootstrap import Bootstrap
from datetime import datetime from datetime import datetime
@ -24,7 +24,8 @@ from uuid import uuid4
from pysanejs import SaneJS from pysanejs import SaneJS
from .helpers import get_homedir from .helpers import get_homedir, get_socket_path
from redis import Redis
app = Flask(__name__) app = Flask(__name__)
@ -56,6 +57,8 @@ if SANE_JS:
else: else:
has_sane_js = False has_sane_js = False
r = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def get_report_dirs(): def get_report_dirs():
# Cleanup HAR_DIR of failed runs. # Cleanup HAR_DIR of failed runs.
@ -138,6 +141,17 @@ def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
return perma_uuid return perma_uuid
@app.route('/submit', methods=['POST', 'GET'])
def submit():
to_query = request.get_json(force=True)
perma_uuid = str(uuid4())
p = r.pipeline()
p.hmset(perma_uuid, to_query)
p.sadd('to_scrape', perma_uuid)
p.execute()
return Response(perma_uuid, mimetype='text/text')
@app.route('/scrape', methods=['GET', 'POST']) @app.route('/scrape', methods=['GET', 'POST'])
def scrape_web(): def scrape_web():
if request.form.get('url'): if request.form.get('url'):
@ -221,6 +235,9 @@ def tree(tree_uuid):
@app.route('/', methods=['GET']) @app.route('/', methods=['GET'])
def index(): def index():
if request.method == 'HEAD':
# Just returns ack if the webserver is running
return 'Ack'
cleanup_old_tmpfiles() cleanup_old_tmpfiles()
session.clear() session.clear()
titles = [] titles = []

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from abc import ABC
import logging
from .helpers import long_sleep, shutdown_requested
class AbstractManager(ABC):
def __init__(self, loglevel: int=logging.DEBUG):
self.loglevel = loglevel
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(loglevel)
self.logger.info(f'Initializing {self.__class__.__name__}')
async def _to_run_forever_async(self):
pass
def _to_run_forever(self):
pass
def run(self, sleep_in_sec: int):
self.logger.info(f'Launching {self.__class__.__name__}')
while True:
if shutdown_requested():
break
try:
self._to_run_forever()
except Exception:
self.logger.exception(f'Something went terribly wrong in {self.__class__.__name__}.')
if not long_sleep(sleep_in_sec):
break
self.logger.info(f'Shutting down {self.__class__.__name__}')

View File

@ -1,3 +0,0 @@
from .tree_handler import WebTreeHandler, NodeActions
__all__ = ['WebTreeHandler', 'NodeActions']

View File

@ -1,95 +0,0 @@
import time
import string
import random
# import logging as log
from ete3 import Tree # , TreeStyle
from ete3.parser.newick import NewickError
import tempfile
import base64
def timeit(f):
def a_wrapper_accepting_arguments(*args, **kargs):
t1 = time.time()
r = f(*args, **kargs)
print(" %0.3f secs: %s" % (time.time() - t1, f.__name__))
return r
return a_wrapper_accepting_arguments
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
class WebTreeHandler(object):
def __init__(self, newick, actions, style):
if isinstance(newick, Tree):
self.tree = newick
else:
try:
self.tree = Tree(newick)
except NewickError:
self.tree = Tree(newick, format=1)
self.tree.actions = actions
self.tree.tree_style = style
# Initialze node internal IDs
for index, n in enumerate(self.tree.traverse('preorder')):
n._nid = index
@timeit
def redraw(self):
with tempfile.NamedTemporaryFile(suffix='.PNG') as temp:
img_map = self.tree.render(temp.name, tree_style=self.tree.tree_style)
temp.seek(0)
base64_img = base64.b64encode(temp.read())
nodes, faces = self.get_html_map(img_map)
base64_img = base64_img.decode()
return nodes, faces, base64_img
def get_html_map(self, img_map):
nodes = []
if img_map.get("nodes"):
for x1, y1, x2, y2, nodeid, text in img_map["nodes"]:
nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
faces = []
if img_map.get("faces"):
for x1, y1, x2, y2, nodeid, text in img_map["faces"]:
faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
return nodes, faces
def get_avail_actions(self, nodeid):
target = self.tree.search_nodes(_nid=int(nodeid))[0]
action_list = []
for aindex, aname, show_fn, run_fn in self.tree.actions:
if show_fn(target):
action_list.append([aindex, aname])
return action_list
def run_action(self, aindex, nodeid):
target = self.tree.search_nodes(_nid=int(nodeid))[0]
run_fn = self.tree.actions.actions[aindex][2]
return run_fn(self.tree, target)
class NodeActions(object):
def __str__(self):
text = []
for aindex, aname, show_fn, run_fn in self:
text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn))
return '\n'.join(text)
def __iter__(self):
for aindex, (aname, show_fn, run_fn) in self.actions.items():
yield (aindex, aname, show_fn, run_fn)
def __init__(self):
self.actions = {}
def clear_default_actions(self):
self.actions = {}
def add_action(self, action_name, show_fn, run_fn):
aindex = "act_" + id_generator()
self.actions[aindex] = [action_name, show_fn, run_fn]

View File

@ -3,6 +3,10 @@
import os import os
from pathlib import Path from pathlib import Path
from .exceptions import MissingEnv from .exceptions import MissingEnv
from redis import Redis
from redis.exceptions import ConnectionError
from datetime import datetime, timedelta
import time
def get_homedir(): def get_homedir():
@ -12,3 +16,58 @@ def get_homedir():
Run the following command (assuming you run the code from the clonned repository):\ Run the following command (assuming you run the code from the clonned repository):\
export LOOKYLOO_HOME='{guessed_home}'") export LOOKYLOO_HOME='{guessed_home}'")
return Path(os.environ['LOOKYLOO_HOME']) return Path(os.environ['LOOKYLOO_HOME'])
def set_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hset('running', name, 1)
def unset_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hdel('running', name)
def is_running() -> dict:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
return r.hgetall('running')
def get_socket_path(name: str) -> str:
mapping = {
'cache': Path('cache', 'cache.sock'),
'storage': Path('storage', 'storage.sock'),
}
return str(get_homedir() / mapping[name])
def check_running(name: str) -> bool:
socket_path = get_socket_path(name)
print(socket_path)
try:
r = Redis(unix_socket_path=socket_path)
if r.ping():
return True
except ConnectionError:
return False
def shutdown_requested() -> bool:
try:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
return r.exists('shutdown')
except ConnectionRefusedError:
return True
except ConnectionError:
return True
def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
if shutdown_check > sleep_in_sec:
shutdown_check = sleep_in_sec
sleep_until = datetime.now() + timedelta(seconds=sleep_in_sec)
while sleep_until > datetime.now():
time.sleep(shutdown_check)
if shutdown_requested():
return False
return True

View File

@ -12,7 +12,7 @@ setup(
url='https://github.com/CIRCL/lookyloo', url='https://github.com/CIRCL/lookyloo',
description='Web interface to track the trackers.', description='Web interface to track the trackers.',
packages=['lookyloo'], packages=['lookyloo'],
scripts=['bin/start_website.py'], scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py'],
include_package_data=True, include_package_data=True,
classifiers=[ classifiers=[
'License :: OSI Approved :: BSD License', 'License :: OSI Approved :: BSD License',