new: Initial commit for client and async scraping

pull/27/head
Raphaël Vinot 2019-01-29 18:37:13 +01:00
parent e14a0150a0
commit 6bc316ebcf
15 changed files with 1633 additions and 103 deletions

3
.gitignore vendored
View File

@ -111,3 +111,6 @@ secret_key
FileSaver.js
d3.v5.min.js
d3.v5.js
cache.pid
dump.rdb

View File

@ -15,6 +15,7 @@ flask-bootstrap = "*"
gunicorn = {extras = ["eventlet"],version = "*"}
lookyloo = {editable = true,path = "."}
cchardet = "*"
redis = "*"
[requires]
python_version = "3.6"

12
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "6468b6eae67fd44ea812c2d4d5cdfe2faf39a109374cc8ef526b6c9419927968"
"sha256": "c3c9657b345f0168789235083c9309852a08bdcfec02df214e6d54f5927e9f20"
},
"pipfile-spec": 6,
"requires": {
@ -413,7 +413,7 @@
"pysanejs": {
"editable": true,
"git": "https://github.com/CIRCL/PySaneJS.git",
"ref": "a91ebf014754b1b84e1c2874759315446d4a6b85"
"ref": "9153b38c1819d93725aee70c8b0195d7e662f978"
},
"queuelib": {
"hashes": [
@ -422,6 +422,14 @@
],
"version": "==1.5.0"
},
"redis": {
"hashes": [
"sha256:74c892041cba46078ae1ef845241548baa3bd3634f9a6f0f952f006eb1619c71",
"sha256:7ba8612bbfd966dea8c62322543fed0095da2834dbd5a7c124afbc617a156aa7"
],
"index": "pypi",
"version": "==3.1.0"
},
"requests": {
"hashes": [
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",

35
bin/async_scrape.py Executable file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from pathlib import Path
import logging
from lookyloo.abstractmanager import AbstractManager
from lookyloo.helpers import get_homedir, get_socket_path
from lookyloo import scrape
from redis import Redis
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
level=logging.INFO, datefmt='%I:%M:%S')
class AsyncScraper(AbstractManager):
def __init__(self, storage_directory: Path=None, loglevel: int=logging.INFO):
super().__init__(loglevel)
if not storage_directory:
self.storage_directory = get_homedir() / 'scraped'
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def _to_run_forever(self):
uuid = self.redis.spop('to_scrape')
if not uuid:
return
to_scrape = self.redis.hgetall(uuid)
to_scrape['perma_uuid'] = uuid
scrape(**to_scrape)
if __name__ == '__main__':
m = AsyncScraper()
m.run(sleep_in_sec=1)

67
bin/run_backend.py Executable file
View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from lookyloo.helpers import get_homedir, check_running
from subprocess import Popen
import time
from pathlib import Path
import argparse
def launch_cache(storage_directory: Path=None):
if not storage_directory:
storage_directory = get_homedir()
if not check_running('cache'):
Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
def shutdown_cache(storage_directory: Path=None):
if not storage_directory:
storage_directory = get_homedir()
Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
def launch_all():
launch_cache()
def check_all(stop=False):
backends = [['cache', False]]
while True:
for b in backends:
try:
b[1] = check_running(b[0])
except Exception:
b[1] = False
if stop:
if not any(b[1] for b in backends):
break
else:
if all(b[1] for b in backends):
break
for b in backends:
if not stop and not b[1]:
print(f"Waiting on {b[0]}")
if stop and b[1]:
print(f"Waiting on {b[0]}")
time.sleep(1)
def stop_all():
shutdown_cache()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Manage backend DBs.')
parser.add_argument("--start", action='store_true', default=False, help="Start all")
parser.add_argument("--stop", action='store_true', default=False, help="Stop all")
parser.add_argument("--status", action='store_true', default=True, help="Show status")
args = parser.parse_args()
if args.start:
launch_all()
if args.stop:
stop_all()
if not args.stop and args.status:
check_all()

13
bin/start.py Executable file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from subprocess import Popen
from lookyloo.helpers import get_homedir
import time
if __name__ == '__main__':
# Just fail if the env isn't set.
get_homedir()
p = Popen(['run_backend.py', '--start'])
p.wait()
Popen(['start_website.py'])

1378
cache/cache.conf vendored Normal file

File diff suppressed because it is too large Load Diff

6
cache/run_redis.sh vendored Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
set -e
set -x
../../redis/src/redis-server ./cache.conf

6
cache/shutdown_redis.sh vendored Executable file
View File

@ -0,0 +1,6 @@
#!/bin/bash
# set -e
set -x
../../redis/src/redis-cli -s ./cache.sock shutdown

View File

@ -6,7 +6,7 @@ import json
from har2tree import CrawledTree
from scrapysplashwrapper import crawl
from flask import Flask, render_template, request, session, send_file, redirect, url_for
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
from flask_bootstrap import Bootstrap
from datetime import datetime
@ -24,7 +24,8 @@ from uuid import uuid4
from pysanejs import SaneJS
from .helpers import get_homedir
from .helpers import get_homedir, get_socket_path
from redis import Redis
app = Flask(__name__)
@ -56,6 +57,8 @@ if SANE_JS:
else:
has_sane_js = False
r = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
def get_report_dirs():
# Cleanup HAR_DIR of failed runs.
@ -138,6 +141,17 @@ def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
return perma_uuid
@app.route('/submit', methods=['POST', 'GET'])
def submit():
to_query = request.get_json(force=True)
perma_uuid = str(uuid4())
p = r.pipeline()
p.hmset(perma_uuid, to_query)
p.sadd('to_scrape', perma_uuid)
p.execute()
return Response(perma_uuid, mimetype='text/text')
@app.route('/scrape', methods=['GET', 'POST'])
def scrape_web():
if request.form.get('url'):
@ -221,6 +235,9 @@ def tree(tree_uuid):
@app.route('/', methods=['GET'])
def index():
if request.method == 'HEAD':
# Just returns ack if the webserver is running
return 'Ack'
cleanup_old_tmpfiles()
session.clear()
titles = []

View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from abc import ABC
import logging
from .helpers import long_sleep, shutdown_requested
class AbstractManager(ABC):
def __init__(self, loglevel: int=logging.DEBUG):
self.loglevel = loglevel
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(loglevel)
self.logger.info(f'Initializing {self.__class__.__name__}')
async def _to_run_forever_async(self):
pass
def _to_run_forever(self):
pass
def run(self, sleep_in_sec: int):
self.logger.info(f'Launching {self.__class__.__name__}')
while True:
if shutdown_requested():
break
try:
self._to_run_forever()
except Exception:
self.logger.exception(f'Something went terribly wrong in {self.__class__.__name__}.')
if not long_sleep(sleep_in_sec):
break
self.logger.info(f'Shutting down {self.__class__.__name__}')

View File

@ -1,3 +0,0 @@
from .tree_handler import WebTreeHandler, NodeActions
__all__ = ['WebTreeHandler', 'NodeActions']

View File

@ -1,95 +0,0 @@
import time
import string
import random
# import logging as log
from ete3 import Tree # , TreeStyle
from ete3.parser.newick import NewickError
import tempfile
import base64
def timeit(f):
def a_wrapper_accepting_arguments(*args, **kargs):
t1 = time.time()
r = f(*args, **kargs)
print(" %0.3f secs: %s" % (time.time() - t1, f.__name__))
return r
return a_wrapper_accepting_arguments
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
return ''.join(random.choice(chars) for _ in range(size))
class WebTreeHandler(object):
def __init__(self, newick, actions, style):
if isinstance(newick, Tree):
self.tree = newick
else:
try:
self.tree = Tree(newick)
except NewickError:
self.tree = Tree(newick, format=1)
self.tree.actions = actions
self.tree.tree_style = style
# Initialze node internal IDs
for index, n in enumerate(self.tree.traverse('preorder')):
n._nid = index
@timeit
def redraw(self):
with tempfile.NamedTemporaryFile(suffix='.PNG') as temp:
img_map = self.tree.render(temp.name, tree_style=self.tree.tree_style)
temp.seek(0)
base64_img = base64.b64encode(temp.read())
nodes, faces = self.get_html_map(img_map)
base64_img = base64_img.decode()
return nodes, faces, base64_img
def get_html_map(self, img_map):
nodes = []
if img_map.get("nodes"):
for x1, y1, x2, y2, nodeid, text in img_map["nodes"]:
nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
faces = []
if img_map.get("faces"):
for x1, y1, x2, y2, nodeid, text in img_map["faces"]:
faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
return nodes, faces
def get_avail_actions(self, nodeid):
target = self.tree.search_nodes(_nid=int(nodeid))[0]
action_list = []
for aindex, aname, show_fn, run_fn in self.tree.actions:
if show_fn(target):
action_list.append([aindex, aname])
return action_list
def run_action(self, aindex, nodeid):
target = self.tree.search_nodes(_nid=int(nodeid))[0]
run_fn = self.tree.actions.actions[aindex][2]
return run_fn(self.tree, target)
class NodeActions(object):
def __str__(self):
text = []
for aindex, aname, show_fn, run_fn in self:
text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn))
return '\n'.join(text)
def __iter__(self):
for aindex, (aname, show_fn, run_fn) in self.actions.items():
yield (aindex, aname, show_fn, run_fn)
def __init__(self):
self.actions = {}
def clear_default_actions(self):
self.actions = {}
def add_action(self, action_name, show_fn, run_fn):
aindex = "act_" + id_generator()
self.actions[aindex] = [action_name, show_fn, run_fn]

View File

@ -3,6 +3,10 @@
import os
from pathlib import Path
from .exceptions import MissingEnv
from redis import Redis
from redis.exceptions import ConnectionError
from datetime import datetime, timedelta
import time
def get_homedir():
@ -12,3 +16,58 @@ def get_homedir():
Run the following command (assuming you run the code from the clonned repository):\
export LOOKYLOO_HOME='{guessed_home}'")
return Path(os.environ['LOOKYLOO_HOME'])
def set_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hset('running', name, 1)
def unset_running(name: str) -> None:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
r.hdel('running', name)
def is_running() -> dict:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
return r.hgetall('running')
def get_socket_path(name: str) -> str:
mapping = {
'cache': Path('cache', 'cache.sock'),
'storage': Path('storage', 'storage.sock'),
}
return str(get_homedir() / mapping[name])
def check_running(name: str) -> bool:
socket_path = get_socket_path(name)
print(socket_path)
try:
r = Redis(unix_socket_path=socket_path)
if r.ping():
return True
except ConnectionError:
return False
def shutdown_requested() -> bool:
try:
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
return r.exists('shutdown')
except ConnectionRefusedError:
return True
except ConnectionError:
return True
def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
if shutdown_check > sleep_in_sec:
shutdown_check = sleep_in_sec
sleep_until = datetime.now() + timedelta(seconds=sleep_in_sec)
while sleep_until > datetime.now():
time.sleep(shutdown_check)
if shutdown_requested():
return False
return True

View File

@ -12,7 +12,7 @@ setup(
url='https://github.com/CIRCL/lookyloo',
description='Web interface to track the trackers.',
packages=['lookyloo'],
scripts=['bin/start_website.py'],
scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py'],
include_package_data=True,
classifiers=[
'License :: OSI Approved :: BSD License',