mirror of https://github.com/CIRCL/lookyloo
new: Initial commit for client and async scraping
parent
e14a0150a0
commit
6bc316ebcf
|
@ -111,3 +111,6 @@ secret_key
|
|||
FileSaver.js
|
||||
d3.v5.min.js
|
||||
d3.v5.js
|
||||
|
||||
cache.pid
|
||||
dump.rdb
|
||||
|
|
1
Pipfile
1
Pipfile
|
@ -15,6 +15,7 @@ flask-bootstrap = "*"
|
|||
gunicorn = {extras = ["eventlet"],version = "*"}
|
||||
lookyloo = {editable = true,path = "."}
|
||||
cchardet = "*"
|
||||
redis = "*"
|
||||
|
||||
[requires]
|
||||
python_version = "3.6"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "6468b6eae67fd44ea812c2d4d5cdfe2faf39a109374cc8ef526b6c9419927968"
|
||||
"sha256": "c3c9657b345f0168789235083c9309852a08bdcfec02df214e6d54f5927e9f20"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
|
@ -413,7 +413,7 @@
|
|||
"pysanejs": {
|
||||
"editable": true,
|
||||
"git": "https://github.com/CIRCL/PySaneJS.git",
|
||||
"ref": "a91ebf014754b1b84e1c2874759315446d4a6b85"
|
||||
"ref": "9153b38c1819d93725aee70c8b0195d7e662f978"
|
||||
},
|
||||
"queuelib": {
|
||||
"hashes": [
|
||||
|
@ -422,6 +422,14 @@
|
|||
],
|
||||
"version": "==1.5.0"
|
||||
},
|
||||
"redis": {
|
||||
"hashes": [
|
||||
"sha256:74c892041cba46078ae1ef845241548baa3bd3634f9a6f0f952f006eb1619c71",
|
||||
"sha256:7ba8612bbfd966dea8c62322543fed0095da2834dbd5a7c124afbc617a156aa7"
|
||||
],
|
||||
"index": "pypi",
|
||||
"version": "==3.1.0"
|
||||
},
|
||||
"requests": {
|
||||
"hashes": [
|
||||
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
from lookyloo.abstractmanager import AbstractManager
|
||||
from lookyloo.helpers import get_homedir, get_socket_path
|
||||
from lookyloo import scrape
|
||||
from redis import Redis
|
||||
|
||||
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||
level=logging.INFO, datefmt='%I:%M:%S')
|
||||
|
||||
|
||||
class AsyncScraper(AbstractManager):
|
||||
|
||||
def __init__(self, storage_directory: Path=None, loglevel: int=logging.INFO):
|
||||
super().__init__(loglevel)
|
||||
if not storage_directory:
|
||||
self.storage_directory = get_homedir() / 'scraped'
|
||||
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
|
||||
def _to_run_forever(self):
|
||||
uuid = self.redis.spop('to_scrape')
|
||||
if not uuid:
|
||||
return
|
||||
to_scrape = self.redis.hgetall(uuid)
|
||||
to_scrape['perma_uuid'] = uuid
|
||||
scrape(**to_scrape)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
m = AsyncScraper()
|
||||
m.run(sleep_in_sec=1)
|
|
@ -0,0 +1,67 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from lookyloo.helpers import get_homedir, check_running
|
||||
from subprocess import Popen
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import argparse
|
||||
|
||||
|
||||
def launch_cache(storage_directory: Path=None):
|
||||
if not storage_directory:
|
||||
storage_directory = get_homedir()
|
||||
if not check_running('cache'):
|
||||
Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
|
||||
|
||||
|
||||
def shutdown_cache(storage_directory: Path=None):
|
||||
if not storage_directory:
|
||||
storage_directory = get_homedir()
|
||||
Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
|
||||
|
||||
|
||||
def launch_all():
|
||||
launch_cache()
|
||||
|
||||
|
||||
def check_all(stop=False):
|
||||
backends = [['cache', False]]
|
||||
while True:
|
||||
for b in backends:
|
||||
try:
|
||||
b[1] = check_running(b[0])
|
||||
except Exception:
|
||||
b[1] = False
|
||||
if stop:
|
||||
if not any(b[1] for b in backends):
|
||||
break
|
||||
else:
|
||||
if all(b[1] for b in backends):
|
||||
break
|
||||
for b in backends:
|
||||
if not stop and not b[1]:
|
||||
print(f"Waiting on {b[0]}")
|
||||
if stop and b[1]:
|
||||
print(f"Waiting on {b[0]}")
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def stop_all():
|
||||
shutdown_cache()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Manage backend DBs.')
|
||||
parser.add_argument("--start", action='store_true', default=False, help="Start all")
|
||||
parser.add_argument("--stop", action='store_true', default=False, help="Stop all")
|
||||
parser.add_argument("--status", action='store_true', default=True, help="Show status")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.start:
|
||||
launch_all()
|
||||
if args.stop:
|
||||
stop_all()
|
||||
if not args.stop and args.status:
|
||||
check_all()
|
|
@ -0,0 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from subprocess import Popen
|
||||
from lookyloo.helpers import get_homedir
|
||||
import time
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Just fail if the env isn't set.
|
||||
get_homedir()
|
||||
p = Popen(['run_backend.py', '--start'])
|
||||
p.wait()
|
||||
Popen(['start_website.py'])
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -e
|
||||
set -x
|
||||
|
||||
../../redis/src/redis-server ./cache.conf
|
|
@ -0,0 +1,6 @@
|
|||
#!/bin/bash
|
||||
|
||||
# set -e
|
||||
set -x
|
||||
|
||||
../../redis/src/redis-cli -s ./cache.sock shutdown
|
|
@ -6,7 +6,7 @@ import json
|
|||
from har2tree import CrawledTree
|
||||
from scrapysplashwrapper import crawl
|
||||
|
||||
from flask import Flask, render_template, request, session, send_file, redirect, url_for
|
||||
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
|
||||
from flask_bootstrap import Bootstrap
|
||||
|
||||
from datetime import datetime
|
||||
|
@ -24,7 +24,8 @@ from uuid import uuid4
|
|||
|
||||
from pysanejs import SaneJS
|
||||
|
||||
from .helpers import get_homedir
|
||||
from .helpers import get_homedir, get_socket_path
|
||||
from redis import Redis
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
|
@ -56,6 +57,8 @@ if SANE_JS:
|
|||
else:
|
||||
has_sane_js = False
|
||||
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||
|
||||
|
||||
def get_report_dirs():
|
||||
# Cleanup HAR_DIR of failed runs.
|
||||
|
@ -138,6 +141,17 @@ def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
|
|||
return perma_uuid
|
||||
|
||||
|
||||
@app.route('/submit', methods=['POST', 'GET'])
|
||||
def submit():
|
||||
to_query = request.get_json(force=True)
|
||||
perma_uuid = str(uuid4())
|
||||
p = r.pipeline()
|
||||
p.hmset(perma_uuid, to_query)
|
||||
p.sadd('to_scrape', perma_uuid)
|
||||
p.execute()
|
||||
return Response(perma_uuid, mimetype='text/text')
|
||||
|
||||
|
||||
@app.route('/scrape', methods=['GET', 'POST'])
|
||||
def scrape_web():
|
||||
if request.form.get('url'):
|
||||
|
@ -221,6 +235,9 @@ def tree(tree_uuid):
|
|||
|
||||
@app.route('/', methods=['GET'])
|
||||
def index():
|
||||
if request.method == 'HEAD':
|
||||
# Just returns ack if the webserver is running
|
||||
return 'Ack'
|
||||
cleanup_old_tmpfiles()
|
||||
session.clear()
|
||||
titles = []
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from abc import ABC
|
||||
import logging
|
||||
|
||||
from .helpers import long_sleep, shutdown_requested
|
||||
|
||||
|
||||
class AbstractManager(ABC):
|
||||
|
||||
def __init__(self, loglevel: int=logging.DEBUG):
|
||||
self.loglevel = loglevel
|
||||
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||
self.logger.setLevel(loglevel)
|
||||
self.logger.info(f'Initializing {self.__class__.__name__}')
|
||||
|
||||
async def _to_run_forever_async(self):
|
||||
pass
|
||||
|
||||
def _to_run_forever(self):
|
||||
pass
|
||||
|
||||
def run(self, sleep_in_sec: int):
|
||||
self.logger.info(f'Launching {self.__class__.__name__}')
|
||||
while True:
|
||||
if shutdown_requested():
|
||||
break
|
||||
try:
|
||||
self._to_run_forever()
|
||||
except Exception:
|
||||
self.logger.exception(f'Something went terribly wrong in {self.__class__.__name__}.')
|
||||
if not long_sleep(sleep_in_sec):
|
||||
break
|
||||
self.logger.info(f'Shutting down {self.__class__.__name__}')
|
|
@ -1,3 +0,0 @@
|
|||
from .tree_handler import WebTreeHandler, NodeActions
|
||||
|
||||
__all__ = ['WebTreeHandler', 'NodeActions']
|
|
@ -1,95 +0,0 @@
|
|||
import time
|
||||
import string
|
||||
import random
|
||||
# import logging as log
|
||||
from ete3 import Tree # , TreeStyle
|
||||
from ete3.parser.newick import NewickError
|
||||
import tempfile
|
||||
import base64
|
||||
|
||||
|
||||
def timeit(f):
|
||||
def a_wrapper_accepting_arguments(*args, **kargs):
|
||||
t1 = time.time()
|
||||
r = f(*args, **kargs)
|
||||
print(" %0.3f secs: %s" % (time.time() - t1, f.__name__))
|
||||
return r
|
||||
return a_wrapper_accepting_arguments
|
||||
|
||||
|
||||
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
|
||||
return ''.join(random.choice(chars) for _ in range(size))
|
||||
|
||||
|
||||
class WebTreeHandler(object):
|
||||
def __init__(self, newick, actions, style):
|
||||
if isinstance(newick, Tree):
|
||||
self.tree = newick
|
||||
else:
|
||||
try:
|
||||
self.tree = Tree(newick)
|
||||
except NewickError:
|
||||
self.tree = Tree(newick, format=1)
|
||||
|
||||
self.tree.actions = actions
|
||||
self.tree.tree_style = style
|
||||
|
||||
# Initialze node internal IDs
|
||||
for index, n in enumerate(self.tree.traverse('preorder')):
|
||||
n._nid = index
|
||||
|
||||
@timeit
|
||||
def redraw(self):
|
||||
with tempfile.NamedTemporaryFile(suffix='.PNG') as temp:
|
||||
img_map = self.tree.render(temp.name, tree_style=self.tree.tree_style)
|
||||
temp.seek(0)
|
||||
base64_img = base64.b64encode(temp.read())
|
||||
nodes, faces = self.get_html_map(img_map)
|
||||
base64_img = base64_img.decode()
|
||||
return nodes, faces, base64_img
|
||||
|
||||
def get_html_map(self, img_map):
|
||||
nodes = []
|
||||
if img_map.get("nodes"):
|
||||
for x1, y1, x2, y2, nodeid, text in img_map["nodes"]:
|
||||
nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
|
||||
faces = []
|
||||
if img_map.get("faces"):
|
||||
for x1, y1, x2, y2, nodeid, text in img_map["faces"]:
|
||||
faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
|
||||
return nodes, faces
|
||||
|
||||
def get_avail_actions(self, nodeid):
|
||||
target = self.tree.search_nodes(_nid=int(nodeid))[0]
|
||||
action_list = []
|
||||
for aindex, aname, show_fn, run_fn in self.tree.actions:
|
||||
if show_fn(target):
|
||||
action_list.append([aindex, aname])
|
||||
return action_list
|
||||
|
||||
def run_action(self, aindex, nodeid):
|
||||
target = self.tree.search_nodes(_nid=int(nodeid))[0]
|
||||
run_fn = self.tree.actions.actions[aindex][2]
|
||||
return run_fn(self.tree, target)
|
||||
|
||||
|
||||
class NodeActions(object):
|
||||
def __str__(self):
|
||||
text = []
|
||||
for aindex, aname, show_fn, run_fn in self:
|
||||
text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn))
|
||||
return '\n'.join(text)
|
||||
|
||||
def __iter__(self):
|
||||
for aindex, (aname, show_fn, run_fn) in self.actions.items():
|
||||
yield (aindex, aname, show_fn, run_fn)
|
||||
|
||||
def __init__(self):
|
||||
self.actions = {}
|
||||
|
||||
def clear_default_actions(self):
|
||||
self.actions = {}
|
||||
|
||||
def add_action(self, action_name, show_fn, run_fn):
|
||||
aindex = "act_" + id_generator()
|
||||
self.actions[aindex] = [action_name, show_fn, run_fn]
|
|
@ -3,6 +3,10 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from .exceptions import MissingEnv
|
||||
from redis import Redis
|
||||
from redis.exceptions import ConnectionError
|
||||
from datetime import datetime, timedelta
|
||||
import time
|
||||
|
||||
|
||||
def get_homedir():
|
||||
|
@ -12,3 +16,58 @@ def get_homedir():
|
|||
Run the following command (assuming you run the code from the clonned repository):\
|
||||
export LOOKYLOO_HOME='{guessed_home}'")
|
||||
return Path(os.environ['LOOKYLOO_HOME'])
|
||||
|
||||
|
||||
def set_running(name: str) -> None:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||
r.hset('running', name, 1)
|
||||
|
||||
|
||||
def unset_running(name: str) -> None:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||
r.hdel('running', name)
|
||||
|
||||
|
||||
def is_running() -> dict:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||
return r.hgetall('running')
|
||||
|
||||
|
||||
def get_socket_path(name: str) -> str:
|
||||
mapping = {
|
||||
'cache': Path('cache', 'cache.sock'),
|
||||
'storage': Path('storage', 'storage.sock'),
|
||||
}
|
||||
return str(get_homedir() / mapping[name])
|
||||
|
||||
|
||||
def check_running(name: str) -> bool:
|
||||
socket_path = get_socket_path(name)
|
||||
print(socket_path)
|
||||
try:
|
||||
r = Redis(unix_socket_path=socket_path)
|
||||
if r.ping():
|
||||
return True
|
||||
except ConnectionError:
|
||||
return False
|
||||
|
||||
|
||||
def shutdown_requested() -> bool:
|
||||
try:
|
||||
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||
return r.exists('shutdown')
|
||||
except ConnectionRefusedError:
|
||||
return True
|
||||
except ConnectionError:
|
||||
return True
|
||||
|
||||
|
||||
def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
|
||||
if shutdown_check > sleep_in_sec:
|
||||
shutdown_check = sleep_in_sec
|
||||
sleep_until = datetime.now() + timedelta(seconds=sleep_in_sec)
|
||||
while sleep_until > datetime.now():
|
||||
time.sleep(shutdown_check)
|
||||
if shutdown_requested():
|
||||
return False
|
||||
return True
|
||||
|
|
2
setup.py
2
setup.py
|
@ -12,7 +12,7 @@ setup(
|
|||
url='https://github.com/CIRCL/lookyloo',
|
||||
description='Web interface to track the trackers.',
|
||||
packages=['lookyloo'],
|
||||
scripts=['bin/start_website.py'],
|
||||
scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py'],
|
||||
include_package_data=True,
|
||||
classifiers=[
|
||||
'License :: OSI Approved :: BSD License',
|
||||
|
|
Loading…
Reference in New Issue