mirror of https://github.com/CIRCL/lookyloo
new: Initial commit for client and async scraping
parent
e14a0150a0
commit
6bc316ebcf
|
@ -111,3 +111,6 @@ secret_key
|
||||||
FileSaver.js
|
FileSaver.js
|
||||||
d3.v5.min.js
|
d3.v5.min.js
|
||||||
d3.v5.js
|
d3.v5.js
|
||||||
|
|
||||||
|
cache.pid
|
||||||
|
dump.rdb
|
||||||
|
|
1
Pipfile
1
Pipfile
|
@ -15,6 +15,7 @@ flask-bootstrap = "*"
|
||||||
gunicorn = {extras = ["eventlet"],version = "*"}
|
gunicorn = {extras = ["eventlet"],version = "*"}
|
||||||
lookyloo = {editable = true,path = "."}
|
lookyloo = {editable = true,path = "."}
|
||||||
cchardet = "*"
|
cchardet = "*"
|
||||||
|
redis = "*"
|
||||||
|
|
||||||
[requires]
|
[requires]
|
||||||
python_version = "3.6"
|
python_version = "3.6"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
{
|
{
|
||||||
"_meta": {
|
"_meta": {
|
||||||
"hash": {
|
"hash": {
|
||||||
"sha256": "6468b6eae67fd44ea812c2d4d5cdfe2faf39a109374cc8ef526b6c9419927968"
|
"sha256": "c3c9657b345f0168789235083c9309852a08bdcfec02df214e6d54f5927e9f20"
|
||||||
},
|
},
|
||||||
"pipfile-spec": 6,
|
"pipfile-spec": 6,
|
||||||
"requires": {
|
"requires": {
|
||||||
|
@ -413,7 +413,7 @@
|
||||||
"pysanejs": {
|
"pysanejs": {
|
||||||
"editable": true,
|
"editable": true,
|
||||||
"git": "https://github.com/CIRCL/PySaneJS.git",
|
"git": "https://github.com/CIRCL/PySaneJS.git",
|
||||||
"ref": "a91ebf014754b1b84e1c2874759315446d4a6b85"
|
"ref": "9153b38c1819d93725aee70c8b0195d7e662f978"
|
||||||
},
|
},
|
||||||
"queuelib": {
|
"queuelib": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
|
@ -422,6 +422,14 @@
|
||||||
],
|
],
|
||||||
"version": "==1.5.0"
|
"version": "==1.5.0"
|
||||||
},
|
},
|
||||||
|
"redis": {
|
||||||
|
"hashes": [
|
||||||
|
"sha256:74c892041cba46078ae1ef845241548baa3bd3634f9a6f0f952f006eb1619c71",
|
||||||
|
"sha256:7ba8612bbfd966dea8c62322543fed0095da2834dbd5a7c124afbc617a156aa7"
|
||||||
|
],
|
||||||
|
"index": "pypi",
|
||||||
|
"version": "==3.1.0"
|
||||||
|
},
|
||||||
"requests": {
|
"requests": {
|
||||||
"hashes": [
|
"hashes": [
|
||||||
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
|
"sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from lookyloo.abstractmanager import AbstractManager
|
||||||
|
from lookyloo.helpers import get_homedir, get_socket_path
|
||||||
|
from lookyloo import scrape
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
|
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
|
||||||
|
level=logging.INFO, datefmt='%I:%M:%S')
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncScraper(AbstractManager):
|
||||||
|
|
||||||
|
def __init__(self, storage_directory: Path=None, loglevel: int=logging.INFO):
|
||||||
|
super().__init__(loglevel)
|
||||||
|
if not storage_directory:
|
||||||
|
self.storage_directory = get_homedir() / 'scraped'
|
||||||
|
self.redis = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
|
|
||||||
|
def _to_run_forever(self):
|
||||||
|
uuid = self.redis.spop('to_scrape')
|
||||||
|
if not uuid:
|
||||||
|
return
|
||||||
|
to_scrape = self.redis.hgetall(uuid)
|
||||||
|
to_scrape['perma_uuid'] = uuid
|
||||||
|
scrape(**to_scrape)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
m = AsyncScraper()
|
||||||
|
m.run(sleep_in_sec=1)
|
|
@ -0,0 +1,67 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from lookyloo.helpers import get_homedir, check_running
|
||||||
|
from subprocess import Popen
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
def launch_cache(storage_directory: Path=None):
|
||||||
|
if not storage_directory:
|
||||||
|
storage_directory = get_homedir()
|
||||||
|
if not check_running('cache'):
|
||||||
|
Popen(["./run_redis.sh"], cwd=(storage_directory / 'cache'))
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown_cache(storage_directory: Path=None):
|
||||||
|
if not storage_directory:
|
||||||
|
storage_directory = get_homedir()
|
||||||
|
Popen(["./shutdown_redis.sh"], cwd=(storage_directory / 'cache'))
|
||||||
|
|
||||||
|
|
||||||
|
def launch_all():
|
||||||
|
launch_cache()
|
||||||
|
|
||||||
|
|
||||||
|
def check_all(stop=False):
|
||||||
|
backends = [['cache', False]]
|
||||||
|
while True:
|
||||||
|
for b in backends:
|
||||||
|
try:
|
||||||
|
b[1] = check_running(b[0])
|
||||||
|
except Exception:
|
||||||
|
b[1] = False
|
||||||
|
if stop:
|
||||||
|
if not any(b[1] for b in backends):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
if all(b[1] for b in backends):
|
||||||
|
break
|
||||||
|
for b in backends:
|
||||||
|
if not stop and not b[1]:
|
||||||
|
print(f"Waiting on {b[0]}")
|
||||||
|
if stop and b[1]:
|
||||||
|
print(f"Waiting on {b[0]}")
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
def stop_all():
|
||||||
|
shutdown_cache()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Manage backend DBs.')
|
||||||
|
parser.add_argument("--start", action='store_true', default=False, help="Start all")
|
||||||
|
parser.add_argument("--stop", action='store_true', default=False, help="Stop all")
|
||||||
|
parser.add_argument("--status", action='store_true', default=True, help="Show status")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.start:
|
||||||
|
launch_all()
|
||||||
|
if args.stop:
|
||||||
|
stop_all()
|
||||||
|
if not args.stop and args.status:
|
||||||
|
check_all()
|
|
@ -0,0 +1,13 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from subprocess import Popen
|
||||||
|
from lookyloo.helpers import get_homedir
|
||||||
|
import time
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Just fail if the env isn't set.
|
||||||
|
get_homedir()
|
||||||
|
p = Popen(['run_backend.py', '--start'])
|
||||||
|
p.wait()
|
||||||
|
Popen(['start_website.py'])
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
set -x
|
||||||
|
|
||||||
|
../../redis/src/redis-server ./cache.conf
|
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# set -e
|
||||||
|
set -x
|
||||||
|
|
||||||
|
../../redis/src/redis-cli -s ./cache.sock shutdown
|
|
@ -6,7 +6,7 @@ import json
|
||||||
from har2tree import CrawledTree
|
from har2tree import CrawledTree
|
||||||
from scrapysplashwrapper import crawl
|
from scrapysplashwrapper import crawl
|
||||||
|
|
||||||
from flask import Flask, render_template, request, session, send_file, redirect, url_for
|
from flask import Flask, render_template, request, session, send_file, redirect, url_for, Response
|
||||||
from flask_bootstrap import Bootstrap
|
from flask_bootstrap import Bootstrap
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -24,7 +24,8 @@ from uuid import uuid4
|
||||||
|
|
||||||
from pysanejs import SaneJS
|
from pysanejs import SaneJS
|
||||||
|
|
||||||
from .helpers import get_homedir
|
from .helpers import get_homedir, get_socket_path
|
||||||
|
from redis import Redis
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@ -56,6 +57,8 @@ if SANE_JS:
|
||||||
else:
|
else:
|
||||||
has_sane_js = False
|
has_sane_js = False
|
||||||
|
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'), decode_responses=True)
|
||||||
|
|
||||||
|
|
||||||
def get_report_dirs():
|
def get_report_dirs():
|
||||||
# Cleanup HAR_DIR of failed runs.
|
# Cleanup HAR_DIR of failed runs.
|
||||||
|
@ -138,6 +141,17 @@ def scrape(url, depth: int=1, user_agent: str=None, perma_uuid: str=None):
|
||||||
return perma_uuid
|
return perma_uuid
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/submit', methods=['POST', 'GET'])
|
||||||
|
def submit():
|
||||||
|
to_query = request.get_json(force=True)
|
||||||
|
perma_uuid = str(uuid4())
|
||||||
|
p = r.pipeline()
|
||||||
|
p.hmset(perma_uuid, to_query)
|
||||||
|
p.sadd('to_scrape', perma_uuid)
|
||||||
|
p.execute()
|
||||||
|
return Response(perma_uuid, mimetype='text/text')
|
||||||
|
|
||||||
|
|
||||||
@app.route('/scrape', methods=['GET', 'POST'])
|
@app.route('/scrape', methods=['GET', 'POST'])
|
||||||
def scrape_web():
|
def scrape_web():
|
||||||
if request.form.get('url'):
|
if request.form.get('url'):
|
||||||
|
@ -221,6 +235,9 @@ def tree(tree_uuid):
|
||||||
|
|
||||||
@app.route('/', methods=['GET'])
|
@app.route('/', methods=['GET'])
|
||||||
def index():
|
def index():
|
||||||
|
if request.method == 'HEAD':
|
||||||
|
# Just returns ack if the webserver is running
|
||||||
|
return 'Ack'
|
||||||
cleanup_old_tmpfiles()
|
cleanup_old_tmpfiles()
|
||||||
session.clear()
|
session.clear()
|
||||||
titles = []
|
titles = []
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from abc import ABC
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from .helpers import long_sleep, shutdown_requested
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractManager(ABC):
|
||||||
|
|
||||||
|
def __init__(self, loglevel: int=logging.DEBUG):
|
||||||
|
self.loglevel = loglevel
|
||||||
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
||||||
|
self.logger.setLevel(loglevel)
|
||||||
|
self.logger.info(f'Initializing {self.__class__.__name__}')
|
||||||
|
|
||||||
|
async def _to_run_forever_async(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _to_run_forever(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def run(self, sleep_in_sec: int):
|
||||||
|
self.logger.info(f'Launching {self.__class__.__name__}')
|
||||||
|
while True:
|
||||||
|
if shutdown_requested():
|
||||||
|
break
|
||||||
|
try:
|
||||||
|
self._to_run_forever()
|
||||||
|
except Exception:
|
||||||
|
self.logger.exception(f'Something went terribly wrong in {self.__class__.__name__}.')
|
||||||
|
if not long_sleep(sleep_in_sec):
|
||||||
|
break
|
||||||
|
self.logger.info(f'Shutting down {self.__class__.__name__}')
|
|
@ -1,3 +0,0 @@
|
||||||
from .tree_handler import WebTreeHandler, NodeActions
|
|
||||||
|
|
||||||
__all__ = ['WebTreeHandler', 'NodeActions']
|
|
|
@ -1,95 +0,0 @@
|
||||||
import time
|
|
||||||
import string
|
|
||||||
import random
|
|
||||||
# import logging as log
|
|
||||||
from ete3 import Tree # , TreeStyle
|
|
||||||
from ete3.parser.newick import NewickError
|
|
||||||
import tempfile
|
|
||||||
import base64
|
|
||||||
|
|
||||||
|
|
||||||
def timeit(f):
|
|
||||||
def a_wrapper_accepting_arguments(*args, **kargs):
|
|
||||||
t1 = time.time()
|
|
||||||
r = f(*args, **kargs)
|
|
||||||
print(" %0.3f secs: %s" % (time.time() - t1, f.__name__))
|
|
||||||
return r
|
|
||||||
return a_wrapper_accepting_arguments
|
|
||||||
|
|
||||||
|
|
||||||
def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
|
|
||||||
return ''.join(random.choice(chars) for _ in range(size))
|
|
||||||
|
|
||||||
|
|
||||||
class WebTreeHandler(object):
|
|
||||||
def __init__(self, newick, actions, style):
|
|
||||||
if isinstance(newick, Tree):
|
|
||||||
self.tree = newick
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
self.tree = Tree(newick)
|
|
||||||
except NewickError:
|
|
||||||
self.tree = Tree(newick, format=1)
|
|
||||||
|
|
||||||
self.tree.actions = actions
|
|
||||||
self.tree.tree_style = style
|
|
||||||
|
|
||||||
# Initialze node internal IDs
|
|
||||||
for index, n in enumerate(self.tree.traverse('preorder')):
|
|
||||||
n._nid = index
|
|
||||||
|
|
||||||
@timeit
|
|
||||||
def redraw(self):
|
|
||||||
with tempfile.NamedTemporaryFile(suffix='.PNG') as temp:
|
|
||||||
img_map = self.tree.render(temp.name, tree_style=self.tree.tree_style)
|
|
||||||
temp.seek(0)
|
|
||||||
base64_img = base64.b64encode(temp.read())
|
|
||||||
nodes, faces = self.get_html_map(img_map)
|
|
||||||
base64_img = base64_img.decode()
|
|
||||||
return nodes, faces, base64_img
|
|
||||||
|
|
||||||
def get_html_map(self, img_map):
|
|
||||||
nodes = []
|
|
||||||
if img_map.get("nodes"):
|
|
||||||
for x1, y1, x2, y2, nodeid, text in img_map["nodes"]:
|
|
||||||
nodes.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
|
|
||||||
faces = []
|
|
||||||
if img_map.get("faces"):
|
|
||||||
for x1, y1, x2, y2, nodeid, text in img_map["faces"]:
|
|
||||||
faces.append([x1, y1, x2, y2, nodeid, text, img_map["node_areas"].get(int(nodeid), [0, 0, 0, 0])])
|
|
||||||
return nodes, faces
|
|
||||||
|
|
||||||
def get_avail_actions(self, nodeid):
|
|
||||||
target = self.tree.search_nodes(_nid=int(nodeid))[0]
|
|
||||||
action_list = []
|
|
||||||
for aindex, aname, show_fn, run_fn in self.tree.actions:
|
|
||||||
if show_fn(target):
|
|
||||||
action_list.append([aindex, aname])
|
|
||||||
return action_list
|
|
||||||
|
|
||||||
def run_action(self, aindex, nodeid):
|
|
||||||
target = self.tree.search_nodes(_nid=int(nodeid))[0]
|
|
||||||
run_fn = self.tree.actions.actions[aindex][2]
|
|
||||||
return run_fn(self.tree, target)
|
|
||||||
|
|
||||||
|
|
||||||
class NodeActions(object):
|
|
||||||
def __str__(self):
|
|
||||||
text = []
|
|
||||||
for aindex, aname, show_fn, run_fn in self:
|
|
||||||
text.append("%s: %s, %s, %s" % (aindex, aname, show_fn, run_fn))
|
|
||||||
return '\n'.join(text)
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for aindex, (aname, show_fn, run_fn) in self.actions.items():
|
|
||||||
yield (aindex, aname, show_fn, run_fn)
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.actions = {}
|
|
||||||
|
|
||||||
def clear_default_actions(self):
|
|
||||||
self.actions = {}
|
|
||||||
|
|
||||||
def add_action(self, action_name, show_fn, run_fn):
|
|
||||||
aindex = "act_" + id_generator()
|
|
||||||
self.actions[aindex] = [action_name, show_fn, run_fn]
|
|
|
@ -3,6 +3,10 @@
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .exceptions import MissingEnv
|
from .exceptions import MissingEnv
|
||||||
|
from redis import Redis
|
||||||
|
from redis.exceptions import ConnectionError
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
def get_homedir():
|
def get_homedir():
|
||||||
|
@ -12,3 +16,58 @@ def get_homedir():
|
||||||
Run the following command (assuming you run the code from the clonned repository):\
|
Run the following command (assuming you run the code from the clonned repository):\
|
||||||
export LOOKYLOO_HOME='{guessed_home}'")
|
export LOOKYLOO_HOME='{guessed_home}'")
|
||||||
return Path(os.environ['LOOKYLOO_HOME'])
|
return Path(os.environ['LOOKYLOO_HOME'])
|
||||||
|
|
||||||
|
|
||||||
|
def set_running(name: str) -> None:
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||||
|
r.hset('running', name, 1)
|
||||||
|
|
||||||
|
|
||||||
|
def unset_running(name: str) -> None:
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||||
|
r.hdel('running', name)
|
||||||
|
|
||||||
|
|
||||||
|
def is_running() -> dict:
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||||
|
return r.hgetall('running')
|
||||||
|
|
||||||
|
|
||||||
|
def get_socket_path(name: str) -> str:
|
||||||
|
mapping = {
|
||||||
|
'cache': Path('cache', 'cache.sock'),
|
||||||
|
'storage': Path('storage', 'storage.sock'),
|
||||||
|
}
|
||||||
|
return str(get_homedir() / mapping[name])
|
||||||
|
|
||||||
|
|
||||||
|
def check_running(name: str) -> bool:
|
||||||
|
socket_path = get_socket_path(name)
|
||||||
|
print(socket_path)
|
||||||
|
try:
|
||||||
|
r = Redis(unix_socket_path=socket_path)
|
||||||
|
if r.ping():
|
||||||
|
return True
|
||||||
|
except ConnectionError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown_requested() -> bool:
|
||||||
|
try:
|
||||||
|
r = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True)
|
||||||
|
return r.exists('shutdown')
|
||||||
|
except ConnectionRefusedError:
|
||||||
|
return True
|
||||||
|
except ConnectionError:
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def long_sleep(sleep_in_sec: int, shutdown_check: int=10) -> bool:
|
||||||
|
if shutdown_check > sleep_in_sec:
|
||||||
|
shutdown_check = sleep_in_sec
|
||||||
|
sleep_until = datetime.now() + timedelta(seconds=sleep_in_sec)
|
||||||
|
while sleep_until > datetime.now():
|
||||||
|
time.sleep(shutdown_check)
|
||||||
|
if shutdown_requested():
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -12,7 +12,7 @@ setup(
|
||||||
url='https://github.com/CIRCL/lookyloo',
|
url='https://github.com/CIRCL/lookyloo',
|
||||||
description='Web interface to track the trackers.',
|
description='Web interface to track the trackers.',
|
||||||
packages=['lookyloo'],
|
packages=['lookyloo'],
|
||||||
scripts=['bin/start_website.py'],
|
scripts=['bin/start_website.py', 'bin/start.py', 'bin/run_backend.py', 'bin/async_scrape.py'],
|
||||||
include_package_data=True,
|
include_package_data=True,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'License :: OSI Approved :: BSD License',
|
'License :: OSI Approved :: BSD License',
|
||||||
|
|
Loading…
Reference in New Issue