mirror of https://github.com/CIRCL/AIL-framework
chg: [Crawler UI] Crawler major refractor (end) + basic UI for manual crawler
parent
7b32d7f34e
commit
c0d72e7d2a
|
@ -30,6 +30,14 @@ def load_blacklist(service_type):
|
|||
except Exception:
|
||||
pass
|
||||
|
||||
def update_auto_crawler():
|
||||
current_epoch = int(time.time())
|
||||
list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
|
||||
for elem_to_crawl in list_to_crawl:
|
||||
mess, type = elem_to_crawl.rsplit(';', 1)
|
||||
redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
|
||||
redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
|
||||
|
||||
# Extract info form url (url, domain, domain url, ...)
|
||||
def unpack_url(url):
|
||||
to_crawl = {}
|
||||
|
@ -76,14 +84,14 @@ def get_elem_to_crawl(rotation_mode):
|
|||
for service_type in rotation_mode:
|
||||
message = redis_crawler.spop('{}_crawler_priority_queue'.format(service_type))
|
||||
if message is not None:
|
||||
domain_service_type = type_service
|
||||
domain_service_type = service_type
|
||||
break
|
||||
#load_normal_queue
|
||||
if message is None:
|
||||
for service_type in rotation_mode:
|
||||
message = redis_crawler.spop('{}_crawler_queue'.format(service_type))
|
||||
if message is not None:
|
||||
domain_service_type = type_service
|
||||
domain_service_type = service_type
|
||||
break
|
||||
|
||||
if message:
|
||||
|
@ -109,6 +117,10 @@ def get_crawler_config(redis_server, mode, service_type, domain):
|
|||
crawler_options[option] = config[option]
|
||||
else:
|
||||
crawler_options[option] = default_crawler_config[option]
|
||||
if mode == 'auto':
|
||||
crawler_options['time'] = int(config['time'])
|
||||
elif mode == 'manual':
|
||||
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||
return crawler_options
|
||||
|
||||
def load_crawler_config(service_type, domain, paste, date):
|
||||
|
@ -239,12 +251,12 @@ def search_potential_source_domain(type_service, domain):
|
|||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if len(sys.argv) != 3:
|
||||
print('usage:', 'Crawler.py', 'mode', 'splash_port')
|
||||
if len(sys.argv) != 2:
|
||||
print('usage:', 'Crawler.py', 'splash_port')
|
||||
exit(1)
|
||||
##################################################
|
||||
#mode = sys.argv[1]
|
||||
splash_port = sys.argv[2]
|
||||
splash_port = sys.argv[1]
|
||||
|
||||
rotation_mode = ['onion', 'regular']
|
||||
default_proto_map = {'http': 80, 'https': 443}
|
||||
|
@ -303,13 +315,11 @@ if __name__ == '__main__':
|
|||
|
||||
while True:
|
||||
|
||||
update_auto_crawler()
|
||||
|
||||
to_crawl = get_elem_to_crawl(rotation_mode)
|
||||
if to_crawl:
|
||||
print(to_crawl)
|
||||
print(to_crawl['url'])
|
||||
url_data = unpack_url(to_crawl['url'])
|
||||
print('url')
|
||||
print(url_data)
|
||||
# remove domain from queue
|
||||
redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain'])
|
||||
|
||||
|
@ -328,14 +338,15 @@ if __name__ == '__main__':
|
|||
'date_month': datetime.datetime.now().strftime("%Y%m"),
|
||||
'epoch': int(time.time())}
|
||||
|
||||
# Update crawler status type
|
||||
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||
|
||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date)
|
||||
print(crawler_config)
|
||||
# check if default crawler
|
||||
#if not crawler_config['requested']:
|
||||
# # Auto crawl only if service not up this month
|
||||
# if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']):
|
||||
# continue
|
||||
if not crawler_config['requested']:
|
||||
# Auto crawl only if service not up this month
|
||||
if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']):
|
||||
continue
|
||||
|
||||
set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste'])
|
||||
|
||||
|
@ -379,14 +390,20 @@ if __name__ == '__main__':
|
|||
############################
|
||||
|
||||
# update list, last crawled domains
|
||||
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain'])
|
||||
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch']))
|
||||
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
|
||||
|
||||
#update crawler status
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
|
||||
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
|
||||
|
||||
time.sleep(60)
|
||||
# Update crawler status type
|
||||
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
|
||||
|
||||
# add next auto Crawling in queue:
|
||||
if to_crawl['paste'] == 'auto':
|
||||
redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
|
||||
|
||||
else:
|
||||
print(' Blacklisted Domain')
|
||||
print()
|
||||
|
|
|
@ -237,7 +237,7 @@ function launching_crawler {
|
|||
sleep 0.1
|
||||
|
||||
for ((i=first_port;i<=last_port;i++)); do
|
||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py onion $i; read x"
|
||||
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
|
||||
sleep 0.1
|
||||
done
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import redis
|
|||
import datetime
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
from pyfaup.faup import Faup
|
||||
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for
|
||||
|
||||
|
@ -94,13 +95,16 @@ def get_domain_type(domain):
|
|||
|
||||
def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
||||
list_crawled_metadata = []
|
||||
for domain in list_domains_crawled:
|
||||
for domain_epoch in list_domains_crawled:
|
||||
domain, epoch = domain_epoch.rsplit(';', 1)
|
||||
metadata_domain = {}
|
||||
# get Domain type
|
||||
if type is None:
|
||||
type = get_domain_type(domain)
|
||||
|
||||
metadata_domain['domain'] = domain
|
||||
metadata_domain['epoch'] = epoch
|
||||
print(epoch)
|
||||
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
|
||||
if metadata_domain['last_check'] is None:
|
||||
metadata_domain['last_check'] = '********'
|
||||
|
@ -118,9 +122,9 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
|
|||
list_crawled_metadata.append(metadata_domain)
|
||||
return list_crawled_metadata
|
||||
|
||||
def get_crawler_splash_status(mode, type):
|
||||
def get_crawler_splash_status(type):
|
||||
crawler_metadata = []
|
||||
all_crawlers = r_cache.smembers('all_crawler:{}:{}'.format(mode, type))
|
||||
all_crawlers = r_cache.smembers('{}_crawlers'.format(type))
|
||||
for crawler in all_crawlers:
|
||||
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
|
||||
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
|
||||
|
@ -132,10 +136,21 @@ def get_crawler_splash_status(mode, type):
|
|||
status=False
|
||||
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
|
||||
|
||||
crawler_metadata.append({'crawler_info': '8050 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True})
|
||||
crawler_metadata.append({'crawler_info': '8051 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True})
|
||||
return crawler_metadata
|
||||
|
||||
def create_crawler_config(mode, service_type, crawler_config, domain):
|
||||
print(crawler_config)
|
||||
if mode == 'manual':
|
||||
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||
elif mode == 'auto':
|
||||
r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
|
||||
|
||||
def send_url_to_crawl_in_queue(mode, service_type, url):
|
||||
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
|
||||
# add auto crawled url for user UI
|
||||
if mode == 'auto':
|
||||
r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url)
|
||||
|
||||
# ============= ROUTES ==============
|
||||
|
||||
@hiddenServices.route("/hiddenServices/2", methods=['GET'])
|
||||
|
@ -160,7 +175,7 @@ def crawler_splash_onion():
|
|||
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
|
||||
|
||||
list_onion = get_last_crawled_domains_metadata(last_onions, date, type='onion')
|
||||
crawler_metadata = get_crawler_splash_status('automatic', 'onion')
|
||||
crawler_metadata = get_crawler_splash_status('onion')
|
||||
|
||||
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
|
||||
return render_template("Crawler_Splash_onion.html", last_onions=list_onion, statDomains=statDomains,
|
||||
|
@ -267,6 +282,81 @@ def unblacklist_onion():
|
|||
else:
|
||||
return redirect(url_for('hiddenServices.blacklisted_onion', page=page, unblacklist_onion=0))
|
||||
|
||||
@hiddenServices.route("/crawlers/create_spider_splash", methods=['POST'])
|
||||
def create_spider_splash():
|
||||
url = request.form.get('url_to_crawl')
|
||||
automatic = request.form.get('crawler_type')
|
||||
crawler_time = request.form.get('crawler_epoch')
|
||||
#html = request.form.get('html_content_id')
|
||||
screenshot = request.form.get('screenshot')
|
||||
har = request.form.get('har')
|
||||
depth_limit = request.form.get('depth_limit')
|
||||
max_pages = request.form.get('max_pages')
|
||||
|
||||
# validate url
|
||||
if url is None or url=='' or url=='\n':
|
||||
return 'incorrect url'
|
||||
|
||||
crawler_config = {}
|
||||
|
||||
# verify user input
|
||||
if automatic:
|
||||
automatic = True
|
||||
else:
|
||||
automatic = False
|
||||
if not screenshot:
|
||||
crawler_config['png'] = 0
|
||||
if not har:
|
||||
crawler_config['har'] = 0
|
||||
|
||||
# verify user input
|
||||
if depth_limit:
|
||||
try:
|
||||
depth_limit = int(depth_limit)
|
||||
if depth_limit < 0:
|
||||
return 'incorrect depth_limit'
|
||||
else:
|
||||
crawler_config['depth_limit'] = depth_limit
|
||||
except:
|
||||
return 'incorrect depth_limit'
|
||||
if max_pages:
|
||||
try:
|
||||
max_pages = int(max_pages)
|
||||
if max_pages < 1:
|
||||
return 'incorrect max_pages'
|
||||
else:
|
||||
crawler_config['closespider_pagecount'] = max_pages
|
||||
except:
|
||||
return 'incorrect max_pages'
|
||||
|
||||
# get service_type
|
||||
faup.decode(url)
|
||||
unpack_url = faup.get()
|
||||
domain = unpack_url['domain'].decode()
|
||||
if unpack_url['tld'] == b'onion':
|
||||
service_type = 'onion'
|
||||
else:
|
||||
service_type = 'regular'
|
||||
|
||||
if automatic:
|
||||
mode = 'auto'
|
||||
try:
|
||||
crawler_time = int(crawler_time)
|
||||
if crawler_time < 0:
|
||||
return 'incorrect epoch'
|
||||
else:
|
||||
crawler_config['time'] = crawler_time
|
||||
except:
|
||||
return 'incorrect epoch'
|
||||
else:
|
||||
mode = 'manual'
|
||||
epoch = None
|
||||
|
||||
create_crawler_config(mode, service_type, crawler_config, domain)
|
||||
send_url_to_crawl_in_queue(mode, service_type, url)
|
||||
|
||||
return redirect(url_for('hiddenServices.manual'))
|
||||
|
||||
@hiddenServices.route("/hiddenServices/", methods=['GET'])
|
||||
def hiddenServices_page():
|
||||
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)
|
||||
|
|
|
@ -0,0 +1,180 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>AIL-Framework</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
|
||||
|
||||
<!-- JS -->
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
|
||||
<script language="javascript" src="{{ url_for('static', filename='js/d3.min.js') }}"></script>
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
<div class="col-12 col-lg-2 p-0 bg-light border-right">
|
||||
|
||||
|
||||
<nav class="navbar navbar-expand navbar-light bg-light flex-md-column flex-row align-items-start py-2">
|
||||
<h5 class="d-flex text-muted w-100">
|
||||
<span>Splash Crawlers </span>
|
||||
<a class="ml-auto" href="#">
|
||||
<i class="fas fa-plus-circle ml-auto"></i>
|
||||
</a>
|
||||
</h5>
|
||||
<ul class="nav flex-md-column flex-row navbar-nav justify-content-between w-100"> <!--nav-pills-->
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="#">
|
||||
<i class="fas fa-search"></i>
|
||||
<span>Dashboard</span>
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link active" href="#">
|
||||
<i class="fas fa-sync"></i>
|
||||
Automatic Onion Crawler
|
||||
</a>
|
||||
</li>
|
||||
<li class="nav-item">
|
||||
<a class="nav-link" href="#">
|
||||
<i class="fas fa-clock"></i>
|
||||
Manual Splash Crawler
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
<div class="col-12 col-lg-10">
|
||||
|
||||
|
||||
<div class="card text-white bg-dark mb-3 mt-1">
|
||||
<div class="card-header">
|
||||
<h5 class="card-title">Crawl a Domain</h5>
|
||||
</div>
|
||||
<div class="card-body">
|
||||
<p class="card-text">Enter a domain and choose what kind of data you want.</p>
|
||||
<form action="{{ url_for('hiddenServices.create_spider_splash') }}" method='post'>
|
||||
<div class="row">
|
||||
<div class="col-12 col-lg-6">
|
||||
<div class="input-group" id="date-range-from">
|
||||
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
|
||||
</div>
|
||||
<div class="d-flex mt-1">
|
||||
<i class="fas fa-user-ninja mt-1"></i> Manual
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
|
||||
<label class="custom-control-label" for="crawler_type">
|
||||
<i class="fas fa-clock"></i> Automatic
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light"><i class="fas fa-clock"></i> </span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="crawler_epoch" value="3600" name="crawler_epoch" required>
|
||||
<div class="input-group-append">
|
||||
<span class="input-group-text">Time (seconds) between each crawling</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="col-12 col-lg-6 mt-2 mt-lg-0">
|
||||
|
||||
<div class="row">
|
||||
<div class="col-12 col-xl-6">
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" name="html_content" value="True" id="html_content_id" checked disabled>
|
||||
<label class="custom-control-label" for="html_content_id">
|
||||
<i class="fab fa-html5"></i> HTML
|
||||
</label>
|
||||
</div>
|
||||
<div class="custom-control custom-switch mt-1">
|
||||
<input class="custom-control-input" type="checkbox" name="screenshot" value="True" id="screenshot_id">
|
||||
<label class="custom-control-label" for="screenshot_id">
|
||||
<i class="fas fa-image"></i> Screenshot
|
||||
</label>
|
||||
</div>
|
||||
<div class="custom-control custom-switch mt-1">
|
||||
<input class="custom-control-input" type="checkbox" name="har" value="True" id="har_id">
|
||||
<label class="custom-control-label" for="har_id">
|
||||
<i class="fas fa-file"></i> HAR
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-12 col-xl-6">
|
||||
<div class="input-group form-group mb-0">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light"><i class="fas fa-water"></i></span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="depth_limit" name="depth_limit" value="0" required>
|
||||
<div class="input-group-append">
|
||||
<span class="input-group-text">Depth Limit</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="input-group mt-2">
|
||||
<div class="input-group-prepend">
|
||||
<span class="input-group-text bg-light"><i class="fas fa-copy"></i> </span>
|
||||
</div>
|
||||
<input class="form-control" type="number" id="max_pages" name="max_pages" value="1" required>
|
||||
<div class="input-group-append">
|
||||
<span class="input-group-text">Max Pages</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<button class="btn btn-primary mt-2">
|
||||
<i class="fas fa-spider"></i> Send to Spider
|
||||
</button>
|
||||
<form>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
var chart = {};
|
||||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
manual_crawler_input_controler();
|
||||
|
||||
$('#crawler_type').change(function () {
|
||||
manual_crawler_input_controler();
|
||||
});
|
||||
});
|
||||
|
||||
function manual_crawler_input_controler() {
|
||||
if($('#crawler_type').is(':checked')){
|
||||
$("#crawler_epoch_input").show();
|
||||
}else{
|
||||
$("#crawler_epoch_input").hide();
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
Loading…
Reference in New Issue