chg: [Crawler UI] Crawler major refractor (end) + basic UI for manual crawler

pull/342/head
Terrtia 2019-02-26 14:50:48 +01:00
parent 7b32d7f34e
commit c0d72e7d2a
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
4 changed files with 310 additions and 23 deletions

View File

@ -30,6 +30,14 @@ def load_blacklist(service_type):
except Exception:
pass
def update_auto_crawler():
current_epoch = int(time.time())
list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch)
for elem_to_crawl in list_to_crawl:
mess, type = elem_to_crawl.rsplit(';', 1)
redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess)
redis_crawler.zrem('crawler_auto_queue', elem_to_crawl)
# Extract info form url (url, domain, domain url, ...)
def unpack_url(url):
to_crawl = {}
@ -76,14 +84,14 @@ def get_elem_to_crawl(rotation_mode):
for service_type in rotation_mode:
message = redis_crawler.spop('{}_crawler_priority_queue'.format(service_type))
if message is not None:
domain_service_type = type_service
domain_service_type = service_type
break
#load_normal_queue
if message is None:
for service_type in rotation_mode:
message = redis_crawler.spop('{}_crawler_queue'.format(service_type))
if message is not None:
domain_service_type = type_service
domain_service_type = service_type
break
if message:
@ -109,6 +117,10 @@ def get_crawler_config(redis_server, mode, service_type, domain):
crawler_options[option] = config[option]
else:
crawler_options[option] = default_crawler_config[option]
if mode == 'auto':
crawler_options['time'] = int(config['time'])
elif mode == 'manual':
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
return crawler_options
def load_crawler_config(service_type, domain, paste, date):
@ -239,12 +251,12 @@ def search_potential_source_domain(type_service, domain):
if __name__ == '__main__':
if len(sys.argv) != 3:
print('usage:', 'Crawler.py', 'mode', 'splash_port')
if len(sys.argv) != 2:
print('usage:', 'Crawler.py', 'splash_port')
exit(1)
##################################################
#mode = sys.argv[1]
splash_port = sys.argv[2]
splash_port = sys.argv[1]
rotation_mode = ['onion', 'regular']
default_proto_map = {'http': 80, 'https': 443}
@ -303,13 +315,11 @@ if __name__ == '__main__':
while True:
update_auto_crawler()
to_crawl = get_elem_to_crawl(rotation_mode)
if to_crawl:
print(to_crawl)
print(to_crawl['url'])
url_data = unpack_url(to_crawl['url'])
print('url')
print(url_data)
# remove domain from queue
redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain'])
@ -328,14 +338,15 @@ if __name__ == '__main__':
'date_month': datetime.datetime.now().strftime("%Y%m"),
'epoch': int(time.time())}
# Update crawler status type
r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port)
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date)
print(crawler_config)
# check if default crawler
#if not crawler_config['requested']:
# # Auto crawl only if service not up this month
# if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']):
# continue
if not crawler_config['requested']:
# Auto crawl only if service not up this month
if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']):
continue
set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste'])
@ -379,14 +390,20 @@ if __name__ == '__main__':
############################
# update list, last crawled domains
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain'])
redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch']))
redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15)
#update crawler status
r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting')
r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain')
time.sleep(60)
# Update crawler status type
r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port)
# add next auto Crawling in queue:
if to_crawl['paste'] == 'auto':
redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service']))
else:
print(' Blacklisted Domain')
print()

View File

@ -237,7 +237,7 @@ function launching_crawler {
sleep 0.1
for ((i=first_port;i<=last_port;i++)); do
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py onion $i; read x"
screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x"
sleep 0.1
done

View File

@ -8,6 +8,7 @@ import redis
import datetime
import sys
import os
import json
from pyfaup.faup import Faup
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for
@ -94,13 +95,16 @@ def get_domain_type(domain):
def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
list_crawled_metadata = []
for domain in list_domains_crawled:
for domain_epoch in list_domains_crawled:
domain, epoch = domain_epoch.rsplit(';', 1)
metadata_domain = {}
# get Domain type
if type is None:
type = get_domain_type(domain)
metadata_domain['domain'] = domain
metadata_domain['epoch'] = epoch
print(epoch)
metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check')
if metadata_domain['last_check'] is None:
metadata_domain['last_check'] = '********'
@ -118,9 +122,9 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None):
list_crawled_metadata.append(metadata_domain)
return list_crawled_metadata
def get_crawler_splash_status(mode, type):
def get_crawler_splash_status(type):
crawler_metadata = []
all_crawlers = r_cache.smembers('all_crawler:{}:{}'.format(mode, type))
all_crawlers = r_cache.smembers('{}_crawlers'.format(type))
for crawler in all_crawlers:
crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain')
started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time')
@ -132,10 +136,21 @@ def get_crawler_splash_status(mode, type):
status=False
crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status})
crawler_metadata.append({'crawler_info': '8050 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True})
crawler_metadata.append({'crawler_info': '8051 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True})
return crawler_metadata
def create_crawler_config(mode, service_type, crawler_config, domain):
print(crawler_config)
if mode == 'manual':
r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
elif mode == 'auto':
r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config))
def send_url_to_crawl_in_queue(mode, service_type, url):
r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode))
# add auto crawled url for user UI
if mode == 'auto':
r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url)
# ============= ROUTES ==============
@hiddenServices.route("/hiddenServices/2", methods=['GET'])
@ -160,7 +175,7 @@ def crawler_splash_onion():
statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue')
list_onion = get_last_crawled_domains_metadata(last_onions, date, type='onion')
crawler_metadata = get_crawler_splash_status('automatic', 'onion')
crawler_metadata = get_crawler_splash_status('onion')
date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8])
return render_template("Crawler_Splash_onion.html", last_onions=list_onion, statDomains=statDomains,
@ -267,6 +282,81 @@ def unblacklist_onion():
else:
return redirect(url_for('hiddenServices.blacklisted_onion', page=page, unblacklist_onion=0))
@hiddenServices.route("/crawlers/create_spider_splash", methods=['POST'])
def create_spider_splash():
url = request.form.get('url_to_crawl')
automatic = request.form.get('crawler_type')
crawler_time = request.form.get('crawler_epoch')
#html = request.form.get('html_content_id')
screenshot = request.form.get('screenshot')
har = request.form.get('har')
depth_limit = request.form.get('depth_limit')
max_pages = request.form.get('max_pages')
# validate url
if url is None or url=='' or url=='\n':
return 'incorrect url'
crawler_config = {}
# verify user input
if automatic:
automatic = True
else:
automatic = False
if not screenshot:
crawler_config['png'] = 0
if not har:
crawler_config['har'] = 0
# verify user input
if depth_limit:
try:
depth_limit = int(depth_limit)
if depth_limit < 0:
return 'incorrect depth_limit'
else:
crawler_config['depth_limit'] = depth_limit
except:
return 'incorrect depth_limit'
if max_pages:
try:
max_pages = int(max_pages)
if max_pages < 1:
return 'incorrect max_pages'
else:
crawler_config['closespider_pagecount'] = max_pages
except:
return 'incorrect max_pages'
# get service_type
faup.decode(url)
unpack_url = faup.get()
domain = unpack_url['domain'].decode()
if unpack_url['tld'] == b'onion':
service_type = 'onion'
else:
service_type = 'regular'
if automatic:
mode = 'auto'
try:
crawler_time = int(crawler_time)
if crawler_time < 0:
return 'incorrect epoch'
else:
crawler_config['time'] = crawler_time
except:
return 'incorrect epoch'
else:
mode = 'manual'
epoch = None
create_crawler_config(mode, service_type, crawler_config, domain)
send_url_to_crawl_in_queue(mode, service_type, url)
return redirect(url_for('hiddenServices.manual'))
@hiddenServices.route("/hiddenServices/", methods=['GET'])
def hiddenServices_page():
last_onions = r_serv_onion.lrange('last_onion', 0 ,-1)

View File

@ -0,0 +1,180 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/daterangepicker.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/popper.min.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/moment.min.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/jquery.daterangepicker.min.js') }}"></script>
<script language="javascript" src="{{ url_for('static', filename='js/d3.min.js') }}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
<div class="col-12 col-lg-2 p-0 bg-light border-right">
<nav class="navbar navbar-expand navbar-light bg-light flex-md-column flex-row align-items-start py-2">
<h5 class="d-flex text-muted w-100">
<span>Splash Crawlers </span>
<a class="ml-auto" href="#">
<i class="fas fa-plus-circle ml-auto"></i>
</a>
</h5>
<ul class="nav flex-md-column flex-row navbar-nav justify-content-between w-100"> <!--nav-pills-->
<li class="nav-item">
<a class="nav-link" href="#">
<i class="fas fa-search"></i>
<span>Dashboard</span>
</a>
</li>
<li class="nav-item">
<a class="nav-link active" href="#">
<i class="fas fa-sync"></i>
Automatic Onion Crawler
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="#">
<i class="fas fa-clock"></i>
Manual Splash Crawler
</a>
</li>
</ul>
</nav>
</div>
<div class="col-12 col-lg-10">
<div class="card text-white bg-dark mb-3 mt-1">
<div class="card-header">
<h5 class="card-title">Crawl a Domain</h5>
</div>
<div class="card-body">
<p class="card-text">Enter a domain and choose what kind of data you want.</p>
<form action="{{ url_for('hiddenServices.create_spider_splash') }}" method='post'>
<div class="row">
<div class="col-12 col-lg-6">
<div class="input-group" id="date-range-from">
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
</div>
<div class="d-flex mt-1">
<i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp;
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
<label class="custom-control-label" for="crawler_type">
<i class="fas fa-clock"></i> &nbsp;Automatic
</label>
</div>
</div>
<div class="input-group mt-2 mb-2" id="crawler_epoch_input">
<div class="input-group-prepend">
<span class="input-group-text bg-light"><i class="fas fa-clock"></i>&nbsp;</span>
</div>
<input class="form-control" type="number" id="crawler_epoch" value="3600" name="crawler_epoch" required>
<div class="input-group-append">
<span class="input-group-text">Time (seconds) between each crawling</span>
</div>
</div>
</div>
<div class="col-12 col-lg-6 mt-2 mt-lg-0">
<div class="row">
<div class="col-12 col-xl-6">
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="html_content" value="True" id="html_content_id" checked disabled>
<label class="custom-control-label" for="html_content_id">
<i class="fab fa-html5"></i> &nbsp;HTML
</label>
</div>
<div class="custom-control custom-switch mt-1">
<input class="custom-control-input" type="checkbox" name="screenshot" value="True" id="screenshot_id">
<label class="custom-control-label" for="screenshot_id">
<i class="fas fa-image"></i> Screenshot
</label>
</div>
<div class="custom-control custom-switch mt-1">
<input class="custom-control-input" type="checkbox" name="har" value="True" id="har_id">
<label class="custom-control-label" for="har_id">
<i class="fas fa-file"></i> &nbsp;HAR
</label>
</div>
</div>
<div class="col-12 col-xl-6">
<div class="input-group form-group mb-0">
<div class="input-group-prepend">
<span class="input-group-text bg-light"><i class="fas fa-water"></i></span>
</div>
<input class="form-control" type="number" id="depth_limit" name="depth_limit" value="0" required>
<div class="input-group-append">
<span class="input-group-text">Depth Limit</span>
</div>
</div>
<div class="input-group mt-2">
<div class="input-group-prepend">
<span class="input-group-text bg-light"><i class="fas fa-copy"></i>&nbsp;</span>
</div>
<input class="form-control" type="number" id="max_pages" name="max_pages" value="1" required>
<div class="input-group-append">
<span class="input-group-text">Max Pages</span>
</div>
</div>
</div>
</div>
</div>
</div>
<button class="btn btn-primary mt-2">
<i class="fas fa-spider"></i> Send to Spider
</button>
<form>
</div>
</div>
</div>
</div>
</div>
</body>
<script>
var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
manual_crawler_input_controler();
$('#crawler_type').change(function () {
manual_crawler_input_controler();
});
});
function manual_crawler_input_controler() {
if($('#crawler_type').is(':checked')){
$("#crawler_epoch_input").show();
}else{
$("#crawler_epoch_input").hide();
}
}
</script>