From c0d72e7d2a909c6790a960292b71b58217d9ab10 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 26 Feb 2019 14:50:48 +0100 Subject: [PATCH] chg: [Crawler UI] Crawler major refractor (end) + basic UI for manual crawler --- bin/Crawler.py | 49 +++-- bin/LAUNCH.sh | 2 +- .../hiddenServices/Flask_hiddenServices.py | 102 +++++++++- .../templates/Crawler_Splash_manual.html | 180 ++++++++++++++++++ 4 files changed, 310 insertions(+), 23 deletions(-) create mode 100644 var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html diff --git a/bin/Crawler.py b/bin/Crawler.py index 14fd0b3a..e8ff8eb1 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -30,6 +30,14 @@ def load_blacklist(service_type): except Exception: pass +def update_auto_crawler(): + current_epoch = int(time.time()) + list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch) + for elem_to_crawl in list_to_crawl: + mess, type = elem_to_crawl.rsplit(';', 1) + redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess) + redis_crawler.zrem('crawler_auto_queue', elem_to_crawl) + # Extract info form url (url, domain, domain url, ...) def unpack_url(url): to_crawl = {} @@ -76,14 +84,14 @@ def get_elem_to_crawl(rotation_mode): for service_type in rotation_mode: message = redis_crawler.spop('{}_crawler_priority_queue'.format(service_type)) if message is not None: - domain_service_type = type_service + domain_service_type = service_type break #load_normal_queue if message is None: for service_type in rotation_mode: message = redis_crawler.spop('{}_crawler_queue'.format(service_type)) if message is not None: - domain_service_type = type_service + domain_service_type = service_type break if message: @@ -109,6 +117,10 @@ def get_crawler_config(redis_server, mode, service_type, domain): crawler_options[option] = config[option] else: crawler_options[option] = default_crawler_config[option] + if mode == 'auto': + crawler_options['time'] = int(config['time']) + elif mode == 'manual': + redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) return crawler_options def load_crawler_config(service_type, domain, paste, date): @@ -239,12 +251,12 @@ def search_potential_source_domain(type_service, domain): if __name__ == '__main__': - if len(sys.argv) != 3: - print('usage:', 'Crawler.py', 'mode', 'splash_port') + if len(sys.argv) != 2: + print('usage:', 'Crawler.py', 'splash_port') exit(1) ################################################## #mode = sys.argv[1] - splash_port = sys.argv[2] + splash_port = sys.argv[1] rotation_mode = ['onion', 'regular'] default_proto_map = {'http': 80, 'https': 443} @@ -303,13 +315,11 @@ if __name__ == '__main__': while True: + update_auto_crawler() + to_crawl = get_elem_to_crawl(rotation_mode) if to_crawl: - print(to_crawl) - print(to_crawl['url']) url_data = unpack_url(to_crawl['url']) - print('url') - print(url_data) # remove domain from queue redis_crawler.srem('{}_domain_crawler_queue'.format(to_crawl['type_service']), url_data['domain']) @@ -328,14 +338,15 @@ if __name__ == '__main__': 'date_month': datetime.datetime.now().strftime("%Y%m"), 'epoch': int(time.time())} + # Update crawler status type + r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_port) crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], date) - print(crawler_config) # check if default crawler - #if not crawler_config['requested']: - # # Auto crawl only if service not up this month - # if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): - # continue + if not crawler_config['requested']: + # Auto crawl only if service not up this month + if redis_crawler.sismember('month_{}_up:{}'.format(to_crawl['type_service'], date['date_month']), url_data['domain']): + continue set_crawled_domain_metadata(to_crawl['type_service'], date, url_data['domain'], to_crawl['paste']) @@ -379,14 +390,20 @@ if __name__ == '__main__': ############################ # update list, last crawled domains - redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), url_data['domain']) + redis_crawler.lpush('last_{}'.format(to_crawl['type_service']), '{};{}'.format(url_data['domain'], date['epoch'])) redis_crawler.ltrim('last_{}'.format(to_crawl['type_service']), 0, 15) #update crawler status r_cache.hset('metadata_crawler:{}'.format(splash_port), 'status', 'Waiting') r_cache.hdel('metadata_crawler:{}'.format(splash_port), 'crawling_domain') - time.sleep(60) + # Update crawler status type + r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_port) + + # add next auto Crawling in queue: + if to_crawl['paste'] == 'auto': + redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service'])) + else: print(' Blacklisted Domain') print() diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index dd5a0517..3d48ae3c 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -237,7 +237,7 @@ function launching_crawler { sleep 0.1 for ((i=first_port;i<=last_port;i++)); do - screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py onion $i; read x" + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c "cd ${AIL_BIN}; ${ENV_PY} ./Crawler.py $i; read x" sleep 0.1 done diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 484056e0..dee511dc 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -8,6 +8,7 @@ import redis import datetime import sys import os +import json from pyfaup.faup import Faup from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for @@ -94,13 +95,16 @@ def get_domain_type(domain): def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): list_crawled_metadata = [] - for domain in list_domains_crawled: + for domain_epoch in list_domains_crawled: + domain, epoch = domain_epoch.rsplit(';', 1) metadata_domain = {} # get Domain type if type is None: type = get_domain_type(domain) metadata_domain['domain'] = domain + metadata_domain['epoch'] = epoch + print(epoch) metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type, domain), 'last_check') if metadata_domain['last_check'] is None: metadata_domain['last_check'] = '********' @@ -118,9 +122,9 @@ def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None): list_crawled_metadata.append(metadata_domain) return list_crawled_metadata -def get_crawler_splash_status(mode, type): +def get_crawler_splash_status(type): crawler_metadata = [] - all_crawlers = r_cache.smembers('all_crawler:{}:{}'.format(mode, type)) + all_crawlers = r_cache.smembers('{}_crawlers'.format(type)) for crawler in all_crawlers: crawling_domain = r_cache.hget('metadata_crawler:{}'.format(crawler), 'crawling_domain') started_time = r_cache.hget('metadata_crawler:{}'.format(crawler), 'started_time') @@ -132,10 +136,21 @@ def get_crawler_splash_status(mode, type): status=False crawler_metadata.append({'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status}) - crawler_metadata.append({'crawler_info': '8050 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True}) - crawler_metadata.append({'crawler_info': '8051 - 2019/02/18 - 16:49.54', 'crawling_domain': 'test', 'status_info': 'Crawling', 'status': True}) return crawler_metadata +def create_crawler_config(mode, service_type, crawler_config, domain): + print(crawler_config) + if mode == 'manual': + r_cache.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) + elif mode == 'auto': + r_serv_onion.set('crawler_config:{}:{}:{}'.format(mode, service_type, domain), json.dumps(crawler_config)) + +def send_url_to_crawl_in_queue(mode, service_type, url): + r_serv_onion.sadd('{}_crawler_priority_queue'.format(service_type), '{};{}'.format(url, mode)) + # add auto crawled url for user UI + if mode == 'auto': + r_serv_onion.sadd('auto_crawler_url:{}'.format(service_type), url) + # ============= ROUTES ============== @hiddenServices.route("/hiddenServices/2", methods=['GET']) @@ -160,7 +175,7 @@ def crawler_splash_onion(): statDomains['domains_queue'] = r_serv_onion.scard('onion_domain_crawler_queue') list_onion = get_last_crawled_domains_metadata(last_onions, date, type='onion') - crawler_metadata = get_crawler_splash_status('automatic', 'onion') + crawler_metadata = get_crawler_splash_status('onion') date_string = '{}-{}-{}'.format(date[0:4], date[4:6], date[6:8]) return render_template("Crawler_Splash_onion.html", last_onions=list_onion, statDomains=statDomains, @@ -267,6 +282,81 @@ def unblacklist_onion(): else: return redirect(url_for('hiddenServices.blacklisted_onion', page=page, unblacklist_onion=0)) +@hiddenServices.route("/crawlers/create_spider_splash", methods=['POST']) +def create_spider_splash(): + url = request.form.get('url_to_crawl') + automatic = request.form.get('crawler_type') + crawler_time = request.form.get('crawler_epoch') + #html = request.form.get('html_content_id') + screenshot = request.form.get('screenshot') + har = request.form.get('har') + depth_limit = request.form.get('depth_limit') + max_pages = request.form.get('max_pages') + + # validate url + if url is None or url=='' or url=='\n': + return 'incorrect url' + + crawler_config = {} + + # verify user input + if automatic: + automatic = True + else: + automatic = False + if not screenshot: + crawler_config['png'] = 0 + if not har: + crawler_config['har'] = 0 + + # verify user input + if depth_limit: + try: + depth_limit = int(depth_limit) + if depth_limit < 0: + return 'incorrect depth_limit' + else: + crawler_config['depth_limit'] = depth_limit + except: + return 'incorrect depth_limit' + if max_pages: + try: + max_pages = int(max_pages) + if max_pages < 1: + return 'incorrect max_pages' + else: + crawler_config['closespider_pagecount'] = max_pages + except: + return 'incorrect max_pages' + + # get service_type + faup.decode(url) + unpack_url = faup.get() + domain = unpack_url['domain'].decode() + if unpack_url['tld'] == b'onion': + service_type = 'onion' + else: + service_type = 'regular' + + if automatic: + mode = 'auto' + try: + crawler_time = int(crawler_time) + if crawler_time < 0: + return 'incorrect epoch' + else: + crawler_config['time'] = crawler_time + except: + return 'incorrect epoch' + else: + mode = 'manual' + epoch = None + + create_crawler_config(mode, service_type, crawler_config, domain) + send_url_to_crawl_in_queue(mode, service_type, url) + + return redirect(url_for('hiddenServices.manual')) + @hiddenServices.route("/hiddenServices/", methods=['GET']) def hiddenServices_page(): last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) diff --git a/var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html b/var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html new file mode 100644 index 00000000..8c9a9bad --- /dev/null +++ b/var/www/modules/hiddenServices/templates/Crawler_Splash_manual.html @@ -0,0 +1,180 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + + +
+ + +
+
+
Crawl a Domain
+
+
+

Enter a domain and choose what kind of data you want.

+
+
+
+
+ +
+
+  Manual   +
+ + +
+
+
+
+   +
+ +
+ Time (seconds) between each crawling +
+
+
+ + +
+ +
+
+
+ + +
+
+ + +
+
+ + +
+
+
+
+
+ +
+ +
+ Depth Limit +
+
+
+
+   +
+ +
+ Max Pages +
+
+
+ +
+
+
+ + +
+
+ + +
+ +
+
+ + + + +