diff --git a/bin/Crawler.py b/bin/Crawler.py index 34406574..6a61a0ba 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -349,7 +349,7 @@ if __name__ == '__main__': 'user_agent': p.config.get("Crawler", "default_crawler_user_agent")} # Track launched crawler - r_cache.sadd('all_crawler', splash_url) + r_cache.sadd('all_splash_crawlers', splash_url) r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting') r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) @@ -385,7 +385,7 @@ if __name__ == '__main__': 'epoch': int(time.time())} # Update crawler status type - r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url) + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler @@ -437,7 +437,7 @@ if __name__ == '__main__': r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain') # Update crawler status type - r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url) + r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) # add next auto Crawling in queue: if to_crawl['paste'] == 'auto': diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index 6f1e3cf7..8b43be99 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -35,6 +35,8 @@ def launch_crawlers(): print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) nb_crawlers = len(all_crawler_urls) + crawlers.reset_all_spash_crawler_status() + for i in range(0, int(nb_crawlers)): splash_url = all_crawler_urls[i] print(all_crawler_urls[i]) @@ -59,6 +61,8 @@ if __name__ == '__main__': while True: + # # TODO: avoid multiple ping + # check if manager is connected if int(time.time()) - last_check > 60: is_manager_connected = crawlers.is_splash_manager_connected() diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index e4643601..6448843d 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -46,9 +46,20 @@ config_loader = None faup = Faup() +# # # # # # # # +# # +# COMMON # +# # +# # # # # # # # + def generate_uuid(): return str(uuid.uuid4()).replace('-', '') +def get_current_date(): + return datetime.now().strftime("%Y%m%d") + +##-- COMMON --# + ################################################################################ # # TODO: handle prefix cookies @@ -377,6 +388,55 @@ def api_create_cookie(user_id, cookiejar_uuid, cookie_dict): #### #### +# # # # # # # # +# # +# CRAWLER # +# # +# # # # # # # # + +#### CRAWLER GLOBAL #### + +def get_all_spash_crawler_status(): + crawler_metadata = [] + all_crawlers = r_cache.smembers('all_splash_crawlers') + for crawler in all_crawlers: + crawler_metadata.append(get_splash_crawler_status(crawler)) + return crawler_metadata + +def reset_all_spash_crawler_status(): + r_cache.delete('all_splash_crawlers') + +def get_splash_crawler_status(spash_url): + crawler_type = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'type') + crawling_domain = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'crawling_domain') + started_time = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'started_time') + status_info = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'status') + crawler_info = '{} - {}'.format(spash_url, started_time) + if status_info=='Waiting' or status_info=='Crawling': + status=True + else: + status=False + return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type} + +def get_stats_last_crawled_domains(crawler_types, date): + statDomains = {} + for crawler_type in crawler_types: + stat_type = {} + stat_type['domains_up'] = r_serv_onion.scard('{}_up:{}'.format(crawler_type, date)) + stat_type['domains_down'] = r_serv_onion.scard('{}_down:{}'.format(crawler_type, date)) + stat_type['total'] = stat_type['domains_up'] + stat_type['domains_down'] + stat_type['domains_queue'] = get_nb_elem_to_crawl_by_type(crawler_type) + statDomains[crawler_type] = stat_type + return statDomains + +# # TODO: handle custom proxy +def get_splash_crawler_latest_stats(): + now = datetime.now() + date = now.strftime("%Y%m%d") + return get_stats_last_crawled_domains(['onion', 'regular'], date) + +##-- CRAWLER GLOBAL --## + #### CRAWLER TASK #### def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): @@ -587,10 +647,20 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message} return None +def get_nb_elem_to_crawl_by_type(queue_type): + nb = r_serv_onion.scard('{}_crawler_priority_queue'.format(queue_type)) + nb += r_serv_onion.scard('{}_crawler_discovery_queue'.format(queue_type)) + nb += r_serv_onion.scard('{}_crawler_queue'.format(queue_type)) + return nb + #### ---- #### +# # # # # # # # # # # # +# # +# SPLASH MANAGER # +# # +# # # # # # # # # # # # -#### SPLASH MANAGER #### def get_splash_manager_url(reload=False): # TODO: add in db config return splash_manager_url @@ -636,6 +706,8 @@ def ping_splash_manager(): return True else: print(req.json()) + update_splash_manager_connection_status(False) + return False except requests.exceptions.ConnectionError: pass # splash manager unreachable diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 7d006c3d..5d9324ed 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -48,6 +48,31 @@ def create_json_response(data, status_code): return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code # ============= ROUTES ============== +@crawler_splash.route("/crawlers/dashboard", methods=['GET']) +@login_required +@login_read_only +def crawlers_dashboard(): + # # TODO: get splash manager status + crawler_enabled = crawlers.ping_splash_manager() + all_splash_crawler_status = crawlers.get_all_spash_crawler_status() + splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() + date = crawlers.get_current_date() + + return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status, + crawler_enabled=crawler_enabled, date=date, + splash_crawlers_latest_stats=splash_crawlers_latest_stats) + +@crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET']) +@login_required +@login_read_only +def crawler_dashboard_json(): + + all_splash_crawler_status = crawlers.get_all_spash_crawler_status() + splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() + + return jsonify({'all_splash_crawler_status': all_splash_crawler_status, + 'splash_crawlers_latest_stats':splash_crawlers_latest_stats}) + @crawler_splash.route("/crawlers/manual", methods=['GET']) @login_required @login_read_only @@ -403,4 +428,11 @@ def crawler_cookiejar_cookie_json_add_post(): return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid)) +@crawler_splash.route('/crawler/cookiejar/cookie/json_add_post', methods=['GET']) +@login_required +@login_analyst +def crawler_splash_setings(): + + return render_template("settings_splash_crawler.html", cookiejar_uuid=True, cookie_uuid=False) + ## - - ## diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index bab5553a..bf9a0ec8 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -231,22 +231,22 @@ def delete_auto_crawler(url): # ============= ROUTES ============== -@hiddenServices.route("/crawlers/", methods=['GET']) -@login_required -@login_read_only -def dashboard(): - crawler_metadata_onion = get_crawler_splash_status('onion') - crawler_metadata_regular = get_crawler_splash_status('regular') - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - statDomains_onion = get_stats_last_crawled_domains('onion', date) - statDomains_regular = get_stats_last_crawled_domains('regular', date) - - return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, - crawler_enabled=crawler_enabled, date=date, - crawler_metadata_regular=crawler_metadata_regular, - statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) +# @hiddenServices.route("/crawlers/", methods=['GET']) +# @login_required +# @login_read_only +# def dashboard(): +# crawler_metadata_onion = get_crawler_splash_status('onion') +# crawler_metadata_regular = get_crawler_splash_status('regular') +# +# now = datetime.datetime.now() +# date = now.strftime("%Y%m%d") +# statDomains_onion = get_stats_last_crawled_domains('onion', date) +# statDomains_regular = get_stats_last_crawled_domains('regular', date) +# +# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, +# crawler_enabled=crawler_enabled, date=date, +# crawler_metadata_regular=crawler_metadata_regular, +# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) @hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET']) @login_required @@ -439,23 +439,6 @@ def remove_auto_crawler(): delete_auto_crawler(url) return redirect(url_for('hiddenServices.auto_crawler', page=page)) -@hiddenServices.route("/crawlers/crawler_dashboard_json", methods=['GET']) -@login_required -@login_read_only -def crawler_dashboard_json(): - - crawler_metadata_onion = get_crawler_splash_status('onion') - crawler_metadata_regular = get_crawler_splash_status('regular') - - now = datetime.datetime.now() - date = now.strftime("%Y%m%d") - - statDomains_onion = get_stats_last_crawled_domains('onion', date) - statDomains_regular = get_stats_last_crawled_domains('regular', date) - - return jsonify({'statDomains_onion': statDomains_onion, 'statDomains_regular': statDomains_regular, - 'crawler_metadata_onion':crawler_metadata_onion, 'crawler_metadata_regular':crawler_metadata_regular}) - # # TODO: refractor @hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET']) @login_required diff --git a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html deleted file mode 100644 index 9c0e1933..00000000 --- a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html +++ /dev/null @@ -1,267 +0,0 @@ - - - - - AIL-Framework - - - - - - - - - - - - - - - {% include 'nav_bar.html' %} - -
-
- - {% include 'crawler/menu_sidebar.html' %} - -
- - {% include 'crawler/crawler_disabled.html' %} - -
-
- -
-
-
Onions Crawlers
-
- -
- {{ statDomains_onion['total'] }} Crawled - {{ statDomains_onion['domains_queue'] }} Queue -
-
-
-
- - - {% for crawler in crawler_metadata_onion %} - - - - - - {% endfor %} - -
- {{crawler['crawler_info']}} - - {{crawler['crawling_domain']}} - - {{crawler['status_info']}} -
-
-
- -
-
-
-
-
Regular Crawlers
-
- -
- {{ statDomains_regular['total'] }} Crawled - {{ statDomains_regular['domains_queue'] }} Queue -
-
-
-
- - - {% for crawler in crawler_metadata_regular %} - - - - - - {% endfor %} - -
- {{crawler['crawler_info']}} - - {{crawler['crawling_domain']}} - - {{crawler['status_info']}} -
-
-
-
-
- -
-
-
-
Show Domain:
-
-
- -
- -
-
-
-
-
-
- - -
- - - - - {% with object_type='domain' %} - {% include 'tags/block_obj_tags_search.html' %} - {% endwith %} - -
-
-
- - - - - diff --git a/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html new file mode 100644 index 00000000..5b059e23 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html @@ -0,0 +1,235 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ + {% include 'crawler/crawler_disabled.html' %} + + + + + + {% for splash_crawler in all_splash_crawler_status %} + + + + + + + {% endfor %} + +
+ {{splash_crawler['crawler_info']}} + + {%if splash_crawler['type']=='onion'%} + + {%else%} + + {%endif%} + + {{splash_crawler['crawling_domain']}} + + {{splash_crawler['status_info']}} +
+ +
+
+
+
Show Domain:
+
+
+ +
+ +
+
+
+
+
+
+ + +
+ + + + + {% with object_type='domain' %} + {% include 'tags/block_obj_tags_search.html' %} + {% endwith %} + +
+
+
+ + + + + diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html index c14abbbe..66a5f4f5 100644 --- a/var/www/templates/crawler/menu_sidebar.html +++ b/var/www/templates/crawler/menu_sidebar.html @@ -14,7 +14,7 @@