diff --git a/bin/Crawler.py b/bin/Crawler.py
index 34406574..6a61a0ba 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -349,7 +349,7 @@ if __name__ == '__main__':
'user_agent': p.config.get("Crawler", "default_crawler_user_agent")}
# Track launched crawler
- r_cache.sadd('all_crawler', splash_url)
+ r_cache.sadd('all_splash_crawlers', splash_url)
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', 'Waiting')
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
@@ -385,7 +385,7 @@ if __name__ == '__main__':
'epoch': int(time.time())}
# Update crawler status type
- r_cache.sadd('{}_crawlers'.format(to_crawl['type_service']), splash_url)
+ r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service'])
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
# check if default crawler
@@ -437,7 +437,7 @@ if __name__ == '__main__':
r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'crawling_domain')
# Update crawler status type
- r_cache.srem('{}_crawlers'.format(to_crawl['type_service']), splash_url)
+ r_cache.hdel('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service'])
# add next auto Crawling in queue:
if to_crawl['paste'] == 'auto':
diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py
index 6f1e3cf7..8b43be99 100755
--- a/bin/core/Crawler_manager.py
+++ b/bin/core/Crawler_manager.py
@@ -35,6 +35,8 @@ def launch_crawlers():
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
nb_crawlers = len(all_crawler_urls)
+ crawlers.reset_all_spash_crawler_status()
+
for i in range(0, int(nb_crawlers)):
splash_url = all_crawler_urls[i]
print(all_crawler_urls[i])
@@ -59,6 +61,8 @@ if __name__ == '__main__':
while True:
+ # # TODO: avoid multiple ping
+
# check if manager is connected
if int(time.time()) - last_check > 60:
is_manager_connected = crawlers.is_splash_manager_connected()
diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index e4643601..6448843d 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -46,9 +46,20 @@ config_loader = None
faup = Faup()
+# # # # # # # #
+# #
+# COMMON #
+# #
+# # # # # # # #
+
def generate_uuid():
return str(uuid.uuid4()).replace('-', '')
+def get_current_date():
+ return datetime.now().strftime("%Y%m%d")
+
+##-- COMMON --#
+
################################################################################
# # TODO: handle prefix cookies
@@ -377,6 +388,55 @@ def api_create_cookie(user_id, cookiejar_uuid, cookie_dict):
#### ####
+# # # # # # # #
+# #
+# CRAWLER #
+# #
+# # # # # # # #
+
+#### CRAWLER GLOBAL ####
+
+def get_all_spash_crawler_status():
+ crawler_metadata = []
+ all_crawlers = r_cache.smembers('all_splash_crawlers')
+ for crawler in all_crawlers:
+ crawler_metadata.append(get_splash_crawler_status(crawler))
+ return crawler_metadata
+
+def reset_all_spash_crawler_status():
+ r_cache.delete('all_splash_crawlers')
+
+def get_splash_crawler_status(spash_url):
+ crawler_type = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'type')
+ crawling_domain = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'crawling_domain')
+ started_time = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'started_time')
+ status_info = r_cache.hget('metadata_crawler:{}'.format(spash_url), 'status')
+ crawler_info = '{} - {}'.format(spash_url, started_time)
+ if status_info=='Waiting' or status_info=='Crawling':
+ status=True
+ else:
+ status=False
+ return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type}
+
+def get_stats_last_crawled_domains(crawler_types, date):
+ statDomains = {}
+ for crawler_type in crawler_types:
+ stat_type = {}
+ stat_type['domains_up'] = r_serv_onion.scard('{}_up:{}'.format(crawler_type, date))
+ stat_type['domains_down'] = r_serv_onion.scard('{}_down:{}'.format(crawler_type, date))
+ stat_type['total'] = stat_type['domains_up'] + stat_type['domains_down']
+ stat_type['domains_queue'] = get_nb_elem_to_crawl_by_type(crawler_type)
+ statDomains[crawler_type] = stat_type
+ return statDomains
+
+# # TODO: handle custom proxy
+def get_splash_crawler_latest_stats():
+ now = datetime.now()
+ date = now.strftime("%Y%m%d")
+ return get_stats_last_crawled_domains(['onion', 'regular'], date)
+
+##-- CRAWLER GLOBAL --##
+
#### CRAWLER TASK ####
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None):
@@ -587,10 +647,20 @@ def get_elem_to_crawl_by_queue_type(l_queue_type):
return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message}
return None
+def get_nb_elem_to_crawl_by_type(queue_type):
+ nb = r_serv_onion.scard('{}_crawler_priority_queue'.format(queue_type))
+ nb += r_serv_onion.scard('{}_crawler_discovery_queue'.format(queue_type))
+ nb += r_serv_onion.scard('{}_crawler_queue'.format(queue_type))
+ return nb
+
#### ---- ####
+# # # # # # # # # # # #
+# #
+# SPLASH MANAGER #
+# #
+# # # # # # # # # # # #
-#### SPLASH MANAGER ####
def get_splash_manager_url(reload=False): # TODO: add in db config
return splash_manager_url
@@ -636,6 +706,8 @@ def ping_splash_manager():
return True
else:
print(req.json())
+ update_splash_manager_connection_status(False)
+ return False
except requests.exceptions.ConnectionError:
pass
# splash manager unreachable
diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py
index 7d006c3d..5d9324ed 100644
--- a/var/www/blueprints/crawler_splash.py
+++ b/var/www/blueprints/crawler_splash.py
@@ -48,6 +48,31 @@ def create_json_response(data, status_code):
return Response(json.dumps(data, indent=2, sort_keys=True), mimetype='application/json'), status_code
# ============= ROUTES ==============
+@crawler_splash.route("/crawlers/dashboard", methods=['GET'])
+@login_required
+@login_read_only
+def crawlers_dashboard():
+ # # TODO: get splash manager status
+ crawler_enabled = crawlers.ping_splash_manager()
+ all_splash_crawler_status = crawlers.get_all_spash_crawler_status()
+ splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats()
+ date = crawlers.get_current_date()
+
+ return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status,
+ crawler_enabled=crawler_enabled, date=date,
+ splash_crawlers_latest_stats=splash_crawlers_latest_stats)
+
+@crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET'])
+@login_required
+@login_read_only
+def crawler_dashboard_json():
+
+ all_splash_crawler_status = crawlers.get_all_spash_crawler_status()
+ splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats()
+
+ return jsonify({'all_splash_crawler_status': all_splash_crawler_status,
+ 'splash_crawlers_latest_stats':splash_crawlers_latest_stats})
+
@crawler_splash.route("/crawlers/manual", methods=['GET'])
@login_required
@login_read_only
@@ -403,4 +428,11 @@ def crawler_cookiejar_cookie_json_add_post():
return redirect(url_for('crawler_splash.crawler_cookiejar_cookie_add', cookiejar_uuid=cookiejar_uuid))
+@crawler_splash.route('/crawler/cookiejar/cookie/json_add_post', methods=['GET'])
+@login_required
+@login_analyst
+def crawler_splash_setings():
+
+ return render_template("settings_splash_crawler.html", cookiejar_uuid=True, cookie_uuid=False)
+
## - - ##
diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py
index bab5553a..bf9a0ec8 100644
--- a/var/www/modules/hiddenServices/Flask_hiddenServices.py
+++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py
@@ -231,22 +231,22 @@ def delete_auto_crawler(url):
# ============= ROUTES ==============
-@hiddenServices.route("/crawlers/", methods=['GET'])
-@login_required
-@login_read_only
-def dashboard():
- crawler_metadata_onion = get_crawler_splash_status('onion')
- crawler_metadata_regular = get_crawler_splash_status('regular')
-
- now = datetime.datetime.now()
- date = now.strftime("%Y%m%d")
- statDomains_onion = get_stats_last_crawled_domains('onion', date)
- statDomains_regular = get_stats_last_crawled_domains('regular', date)
-
- return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion,
- crawler_enabled=crawler_enabled, date=date,
- crawler_metadata_regular=crawler_metadata_regular,
- statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular)
+# @hiddenServices.route("/crawlers/", methods=['GET'])
+# @login_required
+# @login_read_only
+# def dashboard():
+# crawler_metadata_onion = get_crawler_splash_status('onion')
+# crawler_metadata_regular = get_crawler_splash_status('regular')
+#
+# now = datetime.datetime.now()
+# date = now.strftime("%Y%m%d")
+# statDomains_onion = get_stats_last_crawled_domains('onion', date)
+# statDomains_regular = get_stats_last_crawled_domains('regular', date)
+#
+# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion,
+# crawler_enabled=crawler_enabled, date=date,
+# crawler_metadata_regular=crawler_metadata_regular,
+# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular)
@hiddenServices.route("/crawlers/crawler_splash_onion", methods=['GET'])
@login_required
@@ -439,23 +439,6 @@ def remove_auto_crawler():
delete_auto_crawler(url)
return redirect(url_for('hiddenServices.auto_crawler', page=page))
-@hiddenServices.route("/crawlers/crawler_dashboard_json", methods=['GET'])
-@login_required
-@login_read_only
-def crawler_dashboard_json():
-
- crawler_metadata_onion = get_crawler_splash_status('onion')
- crawler_metadata_regular = get_crawler_splash_status('regular')
-
- now = datetime.datetime.now()
- date = now.strftime("%Y%m%d")
-
- statDomains_onion = get_stats_last_crawled_domains('onion', date)
- statDomains_regular = get_stats_last_crawled_domains('regular', date)
-
- return jsonify({'statDomains_onion': statDomains_onion, 'statDomains_regular': statDomains_regular,
- 'crawler_metadata_onion':crawler_metadata_onion, 'crawler_metadata_regular':crawler_metadata_regular})
-
# # TODO: refractor
@hiddenServices.route("/hiddenServices/last_crawled_domains_with_stats_json", methods=['GET'])
@login_required
diff --git a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html b/var/www/modules/hiddenServices/templates/Crawler_dashboard.html
deleted file mode 100644
index 9c0e1933..00000000
--- a/var/www/modules/hiddenServices/templates/Crawler_dashboard.html
+++ /dev/null
@@ -1,267 +0,0 @@
-
-
-
-
- AIL-Framework
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- {% include 'nav_bar.html' %}
-
-
-
-
- {% include 'crawler/menu_sidebar.html' %}
-
-
-
- {% include 'crawler/crawler_disabled.html' %}
-
-
-
-
-
-
-
-
-
- {% for crawler in crawler_metadata_onion %}
-
-
- {{crawler['crawler_info']}}
- |
-
- {{crawler['crawling_domain']}}
- |
-
- {{crawler['status_info']}}
- |
-
- {% endfor %}
-
-
-
-
-
-
-
-
-
-
-
-
- {% for crawler in crawler_metadata_regular %}
-
-
- {{crawler['crawler_info']}}
- |
-
- {{crawler['crawling_domain']}}
- |
-
- {{crawler['status_info']}}
- |
-
- {% endfor %}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- {% with object_type='domain' %}
- {% include 'tags/block_obj_tags_search.html' %}
- {% endwith %}
-
-
-
-
-
-
-
-
-
diff --git a/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html
new file mode 100644
index 00000000..5b059e23
--- /dev/null
+++ b/var/www/templates/crawler/crawler_splash/dashboard_splash_crawler.html
@@ -0,0 +1,235 @@
+
+
+
+
+ AIL-Framework
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% include 'nav_bar.html' %}
+
+
+
+
+ {% include 'crawler/menu_sidebar.html' %}
+
+
+
+ {% include 'crawler/crawler_disabled.html' %}
+
+
+
+
+
+ {% for splash_crawler in all_splash_crawler_status %}
+
+
+ {{splash_crawler['crawler_info']}}
+ |
+
+ {%if splash_crawler['type']=='onion'%}
+
+ {%else%}
+
+ {%endif%}
+ |
+
+ {{splash_crawler['crawling_domain']}}
+ |
+
+ {{splash_crawler['status_info']}}
+ |
+
+ {% endfor %}
+
+
+
+
+
+
+
+
+
+
+
+ {% with object_type='domain' %}
+ {% include 'tags/block_obj_tags_search.html' %}
+ {% endwith %}
+
+
+
+
+
+
+
+
+
diff --git a/var/www/templates/crawler/menu_sidebar.html b/var/www/templates/crawler/menu_sidebar.html
index c14abbbe..66a5f4f5 100644
--- a/var/www/templates/crawler/menu_sidebar.html
+++ b/var/www/templates/crawler/menu_sidebar.html
@@ -14,7 +14,7 @@
-
-
+
Dashboard
@@ -43,6 +43,12 @@
Automatic Crawler
+ -
+
+
+ Settings
+
+
diff --git a/var/www/templates/nav_bar.html b/var/www/templates/nav_bar.html
index 44be94b5..257988cc 100644
--- a/var/www/templates/nav_bar.html
+++ b/var/www/templates/nav_bar.html
@@ -22,7 +22,7 @@
Leaks Hunter
- Crawlers
+ Crawlers
Objects