diff --git a/bin/DB_KVROCKS_MIGRATION.py b/bin/DB_KVROCKS_MIGRATION.py index 85c448c9..3de267ed 100755 --- a/bin/DB_KVROCKS_MIGRATION.py +++ b/bin/DB_KVROCKS_MIGRATION.py @@ -466,24 +466,34 @@ def crawler_migration(): # print(domain, port, epoch) # #crawlers.add_last_crawled_domain(domain_type, domain, port, epoch) - for cookiejar_uuid in get_all_cookiejar(): - meta = get_cookiejar_metadata(cookiejar_uuid) - if meta: - # print(meta) - cookiejar = crawlers.Cookiejar(meta['uuid']) - if not cookiejar.exists(): - crawlers.create_cookiejar(meta['user'], description=meta['description'], level=meta['level'], - cookiejar_uuid=meta['uuid']) - cookiejar._set_date(meta['date']) - - for cookie_uuid in get_cookiejar_cookies_uuid(meta['uuid']): - cookie_dict = get_cookie_dict(cookie_uuid) - if cookie_dict: - # print(cookie_dict) - crawlers.api_create_cookie(meta['user'], cookiejar_uuid, cookie_dict) + # for cookiejar_uuid in get_all_cookiejar(): + # meta = get_cookiejar_metadata(cookiejar_uuid) + # if meta: + # # print(meta) + # cookiejar = crawlers.Cookiejar(meta['uuid']) + # if not cookiejar.exists(): + # crawlers.create_cookiejar(meta['user'], description=meta['description'], level=meta['level'], + # cookiejar_uuid=meta['uuid']) + # cookiejar._set_date(meta['date']) + # + # for cookie_uuid in get_cookiejar_cookies_uuid(meta['uuid']): + # cookie_dict = get_cookie_dict(cookie_uuid) + # if cookie_dict: + # # print(cookie_dict) + # crawlers.api_create_cookie(meta['user'], cookiejar_uuid, cookie_dict) # TODO: auto crawler -> to Fix / change + auto_crawler_web = r_crawler.smembers('auto_crawler_url:regular') + auto_crawler_onion = r_crawler.smembers('auto_crawler_url:onion') + if auto_crawler_onion or auto_crawler_web: + with open('old_auto_crawler_domains.txt', 'w') as f: + f.write('OLD Crawler Scheduler:\n\n') + for domain in auto_crawler_onion: + f.write(f'{domain}\n') + for domain in auto_crawler_web: + f.write(f'{domain}\n') + # TODO: crawlers queues ############################### @@ -919,11 +929,11 @@ if __name__ == '__main__': # user_migration() #tags_migration() # items_migration() - # crawler_migration() + crawler_migration() # domain_migration() # TO TEST ########################### # decodeds_migration() # screenshots_migration() - subtypes_obj_migration() + # subtypes_obj_migration() # ail_2_ail_migration() # trackers_migration() # investigations_migration() diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 2ce75949..aca1b478 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -91,6 +91,12 @@ def is_valid_onion_domain(domain): return True return False +def is_valid_domain(domain): + faup.decode(domain) + url_unpack = faup.get() + unpack_domain = url_unpack['domain'].lower() + return domain == unpack_domain + def get_faup(): return faup @@ -676,6 +682,9 @@ def is_blacklisted_domain(domain): def blacklist_domain(domain): return r_crawler.sadd('blacklist:domain', domain) +def unblacklist_domain(domain): + return r_crawler.srem('blacklist:domain', domain) + def load_blacklist(): try: with open(os.path.join(os.environ['AIL_BIN'], 'crawlers/blacklist.txt'), 'r') as f: @@ -687,6 +696,22 @@ def load_blacklist(): except Exception as e: print(e) +def api_blacklist_domain(data): + domain = str(data.get('domain', '')).lower() + if not is_valid_domain(domain): + return {'error': 'invalid domain'}, 400 + if is_blacklisted_domain(domain): + return {'error': 'domain already blacklisted'}, 400 + return blacklist_domain(domain), 200 + +def api_unblacklist_domain(data): + domain = str(data.get('domain', '')).lower() + if not is_valid_domain(domain): + return {'error': 'invalid domain'}, 400 + if not is_blacklisted_domain(domain): + return {'error': 'domain not blacklisted'}, 404 + return unblacklist_domain(domain), 200 + #### CRAWLER Scheduler #### @unique @@ -1667,12 +1692,6 @@ def test_ail_crawlers(): # TODO CHECK MIGRATION - Rest API -# def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message): -# r_serv_onion.zadd('crawler_auto_queue', int(time.time() + delta) , f'{message};{domain_type}') -# # update list, last auto crawled domains -# r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}') -# r_serv_onion.ltrim('last_auto_crawled', 0, 9) - # TODO MIGRATE ME # def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): # # validate url diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index baefb11d..aca2515b 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -190,6 +190,39 @@ def schedule_delete(): return create_json_response(res[0], res[1]) return redirect(url_for('crawler_splash.scheduler_dashboard')) +@crawler_splash.route("/crawlers/blacklist", methods=['GET']) +@login_required +@login_analyst +def crawler_blacklist(): + domain = request.args.get('domain') + if domain: + res = crawlers.api_blacklist_domain({'domain': domain}) + if res[1] != 200: + if res[0].get('error') == 'domain already blacklisted': + error_code = 2 + else: + error_code = 1 + else: + error_code = 0 + domain = None + else: + domain = None + error_code = None + blacklist = crawlers.get_blacklist() + return render_template("crawler_blacklist.html", blacklist=blacklist, + domain=domain, error_code=error_code, + is_manager_connected=crawlers.get_lacus_connection_metadata()) + +@crawler_splash.route("/crawlers/blacklist/delete", methods=['GET']) +@login_required +@login_analyst +def crawler_blacklist_delete(): + domain = request.args.get('domain') + res = crawlers.api_unblacklist_domain({'domain': domain}) + if res[1] != 200: + return create_json_response(res[0], res[1]) + return redirect(url_for('crawler_splash.crawler_blacklist')) + @crawler_splash.route("/crawlers/last/domains", methods=['GET']) @login_required diff --git a/var/www/modules/Flask_config.py b/var/www/modules/Flask_config.py index 2524ddcb..3d54215c 100644 --- a/var/www/modules/Flask_config.py +++ b/var/www/modules/Flask_config.py @@ -30,9 +30,6 @@ r_cache = config_loader.get_redis_conn("Redis_Cache") r_serv_log = config_loader.get_redis_conn("Redis_Log") r_serv_log_submit = config_loader.get_redis_conn("Redis_Log_submit") -r_serv_charts = config_loader.get_redis_conn("ARDB_Trending") # -> TODO MIGRATE Stats Graphs -r_serv_onion = config_loader.get_redis_conn("ARDB_Onion") # -> TODO MIGRATE AUTO CRAWLER - # # # # # # # r_serv_db = config_loader.get_db_conn("Kvrocks_DB") # TODO remove redis call from blueprint r_serv_tags = config_loader.get_db_conn("Kvrocks_Tags") # TODO remove redis call from blueprint diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py deleted file mode 100644 index 8d19cc24..00000000 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ /dev/null @@ -1,292 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -''' - Flask functions and routes for the trending modules page -''' -import datetime -import sys -import os -import time -from pyfaup.faup import Faup -from flask import Flask, render_template, jsonify, request, send_file, Blueprint, redirect, url_for - -from Role_Manager import login_admin, login_analyst, login_read_only, no_cache -from flask_login import login_required - -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages -################################## -from lib import crawlers - -# ============ VARIABLES ============ -import Flask_config - -app = Flask_config.app -baseUrl = Flask_config.baseUrl -r_cache = Flask_config.r_cache -r_serv_onion = Flask_config.r_serv_onion -bootstrap_label = Flask_config.bootstrap_label - -hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') - -faup = Faup() -list_types=['onion', 'regular'] -dic_type_name={'onion':'Onion', 'regular':'Website'} - -# ============ FUNCTIONS ============ - - -def is_valid_domain(domain): - faup.decode(domain) - domain_unpack = faup.get() - if domain_unpack['tld'] is not None and domain_unpack['scheme'] is None and domain_unpack['port'] is None and domain_unpack['query_string'] is None: - return True - else: - return False - -def get_type_domain(domain): - if domain is None: - type = 'regular' - else: - if domain.rsplit('.', 1)[1] == 'onion': - type = 'onion' - else: - type = 'regular' - return type - -def get_domain_from_url(url): - faup.decode(url) - unpack_url = faup.get() - domain = unpack_url['domain'] - ## TODO: FIXME remove me - try: - domain = domain.decode() - except: - pass - return domain - -def get_last_domains_crawled(type): # DONE - return r_serv_onion.lrange('last_{}'.format(type), 0 ,-1) - - -def get_last_crawled_domains_metadata(list_domains_crawled, date, type=None, auto_mode=False): - list_crawled_metadata = [] - for domain_epoch in list_domains_crawled: - if not auto_mode: - domain, epoch = domain_epoch.rsplit(';', 1) - else: - url = domain_epoch - domain = domain_epoch - domain = domain.split(':') - if len(domain) == 1: - port = 80 - domain = domain[0] - else: - port = domain[1] - domain = domain[0] - metadata_domain = {} - # get Domain type - if type is None: - type_domain = get_type_domain(domain) - else: - type_domain = type - if auto_mode: - metadata_domain['url'] = url - epoch = r_serv_onion.zscore('crawler_auto_queue', '{};auto;{}'.format(domain, type_domain)) - #domain in priority queue - if epoch is None: - epoch = 'In Queue' - else: - epoch = datetime.datetime.fromtimestamp(float(epoch)).strftime('%Y-%m-%d %H:%M:%S') - - metadata_domain['domain'] = domain - if len(domain) > 45: - domain_name, tld_domain = domain.rsplit('.', 1) - metadata_domain['domain_name'] = '{}[...].{}'.format(domain_name[:40], tld_domain) - else: - metadata_domain['domain_name'] = domain - metadata_domain['port'] = port - metadata_domain['epoch'] = epoch - metadata_domain['last_check'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'last_check') - if metadata_domain['last_check'] is None: - metadata_domain['last_check'] = '********' - metadata_domain['first_seen'] = r_serv_onion.hget('{}_metadata:{}'.format(type_domain, domain), 'first_seen') - if metadata_domain['first_seen'] is None: - metadata_domain['first_seen'] = '********' - if r_serv_onion.sismember('{}_up:{}'.format(type_domain, metadata_domain['last_check']) , domain): - metadata_domain['status_text'] = 'UP' - metadata_domain['status_color'] = 'Green' - metadata_domain['status_icon'] = 'fa-check-circle' - else: - metadata_domain['status_text'] = 'DOWN' - metadata_domain['status_color'] = 'Red' - metadata_domain['status_icon'] = 'fa-times-circle' - list_crawled_metadata.append(metadata_domain) - return list_crawled_metadata - -def delete_auto_crawler(url): - domain = get_domain_from_url(url) - type = get_type_domain(domain) - # remove from set - r_serv_onion.srem('auto_crawler_url:{}'.format(type), url) - # remove config - r_serv_onion.delete('crawler_config:auto:{}:{}:{}'.format(type, domain, url)) - # remove from queue - r_serv_onion.srem('{}_crawler_priority_queue'.format(type), '{};auto'.format(url)) - # remove from crawler_auto_queue - r_serv_onion.zrem('crawler_auto_queue'.format(type), '{};auto;{}'.format(url, type)) - -# ============= ROUTES ============== - -@hiddenServices.route("/crawlers/blacklisted_domains", methods=['GET']) -@login_required -@login_read_only -def blacklisted_domains(): - blacklist_domain = request.args.get('blacklist_domain') - unblacklist_domain = request.args.get('unblacklist_domain') - type = request.args.get('type') - if type in list_types: - type_name = dic_type_name[type] - if blacklist_domain is not None: - blacklist_domain = int(blacklist_domain) - if unblacklist_domain is not None: - unblacklist_domain = int(unblacklist_domain) - try: - page = int(request.args.get('page')) - except: - page = 1 - if page <= 0: - page = 1 - nb_page_max = r_serv_onion.scard('blacklist_{}'.format(type))/(1000) - if isinstance(nb_page_max, float): - nb_page_max = int(nb_page_max)+1 - if page > nb_page_max: - page = nb_page_max - start = 1000*(page -1) - stop = 1000*page - - list_blacklisted = list(r_serv_onion.smembers('blacklist_{}'.format(type))) - list_blacklisted_1 = list_blacklisted[start:stop] - list_blacklisted_2 = list_blacklisted[stop:stop+1000] - return render_template("blacklisted_domains.html", list_blacklisted_1=list_blacklisted_1, list_blacklisted_2=list_blacklisted_2, - type=type, type_name=type_name, page=page, nb_page_max=nb_page_max, - blacklist_domain=blacklist_domain, unblacklist_domain=unblacklist_domain) - else: - return 'Incorrect Type' - -@hiddenServices.route("/crawler/blacklist_domain", methods=['GET']) -@login_required -@login_analyst -def blacklist_domain(): - domain = request.args.get('domain') - type = request.args.get('type') - try: - page = int(request.args.get('page')) - except: - page = 1 - if type in list_types: - if is_valid_domain(domain): - res = r_serv_onion.sadd('blacklist_{}'.format(type), domain) - if page: - if res == 0: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=2)) - else: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=1)) - else: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, blacklist_domain=0)) - else: - return 'Incorrect type' - -@hiddenServices.route("/crawler/unblacklist_domain", methods=['GET']) -@login_required -@login_analyst -def unblacklist_domain(): - domain = request.args.get('domain') - type = request.args.get('type') - try: - page = int(request.args.get('page')) - except: - page = 1 - if type in list_types: - if is_valid_domain(domain): - res = r_serv_onion.srem('blacklist_{}'.format(type), domain) - if page: - if res == 0: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=2)) - else: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=1)) - else: - return redirect(url_for('hiddenServices.blacklisted_domains', page=page, type=type, unblacklist_domain=0)) - else: - return 'Incorrect type' - -@hiddenServices.route("/crawlers/auto_crawler", methods=['GET']) -@login_required -@login_read_only -def auto_crawler(): - nb_element_to_display = 100 - try: - page = int(request.args.get('page')) - except: - page = 1 - if page <= 0: - page = 1 - - nb_auto_onion = r_serv_onion.scard('auto_crawler_url:onion') - nb_auto_regular = r_serv_onion.scard('auto_crawler_url:regular') - - if nb_auto_onion > nb_auto_regular: - nb_max = nb_auto_onion - else: - nb_max = nb_auto_regular - - nb_page_max = nb_max/(nb_element_to_display) - if isinstance(nb_page_max, float): - nb_page_max = int(nb_page_max)+1 - if page > nb_page_max: - page = nb_page_max - start = nb_element_to_display*(page -1) - stop = nb_element_to_display*page - - last_auto_crawled = get_last_domains_crawled('auto_crawled') - last_domains = get_last_crawled_domains_metadata(last_auto_crawled, '') - - if start > nb_auto_onion: - auto_crawler_domain_onions = [] - elif stop > nb_auto_onion: - auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:nb_auto_onion] - else: - auto_crawler_domain_onions = list(r_serv_onion.smembers('auto_crawler_url:onion'))[start:stop] - - if start > nb_auto_regular: - auto_crawler_domain_regular = [] - elif stop > nb_auto_regular: - auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:nb_auto_regular] - else: - auto_crawler_domain_regular = list(r_serv_onion.smembers('auto_crawler_url:regular'))[start:stop] - - auto_crawler_domain_onions_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_onions, '', type='onion', auto_mode=True) - auto_crawler_domain_regular_metadata = get_last_crawled_domains_metadata(auto_crawler_domain_regular, '', type='regular', auto_mode=True) - - return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max, - last_domains=last_domains, - is_manager_connected=crawlers.get_lacus_connection_metadata(), - auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata, - auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata) - -@hiddenServices.route("/crawlers/remove_auto_crawler", methods=['GET']) -@login_required -@login_analyst -def remove_auto_crawler(): - url = request.args.get('url') - page = request.args.get('page') - - if url: - delete_auto_crawler(url) - return redirect(url_for('hiddenServices.auto_crawler', page=page)) - - -# ========= REGISTRATION ========= -app.register_blueprint(hiddenServices, url_prefix=baseUrl) diff --git a/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html b/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html deleted file mode 100644 index 248600e4..00000000 --- a/var/www/modules/hiddenServices/templates/Crawler_Splash_last_by_type.html +++ /dev/null @@ -1,327 +0,0 @@ - - - -
-Domain | -First Seen | -Last Check | -Status | -
---|---|---|---|
{{ metadata_domain['domain_name'] }} | -{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}} | -{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}} | -
-
- {{metadata_domain['status_text']}}
-
- |
-
Domain | -First Seen | -Last Check | -Status | -
---|---|---|---|
{{ metadata_domain['domain_name'] }} | -{{'{}/{}/{}'.format(metadata_domain['first_seen'][0:4], metadata_domain['first_seen'][4:6], metadata_domain['first_seen'][6:8])}} | -{{'{}/{}/{}'.format(metadata_domain['last_check'][0:4], metadata_domain['last_check'][4:6], metadata_domain['last_check'][6:8])}} | -
-
- {{metadata_domain['status_text']}}
-
- |
-
Onion Url | -- | Next Check | -- | - |
---|---|---|---|---|
{{ metadata_domain['url'] }} | -- - | -{{metadata_domain['epoch']}} | -
-
- {{metadata_domain['status_text']}}
-
- |
- - - | -
Regular Url | -- | Next Check | -- | - |
---|---|---|---|---|
{{ metadata_domain['url'] }} | -- - | -{{metadata_domain['epoch']}} | -
-
- {{metadata_domain['status_text']}}
-
- |
- - - | -
- -------------- - - - - -------------- - --
Domain | -First Seen | -Last Check | -Status | -
---|---|---|---|
- {{ domain }}
-
- {% for tag in domain_metadata[domain]['tags'] %}
-
- {{ tag }} {{ domain_metadata[domain]['tags'][tag] }}
-
- {% endfor %}
-
- |
- {{'{}/{}/{}'.format(domain_metadata[domain]['first_seen'][0:4], domain_metadata[domain]['first_seen'][4:6], domain_metadata[domain]['first_seen'][6:8])}} | -{{'{}/{}/{}'.format(domain_metadata[domain]['last_check'][0:4], domain_metadata[domain]['last_check'][4:6], domain_metadata[domain]['last_check'][6:8])}} | -
-
- {{domain_metadata[domain]['status_text']}}
-
- |
-
Domain | -First Seen | -Last Check | -Status | -
---|---|---|---|
{{ metadata_onion['domain'] }} | -{{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}} | -{{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}} | -
-
- {{metadata_onion['status_text']}}
-
- |
-
-
-
-
- Domains UP
-
-
- |
-
-
-
-
-
- {{ statDomains['domains_up'] }}
- |
-
-
-
-
- Domains DOWN
-
-
- |
-
-
-
-
-
- {{ statDomains['domains_down'] }}
- |
-
- - Crawled Domains - - | -
-
- {{ statDomains['total'] }}
-
- |
-
Domains in Queue | -{{ statDomains['domains_queue'] }} |
-
- {{crawler['crawler_info']}} - | -- {{crawler['crawling_domain']}} - | -- {{crawler['status_info']}} - | -
Blacklisted Domains | ++ |
---|---|
{{dom}} | ++ + + + | +