diff --git a/bin/Crawler.py b/bin/Crawler.py index 6a61a0ba..ec59243d 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -141,7 +141,7 @@ def get_crawler_config(redis_server, mode, service_type, domain, url=None): redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain)) return crawler_options -def load_crawler_config(service_type, domain, paste, url, date): +def load_crawler_config(queue_type, service_type, domain, paste, url, date): crawler_config = {} crawler_config['splash_url'] = f'http://{splash_url}' crawler_config['item'] = paste @@ -149,6 +149,9 @@ def load_crawler_config(service_type, domain, paste, url, date): crawler_config['domain'] = domain crawler_config['date'] = date + if queue_type and queue_type != 'tor': + service_type = queue_type + # Auto and Manual Crawling # Auto ################################################# create new entry, next crawling => here or when ended ? if paste == 'auto': @@ -282,13 +285,15 @@ if __name__ == '__main__': splash_url = sys.argv[1] splash_name = crawlers.get_splash_name_by_url(splash_url) - crawler_type = crawlers.get_splash_crawler_type(splash_name) + proxy_type = crawlers.get_splash_proxy(splash_name) print(splash_name) - print(crawler_type) + print(proxy_type) #rotation_mode = deque(['onion', 'regular']) - rotation_mode = deque(crawlers.get_crawler_queue_type_by_proxy(splash_name, crawler_type)) + all_crawler_queues = crawlers.get_crawler_queue_types_by_splash_name(splash_name) + rotation_mode = deque(all_crawler_queues) + print(rotation_mode) default_proto_map = {'http': 80, 'https': 443} ######################################################## add ftp ??? @@ -387,7 +392,7 @@ if __name__ == '__main__': # Update crawler status type r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service']) - crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) + crawler_config = load_crawler_config(to_crawl['queue_type'], to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date) # check if default crawler if not crawler_config['requested']: # Auto crawl only if service not up this month diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index 8b43be99..3a95e706 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -13,36 +13,9 @@ config_loader = ConfigLoader.ConfigLoader() r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata") config_loader = None -config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') -SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url') -api_key = config_loader.get_config_str('Splash_Manager', 'api_key') -crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers') -config_loader = None - -import screen - # # TODO: lauch me in core screen # # TODO: check if already launched in tor screen -def launch_crawlers(): - for crawler_splash in crawlers_to_launch: - splash_name = crawler_splash[0] - nb_crawlers = int(crawler_splash[1]) - - all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True) - if nb_crawlers > len(all_crawler_urls): - print('Error, can\'t launch all Splash Dockers') - print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) - nb_crawlers = len(all_crawler_urls) - - crawlers.reset_all_spash_crawler_status() - - for i in range(0, int(nb_crawlers)): - splash_url = all_crawler_urls[i] - print(all_crawler_urls[i]) - - crawlers.launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url)) - # # TODO: handle mutltiple splash_manager if __name__ == '__main__': @@ -56,7 +29,7 @@ if __name__ == '__main__': is_manager_connected = crawlers.reload_splash_and_proxies_list() print(is_manager_connected) if is_manager_connected: - launch_crawlers() + crawlers.relaunch_crawlers() last_check = int(time.time()) while True: @@ -72,7 +45,7 @@ if __name__ == '__main__': is_manager_connected = crawlers.reload_splash_and_proxies_list() if is_manager_connected: print('reload proxies and splash list') - launch_crawlers() + crawlers.relaunch_crawlers() session_uuid = current_session_uuid if not is_manager_connected: print('Error, Can\'t connect to Splash manager') diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 13c8759a..9cc7933f 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -38,12 +38,6 @@ r_cache = config_loader.get_redis_conn("Redis_Cache") PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes")) config_loader = None -# load crawler config -config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg') -splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url') -splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key') -config_loader = None - faup = Faup() # # # # # # # # @@ -435,10 +429,80 @@ def get_splash_crawler_latest_stats(): date = now.strftime("%Y%m%d") return get_stats_last_crawled_domains(['onion', 'regular'], date) +def get_nb_crawlers_to_launch_by_splash_name(splash_name): + res = r_serv_onion.hget('all_crawlers_to_launch', splash_name) + if res: + return int(res) + else: + return 0 + +def get_all_crawlers_to_launch_splash_name(): + return r_serv_onion.hkeys('all_crawlers_to_launch') + +def get_nb_crawlers_to_launch(): + nb_crawlers_to_launch = r_serv_onion.hgetall('all_crawlers_to_launch') + for splash_name in nb_crawlers_to_launch: + nb_crawlers_to_launch[splash_name] = int(nb_crawlers_to_launch[splash_name]) + return nb_crawlers_to_launch + +def get_nb_crawlers_to_launch_ui(): + nb_crawlers_to_launch = get_nb_crawlers_to_launch() + for splash_name in get_all_splash(): + if splash_name not in nb_crawlers_to_launch: + nb_crawlers_to_launch[splash_name] = 0 + return nb_crawlers_to_launch + +def set_nb_crawlers_to_launch(dict_splash_name): + r_serv_onion.delete('all_crawlers_to_launch') + for splash_name in dict_splash_name: + r_serv_onion.hset('all_crawlers_to_launch', splash_name, int(dict_splash_name[splash_name])) + relaunch_crawlers() + +def relaunch_crawlers(): + all_crawlers_to_launch = get_nb_crawlers_to_launch() + for splash_name in all_crawlers_to_launch: + nb_crawlers = int(all_crawlers_to_launch[splash_name]) + + all_crawler_urls = get_splash_all_url(splash_name, r_list=True) + if nb_crawlers > len(all_crawler_urls): + print('Error, can\'t launch all Splash Dockers') + print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name)) + nb_crawlers = len(all_crawler_urls) + + reset_all_spash_crawler_status() + + for i in range(0, int(nb_crawlers)): + splash_url = all_crawler_urls[i] + print(all_crawler_urls[i]) + + launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url)) + +def api_set_nb_crawlers_to_launch(dict_splash_name): + # TODO: check if is dict + dict_crawlers_to_launch = {} + all_splash = get_all_splash() + crawlers_to_launch = list(all_splash & set(dict_splash_name.keys())) + for splash_name in crawlers_to_launch: + try: + nb_to_launch = int(dict_splash_name.get(splash_name, 0)) + if nb_to_launch < 0: + return ({'error':'The number of crawlers to launch is negative'}, 400) + except: + return ({'error':'invalid number of crawlers to launch'}, 400) + if nb_to_launch > 0: + dict_crawlers_to_launch[splash_name] = nb_to_launch + + if dict_crawlers_to_launch: + set_nb_crawlers_to_launch(dict_crawlers_to_launch) + return (dict_crawlers_to_launch, 200) + else: + return ({'error':'invalid input'}, 400) + + ##-- CRAWLER GLOBAL --## #### CRAWLER TASK #### -def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): +def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): crawler_config = {} crawler_config['depth_limit'] = depth_limit @@ -478,10 +542,18 @@ def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages tld = unpack_url['tld'].decode() except: tld = unpack_url['tld'] - if tld == 'onion': - crawler_type = 'onion' + + if crawler_type=='None': + crawler_type = None + + if crawler_type: + if crawler_type=='tor': + crawler_type = 'onion' else: - crawler_type = 'regular' + if tld == 'onion': + crawler_type = 'onion' + else: + crawler_type = 'regular' save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=url) send_url_to_crawl_in_queue(crawler_mode, crawler_type, url) @@ -493,6 +565,7 @@ def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url= r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_mode, crawler_type, domain, url), json.dumps(crawler_config)) def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): + print('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode)) # add auto crawled url for user UI if crawler_mode == 'auto': @@ -500,7 +573,7 @@ def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url): #### #### #### CRAWLER TASK API #### -def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None): +def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): # validate url if url is None or url=='' or url=='\n': return ({'error':'invalid depth limit'}, 400) @@ -537,7 +610,10 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit if cookie_owner != user_id: return ({'error': 'The access to this cookiejar is restricted'}, 403) + # # TODO: verify splash name/crawler type + create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + crawler_type=crawler_type, auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent) return None #### #### @@ -608,21 +684,41 @@ def save_har(har_dir, item_id, har_content): f.write(json.dumps(har_content)) #### CRAWLER QUEUES #### -def get_crawler_queue_type_by_proxy(splash_name, proxy_type): - all_domain_type = [] - if splash_name != 'default_splash' and splash_name != 'default_splash_tor': - all_domain_type.append(splash_name) - # check if can be used for discovery - if not is_splash_used_in_discovery(splash_name): - return all_domain_type - if proxy_type == 'tor': +def get_all_crawlers_queues_types(): + all_queues_types = set() + all_splash_name = get_all_crawlers_to_launch_splash_name() + for splash_name in all_splash_name: + all_queues_types.add(get_splash_crawler_type(splash_name)) + all_splash_name = list() + return all_queues_types + +def get_crawler_queue_types_by_splash_name(splash_name): + all_domain_type = [splash_name] + crawler_type = get_splash_crawler_type(splash_name) + #if not is_splash_used_in_discovery(splash_name) + if crawler_type == 'tor': all_domain_type.append('onion') all_domain_type.append('regular') - # proxy_type = web else: all_domain_type.append('regular') return all_domain_type +def get_crawler_type_by_url(url): + faup.decode(url) + unpack_url = faup.get() + ## TODO: # FIXME: remove me + try: + tld = unpack_url['tld'].decode() + except: + tld = unpack_url['tld'] + + if tld == 'onion': + crawler_type = 'onion' + else: + crawler_type = 'regular' + return crawler_type + + def get_elem_to_crawl_by_queue_type(l_queue_type): ## queues priority: # 1 - priority queue @@ -644,7 +740,8 @@ def get_elem_to_crawl_by_queue_type(l_queue_type): # # TODO: to check/refractor item_id = None url = message - return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message} + crawler_type = get_crawler_type_by_url(url) + return {'url': url, 'paste': item_id, 'type_service': crawler_type, 'queue_type': queue_type, 'original_message': message} return None def get_nb_elem_to_crawl_by_type(queue_type): @@ -662,29 +759,37 @@ def get_nb_elem_to_crawl_by_type(queue_type): # # # # # # # # # # # # def get_splash_manager_url(reload=False): # TODO: add in db config - return splash_manager_url + return r_serv_onion.get('crawler:splash:manager:url') def get_splash_api_key(reload=False): # TODO: add in db config - return splash_api_key + return r_serv_onion.get('crawler:splash:manager:key') def get_hidden_splash_api_key(): # TODO: add in db config key = get_splash_api_key() - if len(key)==41: - return f'{key[:4]}*********************************{key[-4:]}' - else: - return None + if key: + if len(key)==41: + return f'{key[:4]}*********************************{key[-4:]}' + +def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search): + if len(api_key) != 41: + return False + return not bool(search(api_key)) + +def save_splash_manager_url_api(url, api_key): + r_serv_onion.set('crawler:splash:manager:url', url) + r_serv_onion.set('crawler:splash:manager:key', api_key) def get_splash_url_from_manager_url(splash_manager_url, splash_port): url = urlparse(splash_manager_url) host = url.netloc.split(':', 1)[0] return '{}:{}'.format(host, splash_port) -def is_splash_used_in_discovery(splash_name): - res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue') - if res == 'True': - return True - else: - return False +# def is_splash_used_in_discovery(splash_name): +# res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue') +# if res == 'True': +# return True +# else: +# return False def restart_splash_docker(splash_url): splash_port = splash_url.split(':')[-1] @@ -700,25 +805,50 @@ def is_splash_manager_connected(delta_check=30): res = r_cache.hget('crawler:splash:manager', 'connected') return res == 'True' -def update_splash_manager_connection_status(is_connected): +def update_splash_manager_connection_status(is_connected, req_error=None): r_cache.hset('crawler:splash:manager', 'connected', is_connected) r_cache.hset('crawler:splash:manager', 'last_check', int(time.time())) + if not req_error: + r_cache.hdel('crawler:splash:manager', 'error') + else: + r_cache.hset('crawler:splash:manager', 'status_code', req_error['status_code']) + r_cache.hset('crawler:splash:manager', 'error', req_error['error']) + +def get_splash_manager_connection_metadata(force_ping=False): + dict_manager={} + if force_ping: + dict_manager['status'] = ping_splash_manager() + else: + dict_manager['status'] = is_splash_manager_connected() + if not dict_manager['status']: + dict_manager['status_code'] = r_cache.hget('crawler:splash:manager', 'status_code') + dict_manager['error'] = r_cache.hget('crawler:splash:manager', 'error') + return dict_manager ## API ## def ping_splash_manager(): + splash_manager_url = get_splash_manager_url() + if not splash_manager_url: + return False try: - req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) + req = requests.get('{}/api/v1/ping'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False) if req.status_code == 200: update_splash_manager_connection_status(True) return True else: - print(req.json()) - update_splash_manager_connection_status(False) + res = req.json() + if 'reason' in res: + req_error = {'status_code': req.status_code, 'error': res['reason']} + else: + print(req.json()) + req_error = {'status_code': req.status_code, 'error': json.dumps(req.json())} + update_splash_manager_connection_status(False, req_error=req_error) return False except requests.exceptions.ConnectionError: pass # splash manager unreachable - update_splash_manager_connection_status(False) + req_error = {'status_code': 500, 'error': 'splash manager unreachable'} + update_splash_manager_connection_status(False, req_error=req_error) return False def get_splash_manager_session_uuid(): @@ -734,6 +864,18 @@ def get_splash_manager_session_uuid(): # splash manager unreachable update_splash_manager_connection_status(False) +def get_splash_manager_version(): + splash_manager_url = get_splash_manager_url() + if splash_manager_url: + try: + req = requests.get('{}/api/v1/version'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False) + if req.status_code == 200: + return req.json()['message'] + else: + print(req.json()) + except requests.exceptions.ConnectionError: + pass + def get_all_splash_manager_containers_name(): req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False) if req.status_code == 200: @@ -755,6 +897,27 @@ def _restart_splash_docker(splash_port): return req.json() else: print(req.json()) + +def api_save_splash_manager_url_api(data): + # unpack json + manager_url = data.get('url', None) + api_key = data.get('api_key', None) + if not manager_url or not api_key: + return ({'status': 'error', 'reason': 'No url or API key supplied'}, 400) + # check if is valid url + try: + result = urlparse(manager_url) + if not all([result.scheme, result.netloc]): + return ({'status': 'error', 'reason': 'Invalid url'}, 400) + except: + return ({'status': 'error', 'reason': 'Invalid url'}, 400) + + # check if is valid key + if not is_valid_api_key(api_key): + return ({'status': 'error', 'reason': 'Invalid API key'}, 400) + + save_splash_manager_url_api(manager_url, api_key) + return ({'url': manager_url, 'api_key': get_hidden_splash_api_key()}, 200) ## -- ## ## SPLASH ## @@ -869,13 +1032,13 @@ def get_all_proxies_metadata(): all_proxy_dict[proxy_name] = get_proxy_metadata(proxy_name) return all_proxy_dict -def set_proxy_used_in_discovery(proxy_name, value): - r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value) +# def set_proxy_used_in_discovery(proxy_name, value): +# r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value) def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy) proxy_splash = get_all_splash_by_proxy(proxy_name) - if proxy_splash: - print('error, a splash container is using this proxy') + #if proxy_splash: + # print('error, a splash container is using this proxy') r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name)) r_serv_onion.srem('all_proxy', proxy_name) ## -- ## @@ -948,3 +1111,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''): #### CRAWLER PROXY #### #### ---- #### + +if __name__ == '__main__': + res = get_splash_manager_version() + print(res) diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index 7e7cac39..5756a895 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -54,13 +54,13 @@ def create_json_response(data, status_code): @login_read_only def crawlers_dashboard(): # # TODO: get splash manager status - crawler_enabled = crawlers.ping_splash_manager() + is_manager_connected = crawlers.get_splash_manager_connection_metadata() all_splash_crawler_status = crawlers.get_all_spash_crawler_status() splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats() date = crawlers.get_current_date() return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status, - crawler_enabled=crawler_enabled, date=date, + is_manager_connected=is_manager_connected, date=date, splash_crawlers_latest_stats=splash_crawlers_latest_stats) @crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET']) @@ -80,7 +80,13 @@ def crawler_dashboard_json(): def manual(): user_id = current_user.get_id() l_cookiejar = crawlers.api_get_cookies_list_select(user_id) - return render_template("crawler_manual.html", crawler_enabled=True, l_cookiejar=l_cookiejar) + all_crawlers_types = crawlers.get_all_crawlers_queues_types() + all_splash_name = crawlers.get_all_crawlers_to_launch_splash_name() + return render_template("crawler_manual.html", + is_manager_connected=crawlers.get_splash_manager_connection_metadata(), + all_crawlers_types=all_crawlers_types, + all_splash_name=all_splash_name, + l_cookiejar=l_cookiejar) @crawler_splash.route("/crawlers/send_to_spider", methods=['POST']) @login_required @@ -90,6 +96,8 @@ def send_to_spider(): # POST val url = request.form.get('url_to_crawl') + crawler_type = request.form.get('crawler_queue_type') + splash_name = request.form.get('splash_name') auto_crawler = request.form.get('crawler_type') crawler_delta = request.form.get('crawler_epoch') screenshot = request.form.get('screenshot') @@ -98,6 +106,9 @@ def send_to_spider(): max_pages = request.form.get('max_pages') cookiejar_uuid = request.form.get('cookiejar') + if splash_name: + crawler_type = splash_name + if cookiejar_uuid: if cookiejar_uuid == 'None': cookiejar_uuid = None @@ -106,6 +117,7 @@ def send_to_spider(): cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '') res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages, + crawler_type=crawler_type, auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid) if res: return create_json_response(res[0], res[1]) @@ -435,16 +447,55 @@ def crawler_cookiejar_cookie_json_add_post(): def crawler_splash_setings(): all_proxies = crawlers.get_all_proxies_metadata() all_splash = crawlers.get_all_splash_crawler_metadata() + nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch() splash_manager_url = crawlers.get_splash_manager_url() api_key = crawlers.get_hidden_splash_api_key() - is_manager_connected = crawlers.ping_splash_manager() + is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True) crawler_full_config = Config_DB.get_full_config_by_section('crawler') return render_template("settings_splash_crawler.html", is_manager_connected=is_manager_connected, splash_manager_url=splash_manager_url, api_key=api_key, + nb_crawlers_to_launch=nb_crawlers_to_launch, all_splash=all_splash, all_proxies=all_proxies, crawler_full_config=crawler_full_config) +@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST']) +@login_required +@login_admin +def crawler_splash_setings_crawler_manager(): + if request.method == 'POST': + splash_manager_url = request.form.get('splash_manager_url') + api_key = request.form.get('api_key') + + res = crawlers.api_save_splash_manager_url_api({'url':splash_manager_url, 'api_key':api_key}) + if res[1] != 200: + return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] + else: + return redirect(url_for('crawler_splash.crawler_splash_setings')) + else: + splash_manager_url = crawlers.get_splash_manager_url() + api_key = crawlers.get_splash_api_key() + return render_template("settings_edit_splash_crawler_manager.html", + splash_manager_url=splash_manager_url, api_key=api_key) + +@crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST']) +@login_required +@login_admin +def crawler_splash_setings_crawlers_to_lauch(): + if request.method == 'POST': + dict_splash_name = {} + for crawler_name in list(request.form): + dict_splash_name[crawler_name]= request.form.get(crawler_name) + res = crawlers.api_set_nb_crawlers_to_launch(dict_splash_name) + if res[1] != 200: + return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] + else: + return redirect(url_for('crawler_splash.crawler_splash_setings')) + else: + nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch_ui() + return render_template("settings_edit_crawlers_to_launch.html", + nb_crawlers_to_launch=nb_crawlers_to_launch) + ## - - ## diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index bf9a0ec8..55a7abe4 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -18,6 +18,7 @@ from flask_login import login_required from Date import Date from HiddenServices import HiddenServices +import crawlers # ============ VARIABLES ============ import Flask_config @@ -27,7 +28,6 @@ baseUrl = Flask_config.baseUrl r_cache = Flask_config.r_cache r_serv_onion = Flask_config.r_serv_onion r_serv_metadata = Flask_config.r_serv_metadata -crawler_enabled = Flask_config.crawler_enabled bootstrap_label = Flask_config.bootstrap_label sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) @@ -244,7 +244,7 @@ def delete_auto_crawler(url): # statDomains_regular = get_stats_last_crawled_domains('regular', date) # # return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion, -# crawler_enabled=crawler_enabled, date=date, +# date=date, # crawler_metadata_regular=crawler_metadata_regular, # statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular) @@ -288,7 +288,7 @@ def Crawler_Splash_last_by_type(): crawler_metadata = get_crawler_splash_status(type) return render_template("Crawler_Splash_last_by_type.html", type=type, type_name=type_name, - crawler_enabled=crawler_enabled, + is_manager_connected=crawlers.get_splash_manager_connection_metadata(), last_domains=list_domains, statDomains=statDomains, crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string) @@ -424,7 +424,7 @@ def auto_crawler(): return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max, last_domains=last_domains, - crawler_enabled=crawler_enabled, + is_manager_connected=crawlers.get_splash_manager_connection_metadata(), auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata, auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata) diff --git a/var/www/templates/crawler/crawler_disabled.html b/var/www/templates/crawler/crawler_disabled.html index 455f350e..69808cf5 100644 --- a/var/www/templates/crawler/crawler_disabled.html +++ b/var/www/templates/crawler/crawler_disabled.html @@ -1,6 +1,14 @@ -{% if not crawler_enabled %} +{%if not is_manager_connected['status']%} -{% endif %} +{%endif%} diff --git a/var/www/templates/crawler/crawler_splash/crawler_manual.html b/var/www/templates/crawler/crawler_splash/crawler_manual.html index 1072920b..510099a4 100644 --- a/var/www/templates/crawler/crawler_splash/crawler_manual.html +++ b/var/www/templates/crawler/crawler_splash/crawler_manual.html @@ -44,7 +44,31 @@
-
+
+  Crawler Type   +
+ + +
+
+
+ +
+
+ +
+
 Manual  
@@ -143,11 +167,16 @@ var chart = {}; $(document).ready(function(){ $("#page-Crawler").addClass("active"); $("#nav_manual_crawler").addClass("active"); + queue_type_selector_input_controler() manual_crawler_input_controler(); $('#crawler_type').on("change", function () { manual_crawler_input_controler(); }); + + $('#queue_type_selector').on("change", function () { + queue_type_selector_input_controler(); + }); }); function toggle_sidebar(){ @@ -172,4 +201,14 @@ function manual_crawler_input_controler() { } } +function queue_type_selector_input_controler() { + if($('#queue_type_selector').is(':checked')){ + $("#div_crawler_queue_type").hide(); + $("#div_splash_name").show(); + }else{ + $("#div_crawler_queue_type").show(); + $("#div_splash_name").hide(); + } +} + diff --git a/var/www/templates/crawler/crawler_splash/settings_edit_crawlers_to_launch.html b/var/www/templates/crawler/crawler_splash/settings_edit_crawlers_to_launch.html new file mode 100644 index 00000000..a9653820 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/settings_edit_crawlers_to_launch.html @@ -0,0 +1,60 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
Number of Crawlers to Launch:
+ + + {%for crawler_name in nb_crawlers_to_launch%} + + + + + {%endfor%} + +
{{crawler_name}} + +
+ +
+ +
+
+
+ + + + + diff --git a/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html b/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html new file mode 100644 index 00000000..9b154697 --- /dev/null +++ b/var/www/templates/crawler/crawler_splash/settings_edit_splash_crawler_manager.html @@ -0,0 +1,55 @@ + + + + + AIL-Framework + + + + + + + + + + + + + + + {% include 'nav_bar.html' %} + +
+
+ + {% include 'crawler/menu_sidebar.html' %} + +
+ +
+
+ + +
+
+ + +
+ +
+ +
+
+
+ + + + + diff --git a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html b/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html index 9c66211d..8b1dae72 100644 --- a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html +++ b/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html @@ -26,10 +26,6 @@
- {%if not is_manager_connected%} - {% include 'crawler/crawler_disabled.html' %} - {%endif%} -
@@ -45,7 +41,7 @@
- {% if is_manager_connected %} + {% if is_manager_connected['status'] %}
Connected @@ -61,6 +57,10 @@
+ {%if not is_manager_connected['status']%} + {% include 'crawler/crawler_disabled.html' %} + {%endif%} +
@@ -77,12 +77,42 @@ {{api_key}} + + + + +
+
+ +
+ +
+
+
Number of Crawlers to Launch:
+ + + {%for crawler in nb_crawlers_to_launch%} + + + + + {%endfor%} + +
{{crawler}}{{nb_crawlers_to_launch[crawler]}}
+ + + +
@@ -202,55 +232,55 @@
-
+
-
-
-

Crawlers Settings

-
-
- - - - - - - - - - {% for config_field in crawler_full_config %} - - - - - - - {% endfor %} - -
- Key - - Description - - Value -
- {{config_field}} - - {{crawler_full_config[config_field]['info']}} - - {{crawler_full_config[config_field]['value']}} - -
- -
-
- -
+
+
+

Crawlers Settings

+
+ + + + + + + + + + {% for config_field in crawler_full_config %} + + + + + + + {% endfor %} + +
+ Key + + Description + + Value +
+ {{config_field}} + + {{crawler_full_config[config_field]['info']}} + + {{crawler_full_config[config_field]['value']}} + +
+ +
+
+ +
+