chg: [crawler_manager] UI edit config + fix crawler queues

pull/559/head
Terrtia 2020-08-24 22:31:41 +02:00
parent 8633d6460c
commit d8b7ab4de5
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
10 changed files with 526 additions and 138 deletions

View File

@ -141,7 +141,7 @@ def get_crawler_config(redis_server, mode, service_type, domain, url=None):
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
return crawler_options
def load_crawler_config(service_type, domain, paste, url, date):
def load_crawler_config(queue_type, service_type, domain, paste, url, date):
crawler_config = {}
crawler_config['splash_url'] = f'http://{splash_url}'
crawler_config['item'] = paste
@ -149,6 +149,9 @@ def load_crawler_config(service_type, domain, paste, url, date):
crawler_config['domain'] = domain
crawler_config['date'] = date
if queue_type and queue_type != 'tor':
service_type = queue_type
# Auto and Manual Crawling
# Auto ################################################# create new entry, next crawling => here or when ended ?
if paste == 'auto':
@ -282,13 +285,15 @@ if __name__ == '__main__':
splash_url = sys.argv[1]
splash_name = crawlers.get_splash_name_by_url(splash_url)
crawler_type = crawlers.get_splash_crawler_type(splash_name)
proxy_type = crawlers.get_splash_proxy(splash_name)
print(splash_name)
print(crawler_type)
print(proxy_type)
#rotation_mode = deque(['onion', 'regular'])
rotation_mode = deque(crawlers.get_crawler_queue_type_by_proxy(splash_name, crawler_type))
all_crawler_queues = crawlers.get_crawler_queue_types_by_splash_name(splash_name)
rotation_mode = deque(all_crawler_queues)
print(rotation_mode)
default_proto_map = {'http': 80, 'https': 443}
######################################################## add ftp ???
@ -387,7 +392,7 @@ if __name__ == '__main__':
# Update crawler status type
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service'])
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
crawler_config = load_crawler_config(to_crawl['queue_type'], to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
# check if default crawler
if not crawler_config['requested']:
# Auto crawl only if service not up this month

View File

@ -13,36 +13,9 @@ config_loader = ConfigLoader.ConfigLoader()
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
config_loader = None
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
config_loader = None
import screen
# # TODO: lauch me in core screen
# # TODO: check if already launched in tor screen
def launch_crawlers():
for crawler_splash in crawlers_to_launch:
splash_name = crawler_splash[0]
nb_crawlers = int(crawler_splash[1])
all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
if nb_crawlers > len(all_crawler_urls):
print('Error, can\'t launch all Splash Dockers')
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
nb_crawlers = len(all_crawler_urls)
crawlers.reset_all_spash_crawler_status()
for i in range(0, int(nb_crawlers)):
splash_url = all_crawler_urls[i]
print(all_crawler_urls[i])
crawlers.launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url))
# # TODO: handle mutltiple splash_manager
if __name__ == '__main__':
@ -56,7 +29,7 @@ if __name__ == '__main__':
is_manager_connected = crawlers.reload_splash_and_proxies_list()
print(is_manager_connected)
if is_manager_connected:
launch_crawlers()
crawlers.relaunch_crawlers()
last_check = int(time.time())
while True:
@ -72,7 +45,7 @@ if __name__ == '__main__':
is_manager_connected = crawlers.reload_splash_and_proxies_list()
if is_manager_connected:
print('reload proxies and splash list')
launch_crawlers()
crawlers.relaunch_crawlers()
session_uuid = current_session_uuid
if not is_manager_connected:
print('Error, Can\'t connect to Splash manager')

View File

@ -38,12 +38,6 @@ r_cache = config_loader.get_redis_conn("Redis_Cache")
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes"))
config_loader = None
# load crawler config
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
config_loader = None
faup = Faup()
# # # # # # # #
@ -435,10 +429,80 @@ def get_splash_crawler_latest_stats():
date = now.strftime("%Y%m%d")
return get_stats_last_crawled_domains(['onion', 'regular'], date)
def get_nb_crawlers_to_launch_by_splash_name(splash_name):
res = r_serv_onion.hget('all_crawlers_to_launch', splash_name)
if res:
return int(res)
else:
return 0
def get_all_crawlers_to_launch_splash_name():
return r_serv_onion.hkeys('all_crawlers_to_launch')
def get_nb_crawlers_to_launch():
nb_crawlers_to_launch = r_serv_onion.hgetall('all_crawlers_to_launch')
for splash_name in nb_crawlers_to_launch:
nb_crawlers_to_launch[splash_name] = int(nb_crawlers_to_launch[splash_name])
return nb_crawlers_to_launch
def get_nb_crawlers_to_launch_ui():
nb_crawlers_to_launch = get_nb_crawlers_to_launch()
for splash_name in get_all_splash():
if splash_name not in nb_crawlers_to_launch:
nb_crawlers_to_launch[splash_name] = 0
return nb_crawlers_to_launch
def set_nb_crawlers_to_launch(dict_splash_name):
r_serv_onion.delete('all_crawlers_to_launch')
for splash_name in dict_splash_name:
r_serv_onion.hset('all_crawlers_to_launch', splash_name, int(dict_splash_name[splash_name]))
relaunch_crawlers()
def relaunch_crawlers():
all_crawlers_to_launch = get_nb_crawlers_to_launch()
for splash_name in all_crawlers_to_launch:
nb_crawlers = int(all_crawlers_to_launch[splash_name])
all_crawler_urls = get_splash_all_url(splash_name, r_list=True)
if nb_crawlers > len(all_crawler_urls):
print('Error, can\'t launch all Splash Dockers')
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
nb_crawlers = len(all_crawler_urls)
reset_all_spash_crawler_status()
for i in range(0, int(nb_crawlers)):
splash_url = all_crawler_urls[i]
print(all_crawler_urls[i])
launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url))
def api_set_nb_crawlers_to_launch(dict_splash_name):
# TODO: check if is dict
dict_crawlers_to_launch = {}
all_splash = get_all_splash()
crawlers_to_launch = list(all_splash & set(dict_splash_name.keys()))
for splash_name in crawlers_to_launch:
try:
nb_to_launch = int(dict_splash_name.get(splash_name, 0))
if nb_to_launch < 0:
return ({'error':'The number of crawlers to launch is negative'}, 400)
except:
return ({'error':'invalid number of crawlers to launch'}, 400)
if nb_to_launch > 0:
dict_crawlers_to_launch[splash_name] = nb_to_launch
if dict_crawlers_to_launch:
set_nb_crawlers_to_launch(dict_crawlers_to_launch)
return (dict_crawlers_to_launch, 200)
else:
return ({'error':'invalid input'}, 400)
##-- CRAWLER GLOBAL --##
#### CRAWLER TASK ####
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None):
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
crawler_config = {}
crawler_config['depth_limit'] = depth_limit
@ -478,10 +542,18 @@ def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages
tld = unpack_url['tld'].decode()
except:
tld = unpack_url['tld']
if tld == 'onion':
crawler_type = 'onion'
if crawler_type=='None':
crawler_type = None
if crawler_type:
if crawler_type=='tor':
crawler_type = 'onion'
else:
crawler_type = 'regular'
if tld == 'onion':
crawler_type = 'onion'
else:
crawler_type = 'regular'
save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=url)
send_url_to_crawl_in_queue(crawler_mode, crawler_type, url)
@ -493,6 +565,7 @@ def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=
r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_mode, crawler_type, domain, url), json.dumps(crawler_config))
def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url):
print('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode))
r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode))
# add auto crawled url for user UI
if crawler_mode == 'auto':
@ -500,7 +573,7 @@ def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url):
#### ####
#### CRAWLER TASK API ####
def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None):
def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
# validate url
if url is None or url=='' or url=='\n':
return ({'error':'invalid depth limit'}, 400)
@ -537,7 +610,10 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit
if cookie_owner != user_id:
return ({'error': 'The access to this cookiejar is restricted'}, 403)
# # TODO: verify splash name/crawler type
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
crawler_type=crawler_type,
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
return None
#### ####
@ -608,21 +684,41 @@ def save_har(har_dir, item_id, har_content):
f.write(json.dumps(har_content))
#### CRAWLER QUEUES ####
def get_crawler_queue_type_by_proxy(splash_name, proxy_type):
all_domain_type = []
if splash_name != 'default_splash' and splash_name != 'default_splash_tor':
all_domain_type.append(splash_name)
# check if can be used for discovery
if not is_splash_used_in_discovery(splash_name):
return all_domain_type
if proxy_type == 'tor':
def get_all_crawlers_queues_types():
all_queues_types = set()
all_splash_name = get_all_crawlers_to_launch_splash_name()
for splash_name in all_splash_name:
all_queues_types.add(get_splash_crawler_type(splash_name))
all_splash_name = list()
return all_queues_types
def get_crawler_queue_types_by_splash_name(splash_name):
all_domain_type = [splash_name]
crawler_type = get_splash_crawler_type(splash_name)
#if not is_splash_used_in_discovery(splash_name)
if crawler_type == 'tor':
all_domain_type.append('onion')
all_domain_type.append('regular')
# proxy_type = web
else:
all_domain_type.append('regular')
return all_domain_type
def get_crawler_type_by_url(url):
faup.decode(url)
unpack_url = faup.get()
## TODO: # FIXME: remove me
try:
tld = unpack_url['tld'].decode()
except:
tld = unpack_url['tld']
if tld == 'onion':
crawler_type = 'onion'
else:
crawler_type = 'regular'
return crawler_type
def get_elem_to_crawl_by_queue_type(l_queue_type):
## queues priority:
# 1 - priority queue
@ -644,7 +740,8 @@ def get_elem_to_crawl_by_queue_type(l_queue_type):
# # TODO: to check/refractor
item_id = None
url = message
return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message}
crawler_type = get_crawler_type_by_url(url)
return {'url': url, 'paste': item_id, 'type_service': crawler_type, 'queue_type': queue_type, 'original_message': message}
return None
def get_nb_elem_to_crawl_by_type(queue_type):
@ -662,29 +759,37 @@ def get_nb_elem_to_crawl_by_type(queue_type):
# # # # # # # # # # # #
def get_splash_manager_url(reload=False): # TODO: add in db config
return splash_manager_url
return r_serv_onion.get('crawler:splash:manager:url')
def get_splash_api_key(reload=False): # TODO: add in db config
return splash_api_key
return r_serv_onion.get('crawler:splash:manager:key')
def get_hidden_splash_api_key(): # TODO: add in db config
key = get_splash_api_key()
if len(key)==41:
return f'{key[:4]}*********************************{key[-4:]}'
else:
return None
if key:
if len(key)==41:
return f'{key[:4]}*********************************{key[-4:]}'
def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search):
if len(api_key) != 41:
return False
return not bool(search(api_key))
def save_splash_manager_url_api(url, api_key):
r_serv_onion.set('crawler:splash:manager:url', url)
r_serv_onion.set('crawler:splash:manager:key', api_key)
def get_splash_url_from_manager_url(splash_manager_url, splash_port):
url = urlparse(splash_manager_url)
host = url.netloc.split(':', 1)[0]
return '{}:{}'.format(host, splash_port)
def is_splash_used_in_discovery(splash_name):
res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue')
if res == 'True':
return True
else:
return False
# def is_splash_used_in_discovery(splash_name):
# res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue')
# if res == 'True':
# return True
# else:
# return False
def restart_splash_docker(splash_url):
splash_port = splash_url.split(':')[-1]
@ -700,25 +805,50 @@ def is_splash_manager_connected(delta_check=30):
res = r_cache.hget('crawler:splash:manager', 'connected')
return res == 'True'
def update_splash_manager_connection_status(is_connected):
def update_splash_manager_connection_status(is_connected, req_error=None):
r_cache.hset('crawler:splash:manager', 'connected', is_connected)
r_cache.hset('crawler:splash:manager', 'last_check', int(time.time()))
if not req_error:
r_cache.hdel('crawler:splash:manager', 'error')
else:
r_cache.hset('crawler:splash:manager', 'status_code', req_error['status_code'])
r_cache.hset('crawler:splash:manager', 'error', req_error['error'])
def get_splash_manager_connection_metadata(force_ping=False):
dict_manager={}
if force_ping:
dict_manager['status'] = ping_splash_manager()
else:
dict_manager['status'] = is_splash_manager_connected()
if not dict_manager['status']:
dict_manager['status_code'] = r_cache.hget('crawler:splash:manager', 'status_code')
dict_manager['error'] = r_cache.hget('crawler:splash:manager', 'error')
return dict_manager
## API ##
def ping_splash_manager():
splash_manager_url = get_splash_manager_url()
if not splash_manager_url:
return False
try:
req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
req = requests.get('{}/api/v1/ping'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
update_splash_manager_connection_status(True)
return True
else:
print(req.json())
update_splash_manager_connection_status(False)
res = req.json()
if 'reason' in res:
req_error = {'status_code': req.status_code, 'error': res['reason']}
else:
print(req.json())
req_error = {'status_code': req.status_code, 'error': json.dumps(req.json())}
update_splash_manager_connection_status(False, req_error=req_error)
return False
except requests.exceptions.ConnectionError:
pass
# splash manager unreachable
update_splash_manager_connection_status(False)
req_error = {'status_code': 500, 'error': 'splash manager unreachable'}
update_splash_manager_connection_status(False, req_error=req_error)
return False
def get_splash_manager_session_uuid():
@ -734,6 +864,18 @@ def get_splash_manager_session_uuid():
# splash manager unreachable
update_splash_manager_connection_status(False)
def get_splash_manager_version():
splash_manager_url = get_splash_manager_url()
if splash_manager_url:
try:
req = requests.get('{}/api/v1/version'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
return req.json()['message']
else:
print(req.json())
except requests.exceptions.ConnectionError:
pass
def get_all_splash_manager_containers_name():
req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
if req.status_code == 200:
@ -755,6 +897,27 @@ def _restart_splash_docker(splash_port):
return req.json()
else:
print(req.json())
def api_save_splash_manager_url_api(data):
# unpack json
manager_url = data.get('url', None)
api_key = data.get('api_key', None)
if not manager_url or not api_key:
return ({'status': 'error', 'reason': 'No url or API key supplied'}, 400)
# check if is valid url
try:
result = urlparse(manager_url)
if not all([result.scheme, result.netloc]):
return ({'status': 'error', 'reason': 'Invalid url'}, 400)
except:
return ({'status': 'error', 'reason': 'Invalid url'}, 400)
# check if is valid key
if not is_valid_api_key(api_key):
return ({'status': 'error', 'reason': 'Invalid API key'}, 400)
save_splash_manager_url_api(manager_url, api_key)
return ({'url': manager_url, 'api_key': get_hidden_splash_api_key()}, 200)
## -- ##
## SPLASH ##
@ -869,13 +1032,13 @@ def get_all_proxies_metadata():
all_proxy_dict[proxy_name] = get_proxy_metadata(proxy_name)
return all_proxy_dict
def set_proxy_used_in_discovery(proxy_name, value):
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value)
# def set_proxy_used_in_discovery(proxy_name, value):
# r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value)
def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy)
proxy_splash = get_all_splash_by_proxy(proxy_name)
if proxy_splash:
print('error, a splash container is using this proxy')
#if proxy_splash:
# print('error, a splash container is using this proxy')
r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name))
r_serv_onion.srem('all_proxy', proxy_name)
## -- ##
@ -948,3 +1111,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
#### CRAWLER PROXY ####
#### ---- ####
if __name__ == '__main__':
res = get_splash_manager_version()
print(res)

View File

@ -54,13 +54,13 @@ def create_json_response(data, status_code):
@login_read_only
def crawlers_dashboard():
# # TODO: get splash manager status
crawler_enabled = crawlers.ping_splash_manager()
is_manager_connected = crawlers.get_splash_manager_connection_metadata()
all_splash_crawler_status = crawlers.get_all_spash_crawler_status()
splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats()
date = crawlers.get_current_date()
return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status,
crawler_enabled=crawler_enabled, date=date,
is_manager_connected=is_manager_connected, date=date,
splash_crawlers_latest_stats=splash_crawlers_latest_stats)
@crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET'])
@ -80,7 +80,13 @@ def crawler_dashboard_json():
def manual():
user_id = current_user.get_id()
l_cookiejar = crawlers.api_get_cookies_list_select(user_id)
return render_template("crawler_manual.html", crawler_enabled=True, l_cookiejar=l_cookiejar)
all_crawlers_types = crawlers.get_all_crawlers_queues_types()
all_splash_name = crawlers.get_all_crawlers_to_launch_splash_name()
return render_template("crawler_manual.html",
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
all_crawlers_types=all_crawlers_types,
all_splash_name=all_splash_name,
l_cookiejar=l_cookiejar)
@crawler_splash.route("/crawlers/send_to_spider", methods=['POST'])
@login_required
@ -90,6 +96,8 @@ def send_to_spider():
# POST val
url = request.form.get('url_to_crawl')
crawler_type = request.form.get('crawler_queue_type')
splash_name = request.form.get('splash_name')
auto_crawler = request.form.get('crawler_type')
crawler_delta = request.form.get('crawler_epoch')
screenshot = request.form.get('screenshot')
@ -98,6 +106,9 @@ def send_to_spider():
max_pages = request.form.get('max_pages')
cookiejar_uuid = request.form.get('cookiejar')
if splash_name:
crawler_type = splash_name
if cookiejar_uuid:
if cookiejar_uuid == 'None':
cookiejar_uuid = None
@ -106,6 +117,7 @@ def send_to_spider():
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
crawler_type=crawler_type,
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid)
if res:
return create_json_response(res[0], res[1])
@ -435,16 +447,55 @@ def crawler_cookiejar_cookie_json_add_post():
def crawler_splash_setings():
all_proxies = crawlers.get_all_proxies_metadata()
all_splash = crawlers.get_all_splash_crawler_metadata()
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
splash_manager_url = crawlers.get_splash_manager_url()
api_key = crawlers.get_hidden_splash_api_key()
is_manager_connected = crawlers.ping_splash_manager()
is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True)
crawler_full_config = Config_DB.get_full_config_by_section('crawler')
return render_template("settings_splash_crawler.html",
is_manager_connected=is_manager_connected,
splash_manager_url=splash_manager_url, api_key=api_key,
nb_crawlers_to_launch=nb_crawlers_to_launch,
all_splash=all_splash, all_proxies=all_proxies,
crawler_full_config=crawler_full_config)
@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST'])
@login_required
@login_admin
def crawler_splash_setings_crawler_manager():
if request.method == 'POST':
splash_manager_url = request.form.get('splash_manager_url')
api_key = request.form.get('api_key')
res = crawlers.api_save_splash_manager_url_api({'url':splash_manager_url, 'api_key':api_key})
if res[1] != 200:
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
else:
return redirect(url_for('crawler_splash.crawler_splash_setings'))
else:
splash_manager_url = crawlers.get_splash_manager_url()
api_key = crawlers.get_splash_api_key()
return render_template("settings_edit_splash_crawler_manager.html",
splash_manager_url=splash_manager_url, api_key=api_key)
@crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST'])
@login_required
@login_admin
def crawler_splash_setings_crawlers_to_lauch():
if request.method == 'POST':
dict_splash_name = {}
for crawler_name in list(request.form):
dict_splash_name[crawler_name]= request.form.get(crawler_name)
res = crawlers.api_set_nb_crawlers_to_launch(dict_splash_name)
if res[1] != 200:
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
else:
return redirect(url_for('crawler_splash.crawler_splash_setings'))
else:
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch_ui()
return render_template("settings_edit_crawlers_to_launch.html",
nb_crawlers_to_launch=nb_crawlers_to_launch)
## - - ##

View File

@ -18,6 +18,7 @@ from flask_login import login_required
from Date import Date
from HiddenServices import HiddenServices
import crawlers
# ============ VARIABLES ============
import Flask_config
@ -27,7 +28,6 @@ baseUrl = Flask_config.baseUrl
r_cache = Flask_config.r_cache
r_serv_onion = Flask_config.r_serv_onion
r_serv_metadata = Flask_config.r_serv_metadata
crawler_enabled = Flask_config.crawler_enabled
bootstrap_label = Flask_config.bootstrap_label
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
@ -244,7 +244,7 @@ def delete_auto_crawler(url):
# statDomains_regular = get_stats_last_crawled_domains('regular', date)
#
# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion,
# crawler_enabled=crawler_enabled, date=date,
# date=date,
# crawler_metadata_regular=crawler_metadata_regular,
# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular)
@ -288,7 +288,7 @@ def Crawler_Splash_last_by_type():
crawler_metadata = get_crawler_splash_status(type)
return render_template("Crawler_Splash_last_by_type.html", type=type, type_name=type_name,
crawler_enabled=crawler_enabled,
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
last_domains=list_domains, statDomains=statDomains,
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
@ -424,7 +424,7 @@ def auto_crawler():
return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max,
last_domains=last_domains,
crawler_enabled=crawler_enabled,
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata,
auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata)

View File

@ -1,6 +1,14 @@
{% if not crawler_enabled %}
{%if not is_manager_connected['status']%}
<div class="alert alert-secondary text-center my-2" role="alert">
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
<p>...</p>
<p>
{%if 'error' in is_manager_connected%}
<b>{{is_manager_connected['status_code']}}</b>
<br>
<b>Error:</b> {{is_manager_connected['error']}}
{%else%}
<b>Error:</b> core/Crawler_manager not launched
{%endif%}
</p>
</div>
{% endif %}
{%endif%}

View File

@ -44,7 +44,31 @@
<div class="input-group" id="date-range-from">
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
</div>
<div class="d-flex mt-1">
<div class="d-flex mt-2">
<i class="fas fa-spider mt-1"></i> &nbsp;Crawler Type&nbsp;&nbsp;
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="queue_type_selector" value="True" id="queue_type_selector">
<label class="custom-control-label" for="queue_type_selector">
<i class="fas fa-splotch"></i> &nbsp;Splash Name
</label>
</div>
</div>
<div id="div_crawler_queue_type">
<select class="custom-select form-control" name="crawler_queue_type" id="crawler_queue_type">
{%for crawler_type in all_crawlers_types%}
<option value="{{crawler_type}}" {%if crawler_type=='tor'%}selected{%endif%}>{{crawler_type}}</option>
{%endfor%}
</select>
</div>
<div id="div_splash_name">
<select class="custom-select form-control" name="splash_name" id="splash_name">
<option value="None" selected>Don't use a special splash crawler</option>
{%for splash_name in all_splash_name%}
<option value="{{splash_name}}">{{splash_name}}</option>
{%endfor%}
</select>
</div>
<div class="d-flex mt-3">
<i class="fas fa-user-ninja mt-1"></i> &nbsp;Manual&nbsp;&nbsp;
<div class="custom-control custom-switch">
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
@ -143,11 +167,16 @@ var chart = {};
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_manual_crawler").addClass("active");
queue_type_selector_input_controler()
manual_crawler_input_controler();
$('#crawler_type').on("change", function () {
manual_crawler_input_controler();
});
$('#queue_type_selector').on("change", function () {
queue_type_selector_input_controler();
});
});
function toggle_sidebar(){
@ -172,4 +201,14 @@ function manual_crawler_input_controler() {
}
}
function queue_type_selector_input_controler() {
if($('#queue_type_selector').is(':checked')){
$("#div_crawler_queue_type").hide();
$("#div_splash_name").show();
}else{
$("#div_crawler_queue_type").show();
$("#div_splash_name").hide();
}
}
</script>

View File

@ -0,0 +1,60 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}" method="post" enctype="multipart/form-data">
<h5 class="card-title">Number of Crawlers to Launch:</h5>
<table class="table table-sm">
<tbody>
{%for crawler_name in nb_crawlers_to_launch%}
<tr>
<td>{{crawler_name}}</td>
<td>
<input class="form-control" type="number" id="{{crawler_name}}" value="{{nb_crawlers_to_launch[crawler_name]}}" min="0" name="{{crawler_name}}" required>
</td>
</tr>
{%endfor%}
</tbody>
</table>
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
</form>
</div>
</div>
</div>
</body>
<script>
var to_refresh = false
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_settings").addClass("active");
});
</script>

View File

@ -0,0 +1,55 @@
<!DOCTYPE html>
<html>
<head>
<title>AIL-Framework</title>
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
<!-- Core CSS -->
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
<!-- JS -->
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
</head>
<body>
{% include 'nav_bar.html' %}
<div class="container-fluid">
<div class="row">
{% include 'crawler/menu_sidebar.html' %}
<div class="col-12 col-lg-10" id="core_content">
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawler_manager') }}" method="post" enctype="multipart/form-data">
<div class="form-group">
<label for="splash_manager_url">Splash Manager URL</label>
<input type="text" class="form-control" id="splash_manager_url" placeholder="http://splash_manager_url" name="splash_manager_url" {%if splash_manager_url%}value="{{splash_manager_url}}"{%endif%}>
</div>
<div class="form-group">
<label for="api_key">API Key</label>
<input type="text" class="form-control" id="api_key" placeholder="API Key" name="api_key" {%if api_key%}value="{{api_key}}"{%endif%}>
</div>
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
</form>
</div>
</div>
</div>
</body>
<script>
var to_refresh = false
$(document).ready(function(){
$("#page-Crawler").addClass("active");
$("#nav_settings").addClass("active");
});
</script>

View File

@ -26,10 +26,6 @@
<div class="col-12 col-lg-10" id="core_content">
{%if not is_manager_connected%}
{% include 'crawler/crawler_disabled.html' %}
{%endif%}
<div class="row">
<div class="col-xl-6">
@ -45,7 +41,7 @@
<div class="card mb-3 mt-1">
<div class="card-header bg-dark text-white">
<span class="badge badge-pill badge-light flex-row-reverse float-right">
{% if is_manager_connected %}
{% if is_manager_connected['status'] %}
<div style="color:Green;">
<i class="fas fa-check-circle fa-2x"></i>
Connected
@ -61,6 +57,10 @@
</div>
<div class="card-body">
{%if not is_manager_connected['status']%}
{% include 'crawler/crawler_disabled.html' %}
{%endif%}
<div class="row mb-3 justify-content-center">
<div class="col-xl-6">
<div class="card text-center border-secondary">
@ -77,12 +77,42 @@
{{api_key}}
<!-- <a class="ml-3" href="/settings/new_token"><i class="fa fa-random"></i></a> -->
</td>
<td>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawler_manager') }}">
<button type="button" class="btn btn-info">
Edit <i class="fas fa-pencil-alt"></i>
</button>
</a>
</td>
</tr>
</tbody>
</table>
</div>
</div>
</div>
</div>
<div {%if not is_manager_connected%}class="hidden"{%endif%}>
<div class="card border-secondary mb-4">
<div class="card-body text-dark">
<h5 class="card-title">Number of Crawlers to Launch:</h5>
<table class="table table-sm">
<tbody>
{%for crawler in nb_crawlers_to_launch%}
<tr>
<td>{{crawler}}</td>
<td>{{nb_crawlers_to_launch[crawler]}}</td>
</tr>
{%endfor%}
</tbody>
</table>
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
<button type="button" class="btn btn-info">
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
</button>
</a>
</div>
</div>
<div class="card border-secondary mb-4">
@ -202,55 +232,55 @@
</table>
</div>
</div>
</div>
</div>
</div>
<div class="card mb-3 mt-1">
<div class="card-header bg-dark text-white">
<h4>Crawlers Settings</h4>
</div>
<div class="card-body">
<table class="table table-striped table-hover">
<thead class="bg-info text-white">
<th>
Key
</th>
<th>
Description
</th>
<th>
Value
</th>
<th></th>
</thead>
<tbody>
{% for config_field in crawler_full_config %}
<tr>
<td>
{{config_field}}
</td>
<td>
{{crawler_full_config[config_field]['info']}}
</td>
<td>
{{crawler_full_config[config_field]['value']}}
</td>
<td>
<div class="d-flex justify-content-end">
<button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
<div class="card mb-3 mt-1">
<div class="card-header bg-dark text-white">
<h4>Crawlers Settings</h4>
</div>
<div class="card-body">
<table class="table table-striped table-hover">
<thead class="bg-info text-white">
<th>
Key
</th>
<th>
Description
</th>
<th>
Value
</th>
<th></th>
</thead>
<tbody>
{% for config_field in crawler_full_config %}
<tr>
<td>
{{config_field}}
</td>
<td>
{{crawler_full_config[config_field]['info']}}
</td>
<td>
{{crawler_full_config[config_field]['value']}}
</td>
<td>
<div class="d-flex justify-content-end">
<button class="btn btn-outline-dark px-1 py-0">
<i class="fas fa-pencil-alt"></i>
</button>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</div>
</div>
</div>