mirror of https://github.com/CIRCL/AIL-framework
chg: [crawler_manager] UI edit config + fix crawler queues
parent
8633d6460c
commit
d8b7ab4de5
|
@ -141,7 +141,7 @@ def get_crawler_config(redis_server, mode, service_type, domain, url=None):
|
|||
redis_server.delete('crawler_config:{}:{}:{}'.format(mode, service_type, domain))
|
||||
return crawler_options
|
||||
|
||||
def load_crawler_config(service_type, domain, paste, url, date):
|
||||
def load_crawler_config(queue_type, service_type, domain, paste, url, date):
|
||||
crawler_config = {}
|
||||
crawler_config['splash_url'] = f'http://{splash_url}'
|
||||
crawler_config['item'] = paste
|
||||
|
@ -149,6 +149,9 @@ def load_crawler_config(service_type, domain, paste, url, date):
|
|||
crawler_config['domain'] = domain
|
||||
crawler_config['date'] = date
|
||||
|
||||
if queue_type and queue_type != 'tor':
|
||||
service_type = queue_type
|
||||
|
||||
# Auto and Manual Crawling
|
||||
# Auto ################################################# create new entry, next crawling => here or when ended ?
|
||||
if paste == 'auto':
|
||||
|
@ -282,13 +285,15 @@ if __name__ == '__main__':
|
|||
splash_url = sys.argv[1]
|
||||
|
||||
splash_name = crawlers.get_splash_name_by_url(splash_url)
|
||||
crawler_type = crawlers.get_splash_crawler_type(splash_name)
|
||||
proxy_type = crawlers.get_splash_proxy(splash_name)
|
||||
|
||||
print(splash_name)
|
||||
print(crawler_type)
|
||||
print(proxy_type)
|
||||
|
||||
#rotation_mode = deque(['onion', 'regular'])
|
||||
rotation_mode = deque(crawlers.get_crawler_queue_type_by_proxy(splash_name, crawler_type))
|
||||
all_crawler_queues = crawlers.get_crawler_queue_types_by_splash_name(splash_name)
|
||||
rotation_mode = deque(all_crawler_queues)
|
||||
print(rotation_mode)
|
||||
|
||||
default_proto_map = {'http': 80, 'https': 443}
|
||||
######################################################## add ftp ???
|
||||
|
@ -387,7 +392,7 @@ if __name__ == '__main__':
|
|||
# Update crawler status type
|
||||
r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', to_crawl['type_service'])
|
||||
|
||||
crawler_config = load_crawler_config(to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||
crawler_config = load_crawler_config(to_crawl['queue_type'], to_crawl['type_service'], url_data['domain'], to_crawl['paste'], to_crawl['url'], date)
|
||||
# check if default crawler
|
||||
if not crawler_config['requested']:
|
||||
# Auto crawl only if service not up this month
|
||||
|
|
|
@ -13,36 +13,9 @@ config_loader = ConfigLoader.ConfigLoader()
|
|||
r_serv_metadata = config_loader.get_redis_conn("ARDB_Metadata")
|
||||
config_loader = None
|
||||
|
||||
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||
SPLASH_MANAGER_URL = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||
api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||
crawlers_to_launch = config_loader.get_all_keys_values_from_section('Splash_Crawlers')
|
||||
config_loader = None
|
||||
|
||||
import screen
|
||||
|
||||
# # TODO: lauch me in core screen
|
||||
# # TODO: check if already launched in tor screen
|
||||
|
||||
def launch_crawlers():
|
||||
for crawler_splash in crawlers_to_launch:
|
||||
splash_name = crawler_splash[0]
|
||||
nb_crawlers = int(crawler_splash[1])
|
||||
|
||||
all_crawler_urls = crawlers.get_splash_all_url(crawler_splash[0], r_list=True)
|
||||
if nb_crawlers > len(all_crawler_urls):
|
||||
print('Error, can\'t launch all Splash Dockers')
|
||||
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
|
||||
nb_crawlers = len(all_crawler_urls)
|
||||
|
||||
crawlers.reset_all_spash_crawler_status()
|
||||
|
||||
for i in range(0, int(nb_crawlers)):
|
||||
splash_url = all_crawler_urls[i]
|
||||
print(all_crawler_urls[i])
|
||||
|
||||
crawlers.launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url))
|
||||
|
||||
# # TODO: handle mutltiple splash_manager
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
@ -56,7 +29,7 @@ if __name__ == '__main__':
|
|||
is_manager_connected = crawlers.reload_splash_and_proxies_list()
|
||||
print(is_manager_connected)
|
||||
if is_manager_connected:
|
||||
launch_crawlers()
|
||||
crawlers.relaunch_crawlers()
|
||||
last_check = int(time.time())
|
||||
|
||||
while True:
|
||||
|
@ -72,7 +45,7 @@ if __name__ == '__main__':
|
|||
is_manager_connected = crawlers.reload_splash_and_proxies_list()
|
||||
if is_manager_connected:
|
||||
print('reload proxies and splash list')
|
||||
launch_crawlers()
|
||||
crawlers.relaunch_crawlers()
|
||||
session_uuid = current_session_uuid
|
||||
if not is_manager_connected:
|
||||
print('Error, Can\'t connect to Splash manager')
|
||||
|
|
|
@ -38,12 +38,6 @@ r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|||
PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], config_loader.get_config_str("Directories", "pastes"))
|
||||
config_loader = None
|
||||
|
||||
# load crawler config
|
||||
config_loader = ConfigLoader.ConfigLoader(config_file='crawlers.cfg')
|
||||
splash_manager_url = config_loader.get_config_str('Splash_Manager', 'splash_url')
|
||||
splash_api_key = config_loader.get_config_str('Splash_Manager', 'api_key')
|
||||
config_loader = None
|
||||
|
||||
faup = Faup()
|
||||
|
||||
# # # # # # # #
|
||||
|
@ -435,10 +429,80 @@ def get_splash_crawler_latest_stats():
|
|||
date = now.strftime("%Y%m%d")
|
||||
return get_stats_last_crawled_domains(['onion', 'regular'], date)
|
||||
|
||||
def get_nb_crawlers_to_launch_by_splash_name(splash_name):
|
||||
res = r_serv_onion.hget('all_crawlers_to_launch', splash_name)
|
||||
if res:
|
||||
return int(res)
|
||||
else:
|
||||
return 0
|
||||
|
||||
def get_all_crawlers_to_launch_splash_name():
|
||||
return r_serv_onion.hkeys('all_crawlers_to_launch')
|
||||
|
||||
def get_nb_crawlers_to_launch():
|
||||
nb_crawlers_to_launch = r_serv_onion.hgetall('all_crawlers_to_launch')
|
||||
for splash_name in nb_crawlers_to_launch:
|
||||
nb_crawlers_to_launch[splash_name] = int(nb_crawlers_to_launch[splash_name])
|
||||
return nb_crawlers_to_launch
|
||||
|
||||
def get_nb_crawlers_to_launch_ui():
|
||||
nb_crawlers_to_launch = get_nb_crawlers_to_launch()
|
||||
for splash_name in get_all_splash():
|
||||
if splash_name not in nb_crawlers_to_launch:
|
||||
nb_crawlers_to_launch[splash_name] = 0
|
||||
return nb_crawlers_to_launch
|
||||
|
||||
def set_nb_crawlers_to_launch(dict_splash_name):
|
||||
r_serv_onion.delete('all_crawlers_to_launch')
|
||||
for splash_name in dict_splash_name:
|
||||
r_serv_onion.hset('all_crawlers_to_launch', splash_name, int(dict_splash_name[splash_name]))
|
||||
relaunch_crawlers()
|
||||
|
||||
def relaunch_crawlers():
|
||||
all_crawlers_to_launch = get_nb_crawlers_to_launch()
|
||||
for splash_name in all_crawlers_to_launch:
|
||||
nb_crawlers = int(all_crawlers_to_launch[splash_name])
|
||||
|
||||
all_crawler_urls = get_splash_all_url(splash_name, r_list=True)
|
||||
if nb_crawlers > len(all_crawler_urls):
|
||||
print('Error, can\'t launch all Splash Dockers')
|
||||
print('Please launch {} additional {} Dockers'.format( nb_crawlers - len(all_crawler_urls), splash_name))
|
||||
nb_crawlers = len(all_crawler_urls)
|
||||
|
||||
reset_all_spash_crawler_status()
|
||||
|
||||
for i in range(0, int(nb_crawlers)):
|
||||
splash_url = all_crawler_urls[i]
|
||||
print(all_crawler_urls[i])
|
||||
|
||||
launch_ail_splash_crawler(splash_url, script_options='{}'.format(splash_url))
|
||||
|
||||
def api_set_nb_crawlers_to_launch(dict_splash_name):
|
||||
# TODO: check if is dict
|
||||
dict_crawlers_to_launch = {}
|
||||
all_splash = get_all_splash()
|
||||
crawlers_to_launch = list(all_splash & set(dict_splash_name.keys()))
|
||||
for splash_name in crawlers_to_launch:
|
||||
try:
|
||||
nb_to_launch = int(dict_splash_name.get(splash_name, 0))
|
||||
if nb_to_launch < 0:
|
||||
return ({'error':'The number of crawlers to launch is negative'}, 400)
|
||||
except:
|
||||
return ({'error':'invalid number of crawlers to launch'}, 400)
|
||||
if nb_to_launch > 0:
|
||||
dict_crawlers_to_launch[splash_name] = nb_to_launch
|
||||
|
||||
if dict_crawlers_to_launch:
|
||||
set_nb_crawlers_to_launch(dict_crawlers_to_launch)
|
||||
return (dict_crawlers_to_launch, 200)
|
||||
else:
|
||||
return ({'error':'invalid input'}, 400)
|
||||
|
||||
|
||||
##-- CRAWLER GLOBAL --##
|
||||
|
||||
#### CRAWLER TASK ####
|
||||
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None):
|
||||
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
|
||||
|
||||
crawler_config = {}
|
||||
crawler_config['depth_limit'] = depth_limit
|
||||
|
@ -478,6 +542,14 @@ def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages
|
|||
tld = unpack_url['tld'].decode()
|
||||
except:
|
||||
tld = unpack_url['tld']
|
||||
|
||||
if crawler_type=='None':
|
||||
crawler_type = None
|
||||
|
||||
if crawler_type:
|
||||
if crawler_type=='tor':
|
||||
crawler_type = 'onion'
|
||||
else:
|
||||
if tld == 'onion':
|
||||
crawler_type = 'onion'
|
||||
else:
|
||||
|
@ -493,6 +565,7 @@ def save_crawler_config(crawler_mode, crawler_type, crawler_config, domain, url=
|
|||
r_serv_onion.set('crawler_config:{}:{}:{}:{}'.format(crawler_mode, crawler_type, domain, url), json.dumps(crawler_config))
|
||||
|
||||
def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url):
|
||||
print('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode))
|
||||
r_serv_onion.sadd('{}_crawler_priority_queue'.format(crawler_type), '{};{}'.format(url, crawler_mode))
|
||||
# add auto crawled url for user UI
|
||||
if crawler_mode == 'auto':
|
||||
|
@ -500,7 +573,7 @@ def send_url_to_crawl_in_queue(crawler_mode, crawler_type, url):
|
|||
|
||||
#### ####
|
||||
#### CRAWLER TASK API ####
|
||||
def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, cookiejar_uuid=None, user_agent=None):
|
||||
def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
|
||||
# validate url
|
||||
if url is None or url=='' or url=='\n':
|
||||
return ({'error':'invalid depth limit'}, 400)
|
||||
|
@ -537,7 +610,10 @@ def api_create_crawler_task(user_id, url, screenshot=True, har=True, depth_limit
|
|||
if cookie_owner != user_id:
|
||||
return ({'error': 'The access to this cookiejar is restricted'}, 403)
|
||||
|
||||
# # TODO: verify splash name/crawler type
|
||||
|
||||
create_crawler_task(url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
|
||||
crawler_type=crawler_type,
|
||||
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid, user_agent=user_agent)
|
||||
return None
|
||||
#### ####
|
||||
|
@ -608,21 +684,41 @@ def save_har(har_dir, item_id, har_content):
|
|||
f.write(json.dumps(har_content))
|
||||
|
||||
#### CRAWLER QUEUES ####
|
||||
def get_crawler_queue_type_by_proxy(splash_name, proxy_type):
|
||||
all_domain_type = []
|
||||
if splash_name != 'default_splash' and splash_name != 'default_splash_tor':
|
||||
all_domain_type.append(splash_name)
|
||||
# check if can be used for discovery
|
||||
if not is_splash_used_in_discovery(splash_name):
|
||||
return all_domain_type
|
||||
if proxy_type == 'tor':
|
||||
def get_all_crawlers_queues_types():
|
||||
all_queues_types = set()
|
||||
all_splash_name = get_all_crawlers_to_launch_splash_name()
|
||||
for splash_name in all_splash_name:
|
||||
all_queues_types.add(get_splash_crawler_type(splash_name))
|
||||
all_splash_name = list()
|
||||
return all_queues_types
|
||||
|
||||
def get_crawler_queue_types_by_splash_name(splash_name):
|
||||
all_domain_type = [splash_name]
|
||||
crawler_type = get_splash_crawler_type(splash_name)
|
||||
#if not is_splash_used_in_discovery(splash_name)
|
||||
if crawler_type == 'tor':
|
||||
all_domain_type.append('onion')
|
||||
all_domain_type.append('regular')
|
||||
# proxy_type = web
|
||||
else:
|
||||
all_domain_type.append('regular')
|
||||
return all_domain_type
|
||||
|
||||
def get_crawler_type_by_url(url):
|
||||
faup.decode(url)
|
||||
unpack_url = faup.get()
|
||||
## TODO: # FIXME: remove me
|
||||
try:
|
||||
tld = unpack_url['tld'].decode()
|
||||
except:
|
||||
tld = unpack_url['tld']
|
||||
|
||||
if tld == 'onion':
|
||||
crawler_type = 'onion'
|
||||
else:
|
||||
crawler_type = 'regular'
|
||||
return crawler_type
|
||||
|
||||
|
||||
def get_elem_to_crawl_by_queue_type(l_queue_type):
|
||||
## queues priority:
|
||||
# 1 - priority queue
|
||||
|
@ -644,7 +740,8 @@ def get_elem_to_crawl_by_queue_type(l_queue_type):
|
|||
# # TODO: to check/refractor
|
||||
item_id = None
|
||||
url = message
|
||||
return {'url': url, 'paste': item_id, 'type_service': queue_type, 'original_message': message}
|
||||
crawler_type = get_crawler_type_by_url(url)
|
||||
return {'url': url, 'paste': item_id, 'type_service': crawler_type, 'queue_type': queue_type, 'original_message': message}
|
||||
return None
|
||||
|
||||
def get_nb_elem_to_crawl_by_type(queue_type):
|
||||
|
@ -662,29 +759,37 @@ def get_nb_elem_to_crawl_by_type(queue_type):
|
|||
# # # # # # # # # # # #
|
||||
|
||||
def get_splash_manager_url(reload=False): # TODO: add in db config
|
||||
return splash_manager_url
|
||||
return r_serv_onion.get('crawler:splash:manager:url')
|
||||
|
||||
def get_splash_api_key(reload=False): # TODO: add in db config
|
||||
return splash_api_key
|
||||
return r_serv_onion.get('crawler:splash:manager:key')
|
||||
|
||||
def get_hidden_splash_api_key(): # TODO: add in db config
|
||||
key = get_splash_api_key()
|
||||
if key:
|
||||
if len(key)==41:
|
||||
return f'{key[:4]}*********************************{key[-4:]}'
|
||||
else:
|
||||
return None
|
||||
|
||||
def is_valid_api_key(api_key, search=re.compile(r'[^a-zA-Z0-9_-]').search):
|
||||
if len(api_key) != 41:
|
||||
return False
|
||||
return not bool(search(api_key))
|
||||
|
||||
def save_splash_manager_url_api(url, api_key):
|
||||
r_serv_onion.set('crawler:splash:manager:url', url)
|
||||
r_serv_onion.set('crawler:splash:manager:key', api_key)
|
||||
|
||||
def get_splash_url_from_manager_url(splash_manager_url, splash_port):
|
||||
url = urlparse(splash_manager_url)
|
||||
host = url.netloc.split(':', 1)[0]
|
||||
return '{}:{}'.format(host, splash_port)
|
||||
|
||||
def is_splash_used_in_discovery(splash_name):
|
||||
res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue')
|
||||
if res == 'True':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
# def is_splash_used_in_discovery(splash_name):
|
||||
# res = r_serv_onion.hget('splash:metadata:{}'.format(splash_name), 'discovery_queue')
|
||||
# if res == 'True':
|
||||
# return True
|
||||
# else:
|
||||
# return False
|
||||
|
||||
def restart_splash_docker(splash_url):
|
||||
splash_port = splash_url.split(':')[-1]
|
||||
|
@ -700,25 +805,50 @@ def is_splash_manager_connected(delta_check=30):
|
|||
res = r_cache.hget('crawler:splash:manager', 'connected')
|
||||
return res == 'True'
|
||||
|
||||
def update_splash_manager_connection_status(is_connected):
|
||||
def update_splash_manager_connection_status(is_connected, req_error=None):
|
||||
r_cache.hset('crawler:splash:manager', 'connected', is_connected)
|
||||
r_cache.hset('crawler:splash:manager', 'last_check', int(time.time()))
|
||||
if not req_error:
|
||||
r_cache.hdel('crawler:splash:manager', 'error')
|
||||
else:
|
||||
r_cache.hset('crawler:splash:manager', 'status_code', req_error['status_code'])
|
||||
r_cache.hset('crawler:splash:manager', 'error', req_error['error'])
|
||||
|
||||
def get_splash_manager_connection_metadata(force_ping=False):
|
||||
dict_manager={}
|
||||
if force_ping:
|
||||
dict_manager['status'] = ping_splash_manager()
|
||||
else:
|
||||
dict_manager['status'] = is_splash_manager_connected()
|
||||
if not dict_manager['status']:
|
||||
dict_manager['status_code'] = r_cache.hget('crawler:splash:manager', 'status_code')
|
||||
dict_manager['error'] = r_cache.hget('crawler:splash:manager', 'error')
|
||||
return dict_manager
|
||||
|
||||
## API ##
|
||||
def ping_splash_manager():
|
||||
splash_manager_url = get_splash_manager_url()
|
||||
if not splash_manager_url:
|
||||
return False
|
||||
try:
|
||||
req = requests.get('{}/api/v1/ping'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||
req = requests.get('{}/api/v1/ping'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||
if req.status_code == 200:
|
||||
update_splash_manager_connection_status(True)
|
||||
return True
|
||||
else:
|
||||
res = req.json()
|
||||
if 'reason' in res:
|
||||
req_error = {'status_code': req.status_code, 'error': res['reason']}
|
||||
else:
|
||||
print(req.json())
|
||||
update_splash_manager_connection_status(False)
|
||||
req_error = {'status_code': req.status_code, 'error': json.dumps(req.json())}
|
||||
update_splash_manager_connection_status(False, req_error=req_error)
|
||||
return False
|
||||
except requests.exceptions.ConnectionError:
|
||||
pass
|
||||
# splash manager unreachable
|
||||
update_splash_manager_connection_status(False)
|
||||
req_error = {'status_code': 500, 'error': 'splash manager unreachable'}
|
||||
update_splash_manager_connection_status(False, req_error=req_error)
|
||||
return False
|
||||
|
||||
def get_splash_manager_session_uuid():
|
||||
|
@ -734,6 +864,18 @@ def get_splash_manager_session_uuid():
|
|||
# splash manager unreachable
|
||||
update_splash_manager_connection_status(False)
|
||||
|
||||
def get_splash_manager_version():
|
||||
splash_manager_url = get_splash_manager_url()
|
||||
if splash_manager_url:
|
||||
try:
|
||||
req = requests.get('{}/api/v1/version'.format(splash_manager_url), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||
if req.status_code == 200:
|
||||
return req.json()['message']
|
||||
else:
|
||||
print(req.json())
|
||||
except requests.exceptions.ConnectionError:
|
||||
pass
|
||||
|
||||
def get_all_splash_manager_containers_name():
|
||||
req = requests.get('{}/api/v1/get/splash/name/all'.format(get_splash_manager_url()), headers={"Authorization": get_splash_api_key()}, verify=False)
|
||||
if req.status_code == 200:
|
||||
|
@ -755,6 +897,27 @@ def _restart_splash_docker(splash_port):
|
|||
return req.json()
|
||||
else:
|
||||
print(req.json())
|
||||
|
||||
def api_save_splash_manager_url_api(data):
|
||||
# unpack json
|
||||
manager_url = data.get('url', None)
|
||||
api_key = data.get('api_key', None)
|
||||
if not manager_url or not api_key:
|
||||
return ({'status': 'error', 'reason': 'No url or API key supplied'}, 400)
|
||||
# check if is valid url
|
||||
try:
|
||||
result = urlparse(manager_url)
|
||||
if not all([result.scheme, result.netloc]):
|
||||
return ({'status': 'error', 'reason': 'Invalid url'}, 400)
|
||||
except:
|
||||
return ({'status': 'error', 'reason': 'Invalid url'}, 400)
|
||||
|
||||
# check if is valid key
|
||||
if not is_valid_api_key(api_key):
|
||||
return ({'status': 'error', 'reason': 'Invalid API key'}, 400)
|
||||
|
||||
save_splash_manager_url_api(manager_url, api_key)
|
||||
return ({'url': manager_url, 'api_key': get_hidden_splash_api_key()}, 200)
|
||||
## -- ##
|
||||
|
||||
## SPLASH ##
|
||||
|
@ -869,13 +1032,13 @@ def get_all_proxies_metadata():
|
|||
all_proxy_dict[proxy_name] = get_proxy_metadata(proxy_name)
|
||||
return all_proxy_dict
|
||||
|
||||
def set_proxy_used_in_discovery(proxy_name, value):
|
||||
r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value)
|
||||
# def set_proxy_used_in_discovery(proxy_name, value):
|
||||
# r_serv_onion.hset('splash:metadata:{}'.format(splash_name), 'discovery_queue', value)
|
||||
|
||||
def delete_proxy(proxy_name): # # TODO: force delete (delete all proxy)
|
||||
proxy_splash = get_all_splash_by_proxy(proxy_name)
|
||||
if proxy_splash:
|
||||
print('error, a splash container is using this proxy')
|
||||
#if proxy_splash:
|
||||
# print('error, a splash container is using this proxy')
|
||||
r_serv_onion.delete('proxy:metadata:{}'.format(proxy_name))
|
||||
r_serv_onion.srem('all_proxy', proxy_name)
|
||||
## -- ##
|
||||
|
@ -948,3 +1111,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''):
|
|||
#### CRAWLER PROXY ####
|
||||
|
||||
#### ---- ####
|
||||
|
||||
if __name__ == '__main__':
|
||||
res = get_splash_manager_version()
|
||||
print(res)
|
||||
|
|
|
@ -54,13 +54,13 @@ def create_json_response(data, status_code):
|
|||
@login_read_only
|
||||
def crawlers_dashboard():
|
||||
# # TODO: get splash manager status
|
||||
crawler_enabled = crawlers.ping_splash_manager()
|
||||
is_manager_connected = crawlers.get_splash_manager_connection_metadata()
|
||||
all_splash_crawler_status = crawlers.get_all_spash_crawler_status()
|
||||
splash_crawlers_latest_stats = crawlers.get_splash_crawler_latest_stats()
|
||||
date = crawlers.get_current_date()
|
||||
|
||||
return render_template("dashboard_splash_crawler.html", all_splash_crawler_status = all_splash_crawler_status,
|
||||
crawler_enabled=crawler_enabled, date=date,
|
||||
is_manager_connected=is_manager_connected, date=date,
|
||||
splash_crawlers_latest_stats=splash_crawlers_latest_stats)
|
||||
|
||||
@crawler_splash.route("/crawlers/crawler_dashboard_json", methods=['GET'])
|
||||
|
@ -80,7 +80,13 @@ def crawler_dashboard_json():
|
|||
def manual():
|
||||
user_id = current_user.get_id()
|
||||
l_cookiejar = crawlers.api_get_cookies_list_select(user_id)
|
||||
return render_template("crawler_manual.html", crawler_enabled=True, l_cookiejar=l_cookiejar)
|
||||
all_crawlers_types = crawlers.get_all_crawlers_queues_types()
|
||||
all_splash_name = crawlers.get_all_crawlers_to_launch_splash_name()
|
||||
return render_template("crawler_manual.html",
|
||||
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
|
||||
all_crawlers_types=all_crawlers_types,
|
||||
all_splash_name=all_splash_name,
|
||||
l_cookiejar=l_cookiejar)
|
||||
|
||||
@crawler_splash.route("/crawlers/send_to_spider", methods=['POST'])
|
||||
@login_required
|
||||
|
@ -90,6 +96,8 @@ def send_to_spider():
|
|||
|
||||
# POST val
|
||||
url = request.form.get('url_to_crawl')
|
||||
crawler_type = request.form.get('crawler_queue_type')
|
||||
splash_name = request.form.get('splash_name')
|
||||
auto_crawler = request.form.get('crawler_type')
|
||||
crawler_delta = request.form.get('crawler_epoch')
|
||||
screenshot = request.form.get('screenshot')
|
||||
|
@ -98,6 +106,9 @@ def send_to_spider():
|
|||
max_pages = request.form.get('max_pages')
|
||||
cookiejar_uuid = request.form.get('cookiejar')
|
||||
|
||||
if splash_name:
|
||||
crawler_type = splash_name
|
||||
|
||||
if cookiejar_uuid:
|
||||
if cookiejar_uuid == 'None':
|
||||
cookiejar_uuid = None
|
||||
|
@ -106,6 +117,7 @@ def send_to_spider():
|
|||
cookiejar_uuid = cookiejar_uuid[-1].replace(' ', '')
|
||||
|
||||
res = crawlers.api_create_crawler_task(user_id, url, screenshot=screenshot, har=har, depth_limit=depth_limit, max_pages=max_pages,
|
||||
crawler_type=crawler_type,
|
||||
auto_crawler=auto_crawler, crawler_delta=crawler_delta, cookiejar_uuid=cookiejar_uuid)
|
||||
if res:
|
||||
return create_json_response(res[0], res[1])
|
||||
|
@ -435,16 +447,55 @@ def crawler_cookiejar_cookie_json_add_post():
|
|||
def crawler_splash_setings():
|
||||
all_proxies = crawlers.get_all_proxies_metadata()
|
||||
all_splash = crawlers.get_all_splash_crawler_metadata()
|
||||
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch()
|
||||
|
||||
splash_manager_url = crawlers.get_splash_manager_url()
|
||||
api_key = crawlers.get_hidden_splash_api_key()
|
||||
is_manager_connected = crawlers.ping_splash_manager()
|
||||
is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True)
|
||||
crawler_full_config = Config_DB.get_full_config_by_section('crawler')
|
||||
|
||||
return render_template("settings_splash_crawler.html",
|
||||
is_manager_connected=is_manager_connected,
|
||||
splash_manager_url=splash_manager_url, api_key=api_key,
|
||||
nb_crawlers_to_launch=nb_crawlers_to_launch,
|
||||
all_splash=all_splash, all_proxies=all_proxies,
|
||||
crawler_full_config=crawler_full_config)
|
||||
|
||||
@crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_splash_setings_crawler_manager():
|
||||
if request.method == 'POST':
|
||||
splash_manager_url = request.form.get('splash_manager_url')
|
||||
api_key = request.form.get('api_key')
|
||||
|
||||
res = crawlers.api_save_splash_manager_url_api({'url':splash_manager_url, 'api_key':api_key})
|
||||
if res[1] != 200:
|
||||
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
||||
else:
|
||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
||||
else:
|
||||
splash_manager_url = crawlers.get_splash_manager_url()
|
||||
api_key = crawlers.get_splash_api_key()
|
||||
return render_template("settings_edit_splash_crawler_manager.html",
|
||||
splash_manager_url=splash_manager_url, api_key=api_key)
|
||||
|
||||
@crawler_splash.route('/crawler/settings/crawlers_to_lauch', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
@login_admin
|
||||
def crawler_splash_setings_crawlers_to_lauch():
|
||||
if request.method == 'POST':
|
||||
dict_splash_name = {}
|
||||
for crawler_name in list(request.form):
|
||||
dict_splash_name[crawler_name]= request.form.get(crawler_name)
|
||||
res = crawlers.api_set_nb_crawlers_to_launch(dict_splash_name)
|
||||
if res[1] != 200:
|
||||
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
||||
else:
|
||||
return redirect(url_for('crawler_splash.crawler_splash_setings'))
|
||||
else:
|
||||
nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch_ui()
|
||||
return render_template("settings_edit_crawlers_to_launch.html",
|
||||
nb_crawlers_to_launch=nb_crawlers_to_launch)
|
||||
|
||||
## - - ##
|
||||
|
|
|
@ -18,6 +18,7 @@ from flask_login import login_required
|
|||
|
||||
from Date import Date
|
||||
from HiddenServices import HiddenServices
|
||||
import crawlers
|
||||
|
||||
# ============ VARIABLES ============
|
||||
import Flask_config
|
||||
|
@ -27,7 +28,6 @@ baseUrl = Flask_config.baseUrl
|
|||
r_cache = Flask_config.r_cache
|
||||
r_serv_onion = Flask_config.r_serv_onion
|
||||
r_serv_metadata = Flask_config.r_serv_metadata
|
||||
crawler_enabled = Flask_config.crawler_enabled
|
||||
bootstrap_label = Flask_config.bootstrap_label
|
||||
|
||||
sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib'))
|
||||
|
@ -244,7 +244,7 @@ def delete_auto_crawler(url):
|
|||
# statDomains_regular = get_stats_last_crawled_domains('regular', date)
|
||||
#
|
||||
# return render_template("Crawler_dashboard.html", crawler_metadata_onion = crawler_metadata_onion,
|
||||
# crawler_enabled=crawler_enabled, date=date,
|
||||
# date=date,
|
||||
# crawler_metadata_regular=crawler_metadata_regular,
|
||||
# statDomains_onion=statDomains_onion, statDomains_regular=statDomains_regular)
|
||||
|
||||
|
@ -288,7 +288,7 @@ def Crawler_Splash_last_by_type():
|
|||
crawler_metadata = get_crawler_splash_status(type)
|
||||
|
||||
return render_template("Crawler_Splash_last_by_type.html", type=type, type_name=type_name,
|
||||
crawler_enabled=crawler_enabled,
|
||||
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
|
||||
last_domains=list_domains, statDomains=statDomains,
|
||||
crawler_metadata=crawler_metadata, date_from=date_string, date_to=date_string)
|
||||
|
||||
|
@ -424,7 +424,7 @@ def auto_crawler():
|
|||
|
||||
return render_template("Crawler_auto.html", page=page, nb_page_max=nb_page_max,
|
||||
last_domains=last_domains,
|
||||
crawler_enabled=crawler_enabled,
|
||||
is_manager_connected=crawlers.get_splash_manager_connection_metadata(),
|
||||
auto_crawler_domain_onions_metadata=auto_crawler_domain_onions_metadata,
|
||||
auto_crawler_domain_regular_metadata=auto_crawler_domain_regular_metadata)
|
||||
|
||||
|
|
|
@ -1,6 +1,14 @@
|
|||
{% if not crawler_enabled %}
|
||||
{%if not is_manager_connected['status']%}
|
||||
<div class="alert alert-secondary text-center my-2" role="alert">
|
||||
<h1><i class="fas fa-times-circle text-danger"></i> Crawler Disabled</h1>
|
||||
<p>...</p>
|
||||
<p>
|
||||
{%if 'error' in is_manager_connected%}
|
||||
<b>{{is_manager_connected['status_code']}}</b>
|
||||
<br>
|
||||
<b>Error:</b> {{is_manager_connected['error']}}
|
||||
{%else%}
|
||||
<b>Error:</b> core/Crawler_manager not launched
|
||||
{%endif%}
|
||||
</p>
|
||||
</div>
|
||||
{% endif %}
|
||||
{%endif%}
|
||||
|
|
|
@ -44,7 +44,31 @@
|
|||
<div class="input-group" id="date-range-from">
|
||||
<input type="text" class="form-control" id="url_to_crawl" name="url_to_crawl" placeholder="Address or Domain">
|
||||
</div>
|
||||
<div class="d-flex mt-1">
|
||||
<div class="d-flex mt-2">
|
||||
<i class="fas fa-spider mt-1"></i> Crawler Type
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" name="queue_type_selector" value="True" id="queue_type_selector">
|
||||
<label class="custom-control-label" for="queue_type_selector">
|
||||
<i class="fas fa-splotch"></i> Splash Name
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div id="div_crawler_queue_type">
|
||||
<select class="custom-select form-control" name="crawler_queue_type" id="crawler_queue_type">
|
||||
{%for crawler_type in all_crawlers_types%}
|
||||
<option value="{{crawler_type}}" {%if crawler_type=='tor'%}selected{%endif%}>{{crawler_type}}</option>
|
||||
{%endfor%}
|
||||
</select>
|
||||
</div>
|
||||
<div id="div_splash_name">
|
||||
<select class="custom-select form-control" name="splash_name" id="splash_name">
|
||||
<option value="None" selected>Don't use a special splash crawler</option>
|
||||
{%for splash_name in all_splash_name%}
|
||||
<option value="{{splash_name}}">{{splash_name}}</option>
|
||||
{%endfor%}
|
||||
</select>
|
||||
</div>
|
||||
<div class="d-flex mt-3">
|
||||
<i class="fas fa-user-ninja mt-1"></i> Manual
|
||||
<div class="custom-control custom-switch">
|
||||
<input class="custom-control-input" type="checkbox" name="crawler_type" value="True" id="crawler_type">
|
||||
|
@ -143,11 +167,16 @@ var chart = {};
|
|||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
$("#nav_manual_crawler").addClass("active");
|
||||
queue_type_selector_input_controler()
|
||||
manual_crawler_input_controler();
|
||||
|
||||
$('#crawler_type').on("change", function () {
|
||||
manual_crawler_input_controler();
|
||||
});
|
||||
|
||||
$('#queue_type_selector').on("change", function () {
|
||||
queue_type_selector_input_controler();
|
||||
});
|
||||
});
|
||||
|
||||
function toggle_sidebar(){
|
||||
|
@ -172,4 +201,14 @@ function manual_crawler_input_controler() {
|
|||
}
|
||||
}
|
||||
|
||||
function queue_type_selector_input_controler() {
|
||||
if($('#queue_type_selector').is(':checked')){
|
||||
$("#div_crawler_queue_type").hide();
|
||||
$("#div_splash_name").show();
|
||||
}else{
|
||||
$("#div_crawler_queue_type").show();
|
||||
$("#div_splash_name").hide();
|
||||
}
|
||||
}
|
||||
|
||||
</script>
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>AIL-Framework</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
|
||||
<!-- JS -->
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
{% include 'crawler/menu_sidebar.html' %}
|
||||
|
||||
<div class="col-12 col-lg-10" id="core_content">
|
||||
|
||||
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}" method="post" enctype="multipart/form-data">
|
||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
||||
<table class="table table-sm">
|
||||
<tbody>
|
||||
{%for crawler_name in nb_crawlers_to_launch%}
|
||||
<tr>
|
||||
<td>{{crawler_name}}</td>
|
||||
<td>
|
||||
<input class="form-control" type="number" id="{{crawler_name}}" value="{{nb_crawlers_to_launch[crawler_name]}}" min="0" name="{{crawler_name}}" required>
|
||||
</td>
|
||||
</tr>
|
||||
{%endfor%}
|
||||
</tbody>
|
||||
</table>
|
||||
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
var to_refresh = false
|
||||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
$("#nav_settings").addClass("active");
|
||||
});
|
||||
|
||||
</script>
|
|
@ -0,0 +1,55 @@
|
|||
<!DOCTYPE html>
|
||||
|
||||
<html>
|
||||
<head>
|
||||
<title>AIL-Framework</title>
|
||||
<link rel="icon" href="{{ url_for('static', filename='image/ail-icon.png')}}">
|
||||
<!-- Core CSS -->
|
||||
<link href="{{ url_for('static', filename='css/bootstrap4.min.css') }}" rel="stylesheet">
|
||||
<link href="{{ url_for('static', filename='css/font-awesome.min.css') }}" rel="stylesheet">
|
||||
|
||||
<!-- JS -->
|
||||
<script src="{{ url_for('static', filename='js/jquery.js')}}"></script>
|
||||
<script src="{{ url_for('static', filename='js/bootstrap4.min.js')}}"></script>
|
||||
|
||||
|
||||
</head>
|
||||
|
||||
<body>
|
||||
|
||||
{% include 'nav_bar.html' %}
|
||||
|
||||
<div class="container-fluid">
|
||||
<div class="row">
|
||||
|
||||
{% include 'crawler/menu_sidebar.html' %}
|
||||
|
||||
<div class="col-12 col-lg-10" id="core_content">
|
||||
|
||||
<form action="{{ url_for('crawler_splash.crawler_splash_setings_crawler_manager') }}" method="post" enctype="multipart/form-data">
|
||||
<div class="form-group">
|
||||
<label for="splash_manager_url">Splash Manager URL</label>
|
||||
<input type="text" class="form-control" id="splash_manager_url" placeholder="http://splash_manager_url" name="splash_manager_url" {%if splash_manager_url%}value="{{splash_manager_url}}"{%endif%}>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="api_key">API Key</label>
|
||||
<input type="text" class="form-control" id="api_key" placeholder="API Key" name="api_key" {%if api_key%}value="{{api_key}}"{%endif%}>
|
||||
</div>
|
||||
<button type="submit" class="btn btn-primary">Edit <i class="fas fa-pencil-alt"></i></button>
|
||||
</form>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
</body>
|
||||
|
||||
<script>
|
||||
var to_refresh = false
|
||||
$(document).ready(function(){
|
||||
$("#page-Crawler").addClass("active");
|
||||
$("#nav_settings").addClass("active");
|
||||
});
|
||||
|
||||
</script>
|
|
@ -26,10 +26,6 @@
|
|||
|
||||
<div class="col-12 col-lg-10" id="core_content">
|
||||
|
||||
{%if not is_manager_connected%}
|
||||
{% include 'crawler/crawler_disabled.html' %}
|
||||
{%endif%}
|
||||
|
||||
<div class="row">
|
||||
<div class="col-xl-6">
|
||||
|
||||
|
@ -45,7 +41,7 @@
|
|||
<div class="card mb-3 mt-1">
|
||||
<div class="card-header bg-dark text-white">
|
||||
<span class="badge badge-pill badge-light flex-row-reverse float-right">
|
||||
{% if is_manager_connected %}
|
||||
{% if is_manager_connected['status'] %}
|
||||
<div style="color:Green;">
|
||||
<i class="fas fa-check-circle fa-2x"></i>
|
||||
Connected
|
||||
|
@ -61,6 +57,10 @@
|
|||
</div>
|
||||
<div class="card-body">
|
||||
|
||||
{%if not is_manager_connected['status']%}
|
||||
{% include 'crawler/crawler_disabled.html' %}
|
||||
{%endif%}
|
||||
|
||||
<div class="row mb-3 justify-content-center">
|
||||
<div class="col-xl-6">
|
||||
<div class="card text-center border-secondary">
|
||||
|
@ -77,6 +77,13 @@
|
|||
{{api_key}}
|
||||
<!-- <a class="ml-3" href="/settings/new_token"><i class="fa fa-random"></i></a> -->
|
||||
</td>
|
||||
<td>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawler_manager') }}">
|
||||
<button type="button" class="btn btn-info">
|
||||
Edit <i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</a>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
@ -85,6 +92,29 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div {%if not is_manager_connected%}class="hidden"{%endif%}>
|
||||
|
||||
<div class="card border-secondary mb-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">Number of Crawlers to Launch:</h5>
|
||||
<table class="table table-sm">
|
||||
<tbody>
|
||||
{%for crawler in nb_crawlers_to_launch%}
|
||||
<tr>
|
||||
<td>{{crawler}}</td>
|
||||
<td>{{nb_crawlers_to_launch[crawler]}}</td>
|
||||
</tr>
|
||||
{%endfor%}
|
||||
</tbody>
|
||||
</table>
|
||||
<a href="{{ url_for('crawler_splash.crawler_splash_setings_crawlers_to_lauch') }}">
|
||||
<button type="button" class="btn btn-info">
|
||||
Edit number of crawlers to launch <i class="fas fa-pencil-alt"></i>
|
||||
</button>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="card border-secondary mb-4">
|
||||
<div class="card-body text-dark">
|
||||
<h5 class="card-title">All Splash Crawlers:</h5>
|
||||
|
@ -202,7 +232,7 @@
|
|||
</table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
Loading…
Reference in New Issue