mirror of https://github.com/CIRCL/AIL-framework
chg: [crawler] add auto crawler functions
parent
57a69fa1f4
commit
57fbacc49c
|
@ -479,6 +479,15 @@ def is_crawler_activated():
|
||||||
def get_crawler_all_types():
|
def get_crawler_all_types():
|
||||||
return ['onion', 'regular']
|
return ['onion', 'regular']
|
||||||
|
|
||||||
|
def sanitize_crawler_types(l_crawler_types):
|
||||||
|
all_crawler_types = get_crawler_all_types()
|
||||||
|
if not l_crawler_types:
|
||||||
|
return all_crawler_types
|
||||||
|
for crawler_type in l_crawler_types:
|
||||||
|
if crawler_type not in all_crawler_types:
|
||||||
|
return all_crawler_types
|
||||||
|
return l_crawler_types
|
||||||
|
|
||||||
def get_all_spash_crawler_status():
|
def get_all_spash_crawler_status():
|
||||||
crawler_metadata = []
|
crawler_metadata = []
|
||||||
all_crawlers = r_cache.smembers('all_splash_crawlers')
|
all_crawlers = r_cache.smembers('all_splash_crawlers')
|
||||||
|
@ -600,9 +609,40 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
|
||||||
else:
|
else:
|
||||||
return ({'error':'invalid input'}, 400)
|
return ({'error':'invalid input'}, 400)
|
||||||
|
|
||||||
|
|
||||||
##-- CRAWLER GLOBAL --##
|
##-- CRAWLER GLOBAL --##
|
||||||
|
|
||||||
|
#### AUTOMATIC CRAWLER ####
|
||||||
|
|
||||||
|
def get_auto_crawler_all_domain(l_crawler_types=[]):
|
||||||
|
l_crawler_types = sanitize_crawler_types(l_crawler_types)
|
||||||
|
if len(l_crawler_types) == 1:
|
||||||
|
return r_serv_onion.smembers(f'auto_crawler_url:{crawler_type[0]}')
|
||||||
|
else:
|
||||||
|
l_keys_name = []
|
||||||
|
for crawler_type in l_crawler_types:
|
||||||
|
l_keys_name.append(f'auto_crawler_url:{crawler_type}')
|
||||||
|
return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
|
||||||
|
|
||||||
|
def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
|
||||||
|
r_serv_onion.zadd('crawler_auto_queue', int(time.time() + delta) , f'{message};{domain_type}')
|
||||||
|
# update list, last auto crawled domains
|
||||||
|
r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
|
||||||
|
r_serv_onion.ltrim('last_auto_crawled', 0, 9)
|
||||||
|
|
||||||
|
def update_auto_crawler_queue():
|
||||||
|
current_epoch = int(time.time())
|
||||||
|
current_epoch = 1631096842
|
||||||
|
# check if current_epoch > domain_next_epoch
|
||||||
|
l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
|
||||||
|
for elem in l_queue:
|
||||||
|
mess, domain_type = elem.rsplit(';', 1)
|
||||||
|
print(domain_type)
|
||||||
|
print(mess)
|
||||||
|
r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
|
||||||
|
|
||||||
|
|
||||||
|
##-- AUTOMATIC CRAWLER --##
|
||||||
|
|
||||||
#### CRAWLER TASK ####
|
#### CRAWLER TASK ####
|
||||||
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
|
def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
|
||||||
|
|
||||||
|
@ -1448,10 +1488,14 @@ def test_ail_crawlers():
|
||||||
|
|
||||||
#### ---- ####
|
#### ---- ####
|
||||||
|
|
||||||
#if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
# res = get_splash_manager_version()
|
# res = get_splash_manager_version()
|
||||||
# res = test_ail_crawlers()
|
# res = test_ail_crawlers()
|
||||||
# res = is_test_ail_crawlers_successful()
|
# res = is_test_ail_crawlers_successful()
|
||||||
# print(res)
|
# print(res)
|
||||||
# print(get_test_ail_crawlers_message())
|
# print(get_test_ail_crawlers_message())
|
||||||
#print(get_all_queues_stats())
|
#print(get_all_queues_stats())
|
||||||
|
|
||||||
|
#res = get_auto_crawler_all_domain()
|
||||||
|
res = update_auto_crawler_queue()
|
||||||
|
print(res)
|
||||||
|
|
Loading…
Reference in New Issue