From 57fbacc49cb82f9578bd58d54e998634aec131dd Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 14 Oct 2021 14:23:11 +0200 Subject: [PATCH] chg: [crawler] add auto crawler functions --- bin/lib/crawlers.py | 48 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 23ba0ae1..c035b8bd 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -479,6 +479,15 @@ def is_crawler_activated(): def get_crawler_all_types(): return ['onion', 'regular'] +def sanitize_crawler_types(l_crawler_types): + all_crawler_types = get_crawler_all_types() + if not l_crawler_types: + return all_crawler_types + for crawler_type in l_crawler_types: + if crawler_type not in all_crawler_types: + return all_crawler_types + return l_crawler_types + def get_all_spash_crawler_status(): crawler_metadata = [] all_crawlers = r_cache.smembers('all_splash_crawlers') @@ -600,9 +609,40 @@ def api_set_nb_crawlers_to_launch(dict_splash_name): else: return ({'error':'invalid input'}, 400) - ##-- CRAWLER GLOBAL --## +#### AUTOMATIC CRAWLER #### + +def get_auto_crawler_all_domain(l_crawler_types=[]): + l_crawler_types = sanitize_crawler_types(l_crawler_types) + if len(l_crawler_types) == 1: + return r_serv_onion.smembers(f'auto_crawler_url:{crawler_type[0]}') + else: + l_keys_name = [] + for crawler_type in l_crawler_types: + l_keys_name.append(f'auto_crawler_url:{crawler_type}') + return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:]) + +def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message): + r_serv_onion.zadd('crawler_auto_queue', int(time.time() + delta) , f'{message};{domain_type}') + # update list, last auto crawled domains + r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}') + r_serv_onion.ltrim('last_auto_crawled', 0, 9) + +def update_auto_crawler_queue(): + current_epoch = int(time.time()) + current_epoch = 1631096842 + # check if current_epoch > domain_next_epoch + l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch) + for elem in l_queue: + mess, domain_type = elem.rsplit(';', 1) + print(domain_type) + print(mess) + r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess) + + +##-- AUTOMATIC CRAWLER --## + #### CRAWLER TASK #### def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None): @@ -1448,10 +1488,14 @@ def test_ail_crawlers(): #### ---- #### -#if __name__ == '__main__': +if __name__ == '__main__': # res = get_splash_manager_version() # res = test_ail_crawlers() # res = is_test_ail_crawlers_successful() # print(res) # print(get_test_ail_crawlers_message()) #print(get_all_queues_stats()) + + #res = get_auto_crawler_all_domain() + res = update_auto_crawler_queue() + print(res)