From 57fbacc49cb82f9578bd58d54e998634aec131dd Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Thu, 14 Oct 2021 14:23:11 +0200
Subject: [PATCH] chg: [crawler] add auto crawler functions

---
 bin/lib/crawlers.py | 48 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 46 insertions(+), 2 deletions(-)

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
index 23ba0ae1..c035b8bd 100755
--- a/bin/lib/crawlers.py
+++ b/bin/lib/crawlers.py
@@ -479,6 +479,15 @@ def is_crawler_activated():
 def get_crawler_all_types():
     return ['onion', 'regular']
 
+def sanitize_crawler_types(l_crawler_types):
+    all_crawler_types = get_crawler_all_types()
+    if not l_crawler_types:
+        return all_crawler_types
+    for crawler_type in l_crawler_types:
+        if crawler_type not in all_crawler_types:
+            return all_crawler_types
+    return l_crawler_types
+
 def get_all_spash_crawler_status():
     crawler_metadata = []
     all_crawlers = r_cache.smembers('all_splash_crawlers')
@@ -600,9 +609,40 @@ def api_set_nb_crawlers_to_launch(dict_splash_name):
     else:
         return ({'error':'invalid input'}, 400)
 
-
 ##-- CRAWLER GLOBAL --##
 
+#### AUTOMATIC CRAWLER ####
+
+def get_auto_crawler_all_domain(l_crawler_types=[]):
+    l_crawler_types = sanitize_crawler_types(l_crawler_types)
+    if len(l_crawler_types) == 1:
+        return r_serv_onion.smembers(f'auto_crawler_url:{crawler_type[0]}')
+    else:
+        l_keys_name = []
+        for crawler_type in l_crawler_types:
+            l_keys_name.append(f'auto_crawler_url:{crawler_type}')
+        return r_serv_onion.sunion(l_keys_name[0], *l_keys_name[1:])
+
+def add_auto_crawler_in_queue(domain, domain_type, port, epoch, delta, message):
+    r_serv_onion.zadd('crawler_auto_queue', int(time.time() + delta) , f'{message};{domain_type}')
+    # update list, last auto crawled domains
+    r_serv_onion.lpush('last_auto_crawled', f'{domain}:{port};{epoch}')
+    r_serv_onion.ltrim('last_auto_crawled', 0, 9)
+
+def update_auto_crawler_queue():
+    current_epoch = int(time.time())
+    current_epoch = 1631096842
+    # check if current_epoch > domain_next_epoch
+    l_queue = r_serv_onion.zrangebyscore('crawler_auto_queue', 0, current_epoch)
+    for elem in l_queue:
+        mess, domain_type = elem.rsplit(';', 1)
+        print(domain_type)
+        print(mess)
+        r_serv_onion.sadd(f'{domain_type}_crawler_priority_queue', mess)
+        
+
+##-- AUTOMATIC CRAWLER --##
+
 #### CRAWLER TASK ####
 def create_crawler_task(url, screenshot=True, har=True, depth_limit=1, max_pages=100, auto_crawler=False, crawler_delta=3600, crawler_type=None, cookiejar_uuid=None, user_agent=None):
 
@@ -1448,10 +1488,14 @@ def test_ail_crawlers():
 
 #### ---- ####
 
-#if __name__ == '__main__':
+if __name__ == '__main__':
     # res = get_splash_manager_version()
     # res = test_ail_crawlers()
     # res = is_test_ail_crawlers_successful()
     # print(res)
     # print(get_test_ail_crawlers_message())
     #print(get_all_queues_stats())
+
+    #res = get_auto_crawler_all_domain()
+    res = update_auto_crawler_queue()
+    print(res)