From 759d241b758a7a9c3087ee158ab19d1338e5262c Mon Sep 17 00:00:00 2001 From: terrtia Date: Tue, 17 Sep 2024 16:52:36 +0200 Subject: [PATCH] fix: [crawler] fix crawler queue stats --- bin/lib/crawlers.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 3a3b8bba..803c08dd 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1020,13 +1020,11 @@ def get_crawlers_stats(domain_type=None): def reload_crawlers_stats(): for domain_type in get_crawler_all_types(): - to_remove = [] - for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'): + tasks = r_crawler.smembers(f'crawler:queue:type:{domain_type}') + for task_uuid in tasks: task = CrawlerTask(task_uuid) - if not task.exists(): - to_remove.append(task_uuid) - for task_uuid in to_remove: - r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid) + if not task.is_in_queue() and task.get_status() is None: + task.delete() #### Blocklist #### @@ -1533,6 +1531,12 @@ class CrawlerTask: def exists(self): return r_crawler.exists(f'crawler:task:{self.uuid}') + def is_in_queue(self): + if r_crawler.zscore('crawler:queue', self.uuid) is not None: + return True + else: + return False + def get_url(self): return r_crawler.hget(f'crawler:task:{self.uuid}', 'url')