fix: [crawler] fix crawler queue stats

pull/607/merge
terrtia 2024-09-17 16:52:36 +02:00
parent a20b6054e8
commit 759d241b75
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
1 changed files with 10 additions and 6 deletions

View File

@ -1020,13 +1020,11 @@ def get_crawlers_stats(domain_type=None):
def reload_crawlers_stats():
for domain_type in get_crawler_all_types():
to_remove = []
for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'):
tasks = r_crawler.smembers(f'crawler:queue:type:{domain_type}')
for task_uuid in tasks:
task = CrawlerTask(task_uuid)
if not task.exists():
to_remove.append(task_uuid)
for task_uuid in to_remove:
r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid)
if not task.is_in_queue() and task.get_status() is None:
task.delete()
#### Blocklist ####
@ -1533,6 +1531,12 @@ class CrawlerTask:
def exists(self):
return r_crawler.exists(f'crawler:task:{self.uuid}')
def is_in_queue(self):
if r_crawler.zscore('crawler:queue', self.uuid) is not None:
return True
else:
return False
def get_url(self):
return r_crawler.hget(f'crawler:task:{self.uuid}', 'url')