fix: [crawler] fix crawler queue stats

pull/607/merge
terrtia 2024-09-17 15:36:15 +02:00
parent cc7e67d5ed
commit a20b6054e8
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
2 changed files with 12 additions and 0 deletions

View File

@ -61,6 +61,8 @@ class Crawler(AbstractModule):
crawlers.load_blacklist() crawlers.load_blacklist()
# update captures cache # update captures cache
crawlers.reload_crawler_captures() crawlers.reload_crawler_captures()
# update crawler queue stats
crawlers.reload_crawlers_stats()
self.crawler_scheduler = crawlers.CrawlerScheduler() self.crawler_scheduler = crawlers.CrawlerScheduler()

View File

@ -1018,6 +1018,16 @@ def get_crawlers_stats(domain_type=None):
stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled} stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled}
return stats return stats
def reload_crawlers_stats():
for domain_type in get_crawler_all_types():
to_remove = []
for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'):
task = CrawlerTask(task_uuid)
if not task.exists():
to_remove.append(task_uuid)
for task_uuid in to_remove:
r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid)
#### Blocklist #### #### Blocklist ####
def get_blacklist(): def get_blacklist():