From 235539ea421f7acac78df9f732eb96903e9d9ab7 Mon Sep 17 00:00:00 2001 From: terrtia Date: Mon, 11 Dec 2023 09:30:09 +0100 Subject: [PATCH] fix: [crawler] fix capture start time --- bin/crawlers/Crawler.py | 13 +++++++++++-- bin/lib/crawlers.py | 3 +-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 06ebe982..fd86da8a 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -121,7 +121,9 @@ class Crawler(AbstractModule): if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): task_row = crawlers.add_task_to_lacus_queue() if task_row: - task_uuid, priority = task_row + task, priority = task_row + task.start() + task_uuid = task.uuid try: self.enqueue_capture(task_uuid, priority) except ConnectionError: @@ -195,10 +197,17 @@ class Crawler(AbstractModule): print(task.uuid, capture_uuid, 'launched') if self.ail_to_push_discovery: + if task.get_depth() == 1 and priority < 10 and task.get_domain().endswith('.onion'): har = task.get_har() screenshot = task.get_screenshot() - self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har, + # parent_id = task.get_parent() + # if parent_id != 'manual' and parent_id != 'auto': + # parent = parent_id[19:-36] + # else: + # parent = 'AIL_capture' + + self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har, # parent=parent, screenshot=screenshot, depth_limit=1, proxy='force_tor') print(task.uuid, capture_uuid, 'Added to ail_to_push_discovery') return capture_uuid diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 101c8a33..d6ec4f1e 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -1642,8 +1642,7 @@ def add_task_to_lacus_queue(): return None task_uuid, priority = task_uuid[0] task = CrawlerTask(task_uuid) - task.start() - return task.uuid, priority + return task, priority # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,