fix: [crawler] fix capture start time

dev
terrtia 2023-12-11 09:30:09 +01:00
parent 5fc9b1403f
commit 235539ea42
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
2 changed files with 12 additions and 4 deletions

View File

@ -121,7 +121,9 @@ class Crawler(AbstractModule):
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
task_row = crawlers.add_task_to_lacus_queue() task_row = crawlers.add_task_to_lacus_queue()
if task_row: if task_row:
task_uuid, priority = task_row task, priority = task_row
task.start()
task_uuid = task.uuid
try: try:
self.enqueue_capture(task_uuid, priority) self.enqueue_capture(task_uuid, priority)
except ConnectionError: except ConnectionError:
@ -195,10 +197,17 @@ class Crawler(AbstractModule):
print(task.uuid, capture_uuid, 'launched') print(task.uuid, capture_uuid, 'launched')
if self.ail_to_push_discovery: if self.ail_to_push_discovery:
if task.get_depth() == 1 and priority < 10 and task.get_domain().endswith('.onion'): if task.get_depth() == 1 and priority < 10 and task.get_domain().endswith('.onion'):
har = task.get_har() har = task.get_har()
screenshot = task.get_screenshot() screenshot = task.get_screenshot()
self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har, # parent_id = task.get_parent()
# if parent_id != 'manual' and parent_id != 'auto':
# parent = parent_id[19:-36]
# else:
# parent = 'AIL_capture'
self.ail_to_push_discovery.add_crawler_capture(task_uuid, capture_uuid, url, har=har, # parent=parent,
screenshot=screenshot, depth_limit=1, proxy='force_tor') screenshot=screenshot, depth_limit=1, proxy='force_tor')
print(task.uuid, capture_uuid, 'Added to ail_to_push_discovery') print(task.uuid, capture_uuid, 'Added to ail_to_push_discovery')
return capture_uuid return capture_uuid

View File

@ -1642,8 +1642,7 @@ def add_task_to_lacus_queue():
return None return None
task_uuid, priority = task_uuid[0] task_uuid, priority = task_uuid[0]
task = CrawlerTask(task_uuid) task = CrawlerTask(task_uuid)
task.start() return task, priority
return task.uuid, priority
# PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100 # PRIORITY: discovery = 0/10, feeder = 10, manual = 50, auto = 40, test = 100
def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None, def create_task(url, depth=1, har=True, screenshot=True, header=None, cookiejar=None, proxy=None,