mirror of https://github.com/CIRCL/AIL-framework
fix: [crawler] crawler capture with empty task
parent
907f370b29
commit
7b66ff6a8c
|
@ -430,7 +430,6 @@ def _reprocess_all_hars_hhhashs():
|
||||||
extract_hhhash_by_id(har_id, domain, date)
|
extract_hhhash_by_id(har_id, domain, date)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _gzip_har(har_id):
|
def _gzip_har(har_id):
|
||||||
har_path = os.path.join(HAR_DIR, har_id)
|
har_path = os.path.join(HAR_DIR, har_id)
|
||||||
new_id = f'{har_path}.gz'
|
new_id = f'{har_path}.gz'
|
||||||
|
@ -1436,8 +1435,8 @@ class CrawlerCapture:
|
||||||
launch_time = int(time.time())
|
launch_time = int(time.time())
|
||||||
r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', self.uuid)
|
r_crawler.hset(f'crawler:task:{task_uuid}', 'capture', self.uuid)
|
||||||
r_crawler.hset('crawler:captures:tasks', self.uuid, task_uuid)
|
r_crawler.hset('crawler:captures:tasks', self.uuid, task_uuid)
|
||||||
r_crawler.zadd('crawler:captures', {self.uuid: launch_time})
|
|
||||||
r_cache.hset(f'crawler:capture:{self.uuid}', 'launch_time', launch_time)
|
r_cache.hset(f'crawler:capture:{self.uuid}', 'launch_time', launch_time)
|
||||||
|
r_crawler.zadd('crawler:captures', {self.uuid: launch_time})
|
||||||
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
|
r_cache.zadd('crawler:captures', {self.uuid: launch_time})
|
||||||
|
|
||||||
def update(self, status):
|
def update(self, status):
|
||||||
|
@ -1481,15 +1480,24 @@ def get_captures_status():
|
||||||
for capture_uuid in get_crawler_captures():
|
for capture_uuid in get_crawler_captures():
|
||||||
capture = CrawlerCapture(capture_uuid)
|
capture = CrawlerCapture(capture_uuid)
|
||||||
task = capture.get_task()
|
task = capture.get_task()
|
||||||
domain = task.get_domain()
|
if not task:
|
||||||
dom = Domain(domain)
|
meta = {
|
||||||
meta = {
|
'uuid': 'UNKNOWN',
|
||||||
'uuid': task.uuid,
|
'domain': 'UNKNOWN',
|
||||||
'domain': dom.get_id(),
|
'type': 'UNKNOWN',
|
||||||
'type': dom.get_domain_type(),
|
'start_time': capture.get_start_time(),
|
||||||
'start_time': capture.get_start_time(),
|
'status': capture.get_status(),
|
||||||
'status': capture.get_status(),
|
}
|
||||||
}
|
else:
|
||||||
|
domain = task.get_domain()
|
||||||
|
dom = Domain(domain)
|
||||||
|
meta = {
|
||||||
|
'uuid': task.uuid,
|
||||||
|
'domain': dom.get_id(),
|
||||||
|
'type': dom.get_domain_type(),
|
||||||
|
'start_time': capture.get_start_time(),
|
||||||
|
'status': capture.get_status(),
|
||||||
|
}
|
||||||
capture_status = capture.get_status()
|
capture_status = capture.get_status()
|
||||||
if capture_status:
|
if capture_status:
|
||||||
capture_status = CaptureStatus(int(capture_status)).name
|
capture_status = CaptureStatus(int(capture_status)).name
|
||||||
|
@ -1502,7 +1510,7 @@ def delete_captures():
|
||||||
capture = CrawlerCapture(capture_uuid)
|
capture = CrawlerCapture(capture_uuid)
|
||||||
capture.delete()
|
capture.delete()
|
||||||
|
|
||||||
##-- CRAWLER STATE --##
|
## --CRAWLER STATE-- ##
|
||||||
|
|
||||||
|
|
||||||
#### CRAWLER TASK ####
|
#### CRAWLER TASK ####
|
||||||
|
@ -1848,13 +1856,13 @@ def api_add_crawler_task(data, user_org, user_id=None):
|
||||||
if frequency:
|
if frequency:
|
||||||
# TODO verify user
|
# TODO verify user
|
||||||
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_schedule(frequency, user_id, url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags)
|
||||||
else:
|
else:
|
||||||
# TODO HEADERS
|
# TODO HEADERS
|
||||||
# TODO USER AGENT
|
# TODO USER AGENT
|
||||||
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
task_uuid = create_task(url, depth=depth_limit, har=har, screenshot=screenshot, header=None,
|
||||||
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
cookiejar=cookiejar_uuid, proxy=proxy, user_agent=None, tags=tags,
|
||||||
parent='manual', priority=90)
|
parent='manual', priority=90)
|
||||||
|
|
||||||
return {'uuid': task_uuid}, 200
|
return {'uuid': task_uuid}, 200
|
||||||
|
|
||||||
|
@ -1919,10 +1927,10 @@ def create_item_id(item_dir, domain):
|
||||||
# remove /
|
# remove /
|
||||||
domain = domain.replace('/', '_')
|
domain = domain.replace('/', '_')
|
||||||
if len(domain) > 215:
|
if len(domain) > 215:
|
||||||
UUID = domain[-215:]+str(uuid.uuid4())
|
n_uuid = domain[-215:]+str(uuid.uuid4())
|
||||||
else:
|
else:
|
||||||
UUID = domain+str(uuid.uuid4())
|
n_uuid = domain+str(uuid.uuid4())
|
||||||
return os.path.join(item_dir, UUID)
|
return os.path.join(item_dir, n_uuid)
|
||||||
|
|
||||||
# # # # # # # # # # # #
|
# # # # # # # # # # # #
|
||||||
# #
|
# #
|
||||||
|
@ -2160,15 +2168,15 @@ def test_ail_crawlers():
|
||||||
load_blacklist()
|
load_blacklist()
|
||||||
|
|
||||||
# if __name__ == '__main__':
|
# if __name__ == '__main__':
|
||||||
# delete_captures()
|
# delete_captures()
|
||||||
|
#
|
||||||
# item_id = 'crawled/2023/02/20/data.gz'
|
# item_id = 'crawled/2023/02/20/data.gz'
|
||||||
# item = Item(item_id)
|
# item = Item(item_id)
|
||||||
# content = item.get_content()
|
# content = item.get_content()
|
||||||
# temp_url = ''
|
# temp_url = ''
|
||||||
# r = extract_favicon_from_html(content, temp_url)
|
# r = extract_favicon_from_html(content, temp_url)
|
||||||
# print(r)
|
# print(r)
|
||||||
# _reprocess_all_hars_cookie_name()
|
# _reprocess_all_hars_cookie_name()
|
||||||
# _reprocess_all_hars_etag()
|
# _reprocess_all_hars_etag()
|
||||||
# _gzip_all_hars()
|
# _gzip_all_hars()
|
||||||
# _reprocess_all_hars_hhhashs()
|
# _reprocess_all_hars_hhhashs()
|
||||||
|
|
Loading…
Reference in New Issue