mirror of https://github.com/CIRCL/AIL-framework
fix: [crawler] add timeout to Unknown captures
parent
8f0e7f1434
commit
c719990125
|
@ -99,7 +99,7 @@ class Crawler(AbstractModule):
|
||||||
self.crawler_scheduler.update_queue()
|
self.crawler_scheduler.update_queue()
|
||||||
self.crawler_scheduler.process_queue()
|
self.crawler_scheduler.process_queue()
|
||||||
|
|
||||||
self.refresh_lacus_status() # TODO LOG ERROR
|
self.refresh_lacus_status() # TODO LOG ERROR
|
||||||
if not self.is_lacus_up:
|
if not self.is_lacus_up:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -122,11 +122,19 @@ class Crawler(AbstractModule):
|
||||||
if capture:
|
if capture:
|
||||||
try:
|
try:
|
||||||
status = self.lacus.get_capture_status(capture.uuid)
|
status = self.lacus.get_capture_status(capture.uuid)
|
||||||
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time ### print start time
|
if status == crawlers.CaptureStatus.DONE:
|
||||||
|
return capture
|
||||||
|
elif status == crawlers.CaptureStatus.UNKNOWN:
|
||||||
|
capture_start = capture.get_start_time(r_str=False)
|
||||||
|
if int(time.time()) - capture_start > 600: # TODO ADD in new crawler config
|
||||||
|
task = capture.get_task()
|
||||||
|
task.reset()
|
||||||
|
capture.delete()
|
||||||
|
else:
|
||||||
|
capture.update(status)
|
||||||
|
else:
|
||||||
capture.update(status)
|
capture.update(status)
|
||||||
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
|
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
|
||||||
else:
|
|
||||||
return capture
|
|
||||||
|
|
||||||
except ConnectionError:
|
except ConnectionError:
|
||||||
print(capture.uuid)
|
print(capture.uuid)
|
||||||
|
|
|
@ -134,7 +134,7 @@ def unpack_url(url):
|
||||||
# # # # # # # # TODO CREATE NEW OBJECT
|
# # # # # # # # TODO CREATE NEW OBJECT
|
||||||
|
|
||||||
def get_favicon_from_html(html, domain, url):
|
def get_favicon_from_html(html, domain, url):
|
||||||
favicon_urls = extract_favicon_from_html(html, url)
|
favicon_urls, favicons = extract_favicon_from_html(html, url)
|
||||||
# add root favicon
|
# add root favicon
|
||||||
if not favicon_urls:
|
if not favicon_urls:
|
||||||
favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico')
|
favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico')
|
||||||
|
@ -162,7 +162,6 @@ def extract_favicon_from_html(html, url):
|
||||||
# - <meta name="msapplication-TileColor" content="#aaaaaa"> <meta name="theme-color" content="#ffffff">
|
# - <meta name="msapplication-TileColor" content="#aaaaaa"> <meta name="theme-color" content="#ffffff">
|
||||||
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
|
# - <meta name="msapplication-config" content="/icons/browserconfig.xml">
|
||||||
|
|
||||||
|
|
||||||
# Root Favicon
|
# Root Favicon
|
||||||
f = get_faup()
|
f = get_faup()
|
||||||
f.decode(url)
|
f.decode(url)
|
||||||
|
@ -244,13 +243,6 @@ def extract_description_from_html(html):
|
||||||
return description['content']
|
return description['content']
|
||||||
return ''
|
return ''
|
||||||
|
|
||||||
def extract_description_from_html(html):
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
description = soup.find('meta', attrs={'name': 'description'})
|
|
||||||
if description:
|
|
||||||
return description['content']
|
|
||||||
return ''
|
|
||||||
|
|
||||||
def extract_keywords_from_html(html):
|
def extract_keywords_from_html(html):
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
keywords = soup.find('meta', attrs={'name': 'keywords'})
|
keywords = soup.find('meta', attrs={'name': 'keywords'})
|
||||||
|
@ -347,7 +339,7 @@ def _reprocess_all_hars_cookie_name():
|
||||||
cookie = CookiesNames.create(cookie_name)
|
cookie = CookiesNames.create(cookie_name)
|
||||||
cookie.add(date, domain)
|
cookie.add(date, domain)
|
||||||
|
|
||||||
def extract_etag_from_har(har): # TODO check response url
|
def extract_etag_from_har(har): # TODO check response url
|
||||||
etags = set()
|
etags = set()
|
||||||
for entrie in har.get('log', {}).get('entries', []):
|
for entrie in har.get('log', {}).get('entries', []):
|
||||||
for header in entrie.get('response', {}).get('headers', []):
|
for header in entrie.get('response', {}).get('headers', []):
|
||||||
|
@ -686,8 +678,7 @@ class Cookie:
|
||||||
meta[field] = value
|
meta[field] = value
|
||||||
if r_json:
|
if r_json:
|
||||||
data = json.dumps(meta, indent=4, sort_keys=True)
|
data = json.dumps(meta, indent=4, sort_keys=True)
|
||||||
meta = {'data': data}
|
meta = {'data': data, 'uuid': self.uuid}
|
||||||
meta['uuid'] = self.uuid
|
|
||||||
return meta
|
return meta
|
||||||
|
|
||||||
def edit(self, cookie_dict):
|
def edit(self, cookie_dict):
|
||||||
|
@ -799,7 +790,7 @@ def unpack_imported_json_cookie(json_cookie):
|
||||||
|
|
||||||
## - - ##
|
## - - ##
|
||||||
#### COOKIEJAR API ####
|
#### COOKIEJAR API ####
|
||||||
def api_import_cookies_from_json(user_id, cookiejar_uuid, json_cookies_str): # # TODO: add catch
|
def api_import_cookies_from_json(user_id, cookiejar_uuid, json_cookies_str): # # TODO: add catch
|
||||||
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
|
resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id)
|
||||||
if resp:
|
if resp:
|
||||||
return resp
|
return resp
|
||||||
|
@ -968,8 +959,8 @@ class CrawlerScheduler:
|
||||||
minutes = 0
|
minutes = 0
|
||||||
current_time = datetime.now().timestamp()
|
current_time = datetime.now().timestamp()
|
||||||
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
|
time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks),
|
||||||
days=int(days), hours=int(hours),
|
days=int(days), hours=int(hours),
|
||||||
minutes=int(minutes))).timestamp()
|
minutes=int(minutes))).timestamp()
|
||||||
# Make sure the next capture is not scheduled for in a too short interval
|
# Make sure the next capture is not scheduled for in a too short interval
|
||||||
interval_next_capture = time_next_run - current_time
|
interval_next_capture = time_next_run - current_time
|
||||||
if interval_next_capture < self.min_frequency:
|
if interval_next_capture < self.min_frequency:
|
||||||
|
@ -1249,8 +1240,13 @@ class CrawlerCapture:
|
||||||
if task_uuid:
|
if task_uuid:
|
||||||
return CrawlerTask(task_uuid)
|
return CrawlerTask(task_uuid)
|
||||||
|
|
||||||
def get_start_time(self):
|
def get_start_time(self, r_str=True):
|
||||||
return self.get_task().get_start_time()
|
start_time = self.get_task().get_start_time()
|
||||||
|
if r_str:
|
||||||
|
return start_time
|
||||||
|
else:
|
||||||
|
start_time = datetime.strptime(start_time, "%Y/%m/%d - %H:%M.%S").timestamp()
|
||||||
|
return int(start_time)
|
||||||
|
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
status = r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
|
status = r_cache.hget(f'crawler:capture:{self.uuid}', 'status')
|
||||||
|
@ -1517,6 +1513,11 @@ class CrawlerTask:
|
||||||
def start(self):
|
def start(self):
|
||||||
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S"))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
priority = 49
|
||||||
|
r_crawler.hdel(f'crawler:task:{self.uuid}', 'start_time')
|
||||||
|
self.add_to_db_crawler_queue(priority)
|
||||||
|
|
||||||
# Crawler
|
# Crawler
|
||||||
def remove(self): # zrem cache + DB
|
def remove(self): # zrem cache + DB
|
||||||
capture_uuid = self.get_capture()
|
capture_uuid = self.get_capture()
|
||||||
|
@ -1727,13 +1728,13 @@ class CrawlerProxy:
|
||||||
self.uuid = proxy_uuid
|
self.uuid = proxy_uuid
|
||||||
|
|
||||||
def get_description(self):
|
def get_description(self):
|
||||||
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'description')
|
return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'description')
|
||||||
|
|
||||||
# Host
|
# Host
|
||||||
# Port
|
# Port
|
||||||
# Type -> need test
|
# Type -> need test
|
||||||
def get_url(self):
|
def get_url(self):
|
||||||
return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url')
|
return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'url')
|
||||||
|
|
||||||
#### CRAWLER LACUS ####
|
#### CRAWLER LACUS ####
|
||||||
|
|
||||||
|
@ -1816,7 +1817,7 @@ def api_save_lacus_url_key(data):
|
||||||
# unpack json
|
# unpack json
|
||||||
manager_url = data.get('url', None)
|
manager_url = data.get('url', None)
|
||||||
api_key = data.get('api_key', None)
|
api_key = data.get('api_key', None)
|
||||||
if not manager_url: # or not api_key:
|
if not manager_url: # or not api_key:
|
||||||
return {'status': 'error', 'reason': 'No url or API key supplied'}, 400
|
return {'status': 'error', 'reason': 'No url or API key supplied'}, 400
|
||||||
# check if is valid url
|
# check if is valid url
|
||||||
try:
|
try:
|
||||||
|
@ -1859,7 +1860,7 @@ def api_set_crawler_max_captures(data):
|
||||||
save_nb_max_captures(nb_captures)
|
save_nb_max_captures(nb_captures)
|
||||||
return nb_captures, 200
|
return nb_captures, 200
|
||||||
|
|
||||||
## TEST ##
|
## TEST ##
|
||||||
|
|
||||||
def is_test_ail_crawlers_successful():
|
def is_test_ail_crawlers_successful():
|
||||||
return r_db.hget('crawler:tor:test', 'success') == 'True'
|
return r_db.hget('crawler:tor:test', 'success') == 'True'
|
||||||
|
|
Loading…
Reference in New Issue