From c71999012556a8f88020e856641d8686e8f32698 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 10 Jul 2023 11:23:44 +0200 Subject: [PATCH] fix: [crawler] add timeout to Unknown captures --- bin/crawlers/Crawler.py | 16 +++++++++++---- bin/lib/crawlers.py | 43 +++++++++++++++++++++-------------------- 2 files changed, 34 insertions(+), 25 deletions(-) diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py index 2b2f35c6..db1fc0fe 100755 --- a/bin/crawlers/Crawler.py +++ b/bin/crawlers/Crawler.py @@ -99,7 +99,7 @@ class Crawler(AbstractModule): self.crawler_scheduler.update_queue() self.crawler_scheduler.process_queue() - self.refresh_lacus_status() # TODO LOG ERROR + self.refresh_lacus_status() # TODO LOG ERROR if not self.is_lacus_up: return None @@ -122,11 +122,19 @@ class Crawler(AbstractModule): if capture: try: status = self.lacus.get_capture_status(capture.uuid) - if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time ### print start time + if status == crawlers.CaptureStatus.DONE: + return capture + elif status == crawlers.CaptureStatus.UNKNOWN: + capture_start = capture.get_start_time(r_str=False) + if int(time.time()) - capture_start > 600: # TODO ADD in new crawler config + task = capture.get_task() + task.reset() + capture.delete() + else: + capture.update(status) + else: capture.update(status) print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time())) - else: - return capture except ConnectionError: print(capture.uuid) diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 90e18fe4..406d9ccf 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -134,7 +134,7 @@ def unpack_url(url): # # # # # # # # TODO CREATE NEW OBJECT def get_favicon_from_html(html, domain, url): - favicon_urls = extract_favicon_from_html(html, url) + favicon_urls, favicons = extract_favicon_from_html(html, url) # add root favicon if not favicon_urls: favicon_urls.add(f'{urlparse(url).scheme}://{domain}/favicon.ico') @@ -162,7 +162,6 @@ def extract_favicon_from_html(html, url): # - # - - # Root Favicon f = get_faup() f.decode(url) @@ -244,13 +243,6 @@ def extract_description_from_html(html): return description['content'] return '' -def extract_description_from_html(html): - soup = BeautifulSoup(html, 'html.parser') - description = soup.find('meta', attrs={'name': 'description'}) - if description: - return description['content'] - return '' - def extract_keywords_from_html(html): soup = BeautifulSoup(html, 'html.parser') keywords = soup.find('meta', attrs={'name': 'keywords'}) @@ -347,7 +339,7 @@ def _reprocess_all_hars_cookie_name(): cookie = CookiesNames.create(cookie_name) cookie.add(date, domain) -def extract_etag_from_har(har): # TODO check response url +def extract_etag_from_har(har): # TODO check response url etags = set() for entrie in har.get('log', {}).get('entries', []): for header in entrie.get('response', {}).get('headers', []): @@ -686,8 +678,7 @@ class Cookie: meta[field] = value if r_json: data = json.dumps(meta, indent=4, sort_keys=True) - meta = {'data': data} - meta['uuid'] = self.uuid + meta = {'data': data, 'uuid': self.uuid} return meta def edit(self, cookie_dict): @@ -799,7 +790,7 @@ def unpack_imported_json_cookie(json_cookie): ## - - ## #### COOKIEJAR API #### -def api_import_cookies_from_json(user_id, cookiejar_uuid, json_cookies_str): # # TODO: add catch +def api_import_cookies_from_json(user_id, cookiejar_uuid, json_cookies_str): # # TODO: add catch resp = api_verify_cookiejar_acl(cookiejar_uuid, user_id) if resp: return resp @@ -968,8 +959,8 @@ class CrawlerScheduler: minutes = 0 current_time = datetime.now().timestamp() time_next_run = (datetime.now() + relativedelta(months=int(months), weeks=int(weeks), - days=int(days), hours=int(hours), - minutes=int(minutes))).timestamp() + days=int(days), hours=int(hours), + minutes=int(minutes))).timestamp() # Make sure the next capture is not scheduled for in a too short interval interval_next_capture = time_next_run - current_time if interval_next_capture < self.min_frequency: @@ -1249,8 +1240,13 @@ class CrawlerCapture: if task_uuid: return CrawlerTask(task_uuid) - def get_start_time(self): - return self.get_task().get_start_time() + def get_start_time(self, r_str=True): + start_time = self.get_task().get_start_time() + if r_str: + return start_time + else: + start_time = datetime.strptime(start_time, "%Y/%m/%d - %H:%M.%S").timestamp() + return int(start_time) def get_status(self): status = r_cache.hget(f'crawler:capture:{self.uuid}', 'status') @@ -1517,6 +1513,11 @@ class CrawlerTask: def start(self): self._set_field('start_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + def reset(self): + priority = 49 + r_crawler.hdel(f'crawler:task:{self.uuid}', 'start_time') + self.add_to_db_crawler_queue(priority) + # Crawler def remove(self): # zrem cache + DB capture_uuid = self.get_capture() @@ -1727,13 +1728,13 @@ class CrawlerProxy: self.uuid = proxy_uuid def get_description(self): - return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'description') + return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'description') # Host # Port # Type -> need test def get_url(self): - return r_crawler.hgrt(f'crawler:proxy:{self.uuif}', 'url') + return r_crawler.hget(f'crawler:proxy:{self.uuid}', 'url') #### CRAWLER LACUS #### @@ -1816,7 +1817,7 @@ def api_save_lacus_url_key(data): # unpack json manager_url = data.get('url', None) api_key = data.get('api_key', None) - if not manager_url: # or not api_key: + if not manager_url: # or not api_key: return {'status': 'error', 'reason': 'No url or API key supplied'}, 400 # check if is valid url try: @@ -1859,7 +1860,7 @@ def api_set_crawler_max_captures(data): save_nb_max_captures(nb_captures) return nb_captures, 200 - ## TEST ## +## TEST ## def is_test_ail_crawlers_successful(): return r_db.hget('crawler:tor:test', 'success') == 'True'