From c0be210d2cb839edf941a41b7f59c120e7720f76 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 29 Mar 2021 20:27:20 +0200 Subject: [PATCH] chg: [crawler] add test + relaunch crawlers + major fixs --- bin/core/Crawler_manager.py | 6 +- bin/lib/crawlers.py | 135 +++++++- bin/torcrawler/TorSplashCrawler.py | 44 ++- bin/torcrawler/tor_crawler.py | 8 +- requirements.txt | 1 + var/www/blueprints/crawler_splash.py | 24 +- .../settings_splash_crawler.html | 313 ++++++++++-------- 7 files changed, 379 insertions(+), 152 deletions(-) diff --git a/bin/core/Crawler_manager.py b/bin/core/Crawler_manager.py index 3a95e706..bacaaa71 100755 --- a/bin/core/Crawler_manager.py +++ b/bin/core/Crawler_manager.py @@ -29,7 +29,8 @@ if __name__ == '__main__': is_manager_connected = crawlers.reload_splash_and_proxies_list() print(is_manager_connected) if is_manager_connected: - crawlers.relaunch_crawlers() + if crawlers.test_ail_crawlers(): + crawlers.relaunch_crawlers() last_check = int(time.time()) while True: @@ -45,7 +46,8 @@ if __name__ == '__main__': is_manager_connected = crawlers.reload_splash_and_proxies_list() if is_manager_connected: print('reload proxies and splash list') - crawlers.relaunch_crawlers() + if crawlers.test_ail_crawlers(): + crawlers.relaunch_crawlers() session_uuid = current_session_uuid if not is_manager_connected: print('Error, Can\'t connect to Splash manager') diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py index 64aa0e7a..69cce642 100755 --- a/bin/lib/crawlers.py +++ b/bin/lib/crawlers.py @@ -16,6 +16,8 @@ import sys import time import uuid +import subprocess + from datetime import datetime, timedelta from urllib.parse import urlparse @@ -25,6 +27,9 @@ from pyfaup.faup import Faup import requests requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +import git_status + sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) import ConfigLoader @@ -429,6 +434,19 @@ def get_splash_crawler_status(spash_url): status=False return {'crawler_info': crawler_info, 'crawling_domain': crawling_domain, 'status_info': status_info, 'status': status, 'type': crawler_type} +def set_current_crawler_status(splash_url, status, started_time=False, crawled_domain=None, crawler_type=None): + # TODO: get crawler type if None + # Status: ['Waiting', 'Error', ...] + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'status', status) + if started_time: + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'started_time', datetime.now().strftime("%Y/%m/%d - %H:%M.%S")) + if crawler_type: + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'type', crawler_type) + if crawled_domain: + r_cache.hset('metadata_crawler:{}'.format(splash_url), 'crawling_domain', crawled_domain) + + #r_cache.sadd('all_splash_crawlers', splash_url) # # TODO: add me in fct: create_ail_crawler + def get_stats_last_crawled_domains(crawler_types, date): statDomains = {} for crawler_type in crawler_types: @@ -1014,6 +1032,20 @@ def get_all_splash_by_proxy(proxy_name, r_list=False): else: return [] +def get_all_splash_name_by_crawler_type(crawler_type): + l_splash_name = [] + for splash_name in get_all_splash(): + if get_splash_crawler_type(splash_name) == crawler_type: + l_splash_name.append(splash_name) + return l_splash_name + +def get_all_splash_url_by_crawler_type(crawler_type): + l_splash_url = [] + for splash_name in get_all_splash_name_by_crawler_type(crawler_type): + for splash_url in get_splash_all_url(splash_name, r_list=True): + l_splash_url.append(splash_url) + return l_splash_url + def delete_all_splash_containers(): for splash_name in get_all_splash(): delete_splash_container(splash_name) @@ -1140,7 +1172,106 @@ def launch_ail_splash_crawler(splash_url, script_options=''): screen.create_screen(screen_name) screen.launch_uniq_windows_script(screen_name, splash_url, dir_project, script_location, script_name, script_options=script_options, kill_previous_windows=True) +def is_test_ail_crawlers_successful(): + return r_serv_onion.hget('crawler:tor:test', 'success') == 'True' +def get_test_ail_crawlers_message(): + return r_serv_onion.hget('crawler:tor:test', 'message') + +def save_test_ail_crawlers_result(test_success, message): + r_serv_onion.hset('crawler:tor:test', 'success', bool(test_success)) + r_serv_onion.hset('crawler:tor:test', 'message', message) + +def test_ail_crawlers(): + # # TODO: test regular domain + if not ping_splash_manager(): + manager_url = get_splash_manager_url() + error_message = f'Error: Can\'t connect to AIL Splash Manager, http://{manager_url}' + print(error_message) + save_test_ail_crawlers_result(False, error_message) + return False + + splash_url = get_all_splash_url_by_crawler_type('tor') + if not splash_url: + error_message = f'Error: No Tor Splash Launched' + print(error_message) + save_test_ail_crawlers_result(False, error_message) + return False + splash_url = splash_url[0] + commit_id = git_status.get_last_commit_id_from_local() + crawler_options = {'html': True, + 'har': False, + 'png': False, + 'depth_limit': 0, + 'closespider_pagecount': 100, + 'cookiejar_uuid': None, + 'user_agent': commit_id + '-AIL SPLASH CRAWLER'} + date = {'date_day': datetime.now().strftime("%Y%m%d"), + 'date_month': datetime.now().strftime("%Y%m"), + 'epoch': int(time.time())} + crawler_config = {'splash_url': f'http://{splash_url}', + 'service_type': 'onion', + 'url': 'http://eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion', + 'domain': 'eswpccgr5xyovsahffkehgleqthrasfpfdblwbs4lstd345dwq5qumqd.onion', + 'port': 80, + 'original_item': None, + 'item': None, + 'crawler_options': crawler_options, + 'date': date, + 'requested': 'test'} + + ## CHECK IF SPLASH AVAILABLE ## + try: + r = requests.get(f'http://{splash_url}' , timeout=30.0) + retry = False + except Exception as e: + error_message = f'Error: Can\'t connect to Splash Docker, http://{splash_url}' + print(error_message) + save_test_ail_crawlers_result(False, error_message) + return False + ## -- ## + + ## LAUNCH CRAWLER, TEST MODE ## + set_current_crawler_status(splash_url, 'CRAWLER TEST', started_time=True, crawled_domain='TEST DOMAIN', crawler_type='onion') + UUID = str(uuid.uuid4()) + r_cache.set('crawler_request:{}'.format(UUID), json.dumps(crawler_config)) + + ## LAUNCH CRAWLER, TEST MODE ## + tor_crawler_script = os.path.join(os.environ['AIL_BIN'], 'torcrawler/tor_crawler.py') + process = subprocess.Popen(["python", tor_crawler_script, UUID], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) + + if process.returncode == 0: + # Scrapy-Splash ERRORS + stderr = process.stdout.read().decode() + #print(stderr) + if stderr: + print(f'stderr: {stderr}') + save_test_ail_crawlers_result(False, f'Error: {stderr}') + set_current_crawler_status(splash_url, 'Error') + + output = process.stdout.read().decode() + #print(output) + # error: splash:Connection to proxy refused + if 'Connection to proxy refused' in output: + print('{} SPASH, PROXY DOWN OR BAD CONFIGURATION'.format(splash_url)) + save_test_ail_crawlers_result(False, 'SPASH, PROXY DOWN OR BAD CONFIGURATION') + set_current_crawler_status(splash_url, 'Error') + return False + else: + set_current_crawler_status(splash_url, 'Waiting') + return True + else: + # ERROR + stderr = process.stdout.read().decode() + output = process.stdout.read().decode() + error = f'-stderr-\n{stderr}\n-stdout-\n{output}' + print(error) + save_test_ail_crawlers_result(splash_url, error) + return False + return True ## -- ## #### ---- #### @@ -1151,5 +1282,7 @@ def launch_ail_splash_crawler(splash_url, script_options=''): if __name__ == '__main__': res = get_splash_manager_version() - #res = restart_splash_docker('127.0.0.1:8050', 'default_splash_tor') + res = test_ail_crawlers() + res = is_test_ail_crawlers_successful() print(res) + print(get_test_ail_crawlers_message()) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 6742f613..42dffdb0 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -81,7 +81,7 @@ function main(splash, args) html = splash:html(), png = splash:png{render_all=true}, cookies = splash:get_cookies(), - last_url = splash:url() + last_url = splash:url(), } end """ @@ -174,35 +174,54 @@ class TorSplashCrawler(): def parse(self,response): #print(response.headers) #print(response.status) + #print(response.meta) + #print(response.data) # # TODO: handle lua script error + #{'type': 'ScriptError', 'info': {'error': "'}' expected (to close '{' at line 47) near 'error_retry'", + #'message': '[string "..."]:53: \'}\' expected (to close \'{\' at line 47) near \'error_retry\'', + #'type': 'LUA_INIT_ERROR', 'source': '[string "..."]', 'line_number': 53}, + #'error': 400, 'description': 'Error happened while executing Lua script'} if response.status == 504: # no response #print('504 detected') pass - # LUA ERROR # # TODO: print/display errors + # LUA ERROR # # TODO: logs errors elif 'error' in response.data: if(response.data['error'] == 'network99'): ## splash restart ## - error_retry = request.meta.get('error_retry', 0) + error_retry = response.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 - url= request.meta['current_url'] - father = request.meta['father'] + url = response.data['last_url'] + father = response.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) + if 'cookies' in response.data: + all_cookies = response.data['cookies'] # # TODO: use initial cookie ????? + else: + all_cookies = [] + l_cookies = self.build_request_arg(all_cookies) yield SplashRequest( url, self.parse, errback=self.errback_catcher, endpoint='execute', - cache_args=['lua_source'], + dont_filter=True, meta={'father': father, 'current_url': url, 'error_retry': error_retry}, - args=self.build_request_arg(response.cookiejar) + args=l_cookies ) else: + if self.requested_mode == 'test': + crawlers.save_test_ail_crawlers_result(False, 'Connection to proxy refused') print('Connection to proxy refused') + elif response.data['error'] == 'network3': + if self.requested_mode == 'test': + crawlers.save_test_ail_crawlers_result(False, 'HostNotFoundError: the remote host name was not found (invalid hostname)') + print('HostNotFoundError: the remote host name was not found (invalid hostname)') else: + if self.requested_mode == 'test': + crawlers.save_test_ail_crawlers_result(False, response.data['error']) print(response.data['error']) elif response.status != 200: @@ -213,6 +232,17 @@ class TorSplashCrawler(): #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: + ## TEST MODE ## + if self.requested_mode == 'test': + if 'It works!' in response.data['html']: + print(response.data['html']) + #print('success') + crawlers.save_test_ail_crawlers_result(True, 'It works!') + else: + print('TEST ERROR') + crawlers.save_test_ail_crawlers_result(False, 'TEST ERROR') + return + ## -- ## item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 954eae0f..3f493b84 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -45,5 +45,9 @@ if __name__ == '__main__': redis_cache.delete('crawler_request:{}'.format(uuid)) - crawler = TorSplashCrawler(splash_url, crawler_options) - crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item) + try: + crawler = TorSplashCrawler(splash_url, crawler_options) + crawler.crawl(splash_url, service_type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item) + except Exception as e: + print(e) + print(e, file=sys.stderr) diff --git a/requirements.txt b/requirements.txt index 2a9347c4..d7b4b614 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ pymisp +d4-pyclient thehive4py diff --git a/var/www/blueprints/crawler_splash.py b/var/www/blueprints/crawler_splash.py index a5f6d548..9d4d5120 100644 --- a/var/www/blueprints/crawler_splash.py +++ b/var/www/blueprints/crawler_splash.py @@ -504,18 +504,22 @@ def crawler_cookiejar_cookie_json_add_post(): def crawler_splash_setings(): all_proxies = crawlers.get_all_proxies_metadata() all_splash = crawlers.get_all_splash_crawler_metadata() - nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch() - splash_manager_url = crawlers.get_splash_manager_url() api_key = crawlers.get_hidden_splash_api_key() is_manager_connected = crawlers.get_splash_manager_connection_metadata(force_ping=True) + + nb_crawlers_to_launch = crawlers.get_nb_crawlers_to_launch() crawler_full_config = Config_DB.get_full_config_by_section('crawler') + is_crawler_working = crawlers.is_test_ail_crawlers_successful() + crawler_error_mess = crawlers.get_test_ail_crawlers_message() return render_template("settings_splash_crawler.html", is_manager_connected=is_manager_connected, splash_manager_url=splash_manager_url, api_key=api_key, - nb_crawlers_to_launch=nb_crawlers_to_launch, all_splash=all_splash, all_proxies=all_proxies, + nb_crawlers_to_launch=nb_crawlers_to_launch, + is_crawler_working=is_crawler_working, + crawler_error_mess=crawler_error_mess, crawler_full_config=crawler_full_config) @crawler_splash.route('/crawler/settings/crawler_manager', methods=['GET', 'POST']) @@ -555,4 +559,18 @@ def crawler_splash_setings_crawlers_to_lauch(): return render_template("settings_edit_crawlers_to_launch.html", nb_crawlers_to_launch=nb_crawlers_to_launch) +@crawler_splash.route('/crawler/settings/test_crawler', methods=['GET']) +@login_required +@login_admin +def crawler_splash_setings_test_crawler(): + crawlers.test_ail_crawlers() + return redirect(url_for('crawler_splash.crawler_splash_setings')) + +@crawler_splash.route('/crawler/settings/relaunch_crawler', methods=['GET']) +@login_required +@login_admin +def crawler_splash_setings_relaunch_crawler(): + crawlers.relaunch_crawlers() + return redirect(url_for('crawler_splash.crawler_splash_setings')) + ## - - ## diff --git a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html b/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html index 8b1dae72..090d9368 100644 --- a/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html +++ b/var/www/templates/crawler/crawler_splash/settings_splash_crawler.html @@ -90,11 +90,165 @@ + + +
+ +
+
+
All Splash Crawlers:
+ + + + + + + + + + {% for splash_name in all_splash %} + + + + + + + + {% endfor %} + +
+ Splash name + + Proxy + + Crawler type + + Description +
+ {{splash_name}} + + {{all_splash[splash_name]['proxy']}} + + {%if all_splash[splash_name]['type']=='tor'%} + + {%else%} + + {%endif%} + {{all_splash[splash_name]['type']}} + + {{all_splash[splash_name]['description']}} + +
+ +
+
+
+
+ +
+
+
All Proxies:
+ + + + + + + + + + + + {% for proxy_name in all_proxies %} + + + + + + + + + + {% endfor %} + +
+ Proxy name + + Host + + Port + + Type + + Crawler Type + + Description +
+ {{proxy_name}} + + {{all_proxies[proxy_name]['host']}} + + {{all_proxies[proxy_name]['port']}} + + {{all_proxies[proxy_name]['type']}} + + {%if all_proxies[proxy_name]['crawler_type']=='tor'%} + + {%else%} + + {%endif%} + {{all_proxies[proxy_name]['crawler_type']}} + + {{all_proxies[proxy_name]['description']}} + +
+ +
+
+
+
+
+ + -
+
+
+ + {% if is_crawler_working %} +
+ + {{crawler_error_mess}} +
+ {% else %} +
+ + Error +
+ {% endif %} +
+

Crawlers

+
+
-
+
+----------------------------
+- TOR CRAWLER TEST OUTPUT: -
+----------------------------
+
+{{crawler_error_mess}}
+
+ + + + +
Number of Crawlers to Launch:
@@ -112,175 +266,60 @@ Edit number of crawlers to launch - - - -
-
-
All Splash Crawlers:
-
- - - - - - - - - {% for splash_name in all_splash %} - - - - - - - - {% endfor %} - -
- Splash name - - Proxy - - Crawler type - - Description -
- {{splash_name}} - - {{all_splash[splash_name]['proxy']}} - - {%if all_splash[splash_name]['type']=='tor'%} - - {%else%} - - {%endif%} - {{all_splash[splash_name]['type']}} - - {{all_splash[splash_name]['description']}} - -
- -
-
+ + +
-
All Proxies:
- +
Crawlers Settings:
+ +
- - - - + - {% for proxy_name in all_proxies %} + {% for config_field in crawler_full_config %} - - - {% endfor %}
- Proxy name - - Host - - Port - - Type - - Crawler Type + Key Description + Value +
- {{proxy_name}} + {{config_field}} - {{all_proxies[proxy_name]['host']}} + {{crawler_full_config[config_field]['info']}} - {{all_proxies[proxy_name]['port']}} - - {{all_proxies[proxy_name]['type']}} - - {%if all_proxies[proxy_name]['crawler_type']=='tor'%} - - {%else%} - - {%endif%} - {{all_proxies[proxy_name]['crawler_type']}} - - {{all_proxies[proxy_name]['description']}} + {{crawler_full_config[config_field]['value']}}
-
+
+
-
- -
-
-

Crawlers Settings

-
-
- - - - - - - - - - {% for config_field in crawler_full_config %} - - - - - - - {% endfor %} - -
- Key - - Description - - Value -
- {{config_field}} - - {{crawler_full_config[config_field]['info']}} - - {{crawler_full_config[config_field]['value']}} - -
- -
-
- -
-