From e357dce59b0824ed09eeb1c5b376fc1d0ed6e56b Mon Sep 17 00:00:00 2001 From: Terrtia Date: Thu, 27 Sep 2018 15:43:03 +0200 Subject: [PATCH] fix: [Crawler] detect splash connection to proxy error --- HOWTO.md | 19 +++++++------ bin/Crawler.py | 37 +++++++++++++++++-------- bin/torcrawler/TorSplashCrawler.py | 8 ++++-- bin/torcrawler/launch_splash_crawler.sh | 1 + 4 files changed, 43 insertions(+), 22 deletions(-) diff --git a/HOWTO.md b/HOWTO.md index 50fad074..5b5bc92a 100644 --- a/HOWTO.md +++ b/HOWTO.md @@ -102,20 +102,23 @@ Crawler --------------------- In AIL, you can crawl hidden services. -two types of configutation [explaination for what]: - 1) use local Splash dockers (use the same host for Splash servers and AIL) - 2) use remote Splash servers +There is two type of installation. You can install a *local* or a *remote* Splash server. If you install a local Splash server, the Splash and AIL host are the same. -- (Splash host) Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used) -- (Splash host) Setup your tor proxy[is already installed]: +Install/Configure and launch all crawler scripts: + +- *(Splash host)* Launch ``crawler_hidden_services_install.sh`` to install all requirement (type ``y`` if a localhost splah server is used or use ``-y`` option) + +- *(Splash host)* Install/Setup your tor proxy: + - Install the tor proxy: ``sudo apt-get install tor -y`` + (The tor proxy is installed by default in AIL. If you use the same host for the Splash server, you don't need to intall it) - Add the following line in ``/etc/tor/torrc: SOCKSPolicy accept 172.17.0.0/16`` (for a linux docker, the localhost IP is 172.17.0.1; Should be adapted for other platform) - Restart the tor proxy: ``sudo service tor restart`` -- (Splash host) Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` - all the Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. +- *(Splash host)* Launch all Splash servers with: ``sudo ./bin/torcrawler/launch_splash_crawler.sh [-f ] [-p ] [-n ]`` + All Splash dockers are launched inside the ``Docker_Splash`` screen. You can use ``sudo screen -r Docker_Splash`` to connect to the screen session and check all Splash servers status. -- (AIL host) Edit the ``/bin/packages/config.cfg`` file: +- *(AIL host)* Edit the ``/bin/packages/config.cfg`` file: - In the crawler section, set ``activate_crawler`` to ``True`` - Change the IP address of Splash servers if needed (remote only) - Set ``splash_onion_port`` according to your Splash servers port numbers who are using the tor proxy. those ports numbers should be described as a single port (ex: 8050) or a port range (ex: 8050-8052 for 8050,8051,8052 ports). diff --git a/bin/Crawler.py b/bin/Crawler.py index 9642436c..30d3ffb2 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -18,6 +18,12 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) +def on_error_send_message_back_in_queue(type_hidden_service, domain, message): + # send this msg back in the queue + if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): + r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) + r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) + def crawl_onion(url, domain, date, date_month, message): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): @@ -30,15 +36,11 @@ def crawl_onion(url, domain, date, date_month, message): except Exception: ## FIXME: # TODO: relaunch docker or send error message - # send this msg back in the queue - if not r_onion.sismember('{}_domain_crawler_queue'.format(type_hidden_service), domain): - r_onion.sadd('{}_domain_crawler_queue'.format(type_hidden_service), domain) - r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) - + on_error_send_message_back_in_queue(type_hidden_service, domain, message) print('--------------------------------------') print(' \033[91m DOCKER SPLASH DOWN\033[0m') print(' {} DOWN'.format(splash_url)) - exit(0) + exit(1) if r.status_code == 200: process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], @@ -47,15 +49,26 @@ def crawl_onion(url, domain, date, date_month, message): time.sleep(1) if process.returncode == 0: - # onion up - print(process.stdout.read()) - + output = process.stdout.read().decode() + print(output) + # error: splash:Connection to proxy refused + if 'Connection to proxy refused' in output: + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + print('------------------------------------------------------------------------') + print(' \033[91m SPLASH: Connection to proxy refused') + print('') + print(' PROXY DOWN OR BAD CONFIGURATION\033[0m'.format(splash_url)) + print('------------------------------------------------------------------------') + exit(-2) else: print(process.stdout.read()) - exit(0) + exit(-1) else: - ## FIXME: # TODO: relaunch docker - exit(0) + on_error_send_message_back_in_queue(type_hidden_service, domain, message) + print('--------------------------------------') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) + exit(1) if __name__ == '__main__': diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 2c217474..59060ba3 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -110,12 +110,16 @@ class TorSplashCrawler(): def parse(self,response): #print(response.headers) #print(response.status) - print(' | ') if response.status == 504: # down ? print('504 detected') elif response.status != 200: - print('other: {}'.format(response.status)) + #print('other: {}'.format(response.status)) + #print(error_log) + #detect connection to proxy refused + error_log = (json.loads(response.body.decode())) + if(error_log['info']['text'] == 'Connection to proxy refused'): + print('Connection to proxy refused') else: UUID = self.domains[0]+str(uuid.uuid4()) diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index e78656ab..5f3f9020 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -39,4 +39,5 @@ for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' sleep 0.1 + echo " Splash server launched on port $port_number" done