From c49e871ba85b1d7464943c5a21b918b7a7cc3f79 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 26 Sep 2018 16:34:27 +0200 Subject: [PATCH] chg: [crawler] add infos --- bin/Crawler.py | 12 +++++------- bin/torcrawler/launch_splash_crawler.sh | 15 ++++++++++----- crawler_hidden_services_install.sh | 9 +++++++-- 3 files changed, 22 insertions(+), 14 deletions(-) mode change 100644 => 100755 crawler_hidden_services_install.sh diff --git a/bin/Crawler.py b/bin/Crawler.py index fff85daf..9642436c 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -36,7 +36,8 @@ def crawl_onion(url, domain, date, date_month, message): r_onion.sadd('{}_crawler_queue'.format(type_hidden_service), message) print('--------------------------------------') - print(' DOCKER SPLASH DOWN') + print(' \033[91m DOCKER SPLASH DOWN\033[0m') + print(' {} DOWN'.format(splash_url)) exit(0) if r.status_code == 200: @@ -94,7 +95,6 @@ if __name__ == '__main__': print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) - print('crawler type: {}'.format(type_hidden_service)) print('splash url: {}'.format(splash_url)) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") @@ -132,13 +132,11 @@ if __name__ == '__main__': message = r_onion.spop('mess_onion') if message is not None: - print(message) splitted = message.split(';') if len(splitted) == 2: url, paste = splitted paste = paste.replace(PASTES_FOLDER+'/', '') - print(paste) ''' if not '.onion' in url: print('not onion') @@ -157,9 +155,9 @@ if __name__ == '__main__': domain_url = 'http://{}'.format(domain) - print('------------------START CRAWLER------------------') - print(type_hidden_service) - print('-------------------------------------------------') + print('\033[92m------------------START CRAWLER------------------\033[0m') + print('crawler type: {}'.format(type_hidden_service)) + print('\033[92m-------------------------------------------------\033[0m') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh index 562c2eb4..37963e93 100755 --- a/bin/torcrawler/launch_splash_crawler.sh +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -1,6 +1,14 @@ #!/bin/bash -#usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; exit 1; } +usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; + echo " -f: absolute path to splash docker proxy-profiles directory (used for proxy configuration)" + echo " -p: number of the first splash server port number. This number is incremented for the others splash server" + echo " -n: number of splash servers to start" + echo "" + echo "example:" + echo "sudo ./launch_splash_crawler.sh -f /home/my_user/AIL-framework/configs/docker/splash_onion/etc/splash/proxy-profiles/ -p 8050 -n 3" + exit 1; + } while getopts ":p:f:n:" o; do case "${o}" in @@ -25,14 +33,11 @@ if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then echo "usage" fi -first_port=$p -echo "usage0" screen -dmS "Docker_Splash" -echo "usage1" sleep 0.1 for ((i=0;i<=$((${n} - 1));i++)); do port_number=$((${p} + $i)) - screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + screen -S "Docker_Splash" -X screen -t "docker_splash:$port_number" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' sleep 0.1 done diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh old mode 100644 new mode 100755 index 2747ddb6..3cdb96bd --- a/crawler_hidden_services_install.sh +++ b/crawler_hidden_services_install.sh @@ -1,7 +1,12 @@ #!/bin/bash -# install docker -sudo apt install docker.io +read -p "Do you want to install docker? (use local splash server) [y/n] " -n 1 -r +echo # (optional) move to a new line +if [[ $REPLY =~ ^[Yy]$ ]] +then + # install docker + sudo apt install docker.io +fi # pull splah docker sudo docker pull scrapinghub/splash