From 50c81773e91f9718a63aabacc3a14c19bc960e15 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 24 Sep 2018 16:23:14 +0200 Subject: [PATCH] chg: [Crawler] add launcher and install --- bin/Crawler.py | 42 +++++++------- bin/LAUNCH.sh | 30 +++++++++- bin/Onion.py | 56 +++++++++++-------- bin/packages/config.cfg.sample | 7 +-- bin/torcrawler/TorSplashCrawler.py | 17 ++---- bin/torcrawler/launch_splash_crawler.sh | 38 +++++++++++++ bin/torcrawler/tor_crawler.py | 18 +++--- .../etc/splash/proxy-profiles/default.ini | 4 ++ crawler_hidden_services_install.sh | 10 ++++ crawler_requirements.txt | 2 + var/www/Flask_server.py | 5 ++ 11 files changed, 160 insertions(+), 69 deletions(-) create mode 100755 bin/torcrawler/launch_splash_crawler.sh create mode 100644 configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini create mode 100644 crawler_hidden_services_install.sh create mode 100644 crawler_requirements.txt diff --git a/bin/Crawler.py b/bin/Crawler.py index aeaf3ab3..1fdf0601 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -40,16 +40,13 @@ def crawl_onion(url, domain, date, date_month, message): exit(0) if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father], + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) - p.populate_set_out(msg, 'Tags') - + # onion up print(process.stdout.read()) else: @@ -59,14 +56,19 @@ def crawl_onion(url, domain, date, date_month, message): ## FIXME: # TODO: relaunch docker exit(0) + time.sleep(60) + if __name__ == '__main__': - if len(sys.argv) != 2: - print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)') + if len(sys.argv) != 3: + print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port') + print(sys.argv[1]) + print(sys.argv[2]) exit(1) type_hidden_service = sys.argv[1] + splash_port = sys.argv[2] publisher.port = 6380 publisher.channel = "Script" @@ -85,21 +87,19 @@ if __name__ == '__main__': if type_hidden_service == 'onion': regex_hidden_service = url_onion - splash_url = p.config.get("Crawler", "splash_url_onion") - http_proxy = p.config.get("Crawler", "http_proxy_onion") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) elif type_hidden_service == 'i2p': regex_hidden_service = url_i2p - splash_url = p.config.get("Crawler", "splash_url_i2p") - http_proxy = p.config.get("Crawler", "http_proxy_i2p") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"), splash_port) elif type_hidden_service == 'regular': regex_hidden_service = url_i2p - splash_url = p.config.get("Crawler", "splash_url_onion") - http_proxy = p.config.get("Crawler", "http_proxy_onion") + splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"), splash_port) else: print('incorrect crawler type: {}'.format(type_hidden_service)) exit(0) print(type_hidden_service) + print(splash_url) crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") @@ -129,8 +129,6 @@ if __name__ == '__main__': # Recovering the streamed message informations. http://eepsites.i2p message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) - #message = 'http://i2pwiki.i2p;test' - #message = 'http://i2host.i2p;test' # # FIXME: remove if message is None: @@ -186,13 +184,16 @@ if __name__ == '__main__': # save down onion if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) - r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) - r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) + #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) + #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date) else: - r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) + #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) + if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) + p.populate_set_out(msg, 'Tags') # last check r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) @@ -226,12 +227,13 @@ if __name__ == '__main__': r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) + # update list, last crawled onions r_onion.lpush('last_{}'.format(type_hidden_service), domain) r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) #send all crawled domain past - msg = domain - p.populate_set_out(msg, 'DomainSubject') + #msg = domain + #p.populate_set_out(msg, 'DomainSubject') #time.sleep(30) diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index c3bfd8cf..9da28a81 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1` isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1` isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1` isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1` +iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1` function helptext { echo -e $YELLOW" @@ -198,6 +199,26 @@ function launching_scripts { } +function launching_crawler { + CONFIG=$AIL_BIN/packages/config.cfg + lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}") + echo $lport + + IFS='-' read -ra PORTS <<< "$lport" + first_port=${PORTS[0]} + last_port=${PORTS[1]} + + screen -dmS "Crawler_AIL" + sleep 0.1 + + for ((i=first_port;i<=last_port;i++)); do + screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x' + sleep 0.1 + done + + echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT +} + function shutting_down_redis { redis_dir=${AIL_HOME}/redis/src/ bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN' @@ -406,6 +427,9 @@ function launch_all { Flask) launch_flask; ;; + Crawler) + launching_crawler; + ;; Killall) killall; ;; @@ -427,13 +451,13 @@ function launch_all { while [ "$1" != "" ]; do case $1 in - -l | --launchAuto ) launch_all "automatic"; + -l | --launchAuto ) launch_all "automatic"; launching_crawler ;; -k | --killAll ) killall; ;; - -c | --configUpdate ) checking_configuration "manual"; + -t | --thirdpartyUpdate ) update_thirdparty; ;; - -t | --thirdpartyUpdate ) update_thirdparty; + -c | --crawler ) launching_crawler; ;; -h | --help ) helptext; exit diff --git a/bin/Onion.py b/bin/Onion.py index d77c010f..1f233fcf 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -113,6 +113,15 @@ if __name__ == "__main__": message = p.get_from_set() prec_filename = None + # send to crawler: + activate_crawler = p.config.get("Crawler", "activate_crawler") + if activate_crawler == 'True': + activate_crawler = True + print('Crawler enabled') + else: + activate_crawler = False + print('Crawler disabled') + # Thanks to Faup project for this regex # https://github.com/stricaud/faup url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" @@ -142,6 +151,7 @@ if __name__ == "__main__": domains_list.append(domain) urls.append(url) + ''' for x in PST.get_regex(i2p_regex): # Extracting url with regex url, s, credential, subdomain, domain, host, port, \ @@ -156,6 +166,7 @@ if __name__ == "__main__": r_onion.sadd('i2p_domain_crawler_queue', domain) msg = '{};{}'.format(url,PST.p_path) r_onion.sadd('i2p_crawler_queue', msg) + ''' # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -176,32 +187,33 @@ if __name__ == "__main__": to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) - ''' - for url in fetch(p, r_cache, urls, domains_list, path): - publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) - p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') - msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) - p.populate_set_out(msg, 'Tags') - ''' + if activate_crawler: + date_month = datetime.datetime.now().strftime("%Y%m") + date = datetime.datetime.now().strftime("%Y%m%d") + for url in urls: - date_month = datetime.datetime.now().strftime("%Y%m") - date = datetime.datetime.now().strftime("%Y%m%d") - for url in urls: + domain = re.findall(url_regex, url) + if len(domain) > 0: + domain = domain[0][4] + else: + continue - domain = re.findall(url_regex, url) - if len(domain) > 0: - domain = domain[0][4] - else: - continue + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + if not r_onion.sismember('onion_domain_crawler_queue', domain): + print('send to onion crawler') + r_onion.sadd('onion_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('onion_crawler_queue', msg) + #p.populate_set_out(msg, 'Crawler') - if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - if not r_onion.sismember('onion_domain_crawler_queue', domain): - print('send to onion crawler') - r_onion.sadd('onion_domain_crawler_queue', domain) - msg = '{};{}'.format(url,PST.p_path) - r_onion.sadd('onion_crawler_queue', msg) - #p.populate_set_out(msg, 'Crawler') + else: + for url in fetch(p, r_cache, urls, domains_list, path): + publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) + p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') + + msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) + p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 85566654..5bb83d21 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -235,8 +235,7 @@ port = 6381 db = 0 [Crawler] +activate_crawler = True crawler_depth_limit = 1 -splash_url_onion = http://127.0.0.1:8050 -splash_url_i2p = http://127.0.0.1:8050 -http_proxy_onion = http://127.0.0.1:9050 -http_proxy_i2p = http://127.0.0.1:9050 +splash_url_onion = http://127.0.0.1 +splash_onion_port = 8050-8050 diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 6673436b..2c217474 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -26,7 +26,7 @@ from Helper import Process class TorSplashCrawler(): - def __init__(self, splash_url, http_proxy, crawler_depth_limit): + def __init__(self, splash_url, crawler_depth_limit): self.process = CrawlerProcess({'LOG_ENABLED': False}) self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', @@ -114,7 +114,6 @@ class TorSplashCrawler(): if response.status == 504: # down ? print('504 detected') - #elif response.status in in range(400, 600): elif response.status != 200: print('other: {}'.format(response.status)) else: @@ -128,7 +127,7 @@ class TorSplashCrawler(): if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? - self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) + #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) @@ -157,21 +156,17 @@ class TorSplashCrawler(): with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) - #interest = response.data['har']['log']['entries'][0]['response']['header'][0] with open(filename_screenshot+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set - lext = LinkExtractor(deny_domains=self.domains, unique=True) - for link in lext.extract_links(response): - self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) - self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) + #lext = LinkExtractor(deny_domains=self.domains, unique=True) + #for link in lext.extract_links(response): + # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) - #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): - self.r_cache.setbit(link, 0, 0) - self.r_cache.expire(link, 360000) yield SplashRequest( link.url, self.parse, diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh new file mode 100755 index 00000000..562c2eb4 --- /dev/null +++ b/bin/torcrawler/launch_splash_crawler.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +#usage() { echo "Usage: sudo $0 [-f ] [-p ] [-n ]" 1>&2; exit 1; } + +while getopts ":p:f:n:" o; do + case "${o}" in + p) + p=${OPTARG} + ;; + f) + f=${OPTARG} + ;; + n) + n=${OPTARG} + ;; + *) + usage + ;; + esac +done +shift $((OPTIND-1)) + +if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then + #usage + echo "usage" +fi + +first_port=$p +echo "usage0" +screen -dmS "Docker_Splash" +echo "usage1" +sleep 0.1 + +for ((i=0;i<=$((${n} - 1));i++)); do + port_number=$((${p} + $i)) + screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x' + sleep 0.1 +done diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 99eb18c8..7881177c 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -8,8 +8,9 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': - if len(sys.argv) != 8: - print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father') + if len(sys.argv) != 7: + print(sys.argv) + print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -22,14 +23,13 @@ if __name__ == '__main__': cfg.read(configfile) splash_url = sys.argv[1] - http_proxy = sys.argv[2] - type = sys.argv[3] + type = sys.argv[2] crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") - url = sys.argv[4] - domain = sys.argv[5] - paste = sys.argv[6] - super_father = sys.argv[7] + url = sys.argv[3] + domain = sys.argv[4] + paste = sys.argv[5] + super_father = sys.argv[6] - crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) + crawler = TorSplashCrawler(splash_url, crawler_depth_limit) crawler.crawl(type, url, domain, paste, super_father) diff --git a/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini new file mode 100644 index 00000000..63217c2a --- /dev/null +++ b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini @@ -0,0 +1,4 @@ +[proxy] +host=172.17.0.1 +port=9050 +type=SOCKS5 diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh new file mode 100644 index 00000000..2747ddb6 --- /dev/null +++ b/crawler_hidden_services_install.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# install docker +sudo apt install docker.io + +# pull splah docker +sudo docker pull scrapinghub/splash + +. ./AILENV/bin/activate +pip3 install -U -r pip3_packages_requirement.txt diff --git a/crawler_requirements.txt b/crawler_requirements.txt new file mode 100644 index 00000000..b0c096ac --- /dev/null +++ b/crawler_requirements.txt @@ -0,0 +1,2 @@ +scrapy +scrapy-splash diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 068bee65..9b7a93be 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -44,6 +44,11 @@ except IOError: f = open('templates/ignored_modules.txt', 'w') f.close() +activate_crawler = cfg.get("Crawler", "activate_crawler") +if activate_crawler != 'True': + toIgnoreModule.add('hiddenServices') + +print(toIgnoreModule) # Dynamically import routes and functions from modules # Also, prepare header.html