From 50c81773e91f9718a63aabacc3a14c19bc960e15 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Mon, 24 Sep 2018 16:23:14 +0200
Subject: [PATCH] chg: [Crawler] add launcher and install

---
 bin/Crawler.py                                | 42 +++++++-------
 bin/LAUNCH.sh                                 | 30 +++++++++-
 bin/Onion.py                                  | 56 +++++++++++--------
 bin/packages/config.cfg.sample                |  7 +--
 bin/torcrawler/TorSplashCrawler.py            | 17 ++----
 bin/torcrawler/launch_splash_crawler.sh       | 38 +++++++++++++
 bin/torcrawler/tor_crawler.py                 | 18 +++---
 .../etc/splash/proxy-profiles/default.ini     |  4 ++
 crawler_hidden_services_install.sh            | 10 ++++
 crawler_requirements.txt                      |  2 +
 var/www/Flask_server.py                       |  5 ++
 11 files changed, 160 insertions(+), 69 deletions(-)
 create mode 100755 bin/torcrawler/launch_splash_crawler.sh
 create mode 100644 configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini
 create mode 100644 crawler_hidden_services_install.sh
 create mode 100644 crawler_requirements.txt

diff --git a/bin/Crawler.py b/bin/Crawler.py
index aeaf3ab3..1fdf0601 100755
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@@ -40,16 +40,13 @@ def crawl_onion(url, domain, date, date_month, message):
         exit(0)
 
     if r.status_code == 200:
-        process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father],
+        process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, type_hidden_service, url, domain, paste, super_father],
                                    stdout=subprocess.PIPE)
         while process.poll() is None:
             time.sleep(1)
 
         if process.returncode == 0:
-            if r_serv_metadata.exists('paste_children:'+paste):
-                msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
-                p.populate_set_out(msg, 'Tags')
-
+            # onion up
             print(process.stdout.read())
 
         else:
@@ -59,14 +56,19 @@ def crawl_onion(url, domain, date, date_month, message):
         ## FIXME: # TODO: relaunch docker
         exit(0)
 
+    time.sleep(60)
+
 
 if __name__ == '__main__':
 
-    if len(sys.argv) != 2:
-        print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)')
+    if len(sys.argv) != 3:
+        print('usage:', 'Crawler.py', 'type_hidden_service (onion or i2p or regular)', 'splash_port')
+        print(sys.argv[1])
+        print(sys.argv[2])
         exit(1)
 
     type_hidden_service = sys.argv[1]
+    splash_port = sys.argv[2]
 
     publisher.port = 6380
     publisher.channel = "Script"
@@ -85,21 +87,19 @@ if __name__ == '__main__':
 
     if type_hidden_service == 'onion':
         regex_hidden_service = url_onion
-        splash_url = p.config.get("Crawler", "splash_url_onion")
-        http_proxy = p.config.get("Crawler", "http_proxy_onion")
+        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
     elif type_hidden_service == 'i2p':
         regex_hidden_service = url_i2p
-        splash_url = p.config.get("Crawler", "splash_url_i2p")
-        http_proxy = p.config.get("Crawler", "http_proxy_i2p")
+        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_i2p"),  splash_port)
     elif type_hidden_service == 'regular':
         regex_hidden_service = url_i2p
-        splash_url = p.config.get("Crawler", "splash_url_onion")
-        http_proxy = p.config.get("Crawler", "http_proxy_onion")
+        splash_url = '{}:{}'.format( p.config.get("Crawler", "splash_url_onion"),  splash_port)
     else:
         print('incorrect crawler type: {}'.format(type_hidden_service))
         exit(0)
 
     print(type_hidden_service)
+    print(splash_url)
 
     crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
 
@@ -129,8 +129,6 @@ if __name__ == '__main__':
 
         # Recovering the streamed message informations. http://eepsites.i2p
         message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service))
-        #message = 'http://i2pwiki.i2p;test'
-        #message = 'http://i2host.i2p;test'
 
         # # FIXME: remove
         if message is None:
@@ -186,13 +184,16 @@ if __name__ == '__main__':
                         # save down onion
                         if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain):
                             r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain)
-                            r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
-                            r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
+                            #r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url)
+                            #r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1)
                             if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)):
                                 r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date)
                             r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date)
                         else:
-                            r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
+                            #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1)
+                            if r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and r_serv_metadata.exists('paste_children:'+paste):
+                                msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste)
+                                p.populate_set_out(msg, 'Tags')
 
                         # last check
                         r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date)
@@ -226,12 +227,13 @@ if __name__ == '__main__':
                         r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain))
                         print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)))
 
+                        # update list, last crawled onions
                         r_onion.lpush('last_{}'.format(type_hidden_service), domain)
                         r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15)
 
                         #send all crawled domain past
-                        msg = domain
-                        p.populate_set_out(msg, 'DomainSubject')
+                        #msg = domain
+                        #p.populate_set_out(msg, 'DomainSubject')
 
                         #time.sleep(30)
 
diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh
index c3bfd8cf..9da28a81 100755
--- a/bin/LAUNCH.sh
+++ b/bin/LAUNCH.sh
@@ -27,6 +27,7 @@ islogged=`screen -ls | egrep '[0-9]+.Logging_AIL' | cut -d. -f1`
 isqueued=`screen -ls | egrep '[0-9]+.Queue_AIL' | cut -d. -f1`
 isscripted=`screen -ls | egrep '[0-9]+.Script_AIL' | cut -d. -f1`
 isflasked=`screen -ls | egrep '[0-9]+.Flask_AIL' | cut -d. -f1`
+iscrawler=`screen -ls | egrep '[0-9]+.Crawler_AIL' | cut -d. -f1`
 
 function helptext {
     echo -e $YELLOW"
@@ -198,6 +199,26 @@ function launching_scripts {
 
 }
 
+function launching_crawler {
+  CONFIG=$AIL_BIN/packages/config.cfg
+  lport=$(awk '/^\[Crawler\]/{f=1} f==1&&/^splash_onion_port/{print $3;exit}' "${CONFIG}")
+  echo $lport
+
+  IFS='-' read -ra PORTS <<< "$lport"
+  first_port=${PORTS[0]}
+  last_port=${PORTS[1]}
+
+  screen -dmS "Crawler_AIL"
+  sleep 0.1
+
+  for ((i=first_port;i<=last_port;i++)); do
+      screen -S "Crawler_AIL" -X screen -t "onion_crawler:$i" bash -c 'cd '${AIL_BIN}'; ./Crawler.py onion '$i'; read x'
+      sleep 0.1
+  done
+
+  echo -e $GREEN"\t* Launching Crawler_AIL scripts"$DEFAULT
+}
+
 function shutting_down_redis {
     redis_dir=${AIL_HOME}/redis/src/
     bash -c $redis_dir'redis-cli -p 6379 SHUTDOWN'
@@ -406,6 +427,9 @@ function launch_all {
                 Flask)
                     launch_flask;
                     ;;
+                Crawler)
+                    launching_crawler;
+                    ;;
                 Killall)
                     killall;
                     ;;
@@ -427,13 +451,13 @@ function launch_all {
 
 while [ "$1" != "" ]; do
     case $1 in
-        -l | --launchAuto )         launch_all "automatic";
+        -l | --launchAuto )         launch_all "automatic"; launching_crawler
                                     ;;
         -k | --killAll )            killall;
                                     ;;
-        -c | --configUpdate )       checking_configuration "manual";
+        -t | --thirdpartyUpdate )   update_thirdparty;
                                     ;;
-        -t | --thirdpartyUpdate )    update_thirdparty;
+        -c | --crawler )            launching_crawler;
                                     ;;
         -h | --help )               helptext;
                                     exit
diff --git a/bin/Onion.py b/bin/Onion.py
index d77c010f..1f233fcf 100755
--- a/bin/Onion.py
+++ b/bin/Onion.py
@@ -113,6 +113,15 @@ if __name__ == "__main__":
     message = p.get_from_set()
     prec_filename = None
 
+    # send to crawler:
+    activate_crawler = p.config.get("Crawler", "activate_crawler")
+    if activate_crawler == 'True':
+        activate_crawler = True
+        print('Crawler enabled')
+    else:
+        activate_crawler = False
+        print('Crawler disabled')
+
     # Thanks to Faup project for this regex
     # https://github.com/stricaud/faup
     url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
@@ -142,6 +151,7 @@ if __name__ == "__main__":
                         domains_list.append(domain)
                         urls.append(url)
 
+                '''
                 for x in PST.get_regex(i2p_regex):
                     # Extracting url with regex
                     url, s, credential, subdomain, domain, host, port, \
@@ -156,6 +166,7 @@ if __name__ == "__main__":
                             r_onion.sadd('i2p_domain_crawler_queue', domain)
                             msg = '{};{}'.format(url,PST.p_path)
                             r_onion.sadd('i2p_crawler_queue', msg)
+                '''
 
                 # Saving the list of extracted onion domains.
                 PST.__setattr__(channel, domains_list)
@@ -176,32 +187,33 @@ if __name__ == "__main__":
                     to_print = 'Onion;{};{};{};'.format(PST.p_source,
                                                         PST.p_date,
                                                         PST.p_name)
-                    '''
-                    for url in fetch(p, r_cache, urls, domains_list, path):
-                        publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
-                        p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
 
-                        msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
-                        p.populate_set_out(msg, 'Tags')
-                    '''
+                    if activate_crawler:
+                        date_month = datetime.datetime.now().strftime("%Y%m")
+                        date = datetime.datetime.now().strftime("%Y%m%d")
+                        for url in urls:
 
-                    date_month = datetime.datetime.now().strftime("%Y%m")
-                    date = datetime.datetime.now().strftime("%Y%m%d")
-                    for url in urls:
+                            domain = re.findall(url_regex, url)
+                            if len(domain) > 0:
+                                domain = domain[0][4]
+                            else:
+                                continue
 
-                        domain = re.findall(url_regex, url)
-                        if len(domain) > 0:
-                            domain = domain[0][4]
-                        else:
-                            continue
+                            if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
+                                if not r_onion.sismember('onion_domain_crawler_queue', domain):
+                                    print('send to onion crawler')
+                                    r_onion.sadd('onion_domain_crawler_queue', domain)
+                                    msg = '{};{}'.format(url,PST.p_path)
+                                    r_onion.sadd('onion_crawler_queue', msg)
+                                #p.populate_set_out(msg, 'Crawler')
 
-                        if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
-                            if not r_onion.sismember('onion_domain_crawler_queue', domain):
-                                print('send to onion crawler')
-                                r_onion.sadd('onion_domain_crawler_queue', domain)
-                                msg = '{};{}'.format(url,PST.p_path)
-                                r_onion.sadd('onion_crawler_queue', msg)
-                            #p.populate_set_out(msg, 'Crawler')
+                    else:
+                        for url in fetch(p, r_cache, urls, domains_list, path):
+                            publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
+                            p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')
+
+                            msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
+                            p.populate_set_out(msg, 'Tags')
                 else:
                     publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))
 
diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample
index 85566654..5bb83d21 100644
--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@@ -235,8 +235,7 @@ port = 6381
 db = 0
 
 [Crawler]
+activate_crawler = True
 crawler_depth_limit = 1
-splash_url_onion = http://127.0.0.1:8050
-splash_url_i2p = http://127.0.0.1:8050
-http_proxy_onion = http://127.0.0.1:9050
-http_proxy_i2p = http://127.0.0.1:9050
+splash_url_onion = http://127.0.0.1
+splash_onion_port = 8050-8050
diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py
index 6673436b..2c217474 100644
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@@ -26,7 +26,7 @@ from Helper import Process
 
 class TorSplashCrawler():
 
-    def __init__(self, splash_url, http_proxy, crawler_depth_limit):
+    def __init__(self, splash_url, crawler_depth_limit):
         self.process = CrawlerProcess({'LOG_ENABLED': False})
         self.crawler = Crawler(self.TorSplashSpider, {
             'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
@@ -114,7 +114,6 @@ class TorSplashCrawler():
             if response.status == 504:
                 # down ?
                 print('504 detected')
-            #elif response.status in in range(400, 600):
             elif response.status != 200:
                 print('other: {}'.format(response.status))
             else:
@@ -128,7 +127,7 @@ class TorSplashCrawler():
                 if self.save_crawled_paste(filename_paste, response.data['html']):
 
                     # add this paste to the domain crawled set # TODO: # FIXME:  put this on cache ?
-                    self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
+                    #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste)
 
                     self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0])
                     self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0])
@@ -157,21 +156,17 @@ class TorSplashCrawler():
                         with open(filename_screenshot, 'wb') as f:
                             f.write(base64.standard_b64decode(response.data['png'].encode()))
 
-                    #interest = response.data['har']['log']['entries'][0]['response']['header'][0]
                     with open(filename_screenshot+'har.txt', 'wb') as f:
                         f.write(json.dumps(response.data['har']).encode())
 
                     # save external links in set
-                    lext = LinkExtractor(deny_domains=self.domains, unique=True)
-                    for link in lext.extract_links(response):
-                        self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
-                        self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
+                    #lext = LinkExtractor(deny_domains=self.domains, unique=True)
+                    #for link in lext.extract_links(response):
+                    #    self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url)
+                    #    self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url)
 
-                    #le = LinkExtractor(unique=True)
                     le = LinkExtractor(allow_domains=self.domains, unique=True)
                     for link in le.extract_links(response):
-                        self.r_cache.setbit(link, 0, 0)
-                        self.r_cache.expire(link, 360000)
                         yield SplashRequest(
                             link.url,
                             self.parse,
diff --git a/bin/torcrawler/launch_splash_crawler.sh b/bin/torcrawler/launch_splash_crawler.sh
new file mode 100755
index 00000000..562c2eb4
--- /dev/null
+++ b/bin/torcrawler/launch_splash_crawler.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+#usage() { echo "Usage: sudo $0 [-f <config absolute_path>] [-p <port_start>] [-n <number_of_splash>]" 1>&2; exit 1; }
+
+while getopts ":p:f:n:" o; do
+    case "${o}" in
+        p)
+            p=${OPTARG}
+            ;;
+        f)
+            f=${OPTARG}
+            ;;
+        n)
+            n=${OPTARG}
+            ;;
+        *)
+            usage
+            ;;
+    esac
+done
+shift $((OPTIND-1))
+
+if [ -z "${p}" ] || [ -z "${f}" ] || [ -z "${n}" ]; then
+    #usage
+    echo "usage"
+fi
+
+first_port=$p
+echo "usage0"
+screen -dmS "Docker_Splash"
+echo "usage1"
+sleep 0.1
+
+for ((i=0;i<=$((${n} - 1));i++)); do
+    port_number=$((${p} + $i))
+    screen -S "Docker_Splash" -X screen -t "docker_splash:$i" bash -c 'sudo docker run -p '$port_number':8050 --cpus=1 -v '$f':/etc/splash/proxy-profiles/ --net="bridge" scrapinghub/splash; read x'
+    sleep 0.1
+done
diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py
index 99eb18c8..7881177c 100755
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@@ -8,8 +8,9 @@ from TorSplashCrawler import TorSplashCrawler
 
 if __name__ == '__main__':
 
-    if len(sys.argv) != 8:
-        print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father')
+    if len(sys.argv) != 7:
+        print(sys.argv)
+        print('usage:', 'tor_crawler.py', 'splash_url', 'type', 'url', 'domain', 'paste', 'super_father')
         exit(1)
 
     configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
@@ -22,14 +23,13 @@ if __name__ == '__main__':
     cfg.read(configfile)
 
     splash_url = sys.argv[1]
-    http_proxy = sys.argv[2]
-    type = sys.argv[3]
+    type = sys.argv[2]
     crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
 
-    url = sys.argv[4]
-    domain = sys.argv[5]
-    paste = sys.argv[6]
-    super_father = sys.argv[7]
+    url = sys.argv[3]
+    domain = sys.argv[4]
+    paste = sys.argv[5]
+    super_father = sys.argv[6]
 
-    crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
+    crawler = TorSplashCrawler(splash_url, crawler_depth_limit)
     crawler.crawl(type, url, domain, paste, super_father)
diff --git a/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini
new file mode 100644
index 00000000..63217c2a
--- /dev/null
+++ b/configs/docker/splash_onion/etc/splash/proxy-profiles/default.ini
@@ -0,0 +1,4 @@
+[proxy]
+host=172.17.0.1
+port=9050
+type=SOCKS5
diff --git a/crawler_hidden_services_install.sh b/crawler_hidden_services_install.sh
new file mode 100644
index 00000000..2747ddb6
--- /dev/null
+++ b/crawler_hidden_services_install.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+# install docker
+sudo apt install docker.io
+
+# pull splah docker
+sudo docker pull scrapinghub/splash
+
+. ./AILENV/bin/activate
+pip3 install -U -r pip3_packages_requirement.txt
diff --git a/crawler_requirements.txt b/crawler_requirements.txt
new file mode 100644
index 00000000..b0c096ac
--- /dev/null
+++ b/crawler_requirements.txt
@@ -0,0 +1,2 @@
+scrapy
+scrapy-splash
diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py
index 068bee65..9b7a93be 100755
--- a/var/www/Flask_server.py
+++ b/var/www/Flask_server.py
@@ -44,6 +44,11 @@ except IOError:
     f = open('templates/ignored_modules.txt', 'w')
     f.close()
 
+activate_crawler = cfg.get("Crawler", "activate_crawler")
+if activate_crawler != 'True':
+    toIgnoreModule.add('hiddenServices')
+
+print(toIgnoreModule)
 
 # Dynamically import routes and functions from modules
 # Also, prepare header.html