chg: [Onion] add onion splash crawler

2018-08-09 17:42:21 +02:00 · 2018-08-09 17:42:21 +02:00 · 8b1c10b38c
parent 54cc4f3723
commit 8b1c10b38c
7 changed files with 319 additions and 2 deletions
--- a/bin/Crawler.py
+++ b/bin/Crawler.py
@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import redis
+import datetime
+import time
+import subprocess
+
+sys.path.append(os.environ['AIL_BIN'])
+from Helper import Process
+from pubsublogger import publisher
+
+
+def signal_handler(sig, frame):
+    sys.exit(0)
+
+if __name__ == '__main__':
+
+    publisher.port = 6380
+    publisher.channel = "Script"
+
+    publisher.info("Script Crawler started")
+
+    config_section = 'Crawler'
+
+    # Setup the I/O queues
+    p = Process(config_section)
+
+    splash_url = p.config.get("Crawler", "splash_url")
+    http_proxy = p.config.get("Crawler", "http_proxy")
+    crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit")
+
+    #signal.signal(signal.SIGINT, signal_handler)
+
+    r_serv_metadata = redis.StrictRedis(
+        host=p.config.get("ARDB_Metadata", "host"),
+        port=p.config.getint("ARDB_Metadata", "port"),
+        db=p.config.getint("ARDB_Metadata", "db"),
+        decode_responses=True)
+
+    r_cache = redis.StrictRedis(
+        host=p.config.get("Redis_Cache", "host"),
+        port=p.config.getint("Redis_Cache", "port"),
+        db=p.config.getint("Redis_Cache", "db"),
+        decode_responses=True)
+
+    r_onion = redis.StrictRedis(
+        host=p.config.get("ARDB_Onion", "host"),
+        port=p.config.getint("ARDB_Onion", "port"),
+        db=p.config.getint("ARDB_Onion", "db"),
+        decode_responses=True)
+
+    while True:
+
+        message = p.get_from_set()
+        # Recovering the streamed message informations.
+        if message is not None:
+            splitted = message.split(';')
+            if len(splitted) == 2:
+                url, paste = splitted
+
+                print(url)
+
+                if not r_cache.exists(url):
+                    super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father')
+                    if super_father is None:
+                        super_father=paste
+
+                    process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, paste, super_father],
+                                               stdout=subprocess.PIPE)
+                    while process.poll() is None:
+                        time.sleep(1)
+
+                    date = datetime.datetime.now().strftime("%Y%m%d")
+                    print(date)
+                    url_domain = url.replace('http://', '')
+                    if process.returncode == 0:
+                        if r_serv_metadata.exists('paste_children:'+paste):
+                            msg = 'infoleak:automatic-detection="onion";{}'.format(paste)
+                            p.populate_set_out(msg, 'Tags')
+
+                        r_onion.sadd('onion_up:'+date , url_domain)
+                    else:
+                        r_onion.sadd('onion_down:'+date , url_domain)
+                        print(process.stdout.read())
+
+            else:
+                continue
+        else:
+            time.sleep(1)
--- a/bin/Onion.py
+++ b/bin/Onion.py
@ -21,7 +21,6 @@ Requirements
 *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.

 """
-import pprint
 import time
 from packages import Paste
 from pubsublogger import publisher
@ -123,6 +122,7 @@ if __name__ == "__main__":
                PST = Paste.Paste(filename)

                for x in PST.get_regex(url_regex):
+                    print(x)
                    # Extracting url with regex
                    url, s, credential, subdomain, domain, host, port, \
                        resource_path, query_string, f1, f2, f3, f4 = x
@ -149,12 +149,18 @@ if __name__ == "__main__":
                    to_print = 'Onion;{};{};{};'.format(PST.p_source,
                                                        PST.p_date,
                                                        PST.p_name)
+                    '''
                    for url in fetch(p, r_cache, urls, domains_list, path):
                        publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path))
                        p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler')

                        msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path)
                        p.populate_set_out(msg, 'Tags')
+                    '''
+                    for url in urls:
+                        msg = '{};{}'.format(url,PST.p_path)
+                        print('send to crawler')
+                        p.populate_set_out(msg, 'Crawler')
                else:
                    publisher.info('{}Onion related;{}'.format(to_print, PST.p_path))

--- a/bin/packages/config.cfg.sample
+++ b/bin/packages/config.cfg.sample
@ -3,6 +3,8 @@ bloomfilters = Blooms
 dicofilters = Dicos
 pastes = PASTES
 base64 = BASE64
+crawled = crawled
+crawled_screenshot = CRAWLED_SCREENSHOT

 wordtrending_csv = var/www/static/csv/wordstrendingdata
 wordsfile = files/wordfile
@ -171,6 +173,11 @@ host = localhost
 port = 6382
 db = 8

+[ARDB_Onion]
+host = localhost
+port = 6382
+db = 9
+
 [Url]
 cc_critical = DE

@ -215,3 +222,8 @@ channel = FetchedOnion
 host = localhost
 port = 6381
 db = 0
+
+[Crawler]
+crawler_depth_limit = 1
+splash_url = http://127.0.0.1:8050
+http_proxy = http://127.0.0.1:9050
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -61,7 +61,7 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_alertHandler,Redis_Tags

 [Onion]
 subscribe = Redis_Onion
-publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags
+publish = Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler,Redis_Tags,Redis_Crawler
 #publish = Redis_Global,Redis_ValidOnion,ZMQ_FetchedOnion,Redis_alertHandler

 [DumpValidOnion]
@ -136,3 +136,8 @@ publish = Redis_Duplicate,Redis_alertHandler,Redis_Tags
 [submit_paste]
 subscribe = Redis
 publish = Redis_Mixer
+
+[Crawler]
+subscribe = Redis_Crawler
+publish = Redis_Mixer,Redis_Tags
+
--- a/bin/torcrawler/TorSplashCrawler.py
+++ b/bin/torcrawler/TorSplashCrawler.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import gzip
+import base64
+import uuid
+import datetime
+import base64
+import redis
+from urllib.parse import urlparse
+
+from scrapy import Spider
+from scrapy.linkextractors import LinkExtractor
+from scrapy.crawler import CrawlerProcess, Crawler
+
+from twisted.internet import reactor
+
+from scrapy_splash import SplashRequest
+
+sys.path.append(os.environ['AIL_BIN'])
+from Helper import Process
+
+class TorSplashCrawler():
+
+    def __init__(self, splash_url, http_proxy, crawler_depth_limit):
+        self.process = CrawlerProcess({'LOG_ENABLED': False})
+        self.crawler = Crawler(self.TorSplashSpider, {
+            'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0',
+            'SPLASH_URL': splash_url,
+            'HTTP_PROXY': http_proxy,
+            'ROBOTSTXT_OBEY': False,
+            'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723,
+                                       'scrapy_splash.SplashMiddleware': 725,
+                                       'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810,
+                                       },
+            'SPIDER_MIDDLEWARES': {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100,},
+            'DUPEFILTER_CLASS': 'scrapy_splash.SplashAwareDupeFilter',
+            'DEPTH_LIMIT': crawler_depth_limit
+            })
+
+    def crawl(self, url, original_paste, super_father):
+        self.process.crawl(self.crawler, url=url, original_paste=original_paste, super_father=super_father)
+        self.process.start()
+
+    class TorSplashSpider(Spider):
+        name = 'TorSplashSpider'
+
+        def __init__(self, url, original_paste, super_father, *args, **kwargs):
+            self.original_paste = original_paste
+            self.super_father = super_father
+            self.start_urls = url
+            self.domains = [urlparse(url).netloc]
+            date = datetime.datetime.now().strftime("%Y/%m/%d")
+
+            config_section = 'Crawler'
+            self.p = Process(config_section)
+
+            self.r_cache = redis.StrictRedis(
+                host=self.p.config.get("Redis_Cache", "host"),
+                port=self.p.config.getint("Redis_Cache", "port"),
+                db=self.p.config.getint("Redis_Cache", "db"),
+                decode_responses=True)
+
+            self.r_serv_log_submit = redis.StrictRedis(
+                host=self.p.config.get("Redis_Log_submit", "host"),
+                port=self.p.config.getint("Redis_Log_submit", "port"),
+                db=self.p.config.getint("Redis_Log_submit", "db"),
+                decode_responses=True)
+
+            self.r_serv_metadata = redis.StrictRedis(
+                host=self.p.config.get("ARDB_Metadata", "host"),
+                port=self.p.config.getint("ARDB_Metadata", "port"),
+                db=self.p.config.getint("ARDB_Metadata", "db"),
+                decode_responses=True)
+
+            self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"),
+                                            self.p.config.get("Directories", "crawled"), date )
+
+            self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date )
+
+        def start_requests(self):
+            yield SplashRequest(
+                self.start_urls,
+                self.parse,
+                endpoint='render.json',
+                meta={'parent': self.original_paste},
+                args={  'html': 1,
+                        'wait': 10,
+                        'render_all': 1,
+                        'png': 1}
+            )
+
+        def parse(self,response):
+            print(response.headers)
+            print(response.status)
+
+            self.r_cache.setbit(response.url, 0, 1)
+            self.r_cache.expire(response.url, 360000)
+
+            UUID = self.domains[0]+str(uuid.uuid4())
+            filename_paste = os.path.join(self.crawled_paste_filemame, UUID)
+            filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png')
+
+            # save new paste on disk
+            if self.save_crawled_paste(filename_paste, response.data['html']):
+                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father)
+                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['parent'])
+                self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url)
+
+                self.r_serv_metadata.sadd('paste_children:'+response.meta['parent'], filename_paste)
+
+                dirname = os.path.dirname(filename_screenshot)
+                if not os.path.exists(dirname):
+                    os.makedirs(dirname)
+                with open(filename_screenshot, 'wb') as f:
+                    f.write(base64.standard_b64decode(response.data['png'].encode()))
+
+                # save external links in set
+                lext = LinkExtractor(deny_domains=self.domains, unique=True)
+                for link in lext.extract_links(response):
+                    self.r_serv_metadata.sadd('paste_crawler:filename_paste', link)
+
+                #le = LinkExtractor(unique=True)
+                le = LinkExtractor(allow_domains=self.domains, unique=True)
+                for link in le.extract_links(response):
+                    self.r_cache.setbit(link, 0, 0)
+                    self.r_cache.expire(link, 360000)
+                    yield SplashRequest(
+                        link.url,
+                        self.parse,
+                        endpoint='render.json',
+                        meta={'parent': UUID},
+                        args={  'html': 1,
+                                'png': 1,
+                                'render_all': 1,
+                                'wait': 10}
+                    )
+
+        def save_crawled_paste(self, filename, content):
+
+            print(filename)
+            if os.path.isfile(filename):
+                print('File: {} already exist in submitted pastes'.format(filename))
+                return False
+
+            try:
+                gzipencoded = gzip.compress(content.encode())
+                gzip64encoded = base64.standard_b64encode(gzipencoded).decode()
+            except:
+                print("file error: {}".format(filename))
+                return False
+
+            # send paste to Global
+            relay_message = "{0} {1}".format(filename, gzip64encoded)
+            self.p.populate_set_out(relay_message, 'Mixer')
+
+            # increase nb of paste by feeder name
+            self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1)
+
+            # tag crawled paste
+            msg = 'infoleak:submission="crawler";{}'.format(filename)
+            self.p.populate_set_out(msg, 'Tags')
+            return True
--- a/bin/torcrawler/tor_crawler.py
+++ b/bin/torcrawler/tor_crawler.py
@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import sys
+import configparser
+from TorSplashCrawler import TorSplashCrawler
+
+if __name__ == '__main__':
+
+    if len(sys.argv) != 4:
+        print('usage:', 'tor_crawler.py', 'url', 'paste', 'super_father')
+        exit(1)
+
+    configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
+    if not os.path.exists(configfile):
+        raise Exception('Unable to find the configuration file. \
+                        Did you set environment variables? \
+                        Or activate the virtualenv.')
+
+    cfg = configparser.ConfigParser()
+    cfg.read(configfile)
+
+    splash_url = cfg.get("Crawler", "splash_url")
+    http_proxy = cfg.get("Crawler", "http_proxy")
+    crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit")
+
+    url = sys.argv[1]
+    paste = sys.argv[2]
+    super_father = sys.argv[3]
+
+    crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit)
+    crawler.crawl(url, paste, super_father)
--- a/etc/splash/proxy-profiles/default.ini
+++ b/etc/splash/proxy-profiles/default.ini
@ -0,0 +1,4 @@
+[proxy]
+host=localhost
+port=9050
+type=SOCKS5