diff --git a/bin/Crawler.py b/bin/Crawler.py index 2e617959..240ae2a3 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -34,21 +34,21 @@ def crawl_onion(url, domain, date, date_month): exit(0) if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', splash_url, http_proxy, type_hidden_service, url, domain, paste, super_father], stdout=subprocess.PIPE) while process.poll() is None: time.sleep(1) if process.returncode == 0: if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + msg = 'infoleak:automatic-detection="{}";{}'.format(type_hidden_service, paste) p.populate_set_out(msg, 'Tags') print(process.stdout.read()) else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) + r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) + r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) print(process.stdout.read()) else: ## FIXME: # TODO: relaunch docker @@ -67,8 +67,28 @@ if __name__ == '__main__': # Setup the I/O queues p = Process(config_section) - splash_url = p.config.get("Crawler", "splash_url") - http_proxy = p.config.get("Crawler", "http_proxy") + url_onion = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_onion) + url_i2p = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_i2p) + + type_hidden_service = 'onion' + if type_hidden_service == 'onion': + regex_hidden_service = url_onion + splash_url = p.config.get("Crawler", "splash_url_onion") + http_proxy = p.config.get("Crawler", "http_proxy_onion") + elif type_hidden_service == 'i2p': + regex_hidden_service = url_i2p + splash_url = p.config.get("Crawler", "splash_url_i2p") + http_proxy = p.config.get("Crawler", "http_proxy_i2p") + elif type_hidden_service == 'regular': + regex_hidden_service = url_i2p + splash_url = p.config.get("Crawler", "splash_url_onion") + http_proxy = p.config.get("Crawler", "http_proxy_onion") + else: + print('incorrect crawler type: {}'.format(type_hidden_service)) + exit(0) + crawler_depth_limit = p.config.getint("Crawler", "crawler_depth_limit") #signal.signal(signal.SIGINT, signal_handler) @@ -91,93 +111,94 @@ if __name__ == '__main__': db=p.config.getint("ARDB_Onion", "db"), decode_responses=True) - url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" - re.compile(url_regex) - while True: - message = p.get_from_set() # Recovering the streamed message informations. - #message = r_onion.spop('mess_onion') - print(message) + message = r_onion.spop('{}_crawler_queue'.format(type_hidden_service)) + #message='https://www.myip.com/;/home/aurelien/git/python3/AIL-framework/PASTES/crawled/2018/08/10/onionsnjajzkhm5g.onion49eac19d-d71b-48b5-bc55-9a3c63e5b1e2' + # # FIXME: remove if message is None: print('get ardb message') message = r_onion.spop('mess_onion') + print(message) + if message is not None: splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + if not '.onion' in url: print('not onion') continue - url_list = re.findall(url_regex, url)[0] + url_list = re.findall(regex_hidden_service, url)[0] if url_list[1] == '': url= 'http://{}'.format(url) link, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = url_list domain = url_list[4] + r_onion.srem('onion_domain_crawler_queue', domain) + #domain = 'myip.com' domain_url = 'http://{}'.format(domain) - print('------------------START ONION CRAWLER------------------') + print('------------------START CRAWLER------------------') + print(type_hidden_service) + print('-------------------------------------------------') print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) - '''if not r_onion.sismember('full_onion_up', domain): - r_onion.sadd('mess_onion', message) - print('added ..............')''' - - - if not r_onion.sismember('banned_onion', domain): + if not r_onion.sismember('banned_{}'.format(type_hidden_service), domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") - if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + if not r_onion.sismember('month_{}_up:{}'.format(type_hidden_service, date_month), domain) and not r_onion.sismember('{}_down:{}'.format(type_hidden_service, date), domain): crawl_onion(url, domain, date, date_month) if url != domain_url: crawl_onion(domain_url, domain, date, date_month) # save down onion - if not r_onion.sismember('onion_up:'+date , domain): - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - r_onion.hincrby('onion_link_down', url, 1) - if not r_onion.exists('onion_metadata:{}'.format(domain)): - r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) - r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) + if not r_onion.sismember('{}_up:{}'.format(type_hidden_service, date), domain): + r_onion.sadd('{}_down:{}'.format(type_hidden_service, date), domain) + r_onion.sadd('{}_down_link:{}'.format(type_hidden_service, date), url) + r_onion.hincrby('{}_link_down'.format(type_hidden_service), url, 1) + if not r_onion.exists('{}_metadata:{}'.format(type_hidden_service, domain)): + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'first_seen', date) + r_onion.hset('{}_metadata:{}'.format(type_hidden_service,domain), 'last_seen', date) else: - r_onion.hincrby('onion_link_up', url, 1) + r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) # last check - r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + r_onion.hset('{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) # check external onions links (full_scrawl) external_domains = set() - for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): - external_domain = re.findall(url_regex, link) + for link in r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain)): + external_domain = re.findall(url_onion, link) + external_domain.extend(re.findall(url_i2p, link)) if len(external_domain) > 0: external_domain = external_domain[0][4] else: continue - # # TODO: add i2p if '.onion' in external_domain and external_domain != domain: external_domains.add(external_domain) + elif '.i2p' in external_domain and external_domain != domain: + external_domains.add(external_domain) if len(external_domains) >= 10: - r_onion.sadd('onion_potential_source', domain) - r_onion.delete('domain_onion_external_links:{}'.format(domain)) - print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + r_onion.sadd('{}_potential_source'.format(type_hidden_service), domain) + r_onion.delete('domain_{}_external_links:{}'.format(type_hidden_service, domain)) + print(r_onion.smembers('domain_{}_external_links:{}'.format(type_hidden_service, domain))) - r_onion.lpush('last_onions', domain) - r_onion.ltrim('last_onions', 0, 15) + r_onion.lpush('last_{}'.format(type_hidden_service), domain) + r_onion.ltrim('last_{}'.format(type_hidden_service), 0, 15) else: continue diff --git a/bin/Onion.py b/bin/Onion.py index 23a81755..d77c010f 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -150,9 +150,12 @@ if __name__ == "__main__": if '.i2p' in url: print('add i2p') print(domain) - if not r_onion.sismember('i2p_domain', domain): + if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain): r_onion.sadd('i2p_domain', domain) r_onion.sadd('i2p_link', url) + r_onion.sadd('i2p_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('i2p_crawler_queue', msg) # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -193,9 +196,12 @@ if __name__ == "__main__": continue if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - msg = '{};{}'.format(url,PST.p_path) - print('send to crawler') - p.populate_set_out(msg, 'Crawler') + if not r_onion.sismember('onion_domain_crawler_queue', domain): + print('send to onion crawler') + r_onion.sadd('onion_domain_crawler_queue', domain) + msg = '{};{}'.format(url,PST.p_path) + r_onion.sadd('onion_crawler_queue', msg) + #p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index c5280329..135ad0a7 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -9,6 +9,7 @@ import uuid import datetime import base64 import redis +import json from scrapy.spidermiddlewares.httperror import HttpError from twisted.internet.error import DNSLookupError @@ -30,7 +31,6 @@ class TorSplashCrawler(): self.crawler = Crawler(self.TorSplashSpider, { 'USER_AGENT': 'Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20100101 Firefox/24.0', 'SPLASH_URL': splash_url, - 'HTTP_PROXY': http_proxy, 'ROBOTSTXT_OBEY': False, 'DOWNLOADER_MIDDLEWARES': {'scrapy_splash.SplashCookiesMiddleware': 723, 'scrapy_splash.SplashMiddleware': 725, @@ -41,14 +41,15 @@ class TorSplashCrawler(): 'DEPTH_LIMIT': crawler_depth_limit }) - def crawl(self, url, domain, original_paste, super_father): - self.process.crawl(self.crawler, url=url, domain=domain,original_paste=original_paste, super_father=super_father) + def crawl(self, type, url, domain, original_paste, super_father): + self.process.crawl(self.crawler, type=type, url=url, domain=domain,original_paste=original_paste, super_father=super_father) self.process.start() class TorSplashSpider(Spider): name = 'TorSplashSpider' - def __init__(self, url, domain,original_paste, super_father, *args, **kwargs): + def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): + self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url @@ -100,12 +101,13 @@ class TorSplashCrawler(): args={ 'html': 1, 'wait': 10, 'render_all': 1, + 'har': 1, 'png': 1} ) def parse(self,response): - print(response.headers) - print(response.status) + #print(response.headers) + #print(response.status) # # TODO: # FIXME: self.r_cache.setbit(response.url, 0, 1) @@ -119,17 +121,18 @@ class TorSplashCrawler(): # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): - self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) - self.r_serv_onion.sadd('full_onion_up', self.domains[0]) - self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0]) + self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) + self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) + self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata - if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'first_seen', self.full_date) - self.r_serv_onion.hset('onion_metadata:{}'.format(self.domains[0]), 'last_seen', self.full_date) + if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) + self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'paste_parent', self.original_paste) # add onion screenshot history - self.r_serv_onion.sadd('onion_history:{}'.format(self.domains[0]), self.full_date) + self.r_serv_onion.sadd('{}_history:{}'.format(self.type, self.domains[0]), self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) @@ -144,17 +147,20 @@ class TorSplashCrawler(): os.makedirs(dirname) size_screenshot = (len(response.data['png'])*3) /4 - print(size_screenshot) if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) + #interest = response.data['har']['log']['entries'][0]['response']['header'][0] + with open(filename_screenshot+'har.txt', 'wb') as f: + f.write(json.dumps(response.data['har']).encode()) + # save external links in set lext = LinkExtractor(deny_domains=self.domains, unique=True) for link in lext.extract_links(response): - self.r_serv_onion.sadd('domain_onion_external_links:{}'.format(self.domains[0]), link.url) - self.r_serv_metadata.sadd('paste_onion_external_links:{}'.format(filename_paste), link.url) + self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) + self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) #le = LinkExtractor(unique=True) le = LinkExtractor(allow_domains=self.domains, unique=True) @@ -169,6 +175,7 @@ class TorSplashCrawler(): args={ 'html': 1, 'png': 1, 'render_all': 1, + 'har': 1, 'wait': 10} #errback=self.errback_catcher ) diff --git a/bin/torcrawler/tor_crawler.py b/bin/torcrawler/tor_crawler.py index 57a77e76..99eb18c8 100755 --- a/bin/torcrawler/tor_crawler.py +++ b/bin/torcrawler/tor_crawler.py @@ -8,8 +8,8 @@ from TorSplashCrawler import TorSplashCrawler if __name__ == '__main__': - if len(sys.argv) != 5: - print('usage:', 'tor_crawler.py', 'url', 'domain', 'paste', 'super_father') + if len(sys.argv) != 8: + print('usage:', 'tor_crawler.py', 'splash_url', 'http_proxy', 'type', 'url', 'domain', 'paste', 'super_father') exit(1) configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -21,14 +21,15 @@ if __name__ == '__main__': cfg = configparser.ConfigParser() cfg.read(configfile) - splash_url = cfg.get("Crawler", "splash_url") - http_proxy = cfg.get("Crawler", "http_proxy") + splash_url = sys.argv[1] + http_proxy = sys.argv[2] + type = sys.argv[3] crawler_depth_limit = cfg.getint("Crawler", "crawler_depth_limit") - url = sys.argv[1] - domain = sys.argv[2] - paste = sys.argv[3] - super_father = sys.argv[4] + url = sys.argv[4] + domain = sys.argv[5] + paste = sys.argv[6] + super_father = sys.argv[7] crawler = TorSplashCrawler(splash_url, http_proxy, crawler_depth_limit) - crawler.crawl(url, domain, paste, super_father) + crawler.crawl(type, url, domain, paste, super_father) diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py index 04740a93..6d01bbbb 100644 --- a/var/www/modules/hiddenServices/Flask_hiddenServices.py +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -43,7 +43,7 @@ def get_onion_status(domain, date): @hiddenServices.route("/hiddenServices/", methods=['GET']) def hiddenServices_page(): - last_onions = r_serv_onion.lrange('last_onions', 0 ,-1) + last_onions = r_serv_onion.lrange('last_onion', 0 ,-1) list_onion = [] for onion in last_onions: @@ -72,9 +72,11 @@ def onion_domain(): last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') + domain_paste = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'paste_parent') date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) - return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen) + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen, + domain_paste=domain_paste) # ============= JSON ============== @hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html index 18cd79be..88942c73 100644 --- a/var/www/modules/hiddenServices/templates/showDomain.html +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -49,6 +49,12 @@