From e9580d6775981a6a7eeea882bd96ce77ea59cb32 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Tue, 21 Aug 2018 15:54:53 +0200 Subject: [PATCH] chg: [Crawler] change BDD, save i2p links --- bin/Crawler.py | 140 +++++++------ bin/Onion.py | 45 ++++- bin/packages/HiddenServices.py | 79 ++++++++ bin/torcrawler/TorSplashCrawler.py | 2 + files/Onion | 1 + .../hiddenServices/Flask_hiddenServices.py | 99 +++++++++ .../templates/header_hiddenServices.html | 1 + .../templates/hiddenServices.html | 188 ++++++++++++++++++ .../hiddenServices/templates/showDomain.html | 76 +++++++ 9 files changed, 567 insertions(+), 64 deletions(-) create mode 100755 bin/packages/HiddenServices.py create mode 100644 var/www/modules/hiddenServices/Flask_hiddenServices.py create mode 100644 var/www/modules/hiddenServices/templates/header_hiddenServices.html create mode 100644 var/www/modules/hiddenServices/templates/hiddenServices.html create mode 100644 var/www/modules/hiddenServices/templates/showDomain.html diff --git a/bin/Crawler.py b/bin/Crawler.py index df1e0117..2e617959 100755 --- a/bin/Crawler.py +++ b/bin/Crawler.py @@ -18,40 +18,41 @@ from pubsublogger import publisher def signal_handler(sig, frame): sys.exit(0) -def crawl_onion(url, domain, date): +def crawl_onion(url, domain, date, date_month): - if not r_onion.sismember('onion_up:'+date , domain) and not r_onion.sismember('onion_down:'+date , domain): #if not r_onion.sismember('full_onion_up', domain) and not r_onion.sismember('onion_down:'+date , domain): - super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') - if super_father is None: - super_father=paste + super_father = r_serv_metadata.hget('paste_metadata:'+paste, 'super_father') + if super_father is None: + super_father=paste - try: - r = requests.get(splash_url , timeout=0.010) - except Exception: - ## FIXME: # TODO: relaunch docker - exit(0) + try: + r = requests.get(splash_url , timeout=30.0) + except Exception: + ## FIXME: # TODO: relaunch docker or send error message + print('--------------------------------------') + print(' DOCKER SPLASH DOWN') + exit(0) - if r.status_code == 200: - process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], - stdout=subprocess.PIPE) - while process.poll() is None: - time.sleep(1) + if r.status_code == 200: + process = subprocess.Popen(["python", './torcrawler/tor_crawler.py', url, domain, paste, super_father], + stdout=subprocess.PIPE) + while process.poll() is None: + time.sleep(1) - if process.returncode == 0: - if r_serv_metadata.exists('paste_children:'+paste): - msg = 'infoleak:automatic-detection="onion";{}'.format(paste) - p.populate_set_out(msg, 'Tags') + if process.returncode == 0: + if r_serv_metadata.exists('paste_children:'+paste): + msg = 'infoleak:automatic-detection="onion";{}'.format(paste) + p.populate_set_out(msg, 'Tags') - print(process.stdout.read()) + print(process.stdout.read()) - else: - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - print(process.stdout.read()) else: - ## FIXME: # TODO: relaunch docker - exit(0) + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + print(process.stdout.read()) + else: + ## FIXME: # TODO: relaunch docker + exit(0) if __name__ == '__main__': @@ -97,11 +98,23 @@ if __name__ == '__main__': message = p.get_from_set() # Recovering the streamed message informations. + #message = r_onion.spop('mess_onion') + print(message) + + if message is None: + print('get ardb message') + message = r_onion.spop('mess_onion') + if message is not None: + splitted = message.split(';') if len(splitted) == 2: url, paste = splitted + if not '.onion' in url: + print('not onion') + continue + url_list = re.findall(url_regex, url)[0] if url_list[1] == '': url= 'http://{}'.format(url) @@ -117,46 +130,55 @@ if __name__ == '__main__': print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) + '''if not r_onion.sismember('full_onion_up', domain): + r_onion.sadd('mess_onion', message) + print('added ..............')''' + + if not r_onion.sismember('banned_onion', domain): date = datetime.datetime.now().strftime("%Y%m%d") + date_month = datetime.datetime.now().strftime("%Y%m") - crawl_onion(url, domain, date) - if url != domain_url: - crawl_onion(domain_url, domain, date) + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): - # save dowm onion - if not r_onion.sismember('onion_up:'+date , domain): - r_onion.sadd('onion_down:'+date , domain) - r_onion.sadd('onion_down_link:'+date , url) - r_onion.hincrby('onion_link_down', url, 1) - if not r_onion.exists('onion_metadata:{}'.format(domain)): - r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) - r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) - else: - r_onion.hincrby('onion_link_up', url, 1) + crawl_onion(url, domain, date, date_month) + if url != domain_url: + crawl_onion(domain_url, domain, date, date_month) - # last check - r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) - - # check external onions links (full_scrawl) - external_domains = set() - for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): - print(link) - external_domain = re.findall(url_regex, link) - print(external_domain) - if len(external_domain) > 0: - external_domain = external_domain[0][4] + # save down onion + if not r_onion.sismember('onion_up:'+date , domain): + r_onion.sadd('onion_down:'+date , domain) + r_onion.sadd('onion_down_link:'+date , url) + r_onion.hincrby('onion_link_down', url, 1) + if not r_onion.exists('onion_metadata:{}'.format(domain)): + r_onion.hset('onion_metadata:{}'.format(domain), 'first_seen', date) + r_onion.hset('onion_metadata:{}'.format(domain), 'last_seen', date) else: - continue - print(external_domain) - # # TODO: add i2p - if '.onion' in external_domain and external_domain != domain: - external_domains.add(external_domain) - if len(external_domains) >= 10: - r_onion.sadd('onion_potential_source', domain) - r_onion.delete('domain_onion_external_links:{}'.format(domain)) - print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + r_onion.hincrby('onion_link_up', url, 1) + + # last check + r_onion.hset('onion_metadata:{}'.format(domain), 'last_check', date) + + # check external onions links (full_scrawl) + external_domains = set() + for link in r_onion.smembers('domain_onion_external_links:{}'.format(domain)): + external_domain = re.findall(url_regex, link) + if len(external_domain) > 0: + external_domain = external_domain[0][4] + else: + continue + # # TODO: add i2p + if '.onion' in external_domain and external_domain != domain: + external_domains.add(external_domain) + if len(external_domains) >= 10: + r_onion.sadd('onion_potential_source', domain) + r_onion.delete('domain_onion_external_links:{}'.format(domain)) + print(r_onion.smembers('domain_onion_external_links:{}'.format(domain))) + + r_onion.lpush('last_onions', domain) + r_onion.ltrim('last_onions', 0, 15) + else: continue else: diff --git a/bin/Onion.py b/bin/Onion.py index 1e2dff32..23a81755 100755 --- a/bin/Onion.py +++ b/bin/Onion.py @@ -29,6 +29,7 @@ import os import base64 import subprocess import redis +import re from Helper import Process @@ -96,6 +97,12 @@ if __name__ == "__main__": db=p.config.getint("Redis_Cache", "db"), decode_responses=True) + r_onion = redis.StrictRedis( + host=p.config.get("ARDB_Onion", "host"), + port=p.config.getint("ARDB_Onion", "port"), + db=p.config.getint("ARDB_Onion", "db"), + decode_responses=True) + # FUNCTIONS # publisher.info("Script subscribed to channel onion_categ") @@ -109,6 +116,9 @@ if __name__ == "__main__": # Thanks to Faup project for this regex # https://github.com/stricaud/faup url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + re.compile(url_regex) + while True: if message is not None: @@ -127,8 +137,22 @@ if __name__ == "__main__": url, s, credential, subdomain, domain, host, port, \ resource_path, query_string, f1, f2, f3, f4 = x - domains_list.append(domain) - urls.append(url) + if '.onion' in url: + print(url) + domains_list.append(domain) + urls.append(url) + + for x in PST.get_regex(i2p_regex): + # Extracting url with regex + url, s, credential, subdomain, domain, host, port, \ + resource_path, query_string, f1, f2, f3, f4 = x + + if '.i2p' in url: + print('add i2p') + print(domain) + if not r_onion.sismember('i2p_domain', domain): + r_onion.sadd('i2p_domain', domain) + r_onion.sadd('i2p_link', url) # Saving the list of extracted onion domains. PST.__setattr__(channel, domains_list) @@ -157,10 +181,21 @@ if __name__ == "__main__": msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) p.populate_set_out(msg, 'Tags') ''' + + date_month = datetime.datetime.now().strftime("%Y%m") + date = datetime.datetime.now().strftime("%Y%m%d") for url in urls: - msg = '{};{}'.format(url,PST.p_path) - print('send to crawler') - p.populate_set_out(msg, 'Crawler') + + domain = re.findall(url_regex, url) + if len(domain) > 0: + domain = domain[0][4] + else: + continue + + if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): + msg = '{};{}'.format(url,PST.p_path) + print('send to crawler') + p.populate_set_out(msg, 'Crawler') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) diff --git a/bin/packages/HiddenServices.py b/bin/packages/HiddenServices.py new file mode 100755 index 00000000..48f514fc --- /dev/null +++ b/bin/packages/HiddenServices.py @@ -0,0 +1,79 @@ +#!/usr/bin/python3 + +""" +The ``hiddenServices Class`` +=================== + +Use it to create an object from an existing paste or other random file. + +Conditions to fulfill to be able to use this class correctly: +------------------------------------------------------------- + +1/ The paste need to be saved on disk somewhere (have an accessible path) +2/ The paste need to be gziped. +3/ The filepath need to look like something like this: + /directory/source/year/month/day/paste.gz + +""" + +import os +import gzip +import redis + +import configparser +import sys +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) +from Date import Date + +class HiddenServices(object): + """ + This class representing a hiddenServices as an object. + When created, the object will have by default some "main attributes" + + :Example: + + PST = HiddenServices("xxxxxxxx.onion", "onion") + + """ + + def __init__(self, domain, type): + + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') + if not os.path.exists(configfile): + raise Exception('Unable to find the configuration file. \ + Did you set environment variables? \ + Or activate the virtualenv.') + + cfg = configparser.ConfigParser() + cfg.read(configfile) + self.r_serv_onion = redis.StrictRedis( + host=cfg.get("ARDB_Onion", "host"), + port=cfg.getint("ARDB_Onion", "port"), + db=cfg.getint("ARDB_Onion", "db"), + decode_responses=True) + + self.domain = domain + self.type = type + + if type == 'onion': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"), cfg.get("Directories", "crawled")) + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + elif type == 'i2p': + self.paste_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + self.screenshot_directory = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "crawled_screenshot")) + else: + ## TODO: # FIXME: add error + pass + + + def get_last_crawled_pastes(self): + + last_check = self.r_serv_onion.hget('onion_metadata:{}'.format(self.domain), 'last_check') + return self.get_crawled_pastes_by_date(last_check) + + def get_crawled_pastes_by_date(self, date): + pastes_path = os.path.join(self.paste_directory, date[0:4], date[4:6], date[6:8]) + l_crawled_pastes = [f for f in os.listdir(pastes_path) if self.domain in f] + print(len(l_crawled_pastes)) + print(l_crawled_pastes) + return l_crawled_pastes diff --git a/bin/torcrawler/TorSplashCrawler.py b/bin/torcrawler/TorSplashCrawler.py index 3d392b93..c5280329 100644 --- a/bin/torcrawler/TorSplashCrawler.py +++ b/bin/torcrawler/TorSplashCrawler.py @@ -55,6 +55,7 @@ class TorSplashCrawler(): self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") + self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) @@ -120,6 +121,7 @@ class TorSplashCrawler(): self.r_serv_onion.sadd('onion_up:'+self.full_date , self.domains[0]) self.r_serv_onion.sadd('full_onion_up', self.domains[0]) + self.r_serv_onion.sadd('month_onion_up:{}'.format(self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('onion_metadata:{}'.format(self.domains[0])): diff --git a/files/Onion b/files/Onion index 5c9980e2..69fcf878 100644 --- a/files/Onion +++ b/files/Onion @@ -1 +1,2 @@ onion +i2p diff --git a/var/www/modules/hiddenServices/Flask_hiddenServices.py b/var/www/modules/hiddenServices/Flask_hiddenServices.py new file mode 100644 index 00000000..04740a93 --- /dev/null +++ b/var/www/modules/hiddenServices/Flask_hiddenServices.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +''' + Flask functions and routes for the trending modules page +''' +import redis +import datetime +from flask import Flask, render_template, jsonify, request, Blueprint + +import HiddenServices +from Date import Date + +# ============ VARIABLES ============ +import Flask_config + +app = Flask_config.app +cfg = Flask_config.cfg +r_serv_onion = Flask_config.r_serv_onion + +hiddenServices = Blueprint('hiddenServices', __name__, template_folder='templates') + +# ============ FUNCTIONS ============ +def one(): + return 1 + +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date( '{}{}{}'.format(str(curr_date.year), str(curr_date.month).zfill(2), str(curr_date.day).zfill(2)) ) + date_list = [] + + for i in range(0, num_day): + date_list.append(date.substract_day(i)) + + return list(reversed(date_list)) + +def get_onion_status(domain, date): + if r_serv_onion.sismember('onion_up:'+date , domain): + return True + else: + return False +# ============= ROUTES ============== + +@hiddenServices.route("/hiddenServices/", methods=['GET']) +def hiddenServices_page(): + last_onions = r_serv_onion.lrange('last_onions', 0 ,-1) + list_onion = [] + + for onion in last_onions: + metadata_onion = {} + metadata_onion['domain'] = onion + metadata_onion['last_check'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'last_check') + metadata_onion['first_seen'] = r_serv_onion.hget('onion_metadata:{}'.format(onion), 'first_seen') + if get_onion_status(onion, metadata_onion['last_check']): + metadata_onion['status_text'] = 'UP' + metadata_onion['status_color'] = 'Green' + metadata_onion['status_icon'] = 'fa-check-circle' + else: + metadata_onion['status_text'] = 'DOWN' + metadata_onion['status_color'] = 'Red' + metadata_onion['status_icon'] = 'fa-times-circle' + list_onion.append(metadata_onion) + + return render_template("hiddenServices.html", last_onions=list_onion) + +@hiddenServices.route("/hiddenServices/onion_domain", methods=['GET']) +def onion_domain(): + onion_domain = request.args.get('onion_domain') + if onion_domain is None or not r_serv_onion.exists('onion_metadata:{}'.format(onion_domain)): + pass + # # TODO: FIXME return 404 + + last_check = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'last_check') + first_seen = r_serv_onion.hget('onion_metadata:{}'.format(onion_domain), 'first_seen') + date_crawled = r_serv_onion.smembers('onion_history:{}'.format(onion_domain)) + + return render_template("showDomain.html", domain=onion_domain, last_check=last_check, first_seen=first_seen) + +# ============= JSON ============== +@hiddenServices.route("/hiddenServices/domain_crawled_7days_json", methods=['GET']) +def domain_crawled_7days_json(): + type = 'onion' + ## TODO: # FIXME: 404 error + + date_range = get_date_range(7) + json_domain_stats = [] + #try: + for date in date_range: + nb_domain_up = r_serv_onion.scard('{}_up:{}'.format(type, date)) + nb_domain_down = r_serv_onion.scard('{}_up:{}'.format(type, date)) + date = date[0:4] + '-' + date[4:6] + '-' + date[6:8] + json_domain_stats.append({ 'date': date, 'value': int( nb_domain_up ), 'nb_domain_down': int( nb_domain_down )}) + #except: + #return jsonify() + + return jsonify(json_domain_stats) + +# ========= REGISTRATION ========= +app.register_blueprint(hiddenServices) diff --git a/var/www/modules/hiddenServices/templates/header_hiddenServices.html b/var/www/modules/hiddenServices/templates/header_hiddenServices.html new file mode 100644 index 00000000..5c77963c --- /dev/null +++ b/var/www/modules/hiddenServices/templates/header_hiddenServices.html @@ -0,0 +1 @@ +
  • hidden Services
  • diff --git a/var/www/modules/hiddenServices/templates/hiddenServices.html b/var/www/modules/hiddenServices/templates/hiddenServices.html new file mode 100644 index 00000000..bbc66ace --- /dev/null +++ b/var/www/modules/hiddenServices/templates/hiddenServices.html @@ -0,0 +1,188 @@ + + + + + + + + Hidden Service - AIL + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    +
    + +
    +
    ONION
    +
    + + + + + + + + + + + {% for metadata_onion in last_onions %} + + + + + + + {% endfor %} + +
    DomainFirst SeenLast CheckStatus
    {{ metadata_onion['domain'] }}{{'{}/{}/{}'.format(metadata_onion['first_seen'][0:4], metadata_onion['first_seen'][4:6], metadata_onion['first_seen'][6:8])}}{{'{}/{}/{}'.format(metadata_onion['last_check'][0:4], metadata_onion['last_check'][4:6], metadata_onion['last_check'][6:8])}}
    + + {{metadata_onion['status_text']}} +
    +
    + +
    +
    + +
    + +
    + +
    +
    + +
    + +
    + + + + + + + + diff --git a/var/www/modules/hiddenServices/templates/showDomain.html b/var/www/modules/hiddenServices/templates/showDomain.html new file mode 100644 index 00000000..18cd79be --- /dev/null +++ b/var/www/modules/hiddenServices/templates/showDomain.html @@ -0,0 +1,76 @@ + + + + + + + + Show Domain - AIL + + + + + + + + + + + + + + + + + {% include 'navbar.html' %} + +
    + +
    + +
    +
    +
    +
    + Graph +
    + + + + + + + + + + + + + + + + +
    Domain{{ domain }}
    First Seen{{ first_seen }}
    Last Check{{ last_check }}
    +
    +
    +
    + +
    + +
    + +
    + +
    + + + + + + +