mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			258 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			258 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| """
 | |
| The ZMQ_Sub_Onion Module
 | |
| ============================
 | |
| 
 | |
| This module is consuming the Redis-list created by the ZMQ_Sub_Onion_Q Module.
 | |
| 
 | |
| It trying to extract url from paste and returning only ones which are tor
 | |
| related (.onion)
 | |
| 
 | |
|     ..seealso:: Paste method (get_regex)
 | |
| 
 | |
| ..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
 | |
| the same Subscriber name in both of them.
 | |
| 
 | |
| Requirements
 | |
| ------------
 | |
| 
 | |
| *Need running Redis instances. (Redis)
 | |
| *Need the ZMQ_Sub_Onion_Q Module running to be able to work properly.
 | |
| 
 | |
| """
 | |
| import time
 | |
| from packages import Paste
 | |
| from pubsublogger import publisher
 | |
| import datetime
 | |
| import os
 | |
| import base64
 | |
| import subprocess
 | |
| import redis
 | |
| import signal
 | |
| import re
 | |
| 
 | |
| from Helper import Process
 | |
| 
 | |
| class TimeoutException(Exception):
 | |
|     pass
 | |
| 
 | |
| def timeout_handler(signum, frame):
 | |
|     raise TimeoutException
 | |
| 
 | |
| signal.signal(signal.SIGALRM, timeout_handler)
 | |
| 
 | |
| def fetch(p, r_cache, urls, domains, path):
 | |
|     failed = []
 | |
|     downloaded = []
 | |
|     print('{} Urls to fetch'.format(len(urls)))
 | |
|     for url, domain in zip(urls, domains):
 | |
|         if r_cache.exists(url) or url in failed:
 | |
|             continue
 | |
|         to_fetch = base64.standard_b64encode(url.encode('utf8'))
 | |
|         print('fetching url: {}'.format(to_fetch))
 | |
|         process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
 | |
|                                    stdout=subprocess.PIPE)
 | |
|         while process.poll() is None:
 | |
|             time.sleep(1)
 | |
| 
 | |
|         if process.returncode == 0:
 | |
|             r_cache.setbit(url, 0, 1)
 | |
|             r_cache.expire(url, 360000)
 | |
|             downloaded.append(url)
 | |
|             print('downloaded : {}'.format(downloaded))
 | |
|             '''tempfile = process.stdout.read().strip()
 | |
|             tempfile = tempfile.decode('utf8')
 | |
|             #with open(tempfile, 'r') as f:
 | |
|                 filename = path + domain + '.gz'
 | |
|                 fetched = f.read()
 | |
|                 content = base64.standard_b64decode(fetched)
 | |
|                 save_path = os.path.join(os.environ['AIL_HOME'],
 | |
|                                          p.config.get("Directories", "pastes"),
 | |
|                                          filename)
 | |
|                 dirname = os.path.dirname(save_path)
 | |
|                 if not os.path.exists(dirname):
 | |
|                     os.makedirs(dirname)
 | |
|                 with open(save_path, 'w') as ff:
 | |
|                     ff.write(content)
 | |
|                 p.populate_set_out(save_path, 'Global')
 | |
|                 p.populate_set_out(url, 'ValidOnion')
 | |
|                 p.populate_set_out(fetched, 'FetchedOnion')'''
 | |
|             yield url
 | |
|             #os.unlink(tempfile)
 | |
|         else:
 | |
|             r_cache.setbit(url, 0, 0)
 | |
|             r_cache.expire(url, 3600)
 | |
|             failed.append(url)
 | |
|             print('Failed at downloading', url)
 | |
|             print(process.stdout.read())
 | |
|     print('Failed:', len(failed), 'Downloaded:', len(downloaded))
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     publisher.port = 6380
 | |
|     publisher.channel = "Script"
 | |
| 
 | |
|     torclient_host = '127.0.0.1'
 | |
|     torclient_port = 9050
 | |
| 
 | |
|     config_section = 'Onion'
 | |
| 
 | |
|     p = Process(config_section)
 | |
|     r_cache = redis.StrictRedis(
 | |
|         host=p.config.get("Redis_Cache", "host"),
 | |
|         port=p.config.getint("Redis_Cache", "port"),
 | |
|         db=p.config.getint("Redis_Cache", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     r_onion = redis.StrictRedis(
 | |
|         host=p.config.get("ARDB_Onion", "host"),
 | |
|         port=p.config.getint("ARDB_Onion", "port"),
 | |
|         db=p.config.getint("ARDB_Onion", "db"),
 | |
|         decode_responses=True)
 | |
| 
 | |
|     # FUNCTIONS #
 | |
|     publisher.info("Script subscribed to channel onion_categ")
 | |
| 
 | |
|     # FIXME For retro compatibility
 | |
|     channel = 'onion_categ'
 | |
| 
 | |
|     # Getting the first message from redis.
 | |
|     message = p.get_from_set()
 | |
|     prec_filename = None
 | |
| 
 | |
|     max_execution_time = p.config.getint("Onion", "max_execution_time")
 | |
| 
 | |
|     # send to crawler:
 | |
|     activate_crawler = p.config.get("Crawler", "activate_crawler")
 | |
|     if activate_crawler == 'True':
 | |
|         activate_crawler = True
 | |
|         print('Crawler enabled')
 | |
|     else:
 | |
|         activate_crawler = False
 | |
|         print('Crawler disabled')
 | |
| 
 | |
|     # Thanks to Faup project for this regex
 | |
|     # https://github.com/stricaud/faup
 | |
|     url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
 | |
|     i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
 | |
|     re.compile(url_regex)
 | |
| 
 | |
| 
 | |
|     while True:
 | |
|         message = p.get_from_set()
 | |
|         if message is not None:
 | |
|             print(message)
 | |
|             filename, score = message.split()
 | |
| 
 | |
|             # "For each new paste"
 | |
|             if prec_filename is None or filename != prec_filename:
 | |
|                 domains_list = []
 | |
|                 urls = []
 | |
|                 PST = Paste.Paste(filename)
 | |
| 
 | |
|                 # max execution time on regex
 | |
|                 signal.alarm(max_execution_time)
 | |
|                 try:
 | |
|                     for x in PST.get_regex(url_regex):
 | |
|                         print(x)
 | |
|                         # Extracting url with regex
 | |
|                         url, s, credential, subdomain, domain, host, port, \
 | |
|                             resource_path, query_string, f1, f2, f3, f4 = x
 | |
| 
 | |
|                         if '.onion' in url:
 | |
|                             print(url)
 | |
|                             domains_list.append(domain)
 | |
|                             urls.append(url)
 | |
|                 except TimeoutException:
 | |
|                     encoded_list = []
 | |
|                     p.incr_module_timeout_statistic()
 | |
|                     print ("{0} processing timeout".format(PST.p_rel_path))
 | |
|                     continue
 | |
| 
 | |
|                 signal.alarm(0)
 | |
| 
 | |
|                 '''
 | |
|                 for x in PST.get_regex(i2p_regex):
 | |
|                     # Extracting url with regex
 | |
|                     url, s, credential, subdomain, domain, host, port, \
 | |
|                         resource_path, query_string, f1, f2, f3, f4 = x
 | |
| 
 | |
|                     if '.i2p' in url:
 | |
|                         print('add i2p')
 | |
|                         print(domain)
 | |
|                         if not r_onion.sismember('i2p_domain', domain) and not r_onion.sismember('i2p_domain_crawler_queue', domain):
 | |
|                             r_onion.sadd('i2p_domain', domain)
 | |
|                             r_onion.sadd('i2p_link', url)
 | |
|                             r_onion.sadd('i2p_domain_crawler_queue', domain)
 | |
|                             msg = '{};{}'.format(url,PST.p_rel_path)
 | |
|                             r_onion.sadd('i2p_crawler_queue', msg)
 | |
|                 '''
 | |
| 
 | |
|                 # Saving the list of extracted onion domains.
 | |
|                 PST.__setattr__(channel, domains_list)
 | |
|                 PST.save_attribute_redis(channel, domains_list)
 | |
|                 to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date,
 | |
|                                                     PST.p_name)
 | |
| 
 | |
|                 print(len(domains_list))
 | |
|                 if len(domains_list) > 0:
 | |
| 
 | |
|                     if not activate_crawler:
 | |
|                         publisher.warning('{}Detected {} .onion(s);{}'.format(
 | |
|                             to_print, len(domains_list),PST.p_rel_path))
 | |
|                     else:
 | |
|                         publisher.info('{}Detected {} .onion(s);{}'.format(
 | |
|                             to_print, len(domains_list),PST.p_rel_path))
 | |
|                     now = datetime.datetime.now()
 | |
|                     path = os.path.join('onions', str(now.year).zfill(4),
 | |
|                                         str(now.month).zfill(2),
 | |
|                                         str(now.day).zfill(2),
 | |
|                                         str(int(time.mktime(now.utctimetuple()))))
 | |
|                     to_print = 'Onion;{};{};{};'.format(PST.p_source,
 | |
|                                                         PST.p_date,
 | |
|                                                         PST.p_name)
 | |
| 
 | |
|                     if activate_crawler:
 | |
|                         date_month = datetime.datetime.now().strftime("%Y%m")
 | |
|                         date = datetime.datetime.now().strftime("%Y%m%d")
 | |
|                         for url in urls:
 | |
| 
 | |
|                             domain = re.findall(url_regex, url)
 | |
|                             if len(domain) > 0:
 | |
|                                 domain = domain[0][4]
 | |
|                             else:
 | |
|                                 continue
 | |
| 
 | |
|                             # too many subdomain
 | |
|                             if len(domain.split('.')) > 5:
 | |
|                                 continue
 | |
| 
 | |
|                             if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain):
 | |
|                                 if not r_onion.sismember('onion_domain_crawler_queue', domain):
 | |
|                                     print('send to onion crawler')
 | |
|                                     r_onion.sadd('onion_domain_crawler_queue', domain)
 | |
|                                     msg = '{};{}'.format(url,PST.p_rel_path)
 | |
|                                     if not r_onion.hexists('onion_metadata:{}'.format(domain), 'first_seen'):
 | |
|                                         r_onion.sadd('onion_crawler_priority_queue', msg)
 | |
|                                         print('send to priority queue')
 | |
|                                     else:
 | |
|                                         r_onion.sadd('onion_crawler_queue', msg)
 | |
|                                 #p.populate_set_out(msg, 'Crawler')
 | |
| 
 | |
|                     else:
 | |
|                         for url in fetch(p, r_cache, urls, domains_list, path):
 | |
|                             publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))
 | |
| 
 | |
|                     # TAG Item
 | |
|                     msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_rel_path)
 | |
|                     p.populate_set_out(msg, 'Tags')
 | |
|                 else:
 | |
|                     publisher.info('{}Onion related;{}'.format(to_print, PST.p_rel_path))
 | |
| 
 | |
|             prec_filename = filename
 | |
|         else:
 | |
|             publisher.debug("Script url is Idling 10s")
 | |
|             #print('Sleeping')
 | |
|             time.sleep(10)
 |