mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			171 lines
		
	
	
		
			7.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			171 lines
		
	
	
		
			7.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| """
 | |
| The Onion Module
 | |
| ============================
 | |
| 
 | |
| This module extract url from item and returning only ones which are tor
 | |
| related (.onion). All These urls are send to the crawler discovery queue.
 | |
| 
 | |
| Requirements
 | |
| ------------
 | |
| 
 | |
| *Need running Redis instances. (Redis)
 | |
| 
 | |
| """
 | |
| import time
 | |
| import datetime
 | |
| import os
 | |
| import sys
 | |
| import re
 | |
| 
 | |
| sys.path.append(os.environ['AIL_BIN'])
 | |
| ##################################
 | |
| # Import Project packages
 | |
| ##################################
 | |
| from modules.abstract_module import AbstractModule
 | |
| from lib.ConfigLoader import ConfigLoader
 | |
| from lib import crawlers
 | |
| from lib import regex_helper
 | |
| from packages.Item import Item
 | |
| 
 | |
| ## Manually fetch first page if crawler is disabled
 | |
| # import base64
 | |
| # import subprocess
 | |
| #
 | |
| # torclient_host = '127.0.0.1'
 | |
| # torclient_port = 9050
 | |
| #
 | |
| # def fetch(p, r_cache, urls, domains):
 | |
| #     now = datetime.datetime.now()
 | |
| #     path = os.path.join('onions', str(now.year).zfill(4),
 | |
| #                         str(now.month).zfill(2),
 | |
| #                         str(now.day).zfill(2),
 | |
| #                         str(int(time.mktime(now.utctimetuple()))))
 | |
| #     failed = []
 | |
| #     downloaded = []
 | |
| #     print('{} Urls to fetch'.format(len(urls)))
 | |
| #     for url, domain in zip(urls, domains):
 | |
| #         if r_cache.exists(url) or url in failed:
 | |
| #             continue
 | |
| #         to_fetch = base64.standard_b64encode(url.encode('utf8'))
 | |
| #         print('fetching url: {}'.format(to_fetch))
 | |
| #         process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
 | |
| #                                    stdout=subprocess.PIPE)
 | |
| #         while process.poll() is None:
 | |
| #             time.sleep(1)
 | |
| #
 | |
| #         if process.returncode == 0:
 | |
| #             r_cache.setbit(url, 0, 1)
 | |
| #             r_cache.expire(url, 360000)
 | |
| #             downloaded.append(url)
 | |
| #             print('downloaded : {}'.format(downloaded))
 | |
| #             '''tempfile = process.stdout.read().strip()
 | |
| #             tempfile = tempfile.decode('utf8')
 | |
| #             #with open(tempfile, 'r') as f:
 | |
| #                 filename = path + domain + '.gz'
 | |
| #                 fetched = f.read()
 | |
| #                 content = base64.standard_b64decode(fetched)
 | |
| #                 save_path = os.path.join(os.environ['AIL_HOME'],
 | |
| #                                          p.config.get("Directories", "pastes"),
 | |
| #                                          filename)
 | |
| #                 dirname = os.path.dirname(save_path)
 | |
| #                 if not os.path.exists(dirname):
 | |
| #                     os.makedirs(dirname)
 | |
| #                 with open(save_path, 'w') as ff:
 | |
| #                     ff.write(content)
 | |
| #                 p.populate_set_out(save_path, 'Global')
 | |
| #                 p.populate_set_out(url, 'ValidOnion')
 | |
| #                 p.populate_set_out(fetched, 'FetchedOnion')'''
 | |
| #             yield url
 | |
| #             #os.unlink(tempfile)
 | |
| #         else:
 | |
| #             r_cache.setbit(url, 0, 0)
 | |
| #             r_cache.expire(url, 3600)
 | |
| #             failed.append(url)
 | |
| #             print('Failed at downloading', url)
 | |
| #             print(process.stdout.read())
 | |
| #     print('Failed:', len(failed), 'Downloaded:', len(downloaded))
 | |
| 
 | |
| 
 | |
| class Onion(AbstractModule):
 | |
|     """docstring for Onion module."""
 | |
| 
 | |
|     def __init__(self):
 | |
|         super(Onion, self).__init__()
 | |
| 
 | |
|         config_loader = ConfigLoader()
 | |
|         self.r_cache = config_loader.get_redis_conn("Redis_Cache")
 | |
|         self.r_onion = config_loader.get_redis_conn("ARDB_Onion")
 | |
| 
 | |
|         self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
 | |
|         # regex timeout
 | |
|         self.regex_timeout = 30
 | |
| 
 | |
|         self.faup = crawlers.get_faup()
 | |
|         self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)
 | |
| 
 | |
|         # activate_crawler = p.config.get("Crawler", "activate_crawler")
 | |
| 
 | |
| 
 | |
|         self.url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
 | |
|         self.i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
 | |
|         re.compile(self.url_regex)
 | |
|         re.compile(self.i2p_regex)
 | |
| 
 | |
|         self.redis_logger.info(f"Module: {self.module_name} Launched")
 | |
| 
 | |
|         # TEMP var: SAVE I2P Domain (future I2P crawler)
 | |
|         self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
 | |
| 
 | |
|     def compute(self, message):
 | |
|         # list of tuples: (url, subdomains, domain)
 | |
|         urls_to_crawl = []
 | |
| 
 | |
|         id, score = message.split()
 | |
|         item = Item(id)
 | |
|         item_content = item.get_content()
 | |
| 
 | |
|         # max execution time on regex
 | |
|         res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
 | |
|         for x in res:
 | |
|             # String to tuple
 | |
|             x = x[2:-2].replace(" '", "").split("',")
 | |
|             url = x[0]
 | |
|             subdomain = x[4].lower()
 | |
|             self.faup.decode(url)
 | |
|             url_unpack = self.faup.get()
 | |
|             try:    ## TODO: # FIXME: check faup version
 | |
|                 domain = url_unpack['domain'].decode().lower()
 | |
|             except Exception as e:
 | |
|                 domain = url_unpack['domain'].lower()
 | |
| 
 | |
|             if crawlers.is_valid_onion_domain(domain):
 | |
|                 urls_to_crawl.append((url, subdomain, domain))
 | |
| 
 | |
|         to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
 | |
|         if not urls_to_crawl:
 | |
|             self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}')
 | |
|             return
 | |
| 
 | |
|         # TAG Item
 | |
|         msg = f'infoleak:automatic-detection="onion";{item.get_id()}'
 | |
|         self.send_message_to_queue(msg, 'Tags')
 | |
| 
 | |
|         if crawlers.is_crawler_activated():
 | |
|             for to_crawl in urls_to_crawl:
 | |
|                 print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
 | |
|                 crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
 | |
|         else:
 | |
|             print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
 | |
|             self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
 | |
|             # keep manual fetcher ????
 | |
|             ## Manually fetch first page if crawler is disabled
 | |
|             # for url in fetch(p, r_cache, urls, domains_list):
 | |
|             #     publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))
 | |
| 
 | |
| if __name__ == "__main__":
 | |
| 
 | |
|     module = Onion()
 | |
|     module.run()
 |