mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			145 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			145 lines
		
	
	
		
			5.1 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| """
 | |
| The Pasties Module
 | |
| ======================
 | |
| This module spots domain-pasties services for further processing
 | |
| """
 | |
| 
 | |
| ##################################
 | |
| # Import External packages
 | |
| ##################################
 | |
| import os
 | |
| import sys
 | |
| import time
 | |
| 
 | |
| from pyfaup.faup import Faup
 | |
| 
 | |
| sys.path.append(os.environ['AIL_BIN'])
 | |
| ##################################
 | |
| # Import Project packages
 | |
| ##################################
 | |
| from modules.abstract_module import AbstractModule
 | |
| from lib.ConfigLoader import ConfigLoader
 | |
| from lib import crawlers
 | |
| 
 | |
| # TODO add url validator
 | |
| 
 | |
| pasties_blocklist_urls = set()
 | |
| pasties_domains = {}
 | |
| 
 | |
| class Pasties(AbstractModule):
 | |
|     """
 | |
|     Pasties module for AIL framework
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         super(Pasties, self).__init__()
 | |
|         self.faup = Faup()
 | |
| 
 | |
|         config_loader = ConfigLoader()
 | |
|         self.r_cache = config_loader.get_redis_conn("Redis_Cache")
 | |
| 
 | |
|         self.pasties = {}
 | |
|         self.urls_blocklist = set()
 | |
|         self.load_pasties_domains()
 | |
| 
 | |
|         # Send module state to logs
 | |
|         self.logger.info(f'Module {self.module_name} initialized')
 | |
| 
 | |
|     def load_pasties_domains(self):
 | |
|         self.pasties = {}
 | |
|         self.urls_blocklist = set()
 | |
| 
 | |
|         domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
 | |
|         if os.path.exists(domains_pasties):
 | |
|             with open(domains_pasties) as f:
 | |
|                 for line in f:
 | |
|                     url = line.strip()
 | |
|                     if url: # TODO validate line
 | |
|                         self.faup.decode(url)
 | |
|                         url_decoded = self.faup.get()
 | |
|                         host = url_decoded['host']
 | |
|                         # if url_decoded.get('port', ''):
 | |
|                         #     host = f'{host}:{url_decoded["port"]}'
 | |
|                         path = url_decoded.get('resource_path', '')
 | |
|                         # print(url_decoded)
 | |
|                         if path and path != '/':
 | |
|                             if path[-1] != '/':
 | |
|                                 path = f'{path}/'
 | |
|                         else:
 | |
|                             path = None
 | |
| 
 | |
|                         if host in self.pasties:
 | |
|                             if path:
 | |
|                                 self.pasties[host].add(path)
 | |
|                         else:
 | |
|                             if path:
 | |
|                                 self.pasties[host] = {path}
 | |
|                             else:
 | |
|                                 self.pasties[host] = set()
 | |
| 
 | |
|         url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
 | |
|         if os.path.exists(url_blocklist):
 | |
|             with open(url_blocklist) as f:
 | |
|                 for line in f:
 | |
|                     url = line.strip()
 | |
|                     self.faup.decode(url)
 | |
|                     url_decoded = self.faup.get()
 | |
|                     host = url_decoded['host']
 | |
|                     # if url_decoded.get('port', ''):
 | |
|                     #     host = f'{host}:{url_decoded["port"]}'
 | |
|                     path = url_decoded.get('resource_path', '')
 | |
|                     url = f'{host}{path}'
 | |
|                     if url_decoded['query_string']:
 | |
|                         url = url + url_decoded['query_string']
 | |
|                     self.urls_blocklist.add(url)
 | |
| 
 | |
|     def send_to_crawler(self, url, obj_id):
 | |
|         if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
 | |
|             self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
 | |
|             self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
 | |
|             crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
 | |
| 
 | |
|     def compute(self, message):
 | |
|         url, item_id = message.split()
 | |
| 
 | |
|         self.faup.decode(url)
 | |
|         url_decoded = self.faup.get()
 | |
|         # print(url_decoded)
 | |
|         url_host = url_decoded['host']
 | |
|         # if url_decoded.get('port', ''):
 | |
|         #     url_host = f'{url_host}:{url_decoded["port"]}'
 | |
|         path = url_decoded.get('resource_path', '')
 | |
|         if url_host in self.pasties:
 | |
|             if url.startswith('http://'):
 | |
|                 if url[7:] in self.urls_blocklist:
 | |
|                     return None
 | |
|             elif url.startswith('https://'):
 | |
|                 if url[8:] in self.urls_blocklist:
 | |
|                     return None
 | |
|             else:
 | |
|                 if url in self.urls_blocklist:
 | |
|                     return None
 | |
| 
 | |
|             if not self.pasties[url_host]:
 | |
|                 if path and path != '/':
 | |
|                     print('send to crawler', url_host, url)
 | |
|                     self.send_to_crawler(url, item_id)
 | |
|             else:
 | |
|                 if path.endswith('/'):
 | |
|                     path_end = path[:-1]
 | |
|                 else:
 | |
|                     path_end = f'{path}/'
 | |
|                 for url_path in self.pasties[url_host]:
 | |
|                     if path.startswith(url_path):
 | |
|                         if url_path != path and url_path != path_end:
 | |
|                             print('send to crawler', url_path, url)
 | |
|                             self.send_to_crawler(url, item_id)
 | |
|                             break
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     module = Pasties()
 | |
|     module.run()
 |