mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			95 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			95 lines
		
	
	
		
			3.0 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| 
 | |
| """
 | |
| The Urls Module
 | |
| ============================
 | |
| 
 | |
| This module extract URLs from an item and send them to others modules.
 | |
| 
 | |
| """
 | |
| 
 | |
| ##################################
 | |
| # Import External packages
 | |
| ##################################
 | |
| import os
 | |
| import sys
 | |
| 
 | |
| from pyfaup.faup import Faup
 | |
| 
 | |
| sys.path.append(os.environ['AIL_BIN'])
 | |
| ##################################
 | |
| # Import Project packages
 | |
| ##################################
 | |
| from modules.abstract_module import AbstractModule
 | |
| from lib.ConfigLoader import ConfigLoader
 | |
| from lib.objects.Items import Item
 | |
| 
 | |
| # # TODO: Faup packages: Add new binding: Check TLD
 | |
| 
 | |
| class Urls(AbstractModule):
 | |
|     """
 | |
|     Urls module for AIL framework
 | |
|     """
 | |
| 
 | |
|     def __init__(self):
 | |
|         """
 | |
|         Init Urls
 | |
|         """
 | |
|         super(Urls, self).__init__()
 | |
| 
 | |
|         config_loader = ConfigLoader()
 | |
| 
 | |
|         self.faup = Faup()
 | |
| 
 | |
|         # Protocol file path
 | |
|         protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
 | |
|                                           config_loader.get_config_str("Directories", "protocolsfile"))
 | |
|         # Get all uri from protocolsfile (Used for Curve)
 | |
|         uri_scheme = ""
 | |
|         with open(protocolsfile_path, 'r') as scheme_file:
 | |
|             for scheme in scheme_file:
 | |
|                 uri_scheme += scheme[:-1]+"|"
 | |
|         uri_scheme = uri_scheme[:-1]
 | |
| 
 | |
|         self.url_regex = "((?i:"+uri_scheme + \
 | |
|             ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)"
 | |
| 
 | |
|         # Send module state to logs
 | |
|         self.logger.info(f"Module {self.module_name} initialized")
 | |
| 
 | |
|     def compute(self, message):
 | |
|         """
 | |
|         Search for Web links from given message
 | |
|         """
 | |
|         score = message
 | |
| 
 | |
|         item = self.get_obj()
 | |
|         item_content = item.get_content()
 | |
| 
 | |
|         # TODO Handle invalid URL
 | |
|         l_urls = self.regex_findall(self.url_regex, item.get_id(), item_content)
 | |
|         for url in l_urls:
 | |
|             self.faup.decode(url)
 | |
|             url_decoded = self.faup.get()
 | |
|             # decode URL
 | |
|             try:
 | |
|                 url = url_decoded['url'].decode()
 | |
|             except AttributeError:
 | |
|                 url = url_decoded['url']
 | |
| 
 | |
|             print(url, self.obj.get_global_id())
 | |
|             self.add_message_to_queue(message=str(url), queue='Url')
 | |
|             self.logger.debug(f"url_parsed: {url}")
 | |
| 
 | |
|         if len(l_urls) > 0:
 | |
|             to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};'
 | |
|             print(to_print)
 | |
|             # .debug ???
 | |
|             # self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}')
 | |
| 
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     module = Urls()
 | |
|     module.run()
 |