mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [Urls (Web) module] fix regex + rename
							parent
							
								
									fc11c1769b
								
							
						
					
					
						commit
						73da7ae4c5
					
				|  | @ -12,21 +12,21 @@ This module extract URLs from an item and send them to others modules. | |||
| ################################## | ||||
| # Import External packages | ||||
| ################################## | ||||
| import redis | ||||
| import pprint | ||||
| import time | ||||
| import os | ||||
| from pyfaup.faup import Faup | ||||
| import re | ||||
| import sys | ||||
| 
 | ||||
| from pyfaup.faup import Faup | ||||
| 
 | ||||
| sys.path.append(os.environ['AIL_BIN']) | ||||
| ################################## | ||||
| # Import Project packages | ||||
| ################################## | ||||
| from module.abstract_module import AbstractModule | ||||
| from modules.abstract_module import AbstractModule | ||||
| from packages.Item import Item | ||||
| from packages import lib_refine | ||||
| from Helper import Process | ||||
| from lib import regex_helper | ||||
| 
 | ||||
| # # TODO: Faup packages: Add new binding: Check TLD | ||||
| 
 | ||||
| class Urls(AbstractModule): | ||||
|     """ | ||||
|  | @ -39,8 +39,8 @@ class Urls(AbstractModule): | |||
|         """ | ||||
|         super(Urls, self).__init__() | ||||
| 
 | ||||
|         # FUNCTIONS # | ||||
|         self.faup = Faup() | ||||
|         self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) | ||||
| 
 | ||||
|         # Protocol file path | ||||
|         protocolsfile_path = os.path.join(os.environ['AIL_HOME'], | ||||
|  | @ -53,7 +53,7 @@ class Urls(AbstractModule): | |||
|         uri_scheme = uri_scheme[:-1] | ||||
| 
 | ||||
|         self.url_regex = "((?i:"+uri_scheme + \ | ||||
|             ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" | ||||
|             ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)" | ||||
| 
 | ||||
|         # Send module state to logs | ||||
|         self.redis_logger.info(f"Module {self.module_name} initialized") | ||||
|  | @ -67,19 +67,22 @@ class Urls(AbstractModule): | |||
|         id, score = message.split() | ||||
| 
 | ||||
|         item = Item(id) | ||||
|         item_content = item.get_content() | ||||
| 
 | ||||
|         l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item.get_content()) | ||||
|         if len(urls) > 0: | ||||
|             to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' | ||||
|             self.redis_logger.info(f'{to_print}Detected {len(urls)} URL;{item.get_id()}') | ||||
| 
 | ||||
|         l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) | ||||
|         for url in l_urls: | ||||
|             # # TODO: FIXME handle .foundation .dev onion? i2p? | ||||
|             self.faup.decode(url) | ||||
|             unpack_url = self.faup.get() | ||||
| 
 | ||||
|             to_send = f"{url} {item.get_id()}" | ||||
|             print(to_send) | ||||
|             self.send_message_to_queue(to_send, 'Url') | ||||
|             self.redis_logger.debug(f"url_parsed: {to_send}") | ||||
| 
 | ||||
|         if len(l_urls) > 0: | ||||
|             to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' | ||||
|             self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}') | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
| 
 | ||||
|     module = Urls() | ||||
|  | @ -51,7 +51,7 @@ subscribe = Redis_Global | |||
| 
 | ||||
| [Categ] | ||||
| subscribe = Redis_Global | ||||
| publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey | ||||
| publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey | ||||
| 
 | ||||
| [CreditCards] | ||||
| subscribe = Redis_CreditCards | ||||
|  | @ -74,14 +74,11 @@ publish = Redis_ValidOnion,Redis_Tags,Redis_Crawler | |||
| [DumpValidOnion] | ||||
| subscribe = Redis_ValidOnion | ||||
| 
 | ||||
| [Web] | ||||
| subscribe = Redis_Web | ||||
| [Urls] | ||||
| subscribe = Redis_Urls | ||||
| publish = Redis_Url | ||||
| #publish = Redis_Url,ZMQ_Url | ||||
| 
 | ||||
| [WebStats] | ||||
| subscribe = Redis_Url | ||||
| 
 | ||||
| [LibInjection] | ||||
| subscribe = Redis_Url | ||||
| publish = Redis_Duplicate,Redis_Tags | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Terrtia
						Terrtia