From 73da7ae4c5beecdb6112d62fade8f3c1212d95af Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 7 Jun 2021 15:58:21 +0200 Subject: [PATCH] chg: [Urls (Web) module] fix regex + rename --- bin/{ => modules}/Urls.py | 33 ++++++++++++++++++--------------- bin/packages/modules.cfg | 9 +++------ 2 files changed, 21 insertions(+), 21 deletions(-) rename bin/{ => modules}/Urls.py (74%) diff --git a/bin/Urls.py b/bin/modules/Urls.py similarity index 74% rename from bin/Urls.py rename to bin/modules/Urls.py index 60816089..57e53da1 100755 --- a/bin/Urls.py +++ b/bin/modules/Urls.py @@ -12,21 +12,21 @@ This module extract URLs from an item and send them to others modules. ################################## # Import External packages ################################## -import redis -import pprint -import time import os -from pyfaup.faup import Faup import re +import sys +from pyfaup.faup import Faup + +sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## -from module.abstract_module import AbstractModule +from modules.abstract_module import AbstractModule from packages.Item import Item -from packages import lib_refine -from Helper import Process +from lib import regex_helper +# # TODO: Faup packages: Add new binding: Check TLD class Urls(AbstractModule): """ @@ -39,8 +39,8 @@ class Urls(AbstractModule): """ super(Urls, self).__init__() - # FUNCTIONS # self.faup = Faup() + self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) # Protocol file path protocolsfile_path = os.path.join(os.environ['AIL_HOME'], @@ -53,7 +53,7 @@ class Urls(AbstractModule): uri_scheme = uri_scheme[:-1] self.url_regex = "((?i:"+uri_scheme + \ - ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" + ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)" # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") @@ -67,19 +67,22 @@ class Urls(AbstractModule): id, score = message.split() item = Item(id) + item_content = item.get_content() - l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item.get_content()) - if len(urls) > 0: - to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' - self.redis_logger.info(f'{to_print}Detected {len(urls)} URL;{item.get_id()}') - + l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) for url in l_urls: - # # TODO: FIXME handle .foundation .dev onion? i2p? + self.faup.decode(url) + unpack_url = self.faup.get() to_send = f"{url} {item.get_id()}" + print(to_send) self.send_message_to_queue(to_send, 'Url') self.redis_logger.debug(f"url_parsed: {to_send}") + if len(l_urls) > 0: + to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' + self.redis_logger.info(f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}') + if __name__ == '__main__': module = Urls() diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 542dc6dc..c8b681bd 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -51,7 +51,7 @@ subscribe = Redis_Global [Categ] subscribe = Redis_Global -publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Web,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey +publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,Redis_SourceCode,Redis_Cve,Redis_ApiKey [CreditCards] subscribe = Redis_CreditCards @@ -74,14 +74,11 @@ publish = Redis_ValidOnion,Redis_Tags,Redis_Crawler [DumpValidOnion] subscribe = Redis_ValidOnion -[Web] -subscribe = Redis_Web +[Urls] +subscribe = Redis_Urls publish = Redis_Url #publish = Redis_Url,ZMQ_Url -[WebStats] -subscribe = Redis_Url - [LibInjection] subscribe = Redis_Url publish = Redis_Duplicate,Redis_Tags