From 2db8587d03642e16cfa8d468e9b35f4835c0c58b Mon Sep 17 00:00:00 2001 From: terrtia Date: Tue, 30 Jan 2024 10:28:50 +0100 Subject: [PATCH] chg: [Hosts] improve perf + regex timeout + cache DNS results --- bin/modules/DomClassifier.py | 30 ++++++++++++++++++------------ bin/modules/Hosts.py | 36 +++++++++++++++++++----------------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/bin/modules/DomClassifier.py b/bin/modules/DomClassifier.py index 94cf53db..b4620ee2 100755 --- a/bin/modules/DomClassifier.py +++ b/bin/modules/DomClassifier.py @@ -41,7 +41,13 @@ class DomClassifier(AbstractModule): addr_dns = config_loader.get_config_str("DomClassifier", "dns") - self.c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) + redis_host = config_loader.get_config_str('Redis_Cache', 'host') + redis_port = config_loader.get_config_int('Redis_Cache', 'port') + redis_db = config_loader.get_config_int('Redis_Cache', 'db') + self.dom_classifier = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns], + redis_host=redis_host, + redis_port=redis_port, redis_db=redis_db, + re_timeout=30) self.cc = config_loader.get_config_str("DomClassifier", "cc") self.cc_tld = config_loader.get_config_str("DomClassifier", "cc_tld") @@ -58,34 +64,34 @@ class DomClassifier(AbstractModule): item_source = item.get_source() try: - self.c.text(rawtext=host) - if not self.c.domain: + self.dom_classifier.text(rawtext=host) + if not self.dom_classifier.domain: return - print(self.c.domain) - self.c.validdomain(passive_dns=True, extended=False) - # self.logger.debug(self.c.vdomain) + print(self.dom_classifier.domain) + self.dom_classifier.validdomain(passive_dns=True, extended=False) + # self.logger.debug(self.dom_classifier.vdomain) - print(self.c.vdomain) + print(self.dom_classifier.vdomain) print() - if self.c.vdomain and d4.is_passive_dns_enabled(): - for dns_record in self.c.vdomain: + if self.dom_classifier.vdomain and d4.is_passive_dns_enabled(): + for dns_record in self.dom_classifier.vdomain: self.add_message_to_queue(obj=None, message=dns_record) if self.cc_tld: - localizeddomains = self.c.include(expression=self.cc_tld) + localizeddomains = self.dom_classifier.include(expression=self.cc_tld) if localizeddomains: print(localizeddomains) self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc_tld};{item.get_id()}") if self.cc: - localizeddomains = self.c.localizedomain(cc=self.cc) + localizeddomains = self.dom_classifier.localizedomain(cc=self.cc) if localizeddomains: print(localizeddomains) self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc};{item.get_id()}") if r_result: - return self.c.vdomain + return self.dom_classifier.vdomain except IOError as err: self.redis_logger.error(f"Duplicate;{item_source};{item_date};{item_basename};CRC Checksum Failed") diff --git a/bin/modules/Hosts.py b/bin/modules/Hosts.py index 488e7acf..55670777 100755 --- a/bin/modules/Hosts.py +++ b/bin/modules/Hosts.py @@ -18,13 +18,14 @@ import os import re import sys +import DomainClassifier.domainclassifier + sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader -from lib.objects.Items import Item class Hosts(AbstractModule): """ @@ -43,28 +44,29 @@ class Hosts(AbstractModule): # Waiting time in seconds between to message processed self.pending_seconds = 1 - self.host_regex = r'\b([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)\b' - re.compile(self.host_regex) - + redis_host = config_loader.get_config_str('Redis_Cache', 'host') + redis_port = config_loader.get_config_int('Redis_Cache', 'port') + redis_db = config_loader.get_config_int('Redis_Cache', 'db') + self.dom_classifier = DomainClassifier.domainclassifier.Extract(rawtext="", + redis_host=redis_host, + redis_port=redis_port, + redis_db=redis_db, + re_timeout=30) self.logger.info(f"Module: {self.module_name} Launched") def compute(self, message): - item = self.get_obj() + obj = self.get_obj() - # mimetype = item_basic.get_item_mimetype(item.get_id()) - # if mimetype.split('/')[0] == "text": - - content = item.get_content() - hosts = self.regex_findall(self.host_regex, item.get_id(), content, r_set=True) - if hosts: - print(f'{len(hosts)} host {item.get_id()}') - for host in hosts: - # print(host) - if not host.endswith('.onion'): - self.add_message_to_queue(message=str(host), queue='Host') + content = obj.get_content() + self.dom_classifier.text(content) + if self.dom_classifier.domain: + print(f'{len(self.dom_classifier.domain)} host {obj.get_id()}') + # print(self.dom_classifier.domain) + for domain in self.dom_classifier.domain: + if domain: + self.add_message_to_queue(message=domain, queue='Host') if __name__ == '__main__': - module = Hosts() module.run()