chg: [Hosts] improve perf + regex timeout + cache DNS results

dev
terrtia 2024-01-30 10:28:50 +01:00
parent a10119fb6a
commit 2db8587d03
No known key found for this signature in database
GPG Key ID: 1E1B1F50D84613D0
2 changed files with 37 additions and 29 deletions

View File

@ -41,7 +41,13 @@ class DomClassifier(AbstractModule):
addr_dns = config_loader.get_config_str("DomClassifier", "dns") addr_dns = config_loader.get_config_str("DomClassifier", "dns")
self.c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) redis_host = config_loader.get_config_str('Redis_Cache', 'host')
redis_port = config_loader.get_config_int('Redis_Cache', 'port')
redis_db = config_loader.get_config_int('Redis_Cache', 'db')
self.dom_classifier = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns],
redis_host=redis_host,
redis_port=redis_port, redis_db=redis_db,
re_timeout=30)
self.cc = config_loader.get_config_str("DomClassifier", "cc") self.cc = config_loader.get_config_str("DomClassifier", "cc")
self.cc_tld = config_loader.get_config_str("DomClassifier", "cc_tld") self.cc_tld = config_loader.get_config_str("DomClassifier", "cc_tld")
@ -58,34 +64,34 @@ class DomClassifier(AbstractModule):
item_source = item.get_source() item_source = item.get_source()
try: try:
self.c.text(rawtext=host) self.dom_classifier.text(rawtext=host)
if not self.c.domain: if not self.dom_classifier.domain:
return return
print(self.c.domain) print(self.dom_classifier.domain)
self.c.validdomain(passive_dns=True, extended=False) self.dom_classifier.validdomain(passive_dns=True, extended=False)
# self.logger.debug(self.c.vdomain) # self.logger.debug(self.dom_classifier.vdomain)
print(self.c.vdomain) print(self.dom_classifier.vdomain)
print() print()
if self.c.vdomain and d4.is_passive_dns_enabled(): if self.dom_classifier.vdomain and d4.is_passive_dns_enabled():
for dns_record in self.c.vdomain: for dns_record in self.dom_classifier.vdomain:
self.add_message_to_queue(obj=None, message=dns_record) self.add_message_to_queue(obj=None, message=dns_record)
if self.cc_tld: if self.cc_tld:
localizeddomains = self.c.include(expression=self.cc_tld) localizeddomains = self.dom_classifier.include(expression=self.cc_tld)
if localizeddomains: if localizeddomains:
print(localizeddomains) print(localizeddomains)
self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc_tld};{item.get_id()}") self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc_tld};{item.get_id()}")
if self.cc: if self.cc:
localizeddomains = self.c.localizedomain(cc=self.cc) localizeddomains = self.dom_classifier.localizedomain(cc=self.cc)
if localizeddomains: if localizeddomains:
print(localizeddomains) print(localizeddomains)
self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc};{item.get_id()}") self.redis_logger.warning(f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {self.cc};{item.get_id()}")
if r_result: if r_result:
return self.c.vdomain return self.dom_classifier.vdomain
except IOError as err: except IOError as err:
self.redis_logger.error(f"Duplicate;{item_source};{item_date};{item_basename};CRC Checksum Failed") self.redis_logger.error(f"Duplicate;{item_source};{item_date};{item_basename};CRC Checksum Failed")

View File

@ -18,13 +18,14 @@ import os
import re import re
import sys import sys
import DomainClassifier.domainclassifier
sys.path.append(os.environ['AIL_BIN']) sys.path.append(os.environ['AIL_BIN'])
################################## ##################################
# Import Project packages # Import Project packages
################################## ##################################
from modules.abstract_module import AbstractModule from modules.abstract_module import AbstractModule
from lib.ConfigLoader import ConfigLoader from lib.ConfigLoader import ConfigLoader
from lib.objects.Items import Item
class Hosts(AbstractModule): class Hosts(AbstractModule):
""" """
@ -43,28 +44,29 @@ class Hosts(AbstractModule):
# Waiting time in seconds between to message processed # Waiting time in seconds between to message processed
self.pending_seconds = 1 self.pending_seconds = 1
self.host_regex = r'\b([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63})+)\b' redis_host = config_loader.get_config_str('Redis_Cache', 'host')
re.compile(self.host_regex) redis_port = config_loader.get_config_int('Redis_Cache', 'port')
redis_db = config_loader.get_config_int('Redis_Cache', 'db')
self.dom_classifier = DomainClassifier.domainclassifier.Extract(rawtext="",
redis_host=redis_host,
redis_port=redis_port,
redis_db=redis_db,
re_timeout=30)
self.logger.info(f"Module: {self.module_name} Launched") self.logger.info(f"Module: {self.module_name} Launched")
def compute(self, message): def compute(self, message):
item = self.get_obj() obj = self.get_obj()
# mimetype = item_basic.get_item_mimetype(item.get_id()) content = obj.get_content()
# if mimetype.split('/')[0] == "text": self.dom_classifier.text(content)
if self.dom_classifier.domain:
content = item.get_content() print(f'{len(self.dom_classifier.domain)} host {obj.get_id()}')
hosts = self.regex_findall(self.host_regex, item.get_id(), content, r_set=True) # print(self.dom_classifier.domain)
if hosts: for domain in self.dom_classifier.domain:
print(f'{len(hosts)} host {item.get_id()}') if domain:
for host in hosts: self.add_message_to_queue(message=domain, queue='Host')
# print(host)
if not host.endswith('.onion'):
self.add_message_to_queue(message=str(host), queue='Host')
if __name__ == '__main__': if __name__ == '__main__':
module = Hosts() module = Hosts()
module.run() module.run()