From 460100350951bed08c1825a1f95e038c323cc68f Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 20 May 2020 17:03:58 +0200 Subject: [PATCH] fix: [Mail module] regex timeout --- bin/Credential.py | 144 +++++++++++++++------------------------- bin/lib/regex_helper.py | 75 +++++++++++++++++++++ bin/lib/statistics.py | 18 +++++ 3 files changed, 147 insertions(+), 90 deletions(-) create mode 100755 bin/lib/regex_helper.py create mode 100755 bin/lib/statistics.py diff --git a/bin/Credential.py b/bin/Credential.py index 23f771a8..49d1ccfb 100755 --- a/bin/Credential.py +++ b/bin/Credential.py @@ -7,7 +7,7 @@ The Credential Module This module is consuming the Redis-list created by the Categ module. -It apply credential regexes on paste content and warn if above a threshold. +It apply credential regexes on item content and warn if above a threshold. It also split the username and store it into redis for searching purposes. @@ -24,24 +24,37 @@ Redis organization: """ import time +import os import sys -from packages import Paste -from pubsublogger import publisher -from Helper import Process import datetime import re import redis from pyfaup.faup import Faup +from pubsublogger import publisher +from Helper import Process + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) +import Item + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader +import regex_helper + +## LOAD CONFIG ## +config_loader = ConfigLoader.ConfigLoader() +server_cred = config_loader.get_redis_conn("ARDB_TermCred") +server_statistics = config_loader.get_redis_conn("ARDB_Statistics") + +minimumLengthThreshold = config_loader.get_config_int("Credential", "minimumLengthThreshold") +criticalNumberToAlert = config_loader.get_config_int("Credential", "criticalNumberToAlert") +minTopPassList = config_loader.get_config_int("Credential", "minTopPassList") + +config_loader = None +## -- ## + import signal -class TimeoutException(Exception): - pass - -def timeout_handler(signum, frame): - raise TimeoutException - -signal.signal(signal.SIGALRM, timeout_handler) max_execution_time = 30 #split username with spec. char or with upper case, distinguish start with upper @@ -58,107 +71,58 @@ if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = "Credential" + module_name = "Credential" p = Process(config_section) publisher.info("Find credentials") - minimumLengthThreshold = p.config.getint("Credential", "minimumLengthThreshold") - faup = Faup() - server_cred = redis.StrictRedis( - host=p.config.get("ARDB_TermCred", "host"), - port=p.config.get("ARDB_TermCred", "port"), - db=p.config.get("ARDB_TermCred", "db"), - decode_responses=True) - server_statistics = redis.StrictRedis( - host=p.config.get("ARDB_Statistics", "host"), - port=p.config.getint("ARDB_Statistics", "port"), - db=p.config.getint("ARDB_Statistics", "db"), - decode_responses=True) - - criticalNumberToAlert = p.config.getint("Credential", "criticalNumberToAlert") - minTopPassList = p.config.getint("Credential", "minTopPassList") - - regex_web = "((?:https?:\/\/)[-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" + regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" #regex_cred = "[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:[a-zA-Z0-9\_\-]+" regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" + redis_cache_key = regex_helper.generate_redis_cache_key(module_name) + while True: message = p.get_from_set() + if message is None: publisher.debug("Script Credential is Idling 10s") - #print('sleeping 10s') time.sleep(10) continue - filepath, count = message.split(' ') + item_id, count = message.split() - paste = Paste.Paste(filepath) - content = paste.get_p_content() + item_content = Item.get_item_content(item_id) - item_id = filepath + # Extract all credentials + all_credentials = regex_helper.regex_findall(module_name, redis_cache_key, regex_cred, item_content, max_time=max_execution_time) - # max execution time on regex - signal.alarm(max_execution_time) - try: - creds = set(re.findall(regex_cred, content)) - except TimeoutException: - p.incr_module_timeout_statistic() # add encoder type - err_mess = "Credential: processing timeout: {}".format(item_id) - print(err_mess) - publisher.info(err_mess) - continue - else: - signal.alarm(0) - - if len(creds) == 0: + if not all_credentials: continue - signal.alarm(max_execution_time) - try: - sites = re.findall(regex_web, content) #Use to count occurences - except TimeoutException: - p.incr_module_timeout_statistic() - err_mess = "Credential: site, processing timeout: {}".format(item_id) - print(err_mess) - publisher.info(err_mess) - sites = [] - else: - signal.alarm(0) + all_sites = regex_helper.regex_findall(module_name, redis_cache_key, regex_web, item_content, max_time=max_execution_time) - sites_set = set(sites) + message = 'Checked {} credentials found.'.format(len(all_credentials)) + if all_sites: + message += ' Related websites: {}'.format( (', '.join(all_sites)) ) + print(message) - message = 'Checked {} credentials found.'.format(len(creds)) - if sites_set: - message += ' Related websites: {}'.format( (', '.join(sites_set)) ) + to_print = 'Credential;{};{};{};{};{}'.format(Item.get_source(item_id), Item.get_item_date(item_id), Item.get_item_basename(item_id), message, item_id) - to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path) - - print('\n '.join(creds)) #num of creds above tresh, publish an alert - if len(creds) > criticalNumberToAlert: - print("========> Found more than 10 credentials in this file : {}".format( filepath )) + if len(all_credentials) > criticalNumberToAlert: + print("========> Found more than 10 credentials in this file : {}".format( item_id )) publisher.warning(to_print) #Send to duplicate - p.populate_set_out(filepath, 'Duplicate') + p.populate_set_out(item_id, 'Duplicate') - msg = 'infoleak:automatic-detection="credential";{}'.format(filepath) + msg = 'infoleak:automatic-detection="credential";{}'.format(item_id) p.populate_set_out(msg, 'Tags') - #Put in form, count occurences, then send to moduleStats - signal.alarm(max_execution_time) - try: - site_occurence = re.findall(regex_site_for_stats, content) - except TimeoutException: - p.incr_module_timeout_statistic() - err_mess = "Credential: site occurence, processing timeout: {}".format(item_id) - print(err_mess) - publisher.info(err_mess) - site_occurence = [] - else: - signal.alarm(0) + site_occurence = regex_helper.regex_findall(module_name, redis_cache_key, regex_site_for_stats, item_content, max_time=max_execution_time, r_set=False) creds_sites = {} @@ -169,7 +133,7 @@ if __name__ == "__main__": else: creds_sites[site_domain] = 1 - for url in sites: + for url in all_sites: faup.decode(url) domain = faup.get()['domain'] ## TODO: # FIXME: remove me @@ -184,15 +148,15 @@ if __name__ == "__main__": for site, num in creds_sites.items(): # Send for each different site to moduleStats - mssg = 'credential;{};{};{}'.format(num, site, paste.p_date) + mssg = 'credential;{};{};{}'.format(num, site, Item.get_item_date(item_id)) print(mssg) p.populate_set_out(mssg, 'ModuleStats') - if sites_set: - print("=======> Probably on : {}".format(', '.join(sites_set))) + if all_sites: + print("=======> Probably on : {}".format(', '.join(all_sites))) date = datetime.datetime.now().strftime("%Y%m") - for cred in creds: + for cred in all_credentials: maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] faup.decode(maildomains) tld = faup.get()['tld'] @@ -204,17 +168,17 @@ if __name__ == "__main__": server_statistics.hincrby('credential_by_tld:'+date, tld, 1) else: publisher.info(to_print) - print('found {} credentials'.format(len(creds))) + print('found {} credentials'.format(len(all_credentials))) #for searching credential in termFreq - for cred in creds: + for cred in all_credentials: cred = cred.split('@')[0] #Split to ignore mail address #unique number attached to unique path uniq_num_path = server_cred.incr(REDIS_KEY_NUM_PATH) - server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {filepath: uniq_num_path}) - server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: filepath}) + server_cred.hmset(REDIS_KEY_ALL_PATH_SET, {item_id: uniq_num_path}) + server_cred.hmset(REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: item_id}) #unique number attached to unique username uniq_num_cred = server_cred.hget(REDIS_KEY_ALL_CRED_SET, cred) diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py new file mode 100755 index 00000000..04555524 --- /dev/null +++ b/bin/lib/regex_helper.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +Regex Helper +""" + +import os +import re +import sys +import uuid + +from multiprocessing import Process as Proc + +sys.path.append(os.environ['AIL_BIN']) +from pubsublogger import publisher + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib')) +import ConfigLoader +import statistics + +## LOAD CONFIG ## +config_loader = ConfigLoader.ConfigLoader() +r_serv_cache = config_loader.get_redis_conn("Redis_Cache") +config_loader = None +## -- ## + +publisher.port = 6380 +publisher.channel = "Script" + +def generate_redis_cache_key(module_name): + return '{}_extracted:{}'.format(module_name, str(uuid.uuid4())) + +def _regex_findall(redis_key, regex, item_content, r_set): + all_items = re.findall(regex, item_content) + if r_set: + if len(all_items) > 1: + r_serv_cache.sadd(redis_key, *all_items) + r_serv_cache.expire(redis_key, 360) + elif all_items: + r_serv_cache.sadd(redis_key, all_items[0]) + r_serv_cache.expire(redis_key, 360) + else: + if len(all_items) > 1: + r_serv_cache.lpush(redis_key, *all_items) + r_serv_cache.expire(redis_key, 360) + elif all_items: + r_serv_cache.lpush(redis_key, all_items[0]) + r_serv_cache.expire(redis_key, 360) + +def regex_findall(module_name, redis_key, regex, item_content, max_time=30, r_set=True): + + proc = Proc(target=_regex_findall, args=(redis_key, regex, item_content, r_set, )) + try: + proc.start() + proc.join(max_time) + if proc.is_alive(): + proc.terminate() + statistics.incr_module_timeout_statistic(module_name) + err_mess = "{}: processing timeout: {}".format(module_name, item_id) + print(err_mess) + publisher.info(err_mess) + return [] + else: + if r_set: + all_items = r_serv_cache.smembers(redis_key) + else: + all_items = r_serv_cache.lrange(redis_key, 0 ,-1) + r_serv_cache.delete(redis_key) + proc.terminate() + return all_items + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating workers") + proc.terminate() + sys.exit(0) diff --git a/bin/lib/statistics.py b/bin/lib/statistics.py new file mode 100755 index 00000000..f0944754 --- /dev/null +++ b/bin/lib/statistics.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import datetime +import os +import redis +import sys + +sys.path.append(os.path.join(os.environ['AIL_BIN'], 'lib/')) +import ConfigLoader + +config_loader = ConfigLoader.ConfigLoader() +r_serv_statistics = config_loader.get_redis_conn("ARDB_Statistics") +config_loader = None + +def incr_module_timeout_statistic(module_name): + curr_date = datetime.date.today() + r_serv_statistics.hincrby(curr_date.strftime("%Y%m%d"), 'paste_by_modules_timeout:{}'.format(module_name), 1)