perf: use defined compiled regex

re.compile(regex) definition was not used
use compile_regex.findAll() directely instead of re.findall(regex)
pull/569/head
osagit 2021-05-07 14:24:41 +02:00 committed by GitHub
parent 4d6de7a397
commit 22693dca1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 111 additions and 97 deletions

View File

@ -5,6 +5,10 @@
Dectect Binary and decode it Dectect Binary and decode it
""" """
##################################
# Import External packages
##################################
import time import time
import os import os
import redis import redis
@ -13,16 +17,19 @@ from hashlib import sha1
import magic import magic
import json import json
import datetime import datetime
from pubsublogger import publisher from pubsublogger import publisher
import re
import signal
from lib import Decoded
##################################
# Import Project packages
##################################
from module.abstract_module import AbstractModule
from Helper import Process from Helper import Process
from packages import Item from packages import Item
from lib import Decoded
import re
import signal
class TimeoutException(Exception): class TimeoutException(Exception):
pass pass
@ -32,128 +39,135 @@ def timeout_handler(signum, frame):
signal.signal(signal.SIGALRM, timeout_handler) signal.signal(signal.SIGALRM, timeout_handler)
def hex_decoder(hexStr):
#hexStr = ''.join( hex_string.split(" ") )
return bytes(bytearray([int(hexStr[i:i+2], 16) for i in range(0, len(hexStr), 2)]))
def binary_decoder(binary_string): class Decoder(AbstractModule):
return bytes(bytearray([int(binary_string[i:i+8], 2) for i in range(0, len(binary_string), 8)])) """
Decoder module for AIL framework
"""
def base64_decoder(base64_string): # TODO to lambda expr
return base64.b64decode(base64_string) def hex_decoder(self, hexStr):
#hexStr = ''.join( hex_string.split(" ") )
def decode_string(content, item_id, item_date, encoded_list, decoder_name, encoded_min_size): return bytes(bytearray([int(hexStr[i:i+2], 16) for i in range(0, len(hexStr), 2)]))
find = False
for encoded in encoded_list:
if len(encoded) >= encoded_min_size:
decoded_file = decoder_function[decoder_name](encoded)
find = True
sha1_string = sha1(decoded_file).hexdigest()
mimetype = Decoded.get_file_mimetype(decoded_file)
if not mimetype:
print(item_id)
print(sha1_string)
raise Exception('Invalid mimetype')
Decoded.save_decoded_file_content(sha1_string, decoded_file, item_date, mimetype=mimetype)
Decoded.save_item_relationship(sha1_string, item_id)
Decoded.create_decoder_matadata(sha1_string, item_id, decoder_name)
#remove encoded from item content
content = content.replace(encoded, '', 1)
print('{} : {} - {}'.format(item_id, decoder_name, mimetype))
if(find):
set_out_item(decoder_name, item_id)
return content
def set_out_item(decoder_name, item_id):
publisher.warning(decoder_name+' decoded')
#Send to duplicate
p.populate_set_out(item_id, 'Duplicate')
msg = 'infoleak:automatic-detection="'+decoder_name+'";{}'.format(item_id)
p.populate_set_out(msg, 'Tags')
if __name__ == '__main__': # TODO to lambda expr
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) def binary_decoder(self, binary_string):
# Port of the redis instance used by pubsublogger return bytes(bytearray([int(binary_string[i:i+8], 2) for i in range(0, len(binary_string), 8)]))
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg
config_section = 'Decoder'
# Setup the I/O queues # TODO to lambda expr
p = Process(config_section) def base64_decoder(self, base64_string):
return base64.b64decode(base64_string)
serv_metadata = redis.StrictRedis(
host=p.config.get("ARDB_Metadata", "host"),
port=p.config.getint("ARDB_Metadata", "port"),
db=p.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
# Sent to the logging a description of the module def __init__(self):
publisher.info("Decoder started") super(Decoder, self).__init__(logger_channel="script:decoder")
regex_binary = '[0-1]{40,}' serv_metadata = redis.StrictRedis(
#regex_hex = '(0[xX])?[A-Fa-f0-9]{40,}' host=self.process.config.get("ARDB_Metadata", "host"),
regex_hex = '[A-Fa-f0-9]{40,}' port=self.process.config.getint("ARDB_Metadata", "port"),
regex_base64 = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)' db=self.process.config.getint("ARDB_Metadata", "db"),
decode_responses=True)
re.compile(regex_binary) regex_binary = '[0-1]{40,}'
re.compile(regex_hex) #regex_hex = '(0[xX])?[A-Fa-f0-9]{40,}'
re.compile(regex_base64) regex_hex = '[A-Fa-f0-9]{40,}'
regex_base64 = '(?:[A-Za-z0-9+/]{4}){2,}(?:[A-Za-z0-9+/]{2}[AEIMQUYcgkosw048]=|[A-Za-z0-9+/][AQgw]==)'
# map decoder function cmp_regex_binary = re.compile(regex_binary)
decoder_function = {'binary':binary_decoder,'hexadecimal':hex_decoder, 'base64':base64_decoder} cmp_regex_hex = re.compile(regex_hex)
cmp_regex_base64 = re.compile(regex_base64)
hex_max_execution_time = p.config.getint("Hex", "max_execution_time") # map decoder function
binary_max_execution_time = p.config.getint("Binary", "max_execution_time") self.decoder_function = {'binary':self.binary_decoder,'hexadecimal':self.hex_decoder, 'base64':self.base64_decoder}
base64_max_execution_time = p.config.getint("Base64", "max_execution_time")
# list all decoder yith regex, hex_max_execution_time = self.process.config.getint("Hex", "max_execution_time")
decoder_binary = {'name': 'binary', 'regex': regex_binary, 'encoded_min_size': 300, 'max_execution_time': binary_max_execution_time} binary_max_execution_time = self.process.config.getint("Binary", "max_execution_time")
decoder_hexadecimal = {'name': 'hexadecimal', 'regex': regex_hex, 'encoded_min_size': 300, 'max_execution_time': hex_max_execution_time} base64_max_execution_time = self.process.config.getint("Base64", "max_execution_time")
decoder_base64 = {'name': 'base64', 'regex': regex_base64, 'encoded_min_size': 40, 'max_execution_time': base64_max_execution_time}
decoder_order = [ decoder_base64, decoder_binary, decoder_hexadecimal, decoder_base64] # list all decoder with regex,
decoder_binary = {'name': 'binary', 'regex': cmp_regex_binary, 'encoded_min_size': 300, 'max_execution_time': binary_max_execution_time}
decoder_hexadecimal = {'name': 'hexadecimal', 'regex': cmp_regex_hex, 'encoded_min_size': 300, 'max_execution_time': hex_max_execution_time}
decoder_base64 = {'name': 'base64', 'regex': cmp_regex_base64, 'encoded_min_size': 40, 'max_execution_time': base64_max_execution_time}
for decoder in decoder_order: self.decoder_order = [ decoder_base64, decoder_binary, decoder_hexadecimal, decoder_base64]
serv_metadata.sadd('all_decoder', decoder['name'])
# Endless loop getting messages from the input queue for decoder in self.decoder_order:
while True: serv_metadata.sadd('all_decoder', decoder['name'])
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section)) # Waiting time in secondes between to message proccessed
time.sleep(1) self.pending_seconds = 1
continue
# Send module state to logs
self.redis_logger.info(f'Module {self.module_name} initialized')
def compute(self, message):
obj_id = Item.get_item_id(message) obj_id = Item.get_item_id(message)
# Do something with the message from the queue # Extract info from message
content = Item.get_item_content(obj_id) content = Item.get_item_content(obj_id)
date = Item.get_item_date(obj_id) date = Item.get_item_date(obj_id)
for decoder in decoder_order: # add threshold and size limit for decoder in self.decoder_order: # add threshold and size limit
# max execution time on regex # max execution time on regex
signal.alarm(decoder['max_execution_time']) signal.alarm(decoder['max_execution_time'])
try: try:
encoded_list = re.findall(decoder['regex'], content) encoded_list = decoder['regex'].findall(content)
except TimeoutException: except TimeoutException:
encoded_list = [] encoded_list = []
p.incr_module_timeout_statistic() # add encoder type self.process.incr_module_timeout_statistic() # add encoder type
print ("{0} processing timeout".format(obj_id)) self.redis_logger.debug(f"{obj_id} processing timeout")
continue continue
else: else:
signal.alarm(0) signal.alarm(0)
if(len(encoded_list) > 0): if(len(encoded_list) > 0):
content = decode_string(content, message, date, encoded_list, decoder['name'], decoder['encoded_min_size']) content = self.decode_string(content, message, date, encoded_list, decoder['name'], decoder['encoded_min_size'])
def decode_string(self, content, item_id, item_date, encoded_list, decoder_name, encoded_min_size):
find = False
for encoded in encoded_list:
if len(encoded) >= encoded_min_size:
decoded_file = self.decoder_function[decoder_name](encoded)
find = True
sha1_string = sha1(decoded_file).hexdigest()
mimetype = Decoded.get_file_mimetype(decoded_file)
if not mimetype:
self.redis_logger.debug(item_id)
self.redis_logger.debug(sha1_string)
raise Exception('Invalid mimetype')
Decoded.save_decoded_file_content(sha1_string, decoded_file, item_date, mimetype=mimetype)
Decoded.save_item_relationship(sha1_string, item_id)
Decoded.create_decoder_matadata(sha1_string, item_id, decoder_name)
#remove encoded from item content
content = content.replace(encoded, '', 1)
self.redis_logger.debug(f'{item_id} : {decoder_name} - {mimetype}')
if(find):
self.set_out_item(decoder_name, item_id)
return content
def set_out_item(self, decoder_name, item_id):
self.redis_logger.warning(f'{decoder_name} decoded')
# Send to duplicate
self.process.populate_set_out(item_id, 'Duplicate')
# Send to Tags
msg = f'infoleak:automatic-detection="{decoder_name}";{item_id}'
self.process.populate_set_out(msg, 'Tags')
if __name__ == '__main__':
module = Decoder()
module.run()