#!/usr/bin/env python3 # -*-coding:UTF-8 -* """ The Duplicate module ==================== This huge module is, in short term, checking duplicates. Its input comes from other modules, namely: Credential Perform comparisions with ssdeep and tlsh """ import os import sys import time # from datetime import datetime, timedelta import datetime sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from modules.abstract_module import AbstractModule from lib.ConfigLoader import ConfigLoader from lib import Duplicate from lib.objects.Items import Item class Duplicates(AbstractModule): """Duplicates module.""" def __init__(self): super(Duplicates, self).__init__() config_loader = ConfigLoader() THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep') THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh') self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range') self.algos = { "ssdeep": {"threshold": THRESHOLD_SSDEEP}, "tlsh": {"threshold": THRESHOLD_TLSH} } self.logger.info(f"Module: {self.module_name} Launched") def compute(self, message): # IOError: "CRC Checksum Failed on : {id}" item = self.get_obj() # Check file size if item.get_size() < self.min_item_size: return None # one month curr_date_ymonth = datetime.datetime.now().strftime("%Y%m") last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range) x = time.time() # Get Hashs content = item.get_content(r_type='bytes') self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content) self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content) # TODO: Handle computed duplicates nb_duplicates = 0 for algo in self.algos: obj_hash = self.algos[algo]['hash'] for date_ymonth in last_month_dates: if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth): Duplicate.add_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth) nb_duplicates += 1 else: for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth): # # FIXME: try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash) print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG: if similarity >= self.algos[algo]['threshold']: Duplicate.add_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth) nb_duplicates += 1 # Save Hashs Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id()) if nb_duplicates: self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{self.obj.get_global_id()}') y = time.time() print(f'{self.obj.get_global_id()} Processed in {y-x} sec') # self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x)) if __name__ == "__main__": module = Duplicates() module.run()