mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			105 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			105 lines
		
	
	
		
			3.6 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
#!/usr/bin/env python3
 | 
						|
# -*-coding:UTF-8 -*
 | 
						|
 | 
						|
"""
 | 
						|
The Duplicate module
 | 
						|
====================
 | 
						|
 | 
						|
This huge module is, in short term, checking duplicates.
 | 
						|
Its input comes from other modules, namely:
 | 
						|
    Credential
 | 
						|
 | 
						|
Perform comparisions with ssdeep and tlsh
 | 
						|
 | 
						|
"""
 | 
						|
 | 
						|
import os
 | 
						|
import sys
 | 
						|
import time
 | 
						|
 | 
						|
# from datetime import datetime, timedelta
 | 
						|
import datetime
 | 
						|
 | 
						|
sys.path.append(os.environ['AIL_BIN'])
 | 
						|
##################################
 | 
						|
# Import Project packages
 | 
						|
##################################
 | 
						|
from modules.abstract_module import AbstractModule
 | 
						|
from lib.ConfigLoader import ConfigLoader
 | 
						|
from lib import Duplicate
 | 
						|
 | 
						|
 | 
						|
class Duplicates(AbstractModule):
 | 
						|
    """Duplicates module."""
 | 
						|
 | 
						|
    def __init__(self):
 | 
						|
        super(Duplicates, self).__init__()
 | 
						|
 | 
						|
        config_loader = ConfigLoader()
 | 
						|
        THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
 | 
						|
        THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
 | 
						|
        self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
 | 
						|
        self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
 | 
						|
 | 
						|
        self.algos = {
 | 
						|
                        "ssdeep": {"threshold": THRESHOLD_SSDEEP},
 | 
						|
                        "tlsh": {"threshold": THRESHOLD_TLSH}
 | 
						|
                     }
 | 
						|
 | 
						|
        self.logger.info(f"Module: {self.module_name} Launched")
 | 
						|
 | 
						|
    def compute(self, message):
 | 
						|
        # IOError: "CRC Checksum Failed on : {id}"
 | 
						|
 | 
						|
        item = self.get_obj()
 | 
						|
 | 
						|
        # Check file size
 | 
						|
        if item.get_size() < self.min_item_size:
 | 
						|
            return None
 | 
						|
 | 
						|
        # one month
 | 
						|
        curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
 | 
						|
        last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
 | 
						|
 | 
						|
        x = time.time()
 | 
						|
 | 
						|
        # Get Hashs
 | 
						|
        content = item.get_content(r_type='bytes')
 | 
						|
        self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
 | 
						|
        self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
 | 
						|
 | 
						|
        # TODO: Handle computed duplicates
 | 
						|
 | 
						|
        nb_duplicates = 0
 | 
						|
 | 
						|
        for algo in self.algos:
 | 
						|
            obj_hash = self.algos[algo]['hash']
 | 
						|
            for date_ymonth in last_month_dates:
 | 
						|
                if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
 | 
						|
                    Duplicate.add_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
 | 
						|
                    nb_duplicates += 1
 | 
						|
                else:
 | 
						|
                    for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
 | 
						|
                        # # FIXME:  try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
 | 
						|
                        similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
 | 
						|
                        print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}')  # DEBUG:
 | 
						|
                        if similarity >= self.algos[algo]['threshold']:
 | 
						|
                            Duplicate.add_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
 | 
						|
                            nb_duplicates += 1
 | 
						|
 | 
						|
            # Save Hashs
 | 
						|
            Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
 | 
						|
 | 
						|
        if nb_duplicates:
 | 
						|
            self.logger.info(f'Duplicates {nb_duplicates};{self.obj.get_global_id()}')
 | 
						|
 | 
						|
        y = time.time()
 | 
						|
        print(f'{self.obj.get_global_id()} Processed in {y-x} sec')
 | 
						|
        # self.logger.debug('{}Processed in {} sec'.format(to_print, y-x))
 | 
						|
 | 
						|
 | 
						|
if __name__ == "__main__":
 | 
						|
 | 
						|
    module = Duplicates()
 | 
						|
    module.run()
 |