mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			
		
			
				
	
	
		
			106 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
			
		
		
	
	
			106 lines
		
	
	
		
			3.7 KiB
		
	
	
	
		
			Python
		
	
	
		
			Executable File
		
	
| #!/usr/bin/env python3
 | |
| # -*-coding:UTF-8 -*
 | |
| 
 | |
| """
 | |
| The Duplicate module
 | |
| ====================
 | |
| 
 | |
| This huge module is, in short term, checking duplicates.
 | |
| Its input comes from other modules, namely:
 | |
|     Credential
 | |
| 
 | |
| Perform comparisions with ssdeep and tlsh
 | |
| 
 | |
| """
 | |
| 
 | |
| import os
 | |
| import sys
 | |
| import time
 | |
| 
 | |
| # from datetime import datetime, timedelta
 | |
| import datetime
 | |
| 
 | |
| sys.path.append(os.environ['AIL_BIN'])
 | |
| ##################################
 | |
| # Import Project packages
 | |
| ##################################
 | |
| from modules.abstract_module import AbstractModule
 | |
| from lib.ConfigLoader import ConfigLoader
 | |
| from lib import Duplicate
 | |
| from lib.objects.Items import Item
 | |
| 
 | |
| 
 | |
| class Duplicates(AbstractModule):
 | |
|     """Duplicates module."""
 | |
| 
 | |
|     def __init__(self):
 | |
|         super(Duplicates, self).__init__()
 | |
| 
 | |
|         config_loader = ConfigLoader()
 | |
|         THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
 | |
|         THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
 | |
|         self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
 | |
|         self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
 | |
| 
 | |
|         self.algos = {
 | |
|                         "ssdeep": {"threshold": THRESHOLD_SSDEEP},
 | |
|                         "tlsh": {"threshold": THRESHOLD_TLSH}
 | |
|                      }
 | |
| 
 | |
|         self.logger.info(f"Module: {self.module_name} Launched")
 | |
| 
 | |
|     def compute(self, message):
 | |
|         # IOError: "CRC Checksum Failed on : {id}"
 | |
| 
 | |
|         item = self.get_obj()
 | |
| 
 | |
|         # Check file size
 | |
|         if item.get_size() < self.min_item_size:
 | |
|             return None
 | |
| 
 | |
|         # one month
 | |
|         curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
 | |
|         last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
 | |
| 
 | |
|         x = time.time()
 | |
| 
 | |
|         # Get Hashs
 | |
|         content = item.get_content(r_type='bytes')
 | |
|         self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
 | |
|         self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
 | |
| 
 | |
|         # TODO: Handle computed duplicates
 | |
| 
 | |
|         nb_duplicates = 0
 | |
| 
 | |
|         for algo in self.algos:
 | |
|             obj_hash = self.algos[algo]['hash']
 | |
|             for date_ymonth in last_month_dates:
 | |
|                 if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
 | |
|                     Duplicate.add_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
 | |
|                     nb_duplicates += 1
 | |
|                 else:
 | |
|                     for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
 | |
|                         # # FIXME:  try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
 | |
|                         similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
 | |
|                         print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}')  # DEBUG:
 | |
|                         if similarity >= self.algos[algo]['threshold']:
 | |
|                             Duplicate.add_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
 | |
|                             nb_duplicates += 1
 | |
| 
 | |
|             # Save Hashs
 | |
|             Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
 | |
| 
 | |
|         if nb_duplicates:
 | |
|             self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{item.get_id()}')
 | |
| 
 | |
|         y = time.time()
 | |
|         print(f'{item.get_id()} Processed in {y-x} sec')
 | |
|         # self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x))
 | |
| 
 | |
| 
 | |
| if __name__ == "__main__":
 | |
| 
 | |
|     module = Duplicates()
 | |
|     module.run()
 |