2022-07-13 15:10:27 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
"""
|
|
|
|
The Duplicate module
|
|
|
|
====================
|
|
|
|
|
|
|
|
This huge module is, in short term, checking duplicates.
|
|
|
|
Its input comes from other modules, namely:
|
|
|
|
Credential
|
|
|
|
|
|
|
|
Perform comparisions with ssdeep and tlsh
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# from datetime import datetime, timedelta
|
2022-07-13 15:10:27 +02:00
|
|
|
import datetime
|
|
|
|
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from modules.abstract_module import AbstractModule
|
|
|
|
from lib.ConfigLoader import ConfigLoader
|
|
|
|
from lib import Duplicate
|
|
|
|
from lib.objects.Items import Item
|
|
|
|
|
|
|
|
|
|
|
|
class Duplicates(AbstractModule):
|
|
|
|
"""Duplicates module."""
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
super(Duplicates, self).__init__()
|
|
|
|
|
|
|
|
config_loader = ConfigLoader()
|
|
|
|
THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
|
|
|
|
THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
|
|
|
|
self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
|
|
|
|
self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
|
|
|
|
|
|
|
|
self.algos = {
|
|
|
|
"ssdeep": {"threshold": THRESHOLD_SSDEEP},
|
|
|
|
"tlsh": {"threshold": THRESHOLD_TLSH}
|
|
|
|
}
|
|
|
|
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.info(f"Module: {self.module_name} Launched")
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
def compute(self, message):
|
|
|
|
# IOError: "CRC Checksum Failed on : {id}"
|
|
|
|
|
2023-06-22 15:38:04 +02:00
|
|
|
item = self.get_obj()
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
# Check file size
|
|
|
|
if item.get_size() < self.min_item_size:
|
|
|
|
return None
|
|
|
|
|
|
|
|
# one month
|
|
|
|
curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
|
|
|
|
last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
|
|
|
|
|
|
|
|
x = time.time()
|
|
|
|
|
|
|
|
# Get Hashs
|
2023-05-12 15:29:53 +02:00
|
|
|
content = item.get_content(r_type='bytes')
|
2022-07-13 15:10:27 +02:00
|
|
|
self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
|
|
|
|
self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# TODO: Handle computed duplicates
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
nb_duplicates = 0
|
|
|
|
|
|
|
|
for algo in self.algos:
|
|
|
|
obj_hash = self.algos[algo]['hash']
|
|
|
|
for date_ymonth in last_month_dates:
|
|
|
|
if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
|
2022-11-29 16:01:01 +01:00
|
|
|
Duplicate.add_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
|
|
|
|
nb_duplicates += 1
|
2022-07-13 15:10:27 +02:00
|
|
|
else:
|
|
|
|
for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
|
|
|
|
# # FIXME: try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
|
|
|
|
similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
|
2022-11-29 16:01:01 +01:00
|
|
|
print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG:
|
2022-07-13 15:10:27 +02:00
|
|
|
if similarity >= self.algos[algo]['threshold']:
|
2022-11-29 16:01:01 +01:00
|
|
|
Duplicate.add_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
|
|
|
|
nb_duplicates += 1
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
# Save Hashs
|
|
|
|
Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
|
|
|
|
|
|
|
|
if nb_duplicates:
|
2024-03-13 11:58:40 +01:00
|
|
|
self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{self.obj.get_global_id()}')
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
y = time.time()
|
2024-03-13 11:58:40 +01:00
|
|
|
print(f'{self.obj.get_global_id()} Processed in {y-x} sec')
|
2022-10-25 16:25:19 +02:00
|
|
|
# self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x))
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
|
|
module = Duplicates()
|
|
|
|
module.run()
|