AIL-framework/bin/lib/Duplicate.py

115 lines
3.7 KiB
Python
Executable File

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
import os
import ssdeep
import sys
import time
import tlsh
import datetime
sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from lib.ConfigLoader import ConfigLoader
config_loader = ConfigLoader()
r_serv_db = config_loader.get_db_conn("Kvrocks_DB")
MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME
config_loader = None
#
#
# Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100
#
# Object Hash => correlation decoded => don't need correlation to exists
#
# New CORRELATION => HASH
# -> compute/get(if exist we have a correlation) hash -> get correlation same hash
#
#
# Duplicates between differents objects ?????
# Diff Decoded -> Item => Diff Item decoded - Item
#
# Duplicates domains != Duplicates items
def get_ssdeep_hash(content):
return ssdeep.hash(content)
def get_ssdeep_similarity(obj_hash, other_hash):
return ssdeep.compare(obj_hash, other_hash)
def get_tlsh_hash(content):
return tlsh.hash(content)
def get_tlsh_similarity(obj_hash, other_hash):
similarity = tlsh.diffxlen(obj_hash, other_hash)
if similarity > 100:
similarity = 100
similarity = 100 - similarity
return similarity
def get_algo_similarity(algo, obj_hash, other_hash):
if algo == 'ssdeep':
return get_ssdeep_similarity(obj_hash, other_hash)
elif algo == 'tlsh':
return get_tlsh_similarity(obj_hash, other_hash)
def get_algo_hashs_by_month(algo, date_ymonth):
return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}')
def exists_algo_hash_by_month(algo, hash, date_ymonth):
return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
def get_object_id_by_hash(algo, hash, date_ymonth):
return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
def save_object_hash(algo, date_ymonth, hash, obj_id):
r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id)
def get_duplicates(obj_type, subtype, id):
dict_dup = {}
duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{id}')
for str_dup in duplicates:
similarity, algo, id = str_dup.split(':', 2)
if not dict_dup.get(id):
dict_dup[id] = []
dict_dup[id].append({'algo': algo, 'similarity': int(similarity)})
return dict_dup
def _add_obj_duplicate(algo, similarity, obj_type, subtype, id, id_2):
r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{id}', f'{similarity}:{algo}:{id_2}')
def add_obj_duplicate(algo, hash, similarity, obj_type, subtype, id, date_ymonth):
obj2_id = get_object_id_by_hash(algo, hash, date_ymonth)
# same content
if similarity == 100:
dups = get_duplicates(obj_type, subtype, id)
for dup_id in dups:
for algo_dict in dups[dup_id]:
if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo:
_add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id])
_add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id)
_add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id)
_add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id)
def get_last_x_month_dates(nb_months):
now = datetime.datetime.now()
result = [now.strftime("%Y%m")]
for x in range(0, nb_months):
now = now.replace(day=1) - datetime.timedelta(days=1)
result.append(now.strftime("%Y%m"))
return result
if __name__ == '__main__':
res = get_last_x_month_dates(7)
print(res)