2022-07-13 15:10:27 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
|
|
|
import ssdeep
|
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
import tlsh
|
|
|
|
|
|
|
|
import datetime
|
|
|
|
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from lib.ConfigLoader import ConfigLoader
|
|
|
|
|
|
|
|
config_loader = ConfigLoader()
|
2022-11-29 16:01:01 +01:00
|
|
|
r_serv_db = config_loader.get_db_conn("Kvrocks_Duplicates")
|
2022-07-13 15:10:27 +02:00
|
|
|
MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME
|
|
|
|
config_loader = None
|
|
|
|
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100
|
|
|
|
#
|
|
|
|
# Object Hash => correlation decoded => don't need correlation to exists
|
|
|
|
#
|
|
|
|
# New CORRELATION => HASH
|
|
|
|
# -> compute/get(if exist we have a correlation) hash -> get correlation same hash
|
|
|
|
#
|
|
|
|
#
|
|
|
|
# Duplicates between differents objects ?????
|
|
|
|
# Diff Decoded -> Item => Diff Item decoded - Item
|
|
|
|
#
|
|
|
|
# Duplicates domains != Duplicates items
|
|
|
|
|
|
|
|
|
|
|
|
def get_ssdeep_hash(content):
|
|
|
|
return ssdeep.hash(content)
|
|
|
|
|
|
|
|
def get_ssdeep_similarity(obj_hash, other_hash):
|
|
|
|
return ssdeep.compare(obj_hash, other_hash)
|
|
|
|
|
|
|
|
def get_tlsh_hash(content):
|
|
|
|
return tlsh.hash(content)
|
|
|
|
|
|
|
|
def get_tlsh_similarity(obj_hash, other_hash):
|
|
|
|
similarity = tlsh.diffxlen(obj_hash, other_hash)
|
|
|
|
if similarity > 100:
|
|
|
|
similarity = 100
|
|
|
|
similarity = 100 - similarity
|
|
|
|
return similarity
|
|
|
|
|
|
|
|
def get_algo_similarity(algo, obj_hash, other_hash):
|
|
|
|
if algo == 'ssdeep':
|
|
|
|
return get_ssdeep_similarity(obj_hash, other_hash)
|
|
|
|
elif algo == 'tlsh':
|
|
|
|
return get_tlsh_similarity(obj_hash, other_hash)
|
|
|
|
|
|
|
|
def get_algo_hashs_by_month(algo, date_ymonth):
|
|
|
|
return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}')
|
|
|
|
|
|
|
|
def exists_algo_hash_by_month(algo, hash, date_ymonth):
|
|
|
|
return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
|
|
|
|
|
|
|
|
def get_object_id_by_hash(algo, hash, date_ymonth):
|
|
|
|
return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
|
|
|
|
|
|
|
|
def save_object_hash(algo, date_ymonth, hash, obj_id):
|
|
|
|
r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id)
|
|
|
|
|
|
|
|
|
2022-11-29 16:01:01 +01:00
|
|
|
def get_obj_duplicates(obj_type, subtype, obj_id):
|
2022-07-13 15:10:27 +02:00
|
|
|
dict_dup = {}
|
2022-11-29 16:01:01 +01:00
|
|
|
duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{obj_id}')
|
2022-07-13 15:10:27 +02:00
|
|
|
for str_dup in duplicates:
|
2022-11-29 16:01:01 +01:00
|
|
|
similarity, algo, id_2 = str_dup.split(':', 2)
|
|
|
|
if not dict_dup.get(id_2):
|
|
|
|
dict_dup[id_2] = []
|
|
|
|
dict_dup[id_2].append({'algo': algo, 'similarity': int(similarity)})
|
2022-07-13 15:10:27 +02:00
|
|
|
return dict_dup
|
|
|
|
|
2022-11-29 16:01:01 +01:00
|
|
|
def add_obj_duplicate(algo, similarity, obj_type, subtype, obj_id, id_2):
|
|
|
|
r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{obj_id}', f'{similarity}:{algo}:{id_2}')
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
|
2022-11-29 16:01:01 +01:00
|
|
|
def add_duplicate(algo, hash_, similarity, obj_type, subtype, id, date_ymonth):
|
|
|
|
obj2_id = get_object_id_by_hash(algo, hash_, date_ymonth)
|
2022-07-13 15:10:27 +02:00
|
|
|
# same content
|
|
|
|
if similarity == 100:
|
2022-11-29 16:01:01 +01:00
|
|
|
dups = get_obj_duplicates(obj_type, subtype, id)
|
2022-07-13 15:10:27 +02:00
|
|
|
for dup_id in dups:
|
|
|
|
for algo_dict in dups[dup_id]:
|
|
|
|
if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo:
|
2022-11-29 16:01:01 +01:00
|
|
|
add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id])
|
|
|
|
add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id)
|
|
|
|
add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id)
|
|
|
|
add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id)
|
|
|
|
|
|
|
|
# TODO
|
|
|
|
def delete_obj_duplicates():
|
|
|
|
pass
|
|
|
|
|
|
|
|
# TODO
|
|
|
|
def delete_obj_duplicate():
|
|
|
|
pass
|
2022-07-13 15:10:27 +02:00
|
|
|
|
|
|
|
def get_last_x_month_dates(nb_months):
|
|
|
|
now = datetime.datetime.now()
|
|
|
|
result = [now.strftime("%Y%m")]
|
|
|
|
for x in range(0, nb_months):
|
|
|
|
now = now.replace(day=1) - datetime.timedelta(days=1)
|
|
|
|
result.append(now.strftime("%Y%m"))
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
res = get_last_x_month_dates(7)
|
|
|
|
print(res)
|
|
|
|
|