From 2f8a5a333ad9f940c70c7a21af81a9d65bc5ad62 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 13 Jul 2022 15:10:27 +0200 Subject: [PATCH] chg; [Duplicates module] refactor module + DB keys --- bin/DB_KVROCKS_MIGRATION.py | 2 + bin/Duplicates.py | 198 ------------------ bin/lib/Duplicate.py | 130 ++++++++++++ bin/lib/item_basic.py | 10 + bin/lib/objects/Items.py | 131 ++++++++++-- bin/lib/objects/abstract_object.py | 4 + bin/lib/objects/abstract_subtype_object.py | 32 ++- bin/modules/Duplicates.py | 108 ++++++++++ bin/packages/modules.cfg | 29 +-- var/www/blueprints/objects_item.py | 50 ++--- var/www/templates/objects/item/show_item.html | 76 +++---- 11 files changed, 480 insertions(+), 290 deletions(-) delete mode 100755 bin/Duplicates.py create mode 100755 bin/lib/Duplicate.py create mode 100755 bin/modules/Duplicates.py diff --git a/bin/DB_KVROCKS_MIGRATION.py b/bin/DB_KVROCKS_MIGRATION.py index 6f6cc3e9..3bdd125c 100755 --- a/bin/DB_KVROCKS_MIGRATION.py +++ b/bin/DB_KVROCKS_MIGRATION.py @@ -217,6 +217,8 @@ def item_submit_migration(): # /!\ KEY COLISION # # TODO: change db def tags_migration(): + + pass diff --git a/bin/Duplicates.py b/bin/Duplicates.py deleted file mode 100755 index 2057116b..00000000 --- a/bin/Duplicates.py +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* - -""" -The Duplicate module -==================== - -This huge module is, in short term, checking duplicates. -Its input comes from other modules, namely: - Credential, CreditCard, Keys, Mails, SQLinjectionDetection, CVE and Phone - -This one differ from v1 by only using redis and not json file stored on disk - -Perform comparisions with ssdeep and tlsh - -Requirements: -------------- - - -""" -import redis -import os -import time -from datetime import datetime, timedelta -import json -import ssdeep -import tlsh -from packages import Paste -from pubsublogger import publisher - -from Helper import Process - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'Duplicates' - - p = Process(config_section) - - PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes")) - - maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) - threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep")) - threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh")) - threshold_set = {} - threshold_set['ssdeep'] = threshold_duplicate_ssdeep - threshold_set['tlsh'] = threshold_duplicate_tlsh - min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) - - # REDIS # - dico_redis = {} - date_today = datetime.today() - for year in range(2013, date_today.year+1): - for month in range(0, 13): - dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( - host=p.config.get("ARDB_DB", "host"), - port=p.config.get("ARDB_DB", "port"), - db=str(year) + str(month), - decode_responses=True) - - # FUNCTIONS # - publisher.info("Script duplicate started") - - while True: - try: - hash_dico = {} - dupl = set() - dico_range_list = [] - - x = time.time() - - message = p.get_from_set() - if message is not None: - path = message - PST = Paste.Paste(path) - else: - publisher.debug("Script Attribute is idling 10s") - print('sleeping') - time.sleep(10) - continue - - # the paste is too small - if (PST._get_p_size() < min_paste_size): - continue - - PST._set_p_hash_kind("ssdeep") - PST._set_p_hash_kind("tlsh") - - # Assignate the correct redis connexion - r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] - - # Creating the dico name: yyyymm - # Get the date of the range - date_range = date_today - timedelta(days = maximum_month_range*30.4166666) - num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month) - for diff_month in range(0, num_of_month+1): - curr_date_range = date_today - timedelta(days = diff_month*30.4166666) - to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) - dico_range_list.append(to_append) - - # Use all dico in range - dico_range_list = dico_range_list[0:maximum_month_range] - - # UNIQUE INDEX HASHS TABLE - yearly_index = str(date_today.year)+'00' - r_serv0 = dico_redis[yearly_index] - r_serv0.incr("current_index") - index = (r_serv0.get("current_index")) + str(PST.p_date) - - # Open selected dico range - opened_dico = [] - for dico_name in dico_range_list: - opened_dico.append([dico_name, dico_redis[dico_name]]) - - # retrieve hash from paste - paste_hashes = PST._get_p_hash() - - # Go throught the Database of the dico (of the month) - for curr_dico_name, curr_dico_redis in opened_dico: - for hash_type, paste_hash in paste_hashes.items(): - for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type): - - try: - if hash_type == 'ssdeep': - percent = 100-ssdeep.compare(dico_hash, paste_hash) - else: - percent = tlsh.diffxlen(dico_hash, paste_hash) - if percent > 100: - percent = 100 - - threshold_duplicate = threshold_set[hash_type] - if percent < threshold_duplicate: - percent = 100 - percent if hash_type == 'ssdeep' else percent #recovert the correct percent value for ssdeep - # Go throught the Database of the dico filter (month) - r_serv_dico = dico_redis[curr_dico_name] - - # index of paste - index_current = r_serv_dico.get(dico_hash) - index_current = index_current - paste_path = r_serv_dico.get(index_current) - paste_path = paste_path - paste_date = r_serv_dico.get(index_current+'_date') - paste_date = paste_date - paste_date = paste_date if paste_date != None else "No date available" - if paste_path != None: - paste_path = paste_path.replace(PASTES_FOLDER+'/', '', 1) - if paste_path != PST.p_rel_path: - hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date) - - print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + ' and ' + str(paste_path) + ' percentage: ' + str(percent)) - - except Exception: - print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash) - - # Add paste in DB after checking to prevent its analysis twice - # hash_type_i -> index_i AND index_i -> PST.PATH - r_serv1.set(index, PST.p_rel_path) - r_serv1.set(index+'_date', PST._get_p_date()) - r_serv1.sadd("INDEX", index) - # Adding hashes in Redis - for hash_type, paste_hash in paste_hashes.items(): - r_serv1.set(paste_hash, index) - #bad hash - if paste_hash == '': - print('bad Hash: ' + hash_type) - else: - r_serv1.sadd("HASHS_"+hash_type, paste_hash) - - ##################### Similarity found ####################### - - # if there is data in this dictionnary - if len(hash_dico) != 0: - # paste_tuple = (hash_type, date, paste_path, percent) - for dico_hash, paste_tuple in hash_dico.items(): - dupl.add(paste_tuple) - - # Creating the object attribute and save it. - to_print = 'Duplicate;{};{};{};'.format( - PST.p_source, PST.p_date, PST.p_name) - if dupl != []: - dupl = list(dupl) - PST.__setattr__("p_duplicate", dupl) - PST.save_attribute_duplicate(dupl) - PST.save_others_pastes_attribute_duplicate(dupl) - publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path)) - print('{}Detected {}'.format(to_print, len(dupl))) - print('') - - y = time.time() - - publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) - - except IOError: - to_print = 'Duplicate;{};{};{};'.format( - PST.p_source, PST.p_date, PST.p_name) - print("CRC Checksum Failed on :", PST.p_rel_path) - publisher.error('{}CRC Checksum Failed'.format(to_print)) diff --git a/bin/lib/Duplicate.py b/bin/lib/Duplicate.py new file mode 100755 index 00000000..99de95e3 --- /dev/null +++ b/bin/lib/Duplicate.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +import os +import ssdeep +import sys +import time +import tlsh + +import datetime + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib.ConfigLoader import ConfigLoader + +config_loader = ConfigLoader() +r_serv_db = config_loader.get_redis_conn("Kvrocks_DB") +MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME +config_loader = None + +# +# +# Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100 +# +# Object Hash => correlation decoded => don't need correlation to exists +# +# New CORRELATION => HASH +# -> compute/get(if exist we have a correlation) hash -> get correlation same hash +# +# +# Duplicates between differents objects ????? +# Diff Decoded -> Item => Diff Item decoded - Item +# +# Duplicates domains != Duplicates items + + + +def get_ssdeep_hash(content): + return ssdeep.hash(content) + +def get_ssdeep_similarity(obj_hash, other_hash): + return ssdeep.compare(obj_hash, other_hash) + +def get_tlsh_hash(content): + return tlsh.hash(content) + +def get_tlsh_similarity(obj_hash, other_hash): + similarity = tlsh.diffxlen(obj_hash, other_hash) + if similarity > 100: + similarity = 100 + similarity = 100 - similarity + return similarity + +def get_algo_similarity(algo, obj_hash, other_hash): + if algo == 'ssdeep': + return get_ssdeep_similarity(obj_hash, other_hash) + elif algo == 'tlsh': + return get_tlsh_similarity(obj_hash, other_hash) + +def get_algo_hashs_by_month(algo, date_ymonth): + return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}') + +def exists_algo_hash_by_month(algo, hash, date_ymonth): + return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash) + +def get_object_id_by_hash(algo, hash, date_ymonth): + return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash) + +def save_object_hash(algo, date_ymonth, hash, obj_id): + r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id) + + +def get_duplicates(obj_type, subtype, id): + dict_dup = {} + duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{id}') + for str_dup in duplicates: + similarity, algo, id = str_dup.split(':', 2) + if not dict_dup.get(id): + dict_dup[id] = [] + dict_dup[id].append({'algo': algo, 'similarity': int(similarity)}) + return dict_dup + + +def _add_obj_duplicate(algo, similarity, obj_type, subtype, id, id_2): + r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{id}', f'{similarity}:{algo}:{id_2}') + +def add_obj_duplicate(algo, hash, similarity, obj_type, subtype, id, date_ymonth): + obj2_id = get_object_id_by_hash(algo, hash, date_ymonth) + # same content + if similarity == 100: + dups = get_duplicates(obj_type, subtype, id) + for dup_id in dups: + for algo_dict in dups[dup_id]: + if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo: + _add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id]) + _add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id) + _add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id) + _add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id) + + + + +def get_last_x_month_dates(nb_months): + now = datetime.datetime.now() + result = [now.strftime("%Y%m")] + for x in range(0, nb_months): + now = now.replace(day=1) - datetime.timedelta(days=1) + result.append(now.strftime("%Y%m")) + return result + + + +if __name__ == '__main__': + res = get_last_x_month_dates(7) + print(res) + + + + + + + + + + + + +################################# diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py index 608a5ba6..c212f5ab 100755 --- a/bin/lib/item_basic.py +++ b/bin/lib/item_basic.py @@ -54,6 +54,16 @@ def is_crawled(item_id): def get_item_domain(item_id): return item_id[19:-36] +def get_item_content_binary(item_id): + item_full_path = os.path.join(PASTES_FOLDER, item_id) + try: + with gzip.open(item_full_path, 'rb') as f: + item_content = f.read() + except Exception as e: + print(e) + item_content = '' + return item_content + def get_item_content(item_id): item_full_path = os.path.join(PASTES_FOLDER, item_id) try: diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py index d3011b31..a0dc8ab9 100755 --- a/bin/lib/objects/Items.py +++ b/bin/lib/objects/Items.py @@ -91,11 +91,14 @@ class Item(AbstractObject): else: return filename - def get_content(self): + def get_content(self, binary=False): """ Returns Item content """ - return item_basic.get_item_content(self.id) + if binary: + return item_basic.get_item_content_binary(self.id) + else: + return item_basic.get_item_content(self.id) def get_raw_content(self): filepath = self.get_filename() @@ -110,15 +113,34 @@ class Item(AbstractObject): content = base64.b64encode(content) return content.decode() + def get_html2text_content(self, content=None, ignore_links=False): + if not content: + content = self.get_content() + h = html2text.HTML2Text() + h.ignore_links = ignore_links + h.ignore_images = ignore_links + return h.handle(content) + + def get_size(self, str=False): + size = os.path.getsize(self.get_filename())/1024.0 + if str: + size = round(size, 2) + return size + def get_ail_2_ail_payload(self): payload = {'raw': self.get_gzip_content(b64=True)} return payload - def set_origin(self): # set_parent ? - pass + def set_father(self, father_id): # UPDATE KEYS ????????????????????????????? + r_serv_metadata.sadd(f'paste_children:{father_id}', self.id) + r_serv_metadata.hset(f'paste_metadata:{self.id}', 'father', father_id) + + #f'obj:children:{obj_type}:{subtype}:{id}, {obj_type}:{subtype}:{id} + #f'obj:metadata:{obj_type}:{subtype}:{id}', 'father', fathe + # => ON Object LEVEL ????????? + + - def add_duplicate(self): - pass def sanitize_id(self): pass @@ -150,18 +172,25 @@ class Item(AbstractObject): # origin # duplicate -> all item iterations ??? # - def create(self, content, tags, origin=None, duplicate=None): - self.save_on_disk(content, binary=True, compressed=False, base64=False) + def create(self, content, tags, father=None, duplicates=[], _save=True): + if _save: + self.save_on_disk(content, binary=True, compressed=False, base64=False) # # TODO: # for tag in tags: # self.add_tag(tag) - if origin: + if father: + pass + + for obj_id in duplicates: + for dup in duplicates[obj_id]: + self.add_duplicate(obj_id, dup['algo'], dup['similarity']) + + + - if duplicate: - pass # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\ # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ... @@ -204,6 +233,80 @@ class Item(AbstractObject): def exist_correlation(self): pass + def is_crawled(self): + return self.id.startswith('crawled') + + # if is_crawled + def get_domain(self): + return self.id[19:-36] + + def get_screenshot(self): + s = r_serv_metadata.hget(f'paste_metadata:{self.id}', 'screenshot') + if s: + return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:]) + + def get_har(self): + har_path = os.path.join(har_directory, self.id) + '.json' + if os.path.isfile(har_path): + return har_path + else: + return None + + def get_url(self): + return r_serv_metadata.hget(f'paste_metadata:{self.id}', 'real_link') + + # options: set of optional meta fields + def get_meta(self, options=set()): + meta = {} + meta['id'] = self.id + meta['date'] = self.get_date(separator=True) ############################ # TODO: + meta['source'] = self.get_source() + meta['tags'] = self.get_tags() + # optional meta fields + if 'content' in options: + meta['content'] = self.get_content() + if 'crawler' in options: + if self.is_crawled(): + tags = meta.get('tags') + meta['crawler'] = self.get_meta_crawler(tags=tags) + if 'duplicates' in options: + meta['duplicates'] = self.get_duplicates() + if 'lines' in options: + content = meta.get('content') + meta['lines'] = self.get_meta_lines(content=content) + if 'size' in options: + meta['size'] = self.get_size(str=True) + + # # TODO: ADD GET FATHER + + # meta['encoding'] = None + return meta + + def get_meta_crawler(self, tags=[]): + crawler = {} + if self.is_crawled(): + crawler['domain'] = self.get_domain() + crawler['har'] = self.get_har() + crawler['screenshot'] = self.get_screenshot() + crawler['url'] = self.get_url() + if not tags: + tags = self.get_tags() + crawler['is_tags_safe'] = Tag.is_tags_safe(tags) + return crawler + + def get_meta_lines(self, content=None): + if not content: + content = self.get_content() + max_length = 0 + line_id = 0 + nb_line = 0 + for line in content.splitlines(): + length = len(line) + if length > max_length: + max_length = length + nb_line += 1 + return {'nb': nb_line, 'max_length': max_length} + ############################################################################ ############################################################################ @@ -547,7 +650,7 @@ def get_item_list_desc(list_item_id): def is_crawled(item_id): return item_basic.is_crawled(item_id) -def get_crawler_matadata(item_id, ltags=None): +def get_crawler_matadata(item_id, tags=None): dict_crawler = {} if is_crawled(item_id): dict_crawler['domain'] = get_item_domain(item_id) @@ -759,5 +862,7 @@ def delete_domain_node(item_id): if __name__ == '__main__': content = 'test file content' + duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]} + item = Item('tests/2020/01/02/test_save.gz') - item.save_on_disk(content, binary=False) + item.create(content, _save=False) diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py index 8e14590c..bc880ac7 100755 --- a/bin/lib/objects/abstract_object.py +++ b/bin/lib/objects/abstract_object.py @@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN']) # Import Project packages ################################## from packages import Tag +from lib import Duplicate from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations from lib.Tracker import is_obj_tracked, get_obj_all_trackers, delete_obj_trackers @@ -69,6 +70,9 @@ class AbstractObject(ABC): tags = set(tags) return tags + def get_duplicates(self): + return Duplicate.get_duplicates(self.type, self.get_subtype(r_str=True), self.id) + ## ADD TAGS ???? #def add_tags(self): diff --git a/bin/lib/objects/abstract_subtype_object.py b/bin/lib/objects/abstract_subtype_object.py index 741df301..7a86ea33 100755 --- a/bin/lib/objects/abstract_subtype_object.py +++ b/bin/lib/objects/abstract_subtype_object.py @@ -113,21 +113,49 @@ class AbstractSubtypeObject(AbstractObject): if date > last_seen: self.set_last_seen(date) - def add(self, date): + def add(self, date, item_id): self.update_correlation_daterange() # daily r_metadata.hincrby(f'{self.type}:{self.subtype}:{date}', self.id, 1) # all type r_metadata.zincrby(f'{self.type}_all:{self.subtype}', self.id, 1) + ####################################################################### + ####################################################################### + # REPLACE WITH CORRELATION ????? + + # global set + r_serv_metadata.sadd(f'set_{self.type}_{self.subtype}:{self.id}', item_id) + + ## object_metadata + # item + r_serv_metadata.sadd(f'item_{self.type}_{self.subtype}:{item_id}', self.id) + + # new correlation + # + # How to filter by correlation type ???? + # + f'correlation:obj:{self.type}:{self.subtype}:{self.id}', f'{obj_type}:{obj_subtype}:{obj_id}' + f'correlation:obj:{self.type}:{self.subtype}:{obj_type}:{self.id}', f'{obj_subtype}:{obj_id}' + + # + # + # + # + # + # + # + # - # # domain # if item_basic.is_crawled(item_id): # domain = item_basic.get_item_domain(item_id) # self.save_domain_correlation(domain, subtype, obj_id) + def create(self, first_seen, last_seen): + pass + def _delete(self): diff --git a/bin/modules/Duplicates.py b/bin/modules/Duplicates.py new file mode 100755 index 00000000..169295ae --- /dev/null +++ b/bin/modules/Duplicates.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* + +""" +The Duplicate module +==================== + +This huge module is, in short term, checking duplicates. +Its input comes from other modules, namely: + Credential + +Perform comparisions with ssdeep and tlsh + +""" +import redis + + +import os +import sys +import time + +#from datetime import datetime, timedelta +import datetime + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader +from lib import Duplicate +from lib.objects.Items import Item + + +class Duplicates(AbstractModule): + """Duplicates module.""" + + def __init__(self): + super(Duplicates, self).__init__() + + config_loader = ConfigLoader() + THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep') + THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh') + self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me + self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range') + + self.algos = { + "ssdeep": {"threshold": THRESHOLD_SSDEEP}, + "tlsh": {"threshold": THRESHOLD_TLSH} + } + + self.redis_logger.info(f"Module: {self.module_name} Launched") + + + def compute(self, message): + # IOError: "CRC Checksum Failed on : {id}" + + item = Item(message) + + # Check file size + if item.get_size() < self.min_item_size: + return None + + # one month + curr_date_ymonth = datetime.datetime.now().strftime("%Y%m") + last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range) + + x = time.time() + + # Get Hashs + content = item.get_content(binary=True) + self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content) + self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content) + + # TODO: Handle coputed duplicates + + nb_duplicates = 0 + + for algo in self.algos: + obj_hash = self.algos[algo]['hash'] + for date_ymonth in last_month_dates: + if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth): + Duplicate.add_obj_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth) + nb_duplicates +=1 + else: + for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth): + # # FIXME: try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash + similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash) + print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG: + if similarity >= self.algos[algo]['threshold']: + Duplicate.add_obj_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth) + nb_duplicates +=1 + + # Save Hashs + Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id()) + + if nb_duplicates: + self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{item.get_id()}') + + y = time.time() + print(f'{item.get_id()} Processed in {y-x} sec') + #self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x)) + + +if __name__ == "__main__": + + module = Duplicates() + module.run() diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index 0a2ae61a..5b2e672b 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -66,15 +66,15 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R [CreditCards] subscribe = Redis_CreditCards -publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags +publish = Redis_ModuleStats,Redis_Tags [BankAccount] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Mail] subscribe = Redis_Mail -publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags +publish = Redis_ModuleStats,Redis_Tags [Onion] subscribe = Redis_Onion @@ -92,11 +92,11 @@ publish = Redis_Url [LibInjection] subscribe = Redis_Url -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [SQLInjectionDetection] subscribe = Redis_Url -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [ModuleStats] subscribe = Redis_ModuleStats @@ -128,31 +128,31 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags [Cve] subscribe = Redis_Cve -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Phone] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Keys] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_PgpDump,Redis_Tags +publish = Redis_PgpDump,Redis_Tags [PgpDump] subscribe = Redis_PgpDump -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [ApiKey] subscribe = Redis_ApiKey -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Decoder] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Bitcoin] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [submit_paste] subscribe = Redis @@ -164,7 +164,8 @@ publish = Redis_Mixer,Redis_Tags [IP] subscribe = Redis_Global -publish = Redis_Duplicate,Redis_Tags +publish = Redis_Tags [Zerobins] -subscribe = Redis_Url \ No newline at end of file +subscribe = Redis_Url + diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py index 2b951353..0d2e0da6 100644 --- a/var/www/blueprints/objects_item.py +++ b/var/www/blueprints/objects_item.py @@ -15,12 +15,15 @@ from flask_login import login_required, current_user # Import Role_Manager from Role_Manager import login_admin, login_analyst, login_read_only -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages')) -import Item -import Tag +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from lib import item_basic +from lib.objects.Items import Item +from export import Export +from packages import Tag -sys.path.append(os.path.join(os.environ['AIL_BIN'], 'export')) -import Export # ============ BLUEPRINT ============ objects_item = Blueprint('objects_item', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/item')) @@ -38,28 +41,22 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info'] @login_read_only def showItem(): # # TODO: support post item_id = request.args.get('id') - if not item_id or not Item.exist_item(item_id): + if not item_id or not item_basic.exist_item(item_id): abort(404) - dict_item = {} - dict_item['id'] = item_id - dict_item['name'] = dict_item['id'].replace('/', ' / ') - dict_item['father'] = Item.get_item_parent(item_id) - dict_item['content'] = Item.get_item_content(item_id) - dict_item['metadata'] = Item.get_item_metadata(item_id, item_content=dict_item['content']) - dict_item['tags'] = Tag.get_obj_tag(item_id) - #dict_item['duplicates'] = Item.get_item_nb_duplicates(item_id) - dict_item['duplicates'] = Item.get_item_duplicates_dict(item_id) - dict_item['crawler'] = Item.get_crawler_matadata(item_id, ltags=dict_item['tags']) + item = Item(item_id) + meta = item.get_meta(options=set(['content', 'crawler', 'duplicates', 'lines', 'size'])) + meta['name'] = meta['id'].replace('/', ' / ') + meta['father'] = item_basic.get_item_parent(item_id) ## EXPORT SECTION # # TODO: ADD in Export SECTION - dict_item['hive_case'] = Export.get_item_hive_cases(item_id) + meta['hive_case'] = Export.get_item_hive_cases(item_id) return render_template("show_item.html", bootstrap_label=bootstrap_label, - modal_add_tags=Tag.get_modal_add_tags(dict_item['id'], object_type='item'), + modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'), is_hive_connected=Export.get_item_hive_cases(item_id), - dict_item=dict_item) + meta=meta) # kvrocks data @@ -74,24 +71,27 @@ def showItem(): # # TODO: support post @login_read_only def html2text(): # # TODO: support post item_id = request.args.get('id') - if not item_id or not Item.exist_item(item_id): + if not item_id or not item_basic.exist_item(item_id): abort(404) - return Item.get_item_content_html2text(item_id) + item = Item(item_id) + return item.get_html2text_content() @objects_item.route("/object/item/raw_content") @login_required @login_read_only def item_raw_content(): # # TODO: support post item_id = request.args.get('id') - if not item_id or not Item.exist_item(item_id): + if not item_id or not item_basic.exist_item(item_id): abort(404) - return Response(Item.get_item_content(item_id), mimetype='text/plain') + item = Item(item_id) + return Response(item.get_content(), mimetype='text/plain') @objects_item.route("/object/item/download") @login_required @login_read_only def item_download(): # # TODO: support post item_id = request.args.get('id') - if not item_id or not Item.exist_item(item_id): + if not item_id or not item_basic.exist_item(item_id): abort(404) - return send_file(Item.get_raw_content(item_id), attachment_filename=item_id, as_attachment=True) + item = Item(item_id) + return send_file(item.get_raw_content(), attachment_filename=item_id, as_attachment=True) diff --git a/var/www/templates/objects/item/show_item.html b/var/www/templates/objects/item/show_item.html index 6442e0ae..198ea02d 100644 --- a/var/www/templates/objects/item/show_item.html +++ b/var/www/templates/objects/item/show_item.html @@ -38,7 +38,7 @@
-

{{ dict_item['name'] }}

+

{{ meta['name'] }}

@@ -46,7 +46,7 @@ - + @@ -54,12 +54,12 @@ - - - - - - + + + + + +
Date SourceEncoding Size (Kb) Number of lines Max line length
{{ dict_item['metadata']['date'] }}{{ dict_item['metadata']['source'] }}{{ dict_item['metadata']['encoding'] }}{{ dict_item['metadata']['size'] }}{{ dict_item['metadata']['lines']['nb'] }}{{ dict_item['metadata']['lines']['max_length'] }}{{ meta['date'] }}{{ meta['source'] }}{{ meta['size'] }}{{ meta['lines']['nb'] }}{{ meta['lines']['max_length'] }}
@@ -68,9 +68,9 @@
{% include 'modals/edit_tag.html' %} - {% for tag in dict_item['tags'] %} + {% for tag in meta['tags'] %} @@ -84,21 +84,21 @@
- {% if dict_item['father'] %} + {% if meta['father'] %} {% endif %}
- +
- {% with obj_type='item', obj_id=dict_item['id'], obj_subtype=''%} + {% with obj_type='item', obj_id=meta['id'], obj_subtype=''%} {% include 'modals/investigations_register_obj.html' %} {% endwith %}
@@ -108,7 +108,7 @@
- {% with obj_type='item', obj_id=dict_item['id'], obj_lvl=0%} + {% with obj_type='item', obj_id=meta['id'], obj_lvl=0%} {% include 'import_export/block_add_user_object_to_export.html' %} {% endwith %}
@@ -134,14 +134,14 @@
{% endif %} - {% if dict_item['hive_case'] %} + {% if meta['hive_case'] %}
  • The Hive Case already Created
  • {{ hive_url }}
    {% endif %} - {% if dict_item['duplicates'] != 0 %} + {% if meta['duplicates'] != 0 %}
    @@ -149,7 +149,7 @@
    duplicates   -
    {{dict_item['duplicates']|length}}
    +
    {{meta['duplicates']|length}}
    @@ -173,19 +173,19 @@ - {% for duplicate_id in dict_item['duplicates'] %} + {% for duplicate_id in meta['duplicates'] %} - {{dict_item['duplicates'][duplicate_id]['date']}} + {{meta['duplicates'][duplicate_id]['date']}} - {%for algo in dict_item['duplicates'][duplicate_id]['algo']|sort()%} + {%for dict_algo in meta['duplicates'][duplicate_id]|sort(attribute='algo')%} - + @@ -200,7 +200,7 @@ {% endfor %} @@ -261,7 +261,7 @@ {% endif %} - {% if dict_item['crawler'] %} + {% if meta['crawler'] %}
    @@ -294,18 +294,18 @@
    @@ -318,11 +318,11 @@
    - +
    -
    {{algo}}{{dict_algo['algo']}}
    -
    - {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}% +
    + {{dict_algo['similarity']}}%
    - +
    - {{ dict_item['father'] }} + {{ meta['father'] }}
    - {{ dict_item['crawler']['domain'] }} + {{ meta['crawler']['domain'] }}
    url - {{ dict_item['crawler']['url'] }} + {{ meta['crawler']['url'] }}