chg; [Duplicates module] refactor module + DB keys

2022-07-13 15:10:27 +02:00 · 2022-07-13 15:10:27 +02:00 · 2f8a5a333a
parent 8672671e51
commit 2f8a5a333a
11 changed files with 480 additions and 290 deletions
--- a/bin/DB_KVROCKS_MIGRATION.py
+++ b/bin/DB_KVROCKS_MIGRATION.py
@ -217,6 +217,8 @@ def item_submit_migration():
 # /!\ KEY COLISION
 # # TODO: change db
 def tags_migration():
+    
+

    pass

--- a/bin/Duplicates.py
+++ b/bin/Duplicates.py
@ -1,198 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-
-"""
-The Duplicate module
-====================
-
-This huge module is, in short term, checking duplicates.
-Its input comes from other modules, namely:
-    Credential, CreditCard, Keys, Mails, SQLinjectionDetection, CVE and Phone
-
-This one differ from v1 by only using redis and not json file stored on disk
-
-Perform comparisions with ssdeep and tlsh
-
-Requirements:
-------------
-
-
-"""
-import redis
-import os
-import time
-from datetime import datetime, timedelta
-import json
-import ssdeep
-import tlsh
-from packages import Paste
-from pubsublogger import publisher
-
-from Helper import Process
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'Duplicates'
-
-    p = Process(config_section)
-
-    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
-
-    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
-    threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
-    threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
-    threshold_set = {}
-    threshold_set['ssdeep'] = threshold_duplicate_ssdeep
-    threshold_set['tlsh'] = threshold_duplicate_tlsh
-    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
-
-    # REDIS #
-    dico_redis = {}
-    date_today = datetime.today()
-    for year in range(2013, date_today.year+1):
-        for month in range(0, 13):
-            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
-                host=p.config.get("ARDB_DB", "host"),
-                port=p.config.get("ARDB_DB", "port"),
-                db=str(year) + str(month),
-                decode_responses=True)
-
-    # FUNCTIONS #
-    publisher.info("Script duplicate started")
-
-    while True:
-        try:
-            hash_dico = {}
-            dupl = set()
-            dico_range_list = []
-
-            x = time.time()
-
-            message = p.get_from_set()
-            if message is not None:
-                path = message
-                PST = Paste.Paste(path)
-            else:
-                publisher.debug("Script Attribute is idling 10s")
-                print('sleeping')
-                time.sleep(10)
-                continue
-
-            # the paste is too small
-            if (PST._get_p_size() < min_paste_size):
-                continue
-
-            PST._set_p_hash_kind("ssdeep")
-            PST._set_p_hash_kind("tlsh")
-
-            # Assignate the correct redis connexion
-            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
-
-            # Creating the dico name: yyyymm
-            # Get the date of the range
-            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
-            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
-            for diff_month in range(0, num_of_month+1):
-                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
-                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
-                dico_range_list.append(to_append)
-
-            # Use all dico in range
-            dico_range_list = dico_range_list[0:maximum_month_range]
-
-            # UNIQUE INDEX HASHS TABLE
-            yearly_index = str(date_today.year)+'00'
-            r_serv0 = dico_redis[yearly_index]
-            r_serv0.incr("current_index")
-            index = (r_serv0.get("current_index")) + str(PST.p_date)
-
-            # Open selected dico range
-            opened_dico = []
-            for dico_name in dico_range_list:
-                opened_dico.append([dico_name, dico_redis[dico_name]])
-
-            # retrieve hash from paste
-            paste_hashes = PST._get_p_hash()
-
-            # Go throught the Database of the dico (of the month)
-            for curr_dico_name, curr_dico_redis in opened_dico:
-                for hash_type, paste_hash in paste_hashes.items():
-                    for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
-
-                        try:
-                            if hash_type == 'ssdeep':
-                                percent = 100-ssdeep.compare(dico_hash, paste_hash)
-                            else:
-                                percent = tlsh.diffxlen(dico_hash, paste_hash)
-                                if percent > 100:
-                                    percent = 100
-
-                            threshold_duplicate = threshold_set[hash_type]
-                            if percent < threshold_duplicate:
-                                percent = 100 - percent if hash_type == 'ssdeep' else percent #recovert the correct percent value for ssdeep
-                                # Go throught the Database of the dico filter (month)
-                                r_serv_dico = dico_redis[curr_dico_name]
-
-                                # index of paste
-                                index_current = r_serv_dico.get(dico_hash)
-                                index_current = index_current
-                                paste_path = r_serv_dico.get(index_current)
-                                paste_path = paste_path
-                                paste_date = r_serv_dico.get(index_current+'_date')
-                                paste_date = paste_date
-                                paste_date = paste_date if paste_date != None else "No date available"
-                                if paste_path != None:
-                                    paste_path = paste_path.replace(PASTES_FOLDER+'/', '', 1)
-                                    if paste_path != PST.p_rel_path:
-                                        hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date)
-
-                                        print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + '  and  ' + str(paste_path) + ' percentage: ' + str(percent))
-
-                        except Exception:
-                            print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash)
-
-            # Add paste in DB after checking to prevent its analysis twice
-            # hash_type_i -> index_i  AND  index_i -> PST.PATH
-            r_serv1.set(index, PST.p_rel_path)
-            r_serv1.set(index+'_date', PST._get_p_date())
-            r_serv1.sadd("INDEX", index)
-            # Adding hashes in Redis
-            for hash_type, paste_hash in paste_hashes.items():
-                r_serv1.set(paste_hash, index)
-                #bad hash
-                if paste_hash == '':
-                    print('bad Hash: ' + hash_type)
-                else:
-                    r_serv1.sadd("HASHS_"+hash_type, paste_hash)
-
-    ##################### Similarity found  #######################
-
-            # if there is data in this dictionnary
-            if len(hash_dico) != 0:
-                # paste_tuple = (hash_type, date, paste_path, percent)
-                for dico_hash, paste_tuple in hash_dico.items():
-                    dupl.add(paste_tuple)
-
-                # Creating the object attribute and save it.
-                to_print = 'Duplicate;{};{};{};'.format(
-                    PST.p_source, PST.p_date, PST.p_name)
-                if dupl != []:
-                    dupl = list(dupl)
-                    PST.__setattr__("p_duplicate", dupl)
-                    PST.save_attribute_duplicate(dupl)
-                    PST.save_others_pastes_attribute_duplicate(dupl)
-                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path))
-                    print('{}Detected {}'.format(to_print, len(dupl)))
-                    print('')
-
-                y = time.time()
-
-                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
-
-        except IOError:
-            to_print = 'Duplicate;{};{};{};'.format(
-                PST.p_source, PST.p_date, PST.p_name)
-            print("CRC Checksum Failed on :", PST.p_rel_path)
-            publisher.error('{}CRC Checksum Failed'.format(to_print))
--- a/bin/lib/Duplicate.py
+++ b/bin/lib/Duplicate.py
@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import ssdeep
+import sys
+import time
+import tlsh
+
+import datetime
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib.ConfigLoader import ConfigLoader
+
+config_loader = ConfigLoader()
+r_serv_db = config_loader.get_redis_conn("Kvrocks_DB")
+MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME
+config_loader = None
+
+#
+#
+# Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100
+#
+# Object Hash => correlation decoded => don't need correlation to exists
+#
+# New CORRELATION => HASH
+#                     -> compute/get(if exist we have a correlation) hash -> get correlation same hash
+#
+#
+# Duplicates between differents objects ?????
+#         Diff Decoded -> Item => Diff Item decoded - Item
+#
+# Duplicates domains != Duplicates items
+
+
+
+def get_ssdeep_hash(content):
+    return ssdeep.hash(content)
+
+def get_ssdeep_similarity(obj_hash, other_hash):
+    return ssdeep.compare(obj_hash, other_hash)
+
+def get_tlsh_hash(content):
+    return tlsh.hash(content)
+
+def get_tlsh_similarity(obj_hash, other_hash):
+    similarity = tlsh.diffxlen(obj_hash, other_hash)
+    if similarity > 100:
+        similarity = 100
+    similarity = 100 - similarity
+    return similarity
+
+def get_algo_similarity(algo, obj_hash, other_hash):
+    if algo == 'ssdeep':
+        return get_ssdeep_similarity(obj_hash, other_hash)
+    elif algo == 'tlsh':
+        return get_tlsh_similarity(obj_hash, other_hash)
+
+def get_algo_hashs_by_month(algo, date_ymonth):
+    return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}')
+
+def exists_algo_hash_by_month(algo, hash, date_ymonth):
+    return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
+
+def get_object_id_by_hash(algo, hash, date_ymonth):
+    return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
+
+def save_object_hash(algo, date_ymonth, hash, obj_id):
+    r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id)
+
+
+def get_duplicates(obj_type, subtype, id):
+    dict_dup = {}
+    duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{id}')
+    for str_dup in duplicates:
+        similarity, algo, id = str_dup.split(':', 2)
+        if not dict_dup.get(id):
+            dict_dup[id] = []
+        dict_dup[id].append({'algo': algo, 'similarity': int(similarity)})
+    return dict_dup
+
+
+def _add_obj_duplicate(algo, similarity, obj_type, subtype, id, id_2):
+    r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{id}', f'{similarity}:{algo}:{id_2}')
+
+def add_obj_duplicate(algo, hash, similarity, obj_type, subtype, id, date_ymonth):
+    obj2_id = get_object_id_by_hash(algo, hash, date_ymonth)
+    # same content
+    if similarity == 100:
+        dups = get_duplicates(obj_type, subtype, id)
+        for dup_id in dups:
+            for algo_dict in dups[dup_id]:
+                if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo:
+                    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id])
+                    _add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id)
+    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id)
+    _add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id)
+
+
+
+
+def get_last_x_month_dates(nb_months):
+    now = datetime.datetime.now()
+    result = [now.strftime("%Y%m")]
+    for x in range(0, nb_months):
+        now = now.replace(day=1) - datetime.timedelta(days=1)
+        result.append(now.strftime("%Y%m"))
+    return result
+
+
+
+if __name__ == '__main__':
+    res = get_last_x_month_dates(7)
+    print(res)
+
+
+
+
+
+
+
+
+
+
+
+
+#################################
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@ -54,6 +54,16 @@ def is_crawled(item_id):
 def get_item_domain(item_id):
    return item_id[19:-36]

+def get_item_content_binary(item_id):
+    item_full_path = os.path.join(PASTES_FOLDER, item_id)
+    try:
+        with gzip.open(item_full_path, 'rb') as f:
+            item_content = f.read()
+    except Exception as e:
+        print(e)
+        item_content = ''
+    return item_content
+
 def get_item_content(item_id):
    item_full_path = os.path.join(PASTES_FOLDER, item_id)
    try:
--- a/bin/lib/objects/Items.py
+++ b/bin/lib/objects/Items.py
@ -91,11 +91,14 @@ class Item(AbstractObject):
        else:
            return filename

-    def get_content(self):
+    def get_content(self, binary=False):
        """
        Returns Item content
        """
-        return item_basic.get_item_content(self.id)
+        if binary:
+            return item_basic.get_item_content_binary(self.id)
+        else:
+            return item_basic.get_item_content(self.id)

    def get_raw_content(self):
        filepath = self.get_filename()
@ -110,15 +113,34 @@ class Item(AbstractObject):
            content = base64.b64encode(content)
        return content.decode()

+    def get_html2text_content(self, content=None, ignore_links=False):
+        if not content:
+            content = self.get_content()
+        h = html2text.HTML2Text()
+        h.ignore_links = ignore_links
+        h.ignore_images = ignore_links
+        return h.handle(content)
+
+    def get_size(self, str=False):
+        size = os.path.getsize(self.get_filename())/1024.0
+        if str:
+            size = round(size, 2)
+        return size
+
    def get_ail_2_ail_payload(self):
        payload = {'raw': self.get_gzip_content(b64=True)}
        return payload

-    def set_origin(self): # set_parent ?
-        pass
+    def set_father(self, father_id): # UPDATE KEYS ?????????????????????????????
+        r_serv_metadata.sadd(f'paste_children:{father_id}', self.id)
+        r_serv_metadata.hset(f'paste_metadata:{self.id}', 'father', father_id)
+
+        #f'obj:children:{obj_type}:{subtype}:{id}, {obj_type}:{subtype}:{id}
+        #f'obj:metadata:{obj_type}:{subtype}:{id}', 'father', fathe
+        #  => ON Object LEVEL ?????????
+
+

-    def add_duplicate(self):
-        pass

    def sanitize_id(self):
        pass
@ -150,18 +172,25 @@ class Item(AbstractObject):
    # origin
    # duplicate -> all item iterations ???
    #
-    def create(self, content, tags, origin=None, duplicate=None):
-        self.save_on_disk(content, binary=True, compressed=False, base64=False)
+    def create(self, content, tags, father=None, duplicates=[], _save=True):
+        if _save:
+            self.save_on_disk(content, binary=True, compressed=False, base64=False)

        # # TODO:
        # for tag in tags:
        #     self.add_tag(tag)

-        if origin:
+        if father:
+            pass
+
+        for obj_id in duplicates:
+            for dup in duplicates[obj_id]:
+                self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
+
+
+

-        if duplicate:

-        pass

    # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
    # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
@ -204,6 +233,80 @@ class Item(AbstractObject):
    def exist_correlation(self):
        pass

+    def is_crawled(self):
+        return self.id.startswith('crawled')
+
+    # if is_crawled
+    def get_domain(self):
+        return self.id[19:-36]
+
+    def get_screenshot(self):
+        s = r_serv_metadata.hget(f'paste_metadata:{self.id}', 'screenshot')
+        if s:
+            return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:])
+
+    def get_har(self):
+        har_path = os.path.join(har_directory, self.id) + '.json'
+        if os.path.isfile(har_path):
+            return har_path
+        else:
+            return None
+
+    def get_url(self):
+        return r_serv_metadata.hget(f'paste_metadata:{self.id}', 'real_link')
+
+    # options: set of optional meta fields
+    def get_meta(self, options=set()):
+        meta = {}
+        meta['id'] = self.id
+        meta['date'] = self.get_date(separator=True) ############################ # TODO:
+        meta['source'] = self.get_source()
+        meta['tags'] = self.get_tags()
+        # optional meta fields
+        if 'content' in options:
+            meta['content'] = self.get_content()
+        if 'crawler' in options:
+            if self.is_crawled():
+                tags = meta.get('tags')
+                meta['crawler'] = self.get_meta_crawler(tags=tags)
+        if 'duplicates' in options:
+            meta['duplicates'] = self.get_duplicates()
+        if 'lines' in options:
+            content = meta.get('content')
+            meta['lines'] = self.get_meta_lines(content=content)
+        if 'size' in options:
+            meta['size'] = self.get_size(str=True)
+
+        # # TODO: ADD GET FATHER
+
+        # meta['encoding'] = None
+        return meta
+
+    def get_meta_crawler(self, tags=[]):
+        crawler = {}
+        if self.is_crawled():
+            crawler['domain'] = self.get_domain()
+            crawler['har'] = self.get_har()
+            crawler['screenshot'] = self.get_screenshot()
+            crawler['url'] = self.get_url()
+            if not tags:
+                tags = self.get_tags()
+            crawler['is_tags_safe'] = Tag.is_tags_safe(tags)
+        return crawler
+
+    def get_meta_lines(self, content=None):
+        if not content:
+            content = self.get_content()
+        max_length = 0
+        line_id = 0
+        nb_line = 0
+        for line in content.splitlines():
+            length = len(line)
+            if length > max_length:
+                max_length = length
+            nb_line += 1
+        return {'nb': nb_line, 'max_length': max_length}
+
    ############################################################################
    ############################################################################

@ -547,7 +650,7 @@ def get_item_list_desc(list_item_id):
 def is_crawled(item_id):
    return item_basic.is_crawled(item_id)

-def get_crawler_matadata(item_id, ltags=None):
+def get_crawler_matadata(item_id, tags=None):
    dict_crawler = {}
    if is_crawled(item_id):
        dict_crawler['domain'] = get_item_domain(item_id)
@ -759,5 +862,7 @@ def delete_domain_node(item_id):

 if __name__ == '__main__':
    content = 'test file content'
+    duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
+
    item = Item('tests/2020/01/02/test_save.gz')
-    item.save_on_disk(content, binary=False)
+    item.create(content, _save=False)
--- a/bin/lib/objects/abstract_object.py
+++ b/bin/lib/objects/abstract_object.py
@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
 # Import Project packages
 ##################################
 from packages import Tag
+from lib import Duplicate
 from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
 from lib.Tracker import is_obj_tracked, get_obj_all_trackers, delete_obj_trackers

@ -69,6 +70,9 @@ class AbstractObject(ABC):
            tags = set(tags)
        return tags

+    def get_duplicates(self):
+        return Duplicate.get_duplicates(self.type, self.get_subtype(r_str=True), self.id)
+
    ## ADD TAGS ????
    #def add_tags(self):

--- a/bin/lib/objects/abstract_subtype_object.py
+++ b/bin/lib/objects/abstract_subtype_object.py
@ -113,21 +113,49 @@ class AbstractSubtypeObject(AbstractObject):
            if date > last_seen:
                self.set_last_seen(date)

-    def add(self, date):
+    def add(self, date, item_id):
        self.update_correlation_daterange()
        # daily
        r_metadata.hincrby(f'{self.type}:{self.subtype}:{date}', self.id, 1)
        # all type
        r_metadata.zincrby(f'{self.type}_all:{self.subtype}', self.id, 1)

+        #######################################################################
+        #######################################################################
+        # REPLACE WITH CORRELATION ?????
+
+        # global set
+        r_serv_metadata.sadd(f'set_{self.type}_{self.subtype}:{self.id}', item_id)
+
+        ## object_metadata
+        # item
+        r_serv_metadata.sadd(f'item_{self.type}_{self.subtype}:{item_id}', self.id)
+
+        # new correlation
+        #
+        #       How to filter by correlation type ????
+        #
+        f'correlation:obj:{self.type}:{self.subtype}:{self.id}',                f'{obj_type}:{obj_subtype}:{obj_id}'
+        f'correlation:obj:{self.type}:{self.subtype}:{obj_type}:{self.id}',     f'{obj_subtype}:{obj_id}'
+
+        #
+        #
+        #
+        #
+        #
+        #
+        #
+        #

-    

        # # domain
        # if item_basic.is_crawled(item_id):
        #     domain = item_basic.get_item_domain(item_id)
        #     self.save_domain_correlation(domain, subtype, obj_id)

+    def create(self, first_seen, last_seen):
+        pass
+


    def _delete(self):
--- a/bin/modules/Duplicates.py
+++ b/bin/modules/Duplicates.py
@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+"""
+The Duplicate module
+====================
+
+This huge module is, in short term, checking duplicates.
+Its input comes from other modules, namely:
+    Credential
+
+Perform comparisions with ssdeep and tlsh
+
+"""
+import redis
+
+
+import os
+import sys
+import time
+
+#from datetime import datetime, timedelta
+import datetime
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from modules.abstract_module import AbstractModule
+from lib.ConfigLoader import ConfigLoader
+from lib import Duplicate
+from lib.objects.Items import Item
+
+
+class Duplicates(AbstractModule):
+    """Duplicates module."""
+
+    def __init__(self):
+        super(Duplicates, self).__init__()
+
+        config_loader = ConfigLoader()
+        THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
+        THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
+        self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
+        self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
+
+        self.algos = {
+                        "ssdeep": {"threshold": THRESHOLD_SSDEEP},
+                        "tlsh": {"threshold": THRESHOLD_TLSH}
+                     }
+
+        self.redis_logger.info(f"Module: {self.module_name} Launched")
+
+
+    def compute(self, message):
+        # IOError: "CRC Checksum Failed on : {id}"
+
+        item = Item(message)
+
+        # Check file size
+        if item.get_size() < self.min_item_size:
+            return None
+
+        # one month
+        curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
+        last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
+
+        x = time.time()
+
+        # Get Hashs
+        content = item.get_content(binary=True)
+        self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
+        self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
+
+        # TODO: Handle coputed duplicates
+
+        nb_duplicates = 0
+
+        for algo in self.algos:
+            obj_hash = self.algos[algo]['hash']
+            for date_ymonth in last_month_dates:
+                if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
+                    Duplicate.add_obj_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
+                    nb_duplicates +=1
+                else:
+                    for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
+                        # # FIXME:  try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
+                        similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
+                        print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG:
+                        if similarity >= self.algos[algo]['threshold']:
+                            Duplicate.add_obj_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
+                            nb_duplicates +=1
+
+            # Save Hashs
+            Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
+
+        if nb_duplicates:
+            self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{item.get_id()}')
+
+        y = time.time()
+        print(f'{item.get_id()} Processed in {y-x} sec')
+        #self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x))
+
+
+if __name__ == "__main__":
+
+    module = Duplicates()
+    module.run()
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@ -66,15 +66,15 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R

 [CreditCards]
 subscribe = Redis_CreditCards
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags

 [BankAccount]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Mail]
 subscribe = Redis_Mail
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags

 [Onion]
 subscribe = Redis_Onion
@ -92,11 +92,11 @@ publish = Redis_Url

 [LibInjection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [SQLInjectionDetection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [ModuleStats]
 subscribe = Redis_ModuleStats
@ -128,31 +128,31 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags

 [Cve]
 subscribe = Redis_Cve
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Phone]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Keys]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_PgpDump,Redis_Tags
+publish = Redis_PgpDump,Redis_Tags

 [PgpDump]
 subscribe = Redis_PgpDump
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [ApiKey]
 subscribe = Redis_ApiKey
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Decoder]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Bitcoin]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [submit_paste]
 subscribe = Redis
@ -164,7 +164,8 @@ publish = Redis_Mixer,Redis_Tags

 [IP]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags

 [Zerobins]
-subscribe = Redis_Url
+subscribe = Redis_Url
+
--- a/var/www/blueprints/objects_item.py
+++ b/var/www/blueprints/objects_item.py
@ -15,12 +15,15 @@ from flask_login import login_required, current_user
 # Import Role_Manager
 from Role_Manager import login_admin, login_analyst, login_read_only

-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
-import Item
-import Tag
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib import item_basic
+from lib.objects.Items import Item
+from export import Export
+from packages import Tag

-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'export'))
-import Export

 # ============ BLUEPRINT ============
 objects_item = Blueprint('objects_item', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/item'))
@ -38,28 +41,22 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
@login_read_only
 def showItem(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)

-    dict_item = {}
-    dict_item['id'] = item_id
-    dict_item['name'] = dict_item['id'].replace('/', ' / ')
-    dict_item['father'] = Item.get_item_parent(item_id)
-    dict_item['content'] = Item.get_item_content(item_id)
-    dict_item['metadata'] = Item.get_item_metadata(item_id, item_content=dict_item['content'])
-    dict_item['tags'] = Tag.get_obj_tag(item_id)
-    #dict_item['duplicates'] = Item.get_item_nb_duplicates(item_id)
-    dict_item['duplicates'] = Item.get_item_duplicates_dict(item_id)
-    dict_item['crawler'] = Item.get_crawler_matadata(item_id, ltags=dict_item['tags'])
+    item = Item(item_id)
+    meta = item.get_meta(options=set(['content', 'crawler', 'duplicates', 'lines', 'size']))

+    meta['name'] = meta['id'].replace('/', ' / ')
+    meta['father'] = item_basic.get_item_parent(item_id)
    ## EXPORT SECTION
    # # TODO: ADD in Export SECTION
-    dict_item['hive_case'] = Export.get_item_hive_cases(item_id)
+    meta['hive_case'] = Export.get_item_hive_cases(item_id)

    return render_template("show_item.html", bootstrap_label=bootstrap_label,
-                            modal_add_tags=Tag.get_modal_add_tags(dict_item['id'], object_type='item'),
+                            modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'),
                            is_hive_connected=Export.get_item_hive_cases(item_id),
-                            dict_item=dict_item)
+                            meta=meta)

    # kvrocks data

@ -74,24 +71,27 @@ def showItem(): # # TODO: support post
@login_read_only
 def html2text(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return Item.get_item_content_html2text(item_id)
+    item = Item(item_id)
+    return item.get_html2text_content()

@objects_item.route("/object/item/raw_content")
@login_required
@login_read_only
 def item_raw_content(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return Response(Item.get_item_content(item_id), mimetype='text/plain')
+    item = Item(item_id)
+    return Response(item.get_content(), mimetype='text/plain')

@objects_item.route("/object/item/download")
@login_required
@login_read_only
 def item_download(): # # TODO: support post
    item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
        abort(404)
-    return send_file(Item.get_raw_content(item_id), attachment_filename=item_id, as_attachment=True)
+    item = Item(item_id)
+    return send_file(item.get_raw_content(), attachment_filename=item_id, as_attachment=True)
--- a/var/www/templates/objects/item/show_item.html
+++ b/var/www/templates/objects/item/show_item.html
@ -38,7 +38,7 @@

  <div class="card my-2 mx-2">
    <div class="card-header bg-dark">
-      <h3 class="text-white text-center" >{{ dict_item['name'] }}</h3>
+      <h3 class="text-white text-center" >{{ meta['name'] }}</h3>
    </div>
    <div class="card-body pb-1">
      <table class="table table-condensed">
@ -46,7 +46,7 @@
          <tr>
            <th>Date</th>
            <th>Source</th>
-            <th>Encoding</th>
+            <!-- <th>Encoding</th> -->
            <th>Size (Kb)</th>
            <th>Number of lines</th>
            <th>Max line length</th>
@ -54,12 +54,12 @@
        </thead>
        <tbody>
          <tr>
-          <td>{{ dict_item['metadata']['date'] }}</td>
-          <td>{{ dict_item['metadata']['source'] }}</td>
-          <td>{{ dict_item['metadata']['encoding'] }}</td>
-          <td>{{ dict_item['metadata']['size'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['nb'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['max_length'] }}</td>
+          <td>{{ meta['date'] }}</td>
+          <td>{{ meta['source'] }}</td>
+          <!-- <td>{{ meta['encoding'] }}</td> -->
+          <td>{{ meta['size'] }}</td>
+          <td>{{ meta['lines']['nb'] }}</td>
+          <td>{{ meta['lines']['max_length'] }}</td>
          </tr>
        </tbody>
      </table>
@ -68,9 +68,9 @@
        <h5>
          <div>
            {% include 'modals/edit_tag.html' %}
-            {% for tag in dict_item['tags'] %}
+            {% for tag in meta['tags'] %}
              <button class="btn btn-{{ bootstrap_label[loop.index0 % 5] }}" data-toggle="modal" data-target="#edit_tags_modal"
-              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ dict_item['id'] }}">
+              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ meta['id'] }}">
                {{ tag }}
              </button>

@ -84,21 +84,21 @@
        </h5>
      </div>

-      {% if dict_item['father'] %}
+      {% if meta['father'] %}
        <div class="mt-3">
-          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{dict_item['father']}}" target="_blank">{{dict_item['father']}}</a>
+          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{meta['father']}}" target="_blank">{{meta['father']}}</a>
        </div>
      {% endif %}

      <div class="d-flex flex-row-reverse bd-highlight">
        <div>
-          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ dict_item['id'] }}&correlation_objects=paste" target="_blank">
+          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ meta['id'] }}&correlation_objects=paste" target="_blank">
 						<button class="btn btn-lg btn-info"><i class="fas fa-project-diagram"></i> Correlations Graph
 						</button>
 					</a>
        </div>
        <div>
-          {% with obj_type='item', obj_id=dict_item['id'], obj_subtype=''%}
+          {% with obj_type='item', obj_id=meta['id'], obj_subtype=''%}
            {% include 'modals/investigations_register_obj.html' %}
          {% endwith %}
          <div class="mr-2">
@ -108,7 +108,7 @@
          </div>
        </div>
        <div class="mx-2">
-          {% with obj_type='item', obj_id=dict_item['id'], obj_lvl=0%}
+          {% with obj_type='item', obj_id=meta['id'], obj_lvl=0%}
            {% include 'import_export/block_add_user_object_to_export.html' %}
          {% endwith %}
        </div>
@ -134,14 +134,14 @@
    </div>
  {% endif %}

-  {% if dict_item['hive_case'] %}
+  {% if meta['hive_case'] %}
    <div class="list-group" id="misp_event">
      <li class="list-group-item active">The Hive Case already Created</li>
      <a target="_blank" href="{{ hive_url }}" class="list-group-item">{{ hive_url }}</a>
    </div>
  {% endif %}

-  {% if  dict_item['duplicates'] != 0 %}
+  {% if  meta['duplicates'] != 0 %}
    <div id="accordionDuplicate" class="mb-2 mx-3">
      <div class="card">
        <div class="card-header py-1" id="headingDuplicate">
@ -149,7 +149,7 @@
            <div class="col-11">
              <div class="mt-2">
                <i class="far fa-clone"></i> duplicates&nbsp;&nbsp;
-                <div class="badge badge-warning">{{dict_item['duplicates']|length}}</div>
+                <div class="badge badge-warning">{{meta['duplicates']|length}}</div>
              </div>
            </div>
            <div class="col-1">
@ -173,19 +173,19 @@
                </tr>
              </thead>
              <tbody>
-                {% for duplicate_id in dict_item['duplicates'] %}
+                {% for duplicate_id in meta['duplicates'] %}
                  <tr>
-                    <td>{{dict_item['duplicates'][duplicate_id]['date']}}</td>
+                    <td>{{meta['duplicates'][duplicate_id]['date']}}</td>
                    <td class="py-0">
                      <table class="table table-borderless table-sm my-0">
                        <tbody>
-                          {%for algo in dict_item['duplicates'][duplicate_id]['algo']|sort()%}
+                          {%for dict_algo in meta['duplicates'][duplicate_id]|sort(attribute='algo')%}
                            <tr>
-                              <td class="py-0">{{algo}}</td>
+                              <td class="py-0">{{dict_algo['algo']}}</td>
                              <td class="w-100 py-0">
                                <div class="progress mt-1">
-                                  <div class="progress-bar progress-bar-striped {%if algo=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%;" aria-valuenow="{{dict_item['duplicates'][duplicate_id]['algo'][algo]}}" aria-valuemin="0" aria-valuemax="100">
-                                    {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%
+                                  <div class="progress-bar progress-bar-striped {%if dict_algo['algo']=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_algo['similarity']}}%;" aria-valuenow="{{dict_algo['similarity']}}" aria-valuemin="0" aria-valuemax="100">
+                                    {{dict_algo['similarity']}}%
                                  </div>
                                </div>
                              </td>
@ -200,7 +200,7 @@
                      </a>
                    </td>
                    <td>
-                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{dict_item['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
+                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{meta['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
                    </td>
                  </tr>
                {% endfor %}
@ -261,7 +261,7 @@
  {% endif %}


-  {% if dict_item['crawler'] %}
+  {% if meta['crawler'] %}
  <div id="accordionCrawler" class="mb-3 mx-3">
    <div class="card">
      <div class="card-header py-1" id="headingCrawler">
@ -294,18 +294,18 @@
                  <tr>
                    <td><i class="far fa-file"></i></td>
                    <td>
-                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=dict_item['father']) }}" />{{ dict_item['father'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=meta['father']) }}" />{{ meta['father'] }}</a>
                    </td>
                  </tr>
                    <td><i class="fab fa-html5"></i></td>
                    <td>
-                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=dict_item['crawler']['domain']) }}" />{{ dict_item['crawler']['domain'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=meta['crawler']['domain']) }}" />{{ meta['crawler']['domain'] }}</a>
                    </td>
                  </tr>
                  <tr>
                    <td>url</td>
                    <td>
-                      {{ dict_item['crawler']['url'] }}
+                      {{ meta['crawler']['url'] }}
                    </td>
                  </tr>
                </tbody>
@ -318,11 +318,11 @@
              <div class="card-body py-2">
                <div class="row">
                  <div class="col-md-8">
-                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if dict_item['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
+                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if meta['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
                  </div>
                  <div class="col-md-4">
-                    <button class="btn {%if dict_item['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
-                      {%if dict_item['crawler']['is_tags_safe']%}
+                    <button class="btn {%if meta['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
+                      {%if meta['crawler']['is_tags_safe']%}
                        <i class="fas fas fa-plus-square"></i>
                      {%else%}
                        <i class="fas fa-exclamation-triangle"></i>
@ -358,8 +358,8 @@
        <li class="nav-item dropdown">
          <a class="nav-link dropdown-toggle" data-toggle="dropdown" href="#">Others</a>
          <div class="dropdown-menu">
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=dict_item['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=dict_item['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=meta['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=meta['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
          </div>
        </li>
      </ul>
@ -367,7 +367,7 @@

      <div class="tab-content" id="pills-tabContent">
        <div class="tab-pane fade show active" id="pills-content" role="tabpanel" aria-labelledby="pills-content-tab">
-          <p class="my-0"> <pre class="border">{{ dict_item['content'] }}</pre></p>
+          <p class="my-0"> <pre class="border">{{ meta['content'] }}</pre></p>
        </div>
        <div class="tab-pane fade" id="pills-html2text" role="tabpanel" aria-labelledby="pills-html2text-tab">
          <p class="my-0"> <pre id="html2text-container" class="border"></pre></p>
@ -393,7 +393,7 @@

      $('#pills-html2text-tab').on('shown.bs.tab', function (e) {
        if ($('#html2text-container').is(':empty')){
-          $.get("{{ url_for('objects_item.html2text') }}?id={{ dict_item['id'] }}").done(function(data){
+          $.get("{{ url_for('objects_item.html2text') }}?id={{ meta['id'] }}").done(function(data){
            $('#html2text-container').text(data);
          });

@ -401,7 +401,7 @@
      });
  </script>

-{% if dict_item['crawler'] %}
+{% if meta['crawler'] %}
  <script>
  var ctx = canvas.getContext('2d'), img = new Image();

@ -413,7 +413,7 @@
  img.addEventListener("error", img_error);
  var draw_img = false;

-  img.src = "{{ url_for('showsavedpastes.screenshot', filename=dict_item['crawler']['screenshot']) }}";
+  img.src = "{{ url_for('showsavedpastes.screenshot', filename=meta['crawler']['screenshot']) }}";

  function pixelate() {