From 2f8a5a333ad9f940c70c7a21af81a9d65bc5ad62 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Wed, 13 Jul 2022 15:10:27 +0200
Subject: [PATCH] chg; [Duplicates module] refactor module + DB keys

---
 bin/DB_KVROCKS_MIGRATION.py                   |   2 +
 bin/Duplicates.py                             | 198 ------------------
 bin/lib/Duplicate.py                          | 130 ++++++++++++
 bin/lib/item_basic.py                         |  10 +
 bin/lib/objects/Items.py                      | 131 ++++++++++--
 bin/lib/objects/abstract_object.py            |   4 +
 bin/lib/objects/abstract_subtype_object.py    |  32 ++-
 bin/modules/Duplicates.py                     | 108 ++++++++++
 bin/packages/modules.cfg                      |  29 +--
 var/www/blueprints/objects_item.py            |  50 ++---
 var/www/templates/objects/item/show_item.html |  76 +++----
 11 files changed, 480 insertions(+), 290 deletions(-)
 delete mode 100755 bin/Duplicates.py
 create mode 100755 bin/lib/Duplicate.py
 create mode 100755 bin/modules/Duplicates.py

diff --git a/bin/DB_KVROCKS_MIGRATION.py b/bin/DB_KVROCKS_MIGRATION.py
index 6f6cc3e9..3bdd125c 100755
--- a/bin/DB_KVROCKS_MIGRATION.py
+++ b/bin/DB_KVROCKS_MIGRATION.py
@@ -217,6 +217,8 @@ def item_submit_migration():
 # /!\ KEY COLISION
 # # TODO: change db
 def tags_migration():
+    
+
 
     pass
 
diff --git a/bin/Duplicates.py b/bin/Duplicates.py
deleted file mode 100755
index 2057116b..00000000
--- a/bin/Duplicates.py
+++ /dev/null
@@ -1,198 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-
-"""
-The Duplicate module
-====================
-
-This huge module is, in short term, checking duplicates.
-Its input comes from other modules, namely:
-    Credential, CreditCard, Keys, Mails, SQLinjectionDetection, CVE and Phone
-
-This one differ from v1 by only using redis and not json file stored on disk
-
-Perform comparisions with ssdeep and tlsh
-
-Requirements:
--------------
-
-
-"""
-import redis
-import os
-import time
-from datetime import datetime, timedelta
-import json
-import ssdeep
-import tlsh
-from packages import Paste
-from pubsublogger import publisher
-
-from Helper import Process
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'Duplicates'
-
-    p = Process(config_section)
-
-    PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"))
-
-    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
-    threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
-    threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
-    threshold_set = {}
-    threshold_set['ssdeep'] = threshold_duplicate_ssdeep
-    threshold_set['tlsh'] = threshold_duplicate_tlsh
-    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
-
-    # REDIS #
-    dico_redis = {}
-    date_today = datetime.today()
-    for year in range(2013, date_today.year+1):
-        for month in range(0, 13):
-            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
-                host=p.config.get("ARDB_DB", "host"),
-                port=p.config.get("ARDB_DB", "port"),
-                db=str(year) + str(month),
-                decode_responses=True)
-
-    # FUNCTIONS #
-    publisher.info("Script duplicate started")
-
-    while True:
-        try:
-            hash_dico = {}
-            dupl = set()
-            dico_range_list = []
-
-            x = time.time()
-
-            message = p.get_from_set()
-            if message is not None:
-                path = message
-                PST = Paste.Paste(path)
-            else:
-                publisher.debug("Script Attribute is idling 10s")
-                print('sleeping')
-                time.sleep(10)
-                continue
-
-            # the paste is too small
-            if (PST._get_p_size() < min_paste_size):
-                continue
-
-            PST._set_p_hash_kind("ssdeep")
-            PST._set_p_hash_kind("tlsh")
-
-            # Assignate the correct redis connexion
-            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
-
-            # Creating the dico name: yyyymm
-            # Get the date of the range
-            date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
-            num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
-            for diff_month in range(0, num_of_month+1):
-                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
-                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
-                dico_range_list.append(to_append)
-
-            # Use all dico in range
-            dico_range_list = dico_range_list[0:maximum_month_range]
-
-            # UNIQUE INDEX HASHS TABLE
-            yearly_index = str(date_today.year)+'00'
-            r_serv0 = dico_redis[yearly_index]
-            r_serv0.incr("current_index")
-            index = (r_serv0.get("current_index")) + str(PST.p_date)
-
-            # Open selected dico range
-            opened_dico = []
-            for dico_name in dico_range_list:
-                opened_dico.append([dico_name, dico_redis[dico_name]])
-
-            # retrieve hash from paste
-            paste_hashes = PST._get_p_hash()
-
-            # Go throught the Database of the dico (of the month)
-            for curr_dico_name, curr_dico_redis in opened_dico:
-                for hash_type, paste_hash in paste_hashes.items():
-                    for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
-
-                        try:
-                            if hash_type == 'ssdeep':
-                                percent = 100-ssdeep.compare(dico_hash, paste_hash)
-                            else:
-                                percent = tlsh.diffxlen(dico_hash, paste_hash)
-                                if percent > 100:
-                                    percent = 100
-
-                            threshold_duplicate = threshold_set[hash_type]
-                            if percent < threshold_duplicate:
-                                percent = 100 - percent if hash_type == 'ssdeep' else percent #recovert the correct percent value for ssdeep
-                                # Go throught the Database of the dico filter (month)
-                                r_serv_dico = dico_redis[curr_dico_name]
-
-                                # index of paste
-                                index_current = r_serv_dico.get(dico_hash)
-                                index_current = index_current
-                                paste_path = r_serv_dico.get(index_current)
-                                paste_path = paste_path
-                                paste_date = r_serv_dico.get(index_current+'_date')
-                                paste_date = paste_date
-                                paste_date = paste_date if paste_date != None else "No date available"
-                                if paste_path != None:
-                                    paste_path = paste_path.replace(PASTES_FOLDER+'/', '', 1)
-                                    if paste_path != PST.p_rel_path:
-                                        hash_dico[dico_hash] = (hash_type, paste_path, percent, paste_date)
-
-                                        print('['+hash_type+'] '+'comparing: ' + str(PST.p_rel_path) + '  and  ' + str(paste_path) + ' percentage: ' + str(percent))
-
-                        except Exception:
-                            print('hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash)
-
-            # Add paste in DB after checking to prevent its analysis twice
-            # hash_type_i -> index_i  AND  index_i -> PST.PATH
-            r_serv1.set(index, PST.p_rel_path)
-            r_serv1.set(index+'_date', PST._get_p_date())
-            r_serv1.sadd("INDEX", index)
-            # Adding hashes in Redis
-            for hash_type, paste_hash in paste_hashes.items():
-                r_serv1.set(paste_hash, index)
-                #bad hash
-                if paste_hash == '':
-                    print('bad Hash: ' + hash_type)
-                else:
-                    r_serv1.sadd("HASHS_"+hash_type, paste_hash)
-
-    ##################### Similarity found  #######################
-
-            # if there is data in this dictionnary
-            if len(hash_dico) != 0:
-                # paste_tuple = (hash_type, date, paste_path, percent)
-                for dico_hash, paste_tuple in hash_dico.items():
-                    dupl.add(paste_tuple)
-
-                # Creating the object attribute and save it.
-                to_print = 'Duplicate;{};{};{};'.format(
-                    PST.p_source, PST.p_date, PST.p_name)
-                if dupl != []:
-                    dupl = list(dupl)
-                    PST.__setattr__("p_duplicate", dupl)
-                    PST.save_attribute_duplicate(dupl)
-                    PST.save_others_pastes_attribute_duplicate(dupl)
-                    publisher.info('{}Detected {};{}'.format(to_print, len(dupl), PST.p_rel_path))
-                    print('{}Detected {}'.format(to_print, len(dupl)))
-                    print('')
-
-                y = time.time()
-
-                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
-
-        except IOError:
-            to_print = 'Duplicate;{};{};{};'.format(
-                PST.p_source, PST.p_date, PST.p_name)
-            print("CRC Checksum Failed on :", PST.p_rel_path)
-            publisher.error('{}CRC Checksum Failed'.format(to_print))
diff --git a/bin/lib/Duplicate.py b/bin/lib/Duplicate.py
new file mode 100755
index 00000000..99de95e3
--- /dev/null
+++ b/bin/lib/Duplicate.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+import os
+import ssdeep
+import sys
+import time
+import tlsh
+
+import datetime
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib.ConfigLoader import ConfigLoader
+
+config_loader = ConfigLoader()
+r_serv_db = config_loader.get_redis_conn("Kvrocks_DB")
+MIN_ITEM_SIZE = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: RENAME ME
+config_loader = None
+
+#
+#
+# Hash != Duplicates => New correlation HASH => check if same hash if duplicate == 100
+#
+# Object Hash => correlation decoded => don't need correlation to exists
+#
+# New CORRELATION => HASH
+#                     -> compute/get(if exist we have a correlation) hash -> get correlation same hash
+#
+#
+# Duplicates between differents objects ?????
+#         Diff Decoded -> Item => Diff Item decoded - Item
+#
+# Duplicates domains != Duplicates items
+
+
+
+def get_ssdeep_hash(content):
+    return ssdeep.hash(content)
+
+def get_ssdeep_similarity(obj_hash, other_hash):
+    return ssdeep.compare(obj_hash, other_hash)
+
+def get_tlsh_hash(content):
+    return tlsh.hash(content)
+
+def get_tlsh_similarity(obj_hash, other_hash):
+    similarity = tlsh.diffxlen(obj_hash, other_hash)
+    if similarity > 100:
+        similarity = 100
+    similarity = 100 - similarity
+    return similarity
+
+def get_algo_similarity(algo, obj_hash, other_hash):
+    if algo == 'ssdeep':
+        return get_ssdeep_similarity(obj_hash, other_hash)
+    elif algo == 'tlsh':
+        return get_tlsh_similarity(obj_hash, other_hash)
+
+def get_algo_hashs_by_month(algo, date_ymonth):
+    return r_serv_db.hkeys(f'duplicates:hashs:{algo}:{date_ymonth}')
+
+def exists_algo_hash_by_month(algo, hash, date_ymonth):
+    return r_serv_db.hexists(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
+
+def get_object_id_by_hash(algo, hash, date_ymonth):
+    return r_serv_db.hget(f'duplicates:hashs:{algo}:{date_ymonth}', hash)
+
+def save_object_hash(algo, date_ymonth, hash, obj_id):
+    r_serv_db.hset(f'duplicates:hashs:{algo}:{date_ymonth}', hash, obj_id)
+
+
+def get_duplicates(obj_type, subtype, id):
+    dict_dup = {}
+    duplicates = r_serv_db.smembers(f'obj:duplicates:{obj_type}:{subtype}:{id}')
+    for str_dup in duplicates:
+        similarity, algo, id = str_dup.split(':', 2)
+        if not dict_dup.get(id):
+            dict_dup[id] = []
+        dict_dup[id].append({'algo': algo, 'similarity': int(similarity)})
+    return dict_dup
+
+
+def _add_obj_duplicate(algo, similarity, obj_type, subtype, id, id_2):
+    r_serv_db.sadd(f'obj:duplicates:{obj_type}:{subtype}:{id}', f'{similarity}:{algo}:{id_2}')
+
+def add_obj_duplicate(algo, hash, similarity, obj_type, subtype, id, date_ymonth):
+    obj2_id = get_object_id_by_hash(algo, hash, date_ymonth)
+    # same content
+    if similarity == 100:
+        dups = get_duplicates(obj_type, subtype, id)
+        for dup_id in dups:
+            for algo_dict in dups[dup_id]:
+                if algo_dict['similarity'] == 100 and algo_dict['algo'] == algo:
+                    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, dups[dup_id])
+                    _add_obj_duplicate(algo, similarity, obj_type, subtype, dups[dup_id], id)
+    _add_obj_duplicate(algo, similarity, obj_type, subtype, id, obj2_id)
+    _add_obj_duplicate(algo, similarity, obj_type, subtype, obj2_id, id)
+
+
+
+
+def get_last_x_month_dates(nb_months):
+    now = datetime.datetime.now()
+    result = [now.strftime("%Y%m")]
+    for x in range(0, nb_months):
+        now = now.replace(day=1) - datetime.timedelta(days=1)
+        result.append(now.strftime("%Y%m"))
+    return result
+
+
+
+if __name__ == '__main__':
+    res = get_last_x_month_dates(7)
+    print(res)
+
+
+
+
+
+
+
+
+
+
+
+
+#################################
diff --git a/bin/lib/item_basic.py b/bin/lib/item_basic.py
index 608a5ba6..c212f5ab 100755
--- a/bin/lib/item_basic.py
+++ b/bin/lib/item_basic.py
@@ -54,6 +54,16 @@ def is_crawled(item_id):
 def get_item_domain(item_id):
     return item_id[19:-36]
 
+def get_item_content_binary(item_id):
+    item_full_path = os.path.join(PASTES_FOLDER, item_id)
+    try:
+        with gzip.open(item_full_path, 'rb') as f:
+            item_content = f.read()
+    except Exception as e:
+        print(e)
+        item_content = ''
+    return item_content
+
 def get_item_content(item_id):
     item_full_path = os.path.join(PASTES_FOLDER, item_id)
     try:
diff --git a/bin/lib/objects/Items.py b/bin/lib/objects/Items.py
index d3011b31..a0dc8ab9 100755
--- a/bin/lib/objects/Items.py
+++ b/bin/lib/objects/Items.py
@@ -91,11 +91,14 @@ class Item(AbstractObject):
         else:
             return filename
 
-    def get_content(self):
+    def get_content(self, binary=False):
         """
         Returns Item content
         """
-        return item_basic.get_item_content(self.id)
+        if binary:
+            return item_basic.get_item_content_binary(self.id)
+        else:
+            return item_basic.get_item_content(self.id)
 
     def get_raw_content(self):
         filepath = self.get_filename()
@@ -110,15 +113,34 @@ class Item(AbstractObject):
             content = base64.b64encode(content)
         return content.decode()
 
+    def get_html2text_content(self, content=None, ignore_links=False):
+        if not content:
+            content = self.get_content()
+        h = html2text.HTML2Text()
+        h.ignore_links = ignore_links
+        h.ignore_images = ignore_links
+        return h.handle(content)
+
+    def get_size(self, str=False):
+        size = os.path.getsize(self.get_filename())/1024.0
+        if str:
+            size = round(size, 2)
+        return size
+
     def get_ail_2_ail_payload(self):
         payload = {'raw': self.get_gzip_content(b64=True)}
         return payload
 
-    def set_origin(self): # set_parent ?
-        pass
+    def set_father(self, father_id): # UPDATE KEYS ?????????????????????????????
+        r_serv_metadata.sadd(f'paste_children:{father_id}', self.id)
+        r_serv_metadata.hset(f'paste_metadata:{self.id}', 'father', father_id)
+
+        #f'obj:children:{obj_type}:{subtype}:{id}, {obj_type}:{subtype}:{id}
+        #f'obj:metadata:{obj_type}:{subtype}:{id}', 'father', fathe
+        #  => ON Object LEVEL ?????????
+
+
 
-    def add_duplicate(self):
-        pass
 
     def sanitize_id(self):
         pass
@@ -150,18 +172,25 @@ class Item(AbstractObject):
     # origin
     # duplicate -> all item iterations ???
     #
-    def create(self, content, tags, origin=None, duplicate=None):
-        self.save_on_disk(content, binary=True, compressed=False, base64=False)
+    def create(self, content, tags, father=None, duplicates=[], _save=True):
+        if _save:
+            self.save_on_disk(content, binary=True, compressed=False, base64=False)
 
         # # TODO:
         # for tag in tags:
         #     self.add_tag(tag)
 
-        if origin:
+        if father:
+            pass
+
+        for obj_id in duplicates:
+            for dup in duplicates[obj_id]:
+                self.add_duplicate(obj_id, dup['algo'], dup['similarity'])
+
+
+
 
-        if duplicate:
 
-        pass
 
     # # WARNING: UNCLEAN DELETE /!\ TEST ONLY /!\
     # TODO: DELETE ITEM CORRELATION + TAGS + METADATA + ...
@@ -204,6 +233,80 @@ class Item(AbstractObject):
     def exist_correlation(self):
         pass
 
+    def is_crawled(self):
+        return self.id.startswith('crawled')
+
+    # if is_crawled
+    def get_domain(self):
+        return self.id[19:-36]
+
+    def get_screenshot(self):
+        s = r_serv_metadata.hget(f'paste_metadata:{self.id}', 'screenshot')
+        if s:
+            return os.path.join(s[0:2], s[2:4], s[4:6], s[6:8], s[8:10], s[10:12], s[12:])
+
+    def get_har(self):
+        har_path = os.path.join(har_directory, self.id) + '.json'
+        if os.path.isfile(har_path):
+            return har_path
+        else:
+            return None
+
+    def get_url(self):
+        return r_serv_metadata.hget(f'paste_metadata:{self.id}', 'real_link')
+
+    # options: set of optional meta fields
+    def get_meta(self, options=set()):
+        meta = {}
+        meta['id'] = self.id
+        meta['date'] = self.get_date(separator=True) ############################ # TODO:
+        meta['source'] = self.get_source()
+        meta['tags'] = self.get_tags()
+        # optional meta fields
+        if 'content' in options:
+            meta['content'] = self.get_content()
+        if 'crawler' in options:
+            if self.is_crawled():
+                tags = meta.get('tags')
+                meta['crawler'] = self.get_meta_crawler(tags=tags)
+        if 'duplicates' in options:
+            meta['duplicates'] = self.get_duplicates()
+        if 'lines' in options:
+            content = meta.get('content')
+            meta['lines'] = self.get_meta_lines(content=content)
+        if 'size' in options:
+            meta['size'] = self.get_size(str=True)
+
+        # # TODO: ADD GET FATHER
+
+        # meta['encoding'] = None
+        return meta
+
+    def get_meta_crawler(self, tags=[]):
+        crawler = {}
+        if self.is_crawled():
+            crawler['domain'] = self.get_domain()
+            crawler['har'] = self.get_har()
+            crawler['screenshot'] = self.get_screenshot()
+            crawler['url'] = self.get_url()
+            if not tags:
+                tags = self.get_tags()
+            crawler['is_tags_safe'] = Tag.is_tags_safe(tags)
+        return crawler
+
+    def get_meta_lines(self, content=None):
+        if not content:
+            content = self.get_content()
+        max_length = 0
+        line_id = 0
+        nb_line = 0
+        for line in content.splitlines():
+            length = len(line)
+            if length > max_length:
+                max_length = length
+            nb_line += 1
+        return {'nb': nb_line, 'max_length': max_length}
+
     ############################################################################
     ############################################################################
 
@@ -547,7 +650,7 @@ def get_item_list_desc(list_item_id):
 def is_crawled(item_id):
     return item_basic.is_crawled(item_id)
 
-def get_crawler_matadata(item_id, ltags=None):
+def get_crawler_matadata(item_id, tags=None):
     dict_crawler = {}
     if is_crawled(item_id):
         dict_crawler['domain'] = get_item_domain(item_id)
@@ -759,5 +862,7 @@ def delete_domain_node(item_id):
 
 if __name__ == '__main__':
     content = 'test file content'
+    duplicates = {'tests/2020/01/02/test.gz': [{'algo':'ssdeep', 'similarity':75}, {'algo':'tlsh', 'similarity':45}]}
+
     item = Item('tests/2020/01/02/test_save.gz')
-    item.save_on_disk(content, binary=False)
+    item.create(content, _save=False)
diff --git a/bin/lib/objects/abstract_object.py b/bin/lib/objects/abstract_object.py
index 8e14590c..bc880ac7 100755
--- a/bin/lib/objects/abstract_object.py
+++ b/bin/lib/objects/abstract_object.py
@@ -17,6 +17,7 @@ sys.path.append(os.environ['AIL_BIN'])
 # Import Project packages
 ##################################
 from packages import Tag
+from lib import Duplicate
 from lib.Investigations import is_object_investigated, get_obj_investigations, delete_obj_investigations
 from lib.Tracker import is_obj_tracked, get_obj_all_trackers, delete_obj_trackers
 
@@ -69,6 +70,9 @@ class AbstractObject(ABC):
             tags = set(tags)
         return tags
 
+    def get_duplicates(self):
+        return Duplicate.get_duplicates(self.type, self.get_subtype(r_str=True), self.id)
+
     ## ADD TAGS ????
     #def add_tags(self):
 
diff --git a/bin/lib/objects/abstract_subtype_object.py b/bin/lib/objects/abstract_subtype_object.py
index 741df301..7a86ea33 100755
--- a/bin/lib/objects/abstract_subtype_object.py
+++ b/bin/lib/objects/abstract_subtype_object.py
@@ -113,21 +113,49 @@ class AbstractSubtypeObject(AbstractObject):
             if date > last_seen:
                 self.set_last_seen(date)
 
-    def add(self, date):
+    def add(self, date, item_id):
         self.update_correlation_daterange()
         # daily
         r_metadata.hincrby(f'{self.type}:{self.subtype}:{date}', self.id, 1)
         # all type
         r_metadata.zincrby(f'{self.type}_all:{self.subtype}', self.id, 1)
 
+        #######################################################################
+        #######################################################################
+        # REPLACE WITH CORRELATION ?????
+
+        # global set
+        r_serv_metadata.sadd(f'set_{self.type}_{self.subtype}:{self.id}', item_id)
+
+        ## object_metadata
+        # item
+        r_serv_metadata.sadd(f'item_{self.type}_{self.subtype}:{item_id}', self.id)
+
+        # new correlation
+        #
+        #       How to filter by correlation type ????
+        #
+        f'correlation:obj:{self.type}:{self.subtype}:{self.id}',                f'{obj_type}:{obj_subtype}:{obj_id}'
+        f'correlation:obj:{self.type}:{self.subtype}:{obj_type}:{self.id}',     f'{obj_subtype}:{obj_id}'
+
+        #
+        #
+        #
+        #
+        #
+        #
+        #
+        #
 
-    
 
         # # domain
         # if item_basic.is_crawled(item_id):
         #     domain = item_basic.get_item_domain(item_id)
         #     self.save_domain_correlation(domain, subtype, obj_id)
 
+    def create(self, first_seen, last_seen):
+        pass
+
 
 
     def _delete(self):
diff --git a/bin/modules/Duplicates.py b/bin/modules/Duplicates.py
new file mode 100755
index 00000000..169295ae
--- /dev/null
+++ b/bin/modules/Duplicates.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+
+"""
+The Duplicate module
+====================
+
+This huge module is, in short term, checking duplicates.
+Its input comes from other modules, namely:
+    Credential
+
+Perform comparisions with ssdeep and tlsh
+
+"""
+import redis
+
+
+import os
+import sys
+import time
+
+#from datetime import datetime, timedelta
+import datetime
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from modules.abstract_module import AbstractModule
+from lib.ConfigLoader import ConfigLoader
+from lib import Duplicate
+from lib.objects.Items import Item
+
+
+class Duplicates(AbstractModule):
+    """Duplicates module."""
+
+    def __init__(self):
+        super(Duplicates, self).__init__()
+
+        config_loader = ConfigLoader()
+        THRESHOLD_SSDEEP = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_ssdeep')
+        THRESHOLD_TLSH = config_loader.get_config_int('Modules_Duplicates', 'threshold_duplicate_tlsh')
+        self.min_item_size = float(config_loader.get_config_str('Modules_Duplicates', 'min_paste_size')) # # TODO: # FIXME: rename me
+        self.maximum_month_range = config_loader.get_config_int('Modules_Duplicates', 'maximum_month_range')
+
+        self.algos = {
+                        "ssdeep": {"threshold": THRESHOLD_SSDEEP},
+                        "tlsh": {"threshold": THRESHOLD_TLSH}
+                     }
+
+        self.redis_logger.info(f"Module: {self.module_name} Launched")
+
+
+    def compute(self, message):
+        # IOError: "CRC Checksum Failed on : {id}"
+
+        item = Item(message)
+
+        # Check file size
+        if item.get_size() < self.min_item_size:
+            return None
+
+        # one month
+        curr_date_ymonth = datetime.datetime.now().strftime("%Y%m")
+        last_month_dates = Duplicate.get_last_x_month_dates(self.maximum_month_range)
+
+        x = time.time()
+
+        # Get Hashs
+        content = item.get_content(binary=True)
+        self.algos['ssdeep']['hash'] = Duplicate.get_ssdeep_hash(content)
+        self.algos['tlsh']['hash'] = Duplicate.get_tlsh_hash(content)
+
+        # TODO: Handle coputed duplicates
+
+        nb_duplicates = 0
+
+        for algo in self.algos:
+            obj_hash = self.algos[algo]['hash']
+            for date_ymonth in last_month_dates:
+                if Duplicate.exists_algo_hash_by_month(algo, obj_hash, date_ymonth):
+                    Duplicate.add_obj_duplicate(algo, obj_hash, 100, 'item', '', item.get_id(), date_ymonth)
+                    nb_duplicates +=1
+                else:
+                    for hash in Duplicate.get_algo_hashs_by_month(algo, date_ymonth):
+                        # # FIXME:  try - catch 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
+                        similarity = Duplicate.get_algo_similarity(algo, obj_hash, hash)
+                        print(f'[{algo}] comparing: {obj_hash} and {hash} similarity: {similarity}') # DEBUG:
+                        if similarity >= self.algos[algo]['threshold']:
+                            Duplicate.add_obj_duplicate(algo, hash, similarity, 'item', '', item.get_id(), date_ymonth)
+                            nb_duplicates +=1
+
+            # Save Hashs
+            Duplicate.save_object_hash(algo, curr_date_ymonth, self.algos[algo]['hash'], item.get_id())
+
+        if nb_duplicates:
+            self.redis_logger.info(f'Duplicate;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {nb_duplicates};{item.get_id()}')
+
+        y = time.time()
+        print(f'{item.get_id()} Processed in {y-x} sec')
+        #self.redis_logger.debug('{}Processed in {} sec'.format(to_print, y-x))
+
+
+if __name__ == "__main__":
+
+    module = Duplicates()
+    module.run()
diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg
index 0a2ae61a..5b2e672b 100644
--- a/bin/packages/modules.cfg
+++ b/bin/packages/modules.cfg
@@ -66,15 +66,15 @@ publish = Redis_CreditCards,Redis_Mail,Redis_Onion,Redis_Urls,Redis_Credential,R
 
 [CreditCards]
 subscribe = Redis_CreditCards
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags
 
 [BankAccount]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Mail]
 subscribe = Redis_Mail
-publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
+publish = Redis_ModuleStats,Redis_Tags
 
 [Onion]
 subscribe = Redis_Onion
@@ -92,11 +92,11 @@ publish = Redis_Url
 
 [LibInjection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [SQLInjectionDetection]
 subscribe = Redis_Url
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [ModuleStats]
 subscribe = Redis_ModuleStats
@@ -128,31 +128,31 @@ publish = Redis_Duplicate,Redis_ModuleStats,Redis_Tags
 
 [Cve]
 subscribe = Redis_Cve
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Phone]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Keys]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_PgpDump,Redis_Tags
+publish = Redis_PgpDump,Redis_Tags
 
 [PgpDump]
 subscribe = Redis_PgpDump
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [ApiKey]
 subscribe = Redis_ApiKey
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Decoder]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Bitcoin]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [submit_paste]
 subscribe = Redis
@@ -164,7 +164,8 @@ publish = Redis_Mixer,Redis_Tags
 
 [IP]
 subscribe = Redis_Global
-publish = Redis_Duplicate,Redis_Tags
+publish = Redis_Tags
 
 [Zerobins]
-subscribe = Redis_Url
\ No newline at end of file
+subscribe = Redis_Url
+
diff --git a/var/www/blueprints/objects_item.py b/var/www/blueprints/objects_item.py
index 2b951353..0d2e0da6 100644
--- a/var/www/blueprints/objects_item.py
+++ b/var/www/blueprints/objects_item.py
@@ -15,12 +15,15 @@ from flask_login import login_required, current_user
 # Import Role_Manager
 from Role_Manager import login_admin, login_analyst, login_read_only
 
-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages'))
-import Item
-import Tag
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from lib import item_basic
+from lib.objects.Items import Item
+from export import Export
+from packages import Tag
 
-sys.path.append(os.path.join(os.environ['AIL_BIN'], 'export'))
-import Export
 
 # ============ BLUEPRINT ============
 objects_item = Blueprint('objects_item', __name__, template_folder=os.path.join(os.environ['AIL_FLASK'], 'templates/objects/item'))
@@ -38,28 +41,22 @@ bootstrap_label = ['primary', 'success', 'danger', 'warning', 'info']
 @login_read_only
 def showItem(): # # TODO: support post
     item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
         abort(404)
 
-    dict_item = {}
-    dict_item['id'] = item_id
-    dict_item['name'] = dict_item['id'].replace('/', ' / ')
-    dict_item['father'] = Item.get_item_parent(item_id)
-    dict_item['content'] = Item.get_item_content(item_id)
-    dict_item['metadata'] = Item.get_item_metadata(item_id, item_content=dict_item['content'])
-    dict_item['tags'] = Tag.get_obj_tag(item_id)
-    #dict_item['duplicates'] = Item.get_item_nb_duplicates(item_id)
-    dict_item['duplicates'] = Item.get_item_duplicates_dict(item_id)
-    dict_item['crawler'] = Item.get_crawler_matadata(item_id, ltags=dict_item['tags'])
+    item = Item(item_id)
+    meta = item.get_meta(options=set(['content', 'crawler', 'duplicates', 'lines', 'size']))
 
+    meta['name'] = meta['id'].replace('/', ' / ')
+    meta['father'] = item_basic.get_item_parent(item_id)
     ## EXPORT SECTION
     # # TODO: ADD in Export SECTION
-    dict_item['hive_case'] = Export.get_item_hive_cases(item_id)
+    meta['hive_case'] = Export.get_item_hive_cases(item_id)
 
     return render_template("show_item.html", bootstrap_label=bootstrap_label,
-                            modal_add_tags=Tag.get_modal_add_tags(dict_item['id'], object_type='item'),
+                            modal_add_tags=Tag.get_modal_add_tags(meta['id'], object_type='item'),
                             is_hive_connected=Export.get_item_hive_cases(item_id),
-                            dict_item=dict_item)
+                            meta=meta)
 
     # kvrocks data
 
@@ -74,24 +71,27 @@ def showItem(): # # TODO: support post
 @login_read_only
 def html2text(): # # TODO: support post
     item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
         abort(404)
-    return Item.get_item_content_html2text(item_id)
+    item = Item(item_id)
+    return item.get_html2text_content()
 
 @objects_item.route("/object/item/raw_content")
 @login_required
 @login_read_only
 def item_raw_content(): # # TODO: support post
     item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
         abort(404)
-    return Response(Item.get_item_content(item_id), mimetype='text/plain')
+    item = Item(item_id)
+    return Response(item.get_content(), mimetype='text/plain')
 
 @objects_item.route("/object/item/download")
 @login_required
 @login_read_only
 def item_download(): # # TODO: support post
     item_id = request.args.get('id')
-    if not item_id or not Item.exist_item(item_id):
+    if not item_id or not item_basic.exist_item(item_id):
         abort(404)
-    return send_file(Item.get_raw_content(item_id), attachment_filename=item_id, as_attachment=True)
+    item = Item(item_id)
+    return send_file(item.get_raw_content(), attachment_filename=item_id, as_attachment=True)
diff --git a/var/www/templates/objects/item/show_item.html b/var/www/templates/objects/item/show_item.html
index 6442e0ae..198ea02d 100644
--- a/var/www/templates/objects/item/show_item.html
+++ b/var/www/templates/objects/item/show_item.html
@@ -38,7 +38,7 @@
 
   <div class="card my-2 mx-2">
     <div class="card-header bg-dark">
-      <h3 class="text-white text-center" >{{ dict_item['name'] }}</h3>
+      <h3 class="text-white text-center" >{{ meta['name'] }}</h3>
     </div>
     <div class="card-body pb-1">
       <table class="table table-condensed">
@@ -46,7 +46,7 @@
           <tr>
             <th>Date</th>
             <th>Source</th>
-            <th>Encoding</th>
+            <!-- <th>Encoding</th> -->
             <th>Size (Kb)</th>
             <th>Number of lines</th>
             <th>Max line length</th>
@@ -54,12 +54,12 @@
         </thead>
         <tbody>
           <tr>
-          <td>{{ dict_item['metadata']['date'] }}</td>
-          <td>{{ dict_item['metadata']['source'] }}</td>
-          <td>{{ dict_item['metadata']['encoding'] }}</td>
-          <td>{{ dict_item['metadata']['size'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['nb'] }}</td>
-          <td>{{ dict_item['metadata']['lines']['max_length'] }}</td>
+          <td>{{ meta['date'] }}</td>
+          <td>{{ meta['source'] }}</td>
+          <!-- <td>{{ meta['encoding'] }}</td> -->
+          <td>{{ meta['size'] }}</td>
+          <td>{{ meta['lines']['nb'] }}</td>
+          <td>{{ meta['lines']['max_length'] }}</td>
           </tr>
         </tbody>
       </table>
@@ -68,9 +68,9 @@
         <h5>
           <div>
             {% include 'modals/edit_tag.html' %}
-            {% for tag in dict_item['tags'] %}
+            {% for tag in meta['tags'] %}
               <button class="btn btn-{{ bootstrap_label[loop.index0 % 5] }}" data-toggle="modal" data-target="#edit_tags_modal"
-              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ dict_item['id'] }}">
+              data-tagid="{{ tag }}" data-objtype="item" data-objid="{{ meta['id'] }}">
                 {{ tag }}
               </button>
 
@@ -84,21 +84,21 @@
         </h5>
       </div>
 
-      {% if dict_item['father'] %}
+      {% if meta['father'] %}
         <div class="mt-3">
-          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{dict_item['father']}}" target="_blank">{{dict_item['father']}}</a>
+          Father: <a href="{{ url_for('objects_item.showItem')}}?id={{meta['father']}}" target="_blank">{{meta['father']}}</a>
         </div>
       {% endif %}
 
       <div class="d-flex flex-row-reverse bd-highlight">
         <div>
-          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ dict_item['id'] }}&correlation_objects=paste" target="_blank">
+          <a href="{{ url_for('correlation.show_correlation')}}?object_type=paste&correlation_id={{ meta['id'] }}&correlation_objects=paste" target="_blank">
 						<button class="btn btn-lg btn-info"><i class="fas fa-project-diagram"></i> Correlations Graph
 						</button>
 					</a>
         </div>
         <div>
-          {% with obj_type='item', obj_id=dict_item['id'], obj_subtype=''%}
+          {% with obj_type='item', obj_id=meta['id'], obj_subtype=''%}
             {% include 'modals/investigations_register_obj.html' %}
           {% endwith %}
           <div class="mr-2">
@@ -108,7 +108,7 @@
           </div>
         </div>
         <div class="mx-2">
-          {% with obj_type='item', obj_id=dict_item['id'], obj_lvl=0%}
+          {% with obj_type='item', obj_id=meta['id'], obj_lvl=0%}
             {% include 'import_export/block_add_user_object_to_export.html' %}
           {% endwith %}
         </div>
@@ -134,14 +134,14 @@
     </div>
   {% endif %}
 
-  {% if dict_item['hive_case'] %}
+  {% if meta['hive_case'] %}
     <div class="list-group" id="misp_event">
       <li class="list-group-item active">The Hive Case already Created</li>
       <a target="_blank" href="{{ hive_url }}" class="list-group-item">{{ hive_url }}</a>
     </div>
   {% endif %}
 
-  {% if  dict_item['duplicates'] != 0 %}
+  {% if  meta['duplicates'] != 0 %}
     <div id="accordionDuplicate" class="mb-2 mx-3">
       <div class="card">
         <div class="card-header py-1" id="headingDuplicate">
@@ -149,7 +149,7 @@
             <div class="col-11">
               <div class="mt-2">
                 <i class="far fa-clone"></i> duplicates&nbsp;&nbsp;
-                <div class="badge badge-warning">{{dict_item['duplicates']|length}}</div>
+                <div class="badge badge-warning">{{meta['duplicates']|length}}</div>
               </div>
             </div>
             <div class="col-1">
@@ -173,19 +173,19 @@
                 </tr>
               </thead>
               <tbody>
-                {% for duplicate_id in dict_item['duplicates'] %}
+                {% for duplicate_id in meta['duplicates'] %}
                   <tr>
-                    <td>{{dict_item['duplicates'][duplicate_id]['date']}}</td>
+                    <td>{{meta['duplicates'][duplicate_id]['date']}}</td>
                     <td class="py-0">
                       <table class="table table-borderless table-sm my-0">
                         <tbody>
-                          {%for algo in dict_item['duplicates'][duplicate_id]['algo']|sort()%}
+                          {%for dict_algo in meta['duplicates'][duplicate_id]|sort(attribute='algo')%}
                             <tr>
-                              <td class="py-0">{{algo}}</td>
+                              <td class="py-0">{{dict_algo['algo']}}</td>
                               <td class="w-100 py-0">
                                 <div class="progress mt-1">
-                                  <div class="progress-bar progress-bar-striped {%if algo=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%;" aria-valuenow="{{dict_item['duplicates'][duplicate_id]['algo'][algo]}}" aria-valuemin="0" aria-valuemax="100">
-                                    {{dict_item['duplicates'][duplicate_id]['algo'][algo]}}%
+                                  <div class="progress-bar progress-bar-striped {%if dict_algo['algo']=='tlsh'%}bg-secondary{%endif%}" role="progressbar" style="width: {{dict_algo['similarity']}}%;" aria-valuenow="{{dict_algo['similarity']}}" aria-valuemin="0" aria-valuemax="100">
+                                    {{dict_algo['similarity']}}%
                                   </div>
                                 </div>
                               </td>
@@ -200,7 +200,7 @@
                       </a>
                     </td>
                     <td>
-                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{dict_item['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
+                      <a target="_blank" href="{{ url_for('showsavedpastes.showDiff') }}?s1={{meta['id']}}&s2={{duplicate_id}}" class="fa fa-columns" title="Show diff"></a>
                     </td>
                   </tr>
                 {% endfor %}
@@ -261,7 +261,7 @@
   {% endif %}
 
 
-  {% if dict_item['crawler'] %}
+  {% if meta['crawler'] %}
   <div id="accordionCrawler" class="mb-3 mx-3">
     <div class="card">
       <div class="card-header py-1" id="headingCrawler">
@@ -294,18 +294,18 @@
                   <tr>
                     <td><i class="far fa-file"></i></td>
                     <td>
-                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=dict_item['father']) }}" />{{ dict_item['father'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('objects_item.showItem', paste=meta['father']) }}" />{{ meta['father'] }}</a>
                     </td>
                   </tr>
                     <td><i class="fab fa-html5"></i></td>
                     <td>
-                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=dict_item['crawler']['domain']) }}" />{{ dict_item['crawler']['domain'] }}</a>
+                      <a class="badge" target="_blank" href="{{ url_for('crawler_splash.showDomain', domain=meta['crawler']['domain']) }}" />{{ meta['crawler']['domain'] }}</a>
                     </td>
                   </tr>
                   <tr>
                     <td>url</td>
                     <td>
-                      {{ dict_item['crawler']['url'] }}
+                      {{ meta['crawler']['url'] }}
                     </td>
                   </tr>
                 </tbody>
@@ -318,11 +318,11 @@
               <div class="card-body py-2">
                 <div class="row">
                   <div class="col-md-8">
-                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if dict_item['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
+                    <input class="custom-range mt-2" id="blocks" type="range" min="1" max="50" value="{%if meta['crawler']['is_tags_safe']%}13{%else%}0{%endif%}">
                   </div>
                   <div class="col-md-4">
-                    <button class="btn {%if dict_item['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
-                      {%if dict_item['crawler']['is_tags_safe']%}
+                    <button class="btn {%if meta['crawler']['is_tags_safe']%}btn-primary{%else%}btn-danger{%endif%}" onclick="blocks.value=50;pixelate();">
+                      {%if meta['crawler']['is_tags_safe']%}
                         <i class="fas fas fa-plus-square"></i>
                       {%else%}
                         <i class="fas fa-exclamation-triangle"></i>
@@ -358,8 +358,8 @@
         <li class="nav-item dropdown">
           <a class="nav-link dropdown-toggle" data-toggle="dropdown" href="#">Others</a>
           <div class="dropdown-menu">
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=dict_item['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
-            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=dict_item['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_raw_content', id=meta['id']) }}"><i class="far fa-file"></i> &nbsp;Raw Content</a>
+            <a class="dropdown-item" href="{{ url_for('objects_item.item_download', id=meta['id']) }}"><i class="fas fa-download"></i> &nbsp;Download</i></a>
           </div>
         </li>
       </ul>
@@ -367,7 +367,7 @@
 
       <div class="tab-content" id="pills-tabContent">
         <div class="tab-pane fade show active" id="pills-content" role="tabpanel" aria-labelledby="pills-content-tab">
-          <p class="my-0"> <pre class="border">{{ dict_item['content'] }}</pre></p>
+          <p class="my-0"> <pre class="border">{{ meta['content'] }}</pre></p>
         </div>
         <div class="tab-pane fade" id="pills-html2text" role="tabpanel" aria-labelledby="pills-html2text-tab">
           <p class="my-0"> <pre id="html2text-container" class="border"></pre></p>
@@ -393,7 +393,7 @@
 
       $('#pills-html2text-tab').on('shown.bs.tab', function (e) {
         if ($('#html2text-container').is(':empty')){
-          $.get("{{ url_for('objects_item.html2text') }}?id={{ dict_item['id'] }}").done(function(data){
+          $.get("{{ url_for('objects_item.html2text') }}?id={{ meta['id'] }}").done(function(data){
             $('#html2text-container').text(data);
           });
 
@@ -401,7 +401,7 @@
       });
   </script>
 
-{% if dict_item['crawler'] %}
+{% if meta['crawler'] %}
   <script>
   var ctx = canvas.getContext('2d'), img = new Image();
 
@@ -413,7 +413,7 @@
   img.addEventListener("error", img_error);
   var draw_img = false;
 
-  img.src = "{{ url_for('showsavedpastes.screenshot', filename=dict_item['crawler']['screenshot']) }}";
+  img.src = "{{ url_for('showsavedpastes.screenshot', filename=meta['crawler']['screenshot']) }}";
 
   function pixelate() {