Draft: added new duplicate hash comparison - tlsh

2016-08-04 11:55:38 +02:00 · 2016-08-04 11:55:38 +02:00 · d9316771cd
parent 50d2848a40
commit d9316771cd
6 changed files with 125 additions and 79 deletions
--- a/bin/Duplicate_ssdeep_v2.py
+++ b/bin/Duplicate_ssdeep_v2.py
@ -22,6 +22,7 @@ import time
 from datetime import datetime, timedelta
 import json
 import ssdeep
+import tlsh
 from packages import Paste
 from pubsublogger import publisher

@ -36,8 +37,12 @@ if __name__ == "__main__":
    p = Process(config_section)

    maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
-    threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) 
-    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) 
+    threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
+    threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
+    threshold_set = {}
+    threshold_set['ssdeep'] = threshold_duplicate_ssdeep 
+    threshold_set['tlsh'] = threshold_duplicate_tlsh 
+    min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))

    # REDIS #
    dico_redis = {}
@ -47,7 +52,7 @@ if __name__ == "__main__":
            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
                host=p.config.get("Redis_Level_DB", "host"), port=year,
                db=month)
-	    #print("dup: "+str(year)+str(month).zfill(2)+"\n")
+            #print("dup: "+str(year)+str(month).zfill(2)+"\n")

    # FUNCTIONS #
    publisher.info("Script duplicate started")
@ -70,10 +75,11 @@ if __name__ == "__main__":
                continue

            # the paste is too small
-            if (PST._get_p_size() < min_paste_size): 
+            if (PST._get_p_size() < min_paste_size):
                continue

            PST._set_p_hash_kind("ssdeep")
+            PST._set_p_hash_kind("tlsh")

            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
@ -86,7 +92,7 @@ if __name__ == "__main__":
                curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
                to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
                dico_range_list.append(to_append)
-            
+
            # Use all dico in range
            dico_range_list = dico_range_list[0:maximum_month_range]

@ -95,43 +101,47 @@ if __name__ == "__main__":
            r_serv0 = dico_redis[yearly_index]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index")+str(PST.p_date)
-            
-            # Open selected dico range 
+
+            # Open selected dico range
            opened_dico = []
            for dico_name in dico_range_list:
                opened_dico.append([dico_name, dico_redis[dico_name]])
-              
+
            # retrieve hash from paste
-            paste_hash = PST._get_p_hash()
-            
+            paste_hashes = PST._get_p_hash()
+
            # Go throught the Database of the dico (of the month)
            for curr_dico_name, curr_dico_redis in opened_dico:
-                for dico_hash in curr_dico_redis.smembers('HASHS'):
-                    try:
-                        percent = ssdeep.compare(dico_hash, paste_hash)
-                        if percent > threshold_duplicate:
-                            # Go throught the Database of the dico filter (month)
-                            r_serv_dico = dico_redis[curr_dico_name]
-                            
-                            # index of paste
-                            index_current = r_serv_dico.get(dico_hash)
-                            paste_path = r_serv_dico.get(index_current)
-                            if paste_path != None:
-                                hash_dico[dico_hash] = (paste_path, percent)
+                for hash_type, paste_hash in paste_hashes.iteritems():
+                    for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
+                        try:
+                            percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
+                            threshold_duplicate = threshold_set[hash_type]
+                            if percent < threshold_duplicate:
+                                # Go throught the Database of the dico filter (month)
+                                r_serv_dico = dico_redis[curr_dico_name]

-                            #print 'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
-                    except:
-                        # ssdeep hash not comparable
-                        print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
-                        curr_dico_redis.srem('HASHS', dico_hash)
+                                # index of paste
+                                index_current = r_serv_dico.get(dico_hash)
+                                paste_path = r_serv_dico.get(index_current)
+                                if paste_path != None:
+                                    hash_dico[dico_hash] = (hash_type, paste_path, percent)
+
+                                print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + '  and  ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
+                        except Exception,e:
+                            print str(e)
+                            # ssdeep hash not comparable
+                            #print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
+                            #curr_dico_redis.srem('HASHS', dico_hash)

            # Add paste in DB after checking to prevent its analysis twice
            # hash_i -> index_i  AND  index_i -> PST.PATH
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)
            # Adding the hash in Redis
-            r_serv1.set(paste_hash, index)
-            r_serv1.sadd("HASHS", paste_hash)
+            for hash_type, paste_hash in paste_hashes.iteritems():
+                r_serv1.set(paste_hash, index)
+                r_serv1.sadd("HASHS_"+hash_type, paste_hash)
    ##################### Similarity found  #######################

            # if there is data in this dictionnary
@ -153,7 +163,7 @@ if __name__ == "__main__":

                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
                #print '{}Processed in {} sec'.format(to_print, y-x)
-           
+
        except IOError:
            to_print = 'Duplicate;{};{};{};'.format(
                PST.p_source, PST.p_date, PST.p_name)
--- a/bin/packages/Hash.py
+++ b/bin/packages/Hash.py
@ -2,6 +2,7 @@ import hashlib
 import crcmod
 import mmh3
 import ssdeep
+import tlsh


 class Hash(object):
@ -36,4 +37,7 @@ class Hash(object):
        elif self.name == "ssdeep":
            hash = ssdeep.hash(string)

+        elif self.name == "tlsh":
+            hash = tlsh.hash(string)
+
        return hash
--- a/bin/packages/Paste.py
+++ b/bin/packages/Paste.py
@ -86,8 +86,8 @@ class Paste(object):
        self.p_source = var[-5]

        self.p_encoding = None
-        self.p_hash_kind = None
-        self.p_hash = None
+        self.p_hash_kind = {}
+        self.p_hash = {}
        self.p_langage = None
        self.p_nb_lines = None
        self.p_max_length_line = None
@ -159,7 +159,7 @@ class Paste(object):
        .. seealso:: Hash.py Object to get the available hashs.

        """
-        self.p_hash_kind = Hash(hashkind)
+        self.p_hash_kind[hashkind] = (Hash(hashkind))

    def _get_p_hash(self):
        """
@ -174,7 +174,8 @@ class Paste(object):
        .. seealso:: _set_p_hash_kind("md5")

        """
-        self.p_hash = self.p_hash_kind.Calculate(self.get_p_content())
+        for hash_name, the_hash in self.p_hash_kind.iteritems():
+            self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
        return self.p_hash

    def _get_p_language(self):
@ -202,42 +203,6 @@ class Paste(object):
    def _get_p_size(self):
        return self.p_size

-    def _get_hash_lines(self, min=1, start=1, jump=10):
-        """
-        Returning all the lines of the paste hashed.
-
-        :param min: -- (int) Minimum line length to be hashed.
-        :param start: -- (int) Number the line where to start.
-        :param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
-        (Maximum Granularity)
-
-        :return: a set([]) of hash.
-
-        .. warning:: Using a set here mean that this function will only return uniq hash.
-
-        If the paste is composed with 1000 time the same line, this function will return
-        just once the line.
-
-        This choice was made to avoid a certain redundancy and useless hash checking.
-
-        :Example: PST._get_hash_lines(1, 1, 0)
-
-        .. note:: You need first to "declare which kind of hash you want to use
-        before using this function
-        .. seealso:: _set_p_hash_kind("md5")
-
-        """
-        S = set([])
-        f = self.get_p_content_as_file()
-        for num, line in enumerate(f, start):
-            if len(line) >= min:
-                if jump > 1:
-                    if (num % jump) == 1:
-                        S.add(self.p_hash_kind.Calculate(line))
-                else:
-                    S.add(self.p_hash_kind.Calculate(line))
-        return S
-
    def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
        """
        Returning the percent of similarity with another paste.
@ -329,7 +294,10 @@ class Paste(object):
            self.store.hset(self.p_path, attr_name, json.dumps(value))

    def _get_from_redis(self, r_serv):
-        return r_serv.hgetall(self.p_hash)
+        ans = {}
+        for hash_name, the_hash in self.p_hash:
+            ans[hash_name] = r_serv.hgetall(the_hash)
+        return ans

    def _get_top_words(self, sort=False):
        """
--- a/installing_deps.sh
+++ b/installing_deps.sh
@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
 sudo ldconfig
 popd

+# tlsh
+test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
+pushd tlsh/
+./make
+popd
+
 # REDIS LEVEL DB #
 test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
 pushd redis-leveldb/
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
 python setup.py install
 popd

+# Py tlsh
+pushd tlsh/py_ext
+python setup.py build
+python setup.py install

 # Download the necessary NLTK corpora
 HOME=$(pwd) python -m textblob.download_corpora
--- a/var/www/Flask_server.py
+++ b/var/www/Flask_server.py
@ -14,6 +14,8 @@ import Paste
 from Date import Date

 # CONFIG #
+tlsh_to_percent = 1000.0
+
 configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
 if not os.path.exists(configfile):
    raise Exception('Unable to find the configuration file. \
@ -74,13 +76,28 @@ def parseStringToList(the_string):
                strList += c
        else:
            the_list = strList.split(',')
-            if len(the_list) == 2:
+            if len(the_list) == 3:
+               elemList = elemList + the_list
+            elif len(the_list) == 2:
               elemList.append(the_list)
            elif len(the_list) > 1:
               elemList.append(the_list[1:])
            strList = ""
    return elemList

+def parseStringToList2(the_string):
+    res = []
+    tab_str = the_string.split('], [')
+    tab_str[0] = tab_str[0][1:]+']'
+    tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
+    res.append(parseStringToList(tab_str[0]))
+    for i in range(1, len(tab_str)-2):
+        tab_str[i] = '['+tab_str[i]+']'
+        res.append(parseStringToList(tab_str[i]))
+    res.append(parseStringToList(tab_str[len(tab_str)-1]))
+    return res
+
+
 def showpaste(content_range):    
    requested_path = request.args.get('paste', '')
    paste = Paste.Paste(requested_path)
@ -93,19 +110,47 @@ def showpaste(content_range):
    p_mime = paste.p_mime
    p_lineinfo = paste.get_lines_info()
    p_content = paste.get_p_content().decode('utf-8', 'ignore')
-    p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
+    p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
    p_duplicate_list = []
    p_simil_list = []
+    p_hashtype_list = []
+

    for dup_list in p_duplicate_full_list:
-        path, simil_percent = dup_list
+        if dup_list[0] == "tlsh":
+            dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
+        else:
+            dup_list[2] = int(dup_list[2])
+            
+    p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
+
+    new_dup_list = []
+    dup_list_removed = []
+    for dup_list_index in range(0, len(p_duplicate_full_list)):
+        if dup_list_index in dup_list_removed:
+            continue
+        indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
+        hash_types = []
+        comp_vals = []
+        for i in indices:
+            hash_types.append(p_duplicate_full_list[i][0])
+            comp_vals.append(p_duplicate_full_list[i][2])
+            dup_list_removed.append(i)
+
+        hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
+        comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
+        new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
+
+    for dup_list in new_dup_list:
+        hash_type, path, simil_percent = dup_list
        p_duplicate_list.append(path)
        p_simil_list.append(simil_percent)
+        p_hashtype_list.append(hash_type)

    if content_range != 0:
       p_content = p_content[0:content_range] 

-    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)
+    return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)

 def get_date_range(num_day):
    curr_date = datetime.date.today()
--- a/var/www/templates/show_saved_paste.html
+++ b/var/www/templates/show_saved_paste.html
@ -43,16 +43,25 @@
  </div>
  <div class="panel-body" id="panel-body">
  {% if  duplicate_list|length == 0 %}
-      <h4> No Duplicate </h4>
+      <h3> No Duplicate </h3>
  {% else %}
-      <h4> Duplicate list: </h4>
+      <h3> Duplicate list: </h3>
+      <table style="width:100%">
      {% set i = 0 %}
+      <tr>
+          <th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
+      </tr>
      {% for dup_path in duplicate_list %}
-          Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
+          <tr>
+              <td>{{ hashtype_list[i] }}</td>
+              <td>Similarity: {{ simil_list[i] }}%</td>
+              <td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
+          </tr>
          {% set i = i + 1 %}
      {% endfor %}
+      </table>
  {% endif %}
-    <h4> Content: </h4>
+    <h3> Content: </h3>
  <p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
  </div>
 </div>