diff --git a/bin/Duplicate_ssdeep_v2.py b/bin/Duplicate_ssdeep_v2.py index e8930c02..67fc14e2 100755 --- a/bin/Duplicate_ssdeep_v2.py +++ b/bin/Duplicate_ssdeep_v2.py @@ -22,6 +22,7 @@ import time from datetime import datetime, timedelta import json import ssdeep +import tlsh from packages import Paste from pubsublogger import publisher @@ -36,8 +37,12 @@ if __name__ == "__main__": p = Process(config_section) maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range")) - threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate")) - min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) + threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep")) + threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh")) + threshold_set = {} + threshold_set['ssdeep'] = threshold_duplicate_ssdeep + threshold_set['tlsh'] = threshold_duplicate_tlsh + min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size")) # REDIS # dico_redis = {} @@ -47,7 +52,7 @@ if __name__ == "__main__": dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis( host=p.config.get("Redis_Level_DB", "host"), port=year, db=month) - #print("dup: "+str(year)+str(month).zfill(2)+"\n") + #print("dup: "+str(year)+str(month).zfill(2)+"\n") # FUNCTIONS # publisher.info("Script duplicate started") @@ -70,10 +75,11 @@ if __name__ == "__main__": continue # the paste is too small - if (PST._get_p_size() < min_paste_size): + if (PST._get_p_size() < min_paste_size): continue PST._set_p_hash_kind("ssdeep") + PST._set_p_hash_kind("tlsh") # Assignate the correct redis connexion r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month] @@ -86,7 +92,7 @@ if __name__ == "__main__": curr_date_range = date_today - timedelta(days = diff_month*30.4166666) to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2) dico_range_list.append(to_append) - + # Use all dico in range dico_range_list = dico_range_list[0:maximum_month_range] @@ -95,43 +101,47 @@ if __name__ == "__main__": r_serv0 = dico_redis[yearly_index] r_serv0.incr("current_index") index = r_serv0.get("current_index")+str(PST.p_date) - - # Open selected dico range + + # Open selected dico range opened_dico = [] for dico_name in dico_range_list: opened_dico.append([dico_name, dico_redis[dico_name]]) - + # retrieve hash from paste - paste_hash = PST._get_p_hash() - + paste_hashes = PST._get_p_hash() + # Go throught the Database of the dico (of the month) for curr_dico_name, curr_dico_redis in opened_dico: - for dico_hash in curr_dico_redis.smembers('HASHS'): - try: - percent = ssdeep.compare(dico_hash, paste_hash) - if percent > threshold_duplicate: - # Go throught the Database of the dico filter (month) - r_serv_dico = dico_redis[curr_dico_name] - - # index of paste - index_current = r_serv_dico.get(dico_hash) - paste_path = r_serv_dico.get(index_current) - if paste_path != None: - hash_dico[dico_hash] = (paste_path, percent) + for hash_type, paste_hash in paste_hashes.iteritems(): + for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type): + try: + percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash) + threshold_duplicate = threshold_set[hash_type] + if percent < threshold_duplicate: + # Go throught the Database of the dico filter (month) + r_serv_dico = dico_redis[curr_dico_name] - #print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) - except: - # ssdeep hash not comparable - print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash - curr_dico_redis.srem('HASHS', dico_hash) + # index of paste + index_current = r_serv_dico.get(dico_hash) + paste_path = r_serv_dico.get(index_current) + if paste_path != None: + hash_dico[dico_hash] = (hash_type, paste_path, percent) + + print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent) + except Exception,e: + print str(e) + # ssdeep hash not comparable + #print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash + #curr_dico_redis.srem('HASHS', dico_hash) # Add paste in DB after checking to prevent its analysis twice # hash_i -> index_i AND index_i -> PST.PATH r_serv1.set(index, PST.p_path) r_serv1.sadd("INDEX", index) # Adding the hash in Redis - r_serv1.set(paste_hash, index) - r_serv1.sadd("HASHS", paste_hash) + for hash_type, paste_hash in paste_hashes.iteritems(): + r_serv1.set(paste_hash, index) + r_serv1.sadd("HASHS_"+hash_type, paste_hash) ##################### Similarity found ####################### # if there is data in this dictionnary @@ -153,7 +163,7 @@ if __name__ == "__main__": publisher.debug('{}Processed in {} sec'.format(to_print, y-x)) #print '{}Processed in {} sec'.format(to_print, y-x) - + except IOError: to_print = 'Duplicate;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index 2f34c5c7..a55a8695 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -2,6 +2,7 @@ import hashlib import crcmod import mmh3 import ssdeep +import tlsh class Hash(object): @@ -36,4 +37,7 @@ class Hash(object): elif self.name == "ssdeep": hash = ssdeep.hash(string) + elif self.name == "tlsh": + hash = tlsh.hash(string) + return hash diff --git a/bin/packages/Paste.py b/bin/packages/Paste.py index 172f0931..90f7cae5 100755 --- a/bin/packages/Paste.py +++ b/bin/packages/Paste.py @@ -86,8 +86,8 @@ class Paste(object): self.p_source = var[-5] self.p_encoding = None - self.p_hash_kind = None - self.p_hash = None + self.p_hash_kind = {} + self.p_hash = {} self.p_langage = None self.p_nb_lines = None self.p_max_length_line = None @@ -159,7 +159,7 @@ class Paste(object): .. seealso:: Hash.py Object to get the available hashs. """ - self.p_hash_kind = Hash(hashkind) + self.p_hash_kind[hashkind] = (Hash(hashkind)) def _get_p_hash(self): """ @@ -174,7 +174,8 @@ class Paste(object): .. seealso:: _set_p_hash_kind("md5") """ - self.p_hash = self.p_hash_kind.Calculate(self.get_p_content()) + for hash_name, the_hash in self.p_hash_kind.iteritems(): + self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content()) return self.p_hash def _get_p_language(self): @@ -202,42 +203,6 @@ class Paste(object): def _get_p_size(self): return self.p_size - def _get_hash_lines(self, min=1, start=1, jump=10): - """ - Returning all the lines of the paste hashed. - - :param min: -- (int) Minimum line length to be hashed. - :param start: -- (int) Number the line where to start. - :param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps - (Maximum Granularity) - - :return: a set([]) of hash. - - .. warning:: Using a set here mean that this function will only return uniq hash. - - If the paste is composed with 1000 time the same line, this function will return - just once the line. - - This choice was made to avoid a certain redundancy and useless hash checking. - - :Example: PST._get_hash_lines(1, 1, 0) - - .. note:: You need first to "declare which kind of hash you want to use - before using this function - .. seealso:: _set_p_hash_kind("md5") - - """ - S = set([]) - f = self.get_p_content_as_file() - for num, line in enumerate(f, start): - if len(line) >= min: - if jump > 1: - if (num % jump) == 1: - S.add(self.p_hash_kind.Calculate(line)) - else: - S.add(self.p_hash_kind.Calculate(line)) - return S - def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10): """ Returning the percent of similarity with another paste. @@ -329,7 +294,10 @@ class Paste(object): self.store.hset(self.p_path, attr_name, json.dumps(value)) def _get_from_redis(self, r_serv): - return r_serv.hgetall(self.p_hash) + ans = {} + for hash_name, the_hash in self.p_hash: + ans[hash_name] = r_serv.hgetall(the_hash) + return ans def _get_top_words(self, sort=False): """ diff --git a/installing_deps.sh b/installing_deps.sh index ae4f3fc8..6eecc805 100755 --- a/installing_deps.sh +++ b/installing_deps.sh @@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf sudo ldconfig popd +# tlsh +test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git +pushd tlsh/ +./make +popd + # REDIS LEVEL DB # test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git pushd redis-leveldb/ @@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/ python setup.py install popd +# Py tlsh +pushd tlsh/py_ext +python setup.py build +python setup.py install # Download the necessary NLTK corpora HOME=$(pwd) python -m textblob.download_corpora diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 42a28bd0..11836de4 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -14,6 +14,8 @@ import Paste from Date import Date # CONFIG # +tlsh_to_percent = 1000.0 + configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') if not os.path.exists(configfile): raise Exception('Unable to find the configuration file. \ @@ -74,13 +76,28 @@ def parseStringToList(the_string): strList += c else: the_list = strList.split(',') - if len(the_list) == 2: + if len(the_list) == 3: + elemList = elemList + the_list + elif len(the_list) == 2: elemList.append(the_list) elif len(the_list) > 1: elemList.append(the_list[1:]) strList = "" return elemList +def parseStringToList2(the_string): + res = [] + tab_str = the_string.split('], [') + tab_str[0] = tab_str[0][1:]+']' + tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1] + res.append(parseStringToList(tab_str[0])) + for i in range(1, len(tab_str)-2): + tab_str[i] = '['+tab_str[i]+']' + res.append(parseStringToList(tab_str[i])) + res.append(parseStringToList(tab_str[len(tab_str)-1])) + return res + + def showpaste(content_range): requested_path = request.args.get('paste', '') paste = Paste.Paste(requested_path) @@ -93,19 +110,47 @@ def showpaste(content_range): p_mime = paste.p_mime p_lineinfo = paste.get_lines_info() p_content = paste.get_p_content().decode('utf-8', 'ignore') - p_duplicate_full_list = parseStringToList(paste._get_p_duplicate()) + p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate()) p_duplicate_list = [] p_simil_list = [] + p_hashtype_list = [] + for dup_list in p_duplicate_full_list: - path, simil_percent = dup_list + if dup_list[0] == "tlsh": + dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100) + else: + dup_list[2] = int(dup_list[2]) + + p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True) + + new_dup_list = [] + dup_list_removed = [] + for dup_list_index in range(0, len(p_duplicate_full_list)): + if dup_list_index in dup_list_removed: + continue + indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]] + hash_types = [] + comp_vals = [] + for i in indices: + hash_types.append(p_duplicate_full_list[i][0]) + comp_vals.append(p_duplicate_full_list[i][2]) + dup_list_removed.append(i) + + hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types) + comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals) + new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals]) + + for dup_list in new_dup_list: + hash_type, path, simil_percent = dup_list p_duplicate_list.append(path) p_simil_list.append(simil_percent) + p_hashtype_list.append(hash_type) if content_range != 0: p_content = p_content[0:content_range] - return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list) + return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list) def get_date_range(num_day): curr_date = datetime.date.today() diff --git a/var/www/templates/show_saved_paste.html b/var/www/templates/show_saved_paste.html index 707786d7..0da148f3 100644 --- a/var/www/templates/show_saved_paste.html +++ b/var/www/templates/show_saved_paste.html @@ -43,16 +43,25 @@
Hash type | Paste info | +|
---|---|---|
{{ hashtype_list[i] }} | +Similarity: {{ simil_list[i] }}% | +{{ dup_path }} | +