Draft: added new duplicate hash comparison - tlsh

pull/65/head
Mokaddem 2016-08-04 11:55:38 +02:00
parent 50d2848a40
commit d9316771cd
6 changed files with 125 additions and 79 deletions

View File

@ -22,6 +22,7 @@ import time
from datetime import datetime, timedelta
import json
import ssdeep
import tlsh
from packages import Paste
from pubsublogger import publisher
@ -36,8 +37,12 @@ if __name__ == "__main__":
p = Process(config_section)
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
threshold_set = {}
threshold_set['ssdeep'] = threshold_duplicate_ssdeep
threshold_set['tlsh'] = threshold_duplicate_tlsh
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
# REDIS #
dico_redis = {}
@ -47,7 +52,7 @@ if __name__ == "__main__":
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
host=p.config.get("Redis_Level_DB", "host"), port=year,
db=month)
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
# FUNCTIONS #
publisher.info("Script duplicate started")
@ -70,10 +75,11 @@ if __name__ == "__main__":
continue
# the paste is too small
if (PST._get_p_size() < min_paste_size):
if (PST._get_p_size() < min_paste_size):
continue
PST._set_p_hash_kind("ssdeep")
PST._set_p_hash_kind("tlsh")
# Assignate the correct redis connexion
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
@ -86,7 +92,7 @@ if __name__ == "__main__":
curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
dico_range_list.append(to_append)
# Use all dico in range
dico_range_list = dico_range_list[0:maximum_month_range]
@ -95,43 +101,47 @@ if __name__ == "__main__":
r_serv0 = dico_redis[yearly_index]
r_serv0.incr("current_index")
index = r_serv0.get("current_index")+str(PST.p_date)
# Open selected dico range
# Open selected dico range
opened_dico = []
for dico_name in dico_range_list:
opened_dico.append([dico_name, dico_redis[dico_name]])
# retrieve hash from paste
paste_hash = PST._get_p_hash()
paste_hashes = PST._get_p_hash()
# Go throught the Database of the dico (of the month)
for curr_dico_name, curr_dico_redis in opened_dico:
for dico_hash in curr_dico_redis.smembers('HASHS'):
try:
percent = ssdeep.compare(dico_hash, paste_hash)
if percent > threshold_duplicate:
# Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[curr_dico_name]
# index of paste
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (paste_path, percent)
for hash_type, paste_hash in paste_hashes.iteritems():
for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
try:
percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
threshold_duplicate = threshold_set[hash_type]
if percent < threshold_duplicate:
# Go throught the Database of the dico filter (month)
r_serv_dico = dico_redis[curr_dico_name]
#print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
except:
# ssdeep hash not comparable
print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
curr_dico_redis.srem('HASHS', dico_hash)
# index of paste
index_current = r_serv_dico.get(dico_hash)
paste_path = r_serv_dico.get(index_current)
if paste_path != None:
hash_dico[dico_hash] = (hash_type, paste_path, percent)
print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
except Exception,e:
print str(e)
# ssdeep hash not comparable
#print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
#curr_dico_redis.srem('HASHS', dico_hash)
# Add paste in DB after checking to prevent its analysis twice
# hash_i -> index_i AND index_i -> PST.PATH
r_serv1.set(index, PST.p_path)
r_serv1.sadd("INDEX", index)
# Adding the hash in Redis
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS", paste_hash)
for hash_type, paste_hash in paste_hashes.iteritems():
r_serv1.set(paste_hash, index)
r_serv1.sadd("HASHS_"+hash_type, paste_hash)
##################### Similarity found #######################
# if there is data in this dictionnary
@ -153,7 +163,7 @@ if __name__ == "__main__":
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
#print '{}Processed in {} sec'.format(to_print, y-x)
except IOError:
to_print = 'Duplicate;{};{};{};'.format(
PST.p_source, PST.p_date, PST.p_name)

View File

@ -2,6 +2,7 @@ import hashlib
import crcmod
import mmh3
import ssdeep
import tlsh
class Hash(object):
@ -36,4 +37,7 @@ class Hash(object):
elif self.name == "ssdeep":
hash = ssdeep.hash(string)
elif self.name == "tlsh":
hash = tlsh.hash(string)
return hash

View File

@ -86,8 +86,8 @@ class Paste(object):
self.p_source = var[-5]
self.p_encoding = None
self.p_hash_kind = None
self.p_hash = None
self.p_hash_kind = {}
self.p_hash = {}
self.p_langage = None
self.p_nb_lines = None
self.p_max_length_line = None
@ -159,7 +159,7 @@ class Paste(object):
.. seealso:: Hash.py Object to get the available hashs.
"""
self.p_hash_kind = Hash(hashkind)
self.p_hash_kind[hashkind] = (Hash(hashkind))
def _get_p_hash(self):
"""
@ -174,7 +174,8 @@ class Paste(object):
.. seealso:: _set_p_hash_kind("md5")
"""
self.p_hash = self.p_hash_kind.Calculate(self.get_p_content())
for hash_name, the_hash in self.p_hash_kind.iteritems():
self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
return self.p_hash
def _get_p_language(self):
@ -202,42 +203,6 @@ class Paste(object):
def _get_p_size(self):
return self.p_size
def _get_hash_lines(self, min=1, start=1, jump=10):
"""
Returning all the lines of the paste hashed.
:param min: -- (int) Minimum line length to be hashed.
:param start: -- (int) Number the line where to start.
:param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
(Maximum Granularity)
:return: a set([]) of hash.
.. warning:: Using a set here mean that this function will only return uniq hash.
If the paste is composed with 1000 time the same line, this function will return
just once the line.
This choice was made to avoid a certain redundancy and useless hash checking.
:Example: PST._get_hash_lines(1, 1, 0)
.. note:: You need first to "declare which kind of hash you want to use
before using this function
.. seealso:: _set_p_hash_kind("md5")
"""
S = set([])
f = self.get_p_content_as_file()
for num, line in enumerate(f, start):
if len(line) >= min:
if jump > 1:
if (num % jump) == 1:
S.add(self.p_hash_kind.Calculate(line))
else:
S.add(self.p_hash_kind.Calculate(line))
return S
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
"""
Returning the percent of similarity with another paste.
@ -329,7 +294,10 @@ class Paste(object):
self.store.hset(self.p_path, attr_name, json.dumps(value))
def _get_from_redis(self, r_serv):
return r_serv.hgetall(self.p_hash)
ans = {}
for hash_name, the_hash in self.p_hash:
ans[hash_name] = r_serv.hgetall(the_hash)
return ans
def _get_top_words(self, sort=False):
"""

View File

@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
sudo ldconfig
popd
# tlsh
test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
pushd tlsh/
./make
popd
# REDIS LEVEL DB #
test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
pushd redis-leveldb/
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
python setup.py install
popd
# Py tlsh
pushd tlsh/py_ext
python setup.py build
python setup.py install
# Download the necessary NLTK corpora
HOME=$(pwd) python -m textblob.download_corpora

View File

@ -14,6 +14,8 @@ import Paste
from Date import Date
# CONFIG #
tlsh_to_percent = 1000.0
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
raise Exception('Unable to find the configuration file. \
@ -74,13 +76,28 @@ def parseStringToList(the_string):
strList += c
else:
the_list = strList.split(',')
if len(the_list) == 2:
if len(the_list) == 3:
elemList = elemList + the_list
elif len(the_list) == 2:
elemList.append(the_list)
elif len(the_list) > 1:
elemList.append(the_list[1:])
strList = ""
return elemList
def parseStringToList2(the_string):
res = []
tab_str = the_string.split('], [')
tab_str[0] = tab_str[0][1:]+']'
tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
res.append(parseStringToList(tab_str[0]))
for i in range(1, len(tab_str)-2):
tab_str[i] = '['+tab_str[i]+']'
res.append(parseStringToList(tab_str[i]))
res.append(parseStringToList(tab_str[len(tab_str)-1]))
return res
def showpaste(content_range):
requested_path = request.args.get('paste', '')
paste = Paste.Paste(requested_path)
@ -93,19 +110,47 @@ def showpaste(content_range):
p_mime = paste.p_mime
p_lineinfo = paste.get_lines_info()
p_content = paste.get_p_content().decode('utf-8', 'ignore')
p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
p_duplicate_list = []
p_simil_list = []
p_hashtype_list = []
for dup_list in p_duplicate_full_list:
path, simil_percent = dup_list
if dup_list[0] == "tlsh":
dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
else:
dup_list[2] = int(dup_list[2])
p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
new_dup_list = []
dup_list_removed = []
for dup_list_index in range(0, len(p_duplicate_full_list)):
if dup_list_index in dup_list_removed:
continue
indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
hash_types = []
comp_vals = []
for i in indices:
hash_types.append(p_duplicate_full_list[i][0])
comp_vals.append(p_duplicate_full_list[i][2])
dup_list_removed.append(i)
hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
for dup_list in new_dup_list:
hash_type, path, simil_percent = dup_list
p_duplicate_list.append(path)
p_simil_list.append(simil_percent)
p_hashtype_list.append(hash_type)
if content_range != 0:
p_content = p_content[0:content_range]
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)
def get_date_range(num_day):
curr_date = datetime.date.today()

View File

@ -43,16 +43,25 @@
</div>
<div class="panel-body" id="panel-body">
{% if duplicate_list|length == 0 %}
<h4> No Duplicate </h4>
<h3> No Duplicate </h3>
{% else %}
<h4> Duplicate list: </h4>
<h3> Duplicate list: </h3>
<table style="width:100%">
{% set i = 0 %}
<tr>
<th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
</tr>
{% for dup_path in duplicate_list %}
Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
<tr>
<td>{{ hashtype_list[i] }}</td>
<td>Similarity: {{ simil_list[i] }}%</td>
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
</tr>
{% set i = i + 1 %}
{% endfor %}
</table>
{% endif %}
<h4> Content: </h4>
<h3> Content: </h3>
<p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
</div>
</div>