mirror of https://github.com/CIRCL/AIL-framework
Draft: added new duplicate hash comparison - tlsh
parent
50d2848a40
commit
d9316771cd
|
@ -22,6 +22,7 @@ import time
|
|||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import ssdeep
|
||||
import tlsh
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
||||
|
@ -36,8 +37,12 @@ if __name__ == "__main__":
|
|||
p = Process(config_section)
|
||||
|
||||
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
|
||||
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
|
||||
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
|
||||
threshold_duplicate_ssdeep = int(p.config.get("Modules_Duplicates", "threshold_duplicate_ssdeep"))
|
||||
threshold_duplicate_tlsh = int(p.config.get("Modules_Duplicates", "threshold_duplicate_tlsh"))
|
||||
threshold_set = {}
|
||||
threshold_set['ssdeep'] = threshold_duplicate_ssdeep
|
||||
threshold_set['tlsh'] = threshold_duplicate_tlsh
|
||||
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
|
||||
|
||||
# REDIS #
|
||||
dico_redis = {}
|
||||
|
@ -47,7 +52,7 @@ if __name__ == "__main__":
|
|||
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
|
||||
host=p.config.get("Redis_Level_DB", "host"), port=year,
|
||||
db=month)
|
||||
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
||||
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("Script duplicate started")
|
||||
|
@ -70,10 +75,11 @@ if __name__ == "__main__":
|
|||
continue
|
||||
|
||||
# the paste is too small
|
||||
if (PST._get_p_size() < min_paste_size):
|
||||
if (PST._get_p_size() < min_paste_size):
|
||||
continue
|
||||
|
||||
PST._set_p_hash_kind("ssdeep")
|
||||
PST._set_p_hash_kind("tlsh")
|
||||
|
||||
# Assignate the correct redis connexion
|
||||
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
|
||||
|
@ -86,7 +92,7 @@ if __name__ == "__main__":
|
|||
curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
|
||||
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
|
||||
dico_range_list.append(to_append)
|
||||
|
||||
|
||||
# Use all dico in range
|
||||
dico_range_list = dico_range_list[0:maximum_month_range]
|
||||
|
||||
|
@ -95,43 +101,47 @@ if __name__ == "__main__":
|
|||
r_serv0 = dico_redis[yearly_index]
|
||||
r_serv0.incr("current_index")
|
||||
index = r_serv0.get("current_index")+str(PST.p_date)
|
||||
|
||||
# Open selected dico range
|
||||
|
||||
# Open selected dico range
|
||||
opened_dico = []
|
||||
for dico_name in dico_range_list:
|
||||
opened_dico.append([dico_name, dico_redis[dico_name]])
|
||||
|
||||
|
||||
# retrieve hash from paste
|
||||
paste_hash = PST._get_p_hash()
|
||||
|
||||
paste_hashes = PST._get_p_hash()
|
||||
|
||||
# Go throught the Database of the dico (of the month)
|
||||
for curr_dico_name, curr_dico_redis in opened_dico:
|
||||
for dico_hash in curr_dico_redis.smembers('HASHS'):
|
||||
try:
|
||||
percent = ssdeep.compare(dico_hash, paste_hash)
|
||||
if percent > threshold_duplicate:
|
||||
# Go throught the Database of the dico filter (month)
|
||||
r_serv_dico = dico_redis[curr_dico_name]
|
||||
|
||||
# index of paste
|
||||
index_current = r_serv_dico.get(dico_hash)
|
||||
paste_path = r_serv_dico.get(index_current)
|
||||
if paste_path != None:
|
||||
hash_dico[dico_hash] = (paste_path, percent)
|
||||
for hash_type, paste_hash in paste_hashes.iteritems():
|
||||
for dico_hash in curr_dico_redis.smembers('HASHS_'+hash_type):
|
||||
try:
|
||||
percent = 100-ssdeep.compare(dico_hash, paste_hash) if hash_type == 'ssdeep' else tlsh.diffxlen(dico_hash, paste_hash)
|
||||
threshold_duplicate = threshold_set[hash_type]
|
||||
if percent < threshold_duplicate:
|
||||
# Go throught the Database of the dico filter (month)
|
||||
r_serv_dico = dico_redis[curr_dico_name]
|
||||
|
||||
#print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
|
||||
except:
|
||||
# ssdeep hash not comparable
|
||||
print 'ssdeep hash not comparable, cleaning bad hash: '+dico_hash
|
||||
curr_dico_redis.srem('HASHS', dico_hash)
|
||||
# index of paste
|
||||
index_current = r_serv_dico.get(dico_hash)
|
||||
paste_path = r_serv_dico.get(index_current)
|
||||
if paste_path != None:
|
||||
hash_dico[dico_hash] = (hash_type, paste_path, percent)
|
||||
|
||||
print '['+hash_type+'] '+'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
|
||||
except Exception,e:
|
||||
print str(e)
|
||||
# ssdeep hash not comparable
|
||||
#print 'hash not comparable, bad hash: '+dico_hash+' , current_hash: '+paste_hash
|
||||
#curr_dico_redis.srem('HASHS', dico_hash)
|
||||
|
||||
# Add paste in DB after checking to prevent its analysis twice
|
||||
# hash_i -> index_i AND index_i -> PST.PATH
|
||||
r_serv1.set(index, PST.p_path)
|
||||
r_serv1.sadd("INDEX", index)
|
||||
# Adding the hash in Redis
|
||||
r_serv1.set(paste_hash, index)
|
||||
r_serv1.sadd("HASHS", paste_hash)
|
||||
for hash_type, paste_hash in paste_hashes.iteritems():
|
||||
r_serv1.set(paste_hash, index)
|
||||
r_serv1.sadd("HASHS_"+hash_type, paste_hash)
|
||||
##################### Similarity found #######################
|
||||
|
||||
# if there is data in this dictionnary
|
||||
|
@ -153,7 +163,7 @@ if __name__ == "__main__":
|
|||
|
||||
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
|
||||
#print '{}Processed in {} sec'.format(to_print, y-x)
|
||||
|
||||
|
||||
except IOError:
|
||||
to_print = 'Duplicate;{};{};{};'.format(
|
||||
PST.p_source, PST.p_date, PST.p_name)
|
||||
|
|
|
@ -2,6 +2,7 @@ import hashlib
|
|||
import crcmod
|
||||
import mmh3
|
||||
import ssdeep
|
||||
import tlsh
|
||||
|
||||
|
||||
class Hash(object):
|
||||
|
@ -36,4 +37,7 @@ class Hash(object):
|
|||
elif self.name == "ssdeep":
|
||||
hash = ssdeep.hash(string)
|
||||
|
||||
elif self.name == "tlsh":
|
||||
hash = tlsh.hash(string)
|
||||
|
||||
return hash
|
||||
|
|
|
@ -86,8 +86,8 @@ class Paste(object):
|
|||
self.p_source = var[-5]
|
||||
|
||||
self.p_encoding = None
|
||||
self.p_hash_kind = None
|
||||
self.p_hash = None
|
||||
self.p_hash_kind = {}
|
||||
self.p_hash = {}
|
||||
self.p_langage = None
|
||||
self.p_nb_lines = None
|
||||
self.p_max_length_line = None
|
||||
|
@ -159,7 +159,7 @@ class Paste(object):
|
|||
.. seealso:: Hash.py Object to get the available hashs.
|
||||
|
||||
"""
|
||||
self.p_hash_kind = Hash(hashkind)
|
||||
self.p_hash_kind[hashkind] = (Hash(hashkind))
|
||||
|
||||
def _get_p_hash(self):
|
||||
"""
|
||||
|
@ -174,7 +174,8 @@ class Paste(object):
|
|||
.. seealso:: _set_p_hash_kind("md5")
|
||||
|
||||
"""
|
||||
self.p_hash = self.p_hash_kind.Calculate(self.get_p_content())
|
||||
for hash_name, the_hash in self.p_hash_kind.iteritems():
|
||||
self.p_hash[hash_name] = the_hash.Calculate(self.get_p_content())
|
||||
return self.p_hash
|
||||
|
||||
def _get_p_language(self):
|
||||
|
@ -202,42 +203,6 @@ class Paste(object):
|
|||
def _get_p_size(self):
|
||||
return self.p_size
|
||||
|
||||
def _get_hash_lines(self, min=1, start=1, jump=10):
|
||||
"""
|
||||
Returning all the lines of the paste hashed.
|
||||
|
||||
:param min: -- (int) Minimum line length to be hashed.
|
||||
:param start: -- (int) Number the line where to start.
|
||||
:param jump: -- (int) Granularity of the hashing 0 or 1 means no jumps
|
||||
(Maximum Granularity)
|
||||
|
||||
:return: a set([]) of hash.
|
||||
|
||||
.. warning:: Using a set here mean that this function will only return uniq hash.
|
||||
|
||||
If the paste is composed with 1000 time the same line, this function will return
|
||||
just once the line.
|
||||
|
||||
This choice was made to avoid a certain redundancy and useless hash checking.
|
||||
|
||||
:Example: PST._get_hash_lines(1, 1, 0)
|
||||
|
||||
.. note:: You need first to "declare which kind of hash you want to use
|
||||
before using this function
|
||||
.. seealso:: _set_p_hash_kind("md5")
|
||||
|
||||
"""
|
||||
S = set([])
|
||||
f = self.get_p_content_as_file()
|
||||
for num, line in enumerate(f, start):
|
||||
if len(line) >= min:
|
||||
if jump > 1:
|
||||
if (num % jump) == 1:
|
||||
S.add(self.p_hash_kind.Calculate(line))
|
||||
else:
|
||||
S.add(self.p_hash_kind.Calculate(line))
|
||||
return S
|
||||
|
||||
def is_duplicate(self, obj, min=1, percent=50, start=1, jump=10):
|
||||
"""
|
||||
Returning the percent of similarity with another paste.
|
||||
|
@ -329,7 +294,10 @@ class Paste(object):
|
|||
self.store.hset(self.p_path, attr_name, json.dumps(value))
|
||||
|
||||
def _get_from_redis(self, r_serv):
|
||||
return r_serv.hgetall(self.p_hash)
|
||||
ans = {}
|
||||
for hash_name, the_hash in self.p_hash:
|
||||
ans[hash_name] = r_serv.hgetall(the_hash)
|
||||
return ans
|
||||
|
||||
def _get_top_words(self, sort=False):
|
||||
"""
|
||||
|
|
|
@ -39,6 +39,12 @@ echo '/usr/local/lib' | sudo tee -a /etc/ld.so.conf.d/faup.conf
|
|||
sudo ldconfig
|
||||
popd
|
||||
|
||||
# tlsh
|
||||
test ! -d tlsh && git clone git://github.com/trendmicro/tlsh.git
|
||||
pushd tlsh/
|
||||
./make
|
||||
popd
|
||||
|
||||
# REDIS LEVEL DB #
|
||||
test ! -d redis-leveldb/ && git clone https://github.com/KDr2/redis-leveldb.git
|
||||
pushd redis-leveldb/
|
||||
|
@ -72,6 +78,10 @@ pushd faup/src/lib/bindings/python/
|
|||
python setup.py install
|
||||
popd
|
||||
|
||||
# Py tlsh
|
||||
pushd tlsh/py_ext
|
||||
python setup.py build
|
||||
python setup.py install
|
||||
|
||||
# Download the necessary NLTK corpora
|
||||
HOME=$(pwd) python -m textblob.download_corpora
|
||||
|
|
|
@ -14,6 +14,8 @@ import Paste
|
|||
from Date import Date
|
||||
|
||||
# CONFIG #
|
||||
tlsh_to_percent = 1000.0
|
||||
|
||||
configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||
if not os.path.exists(configfile):
|
||||
raise Exception('Unable to find the configuration file. \
|
||||
|
@ -74,13 +76,28 @@ def parseStringToList(the_string):
|
|||
strList += c
|
||||
else:
|
||||
the_list = strList.split(',')
|
||||
if len(the_list) == 2:
|
||||
if len(the_list) == 3:
|
||||
elemList = elemList + the_list
|
||||
elif len(the_list) == 2:
|
||||
elemList.append(the_list)
|
||||
elif len(the_list) > 1:
|
||||
elemList.append(the_list[1:])
|
||||
strList = ""
|
||||
return elemList
|
||||
|
||||
def parseStringToList2(the_string):
|
||||
res = []
|
||||
tab_str = the_string.split('], [')
|
||||
tab_str[0] = tab_str[0][1:]+']'
|
||||
tab_str[len(tab_str)-1] = '['+tab_str[len(tab_str)-1][:-1]
|
||||
res.append(parseStringToList(tab_str[0]))
|
||||
for i in range(1, len(tab_str)-2):
|
||||
tab_str[i] = '['+tab_str[i]+']'
|
||||
res.append(parseStringToList(tab_str[i]))
|
||||
res.append(parseStringToList(tab_str[len(tab_str)-1]))
|
||||
return res
|
||||
|
||||
|
||||
def showpaste(content_range):
|
||||
requested_path = request.args.get('paste', '')
|
||||
paste = Paste.Paste(requested_path)
|
||||
|
@ -93,19 +110,47 @@ def showpaste(content_range):
|
|||
p_mime = paste.p_mime
|
||||
p_lineinfo = paste.get_lines_info()
|
||||
p_content = paste.get_p_content().decode('utf-8', 'ignore')
|
||||
p_duplicate_full_list = parseStringToList(paste._get_p_duplicate())
|
||||
p_duplicate_full_list = parseStringToList2(paste._get_p_duplicate())
|
||||
p_duplicate_list = []
|
||||
p_simil_list = []
|
||||
p_hashtype_list = []
|
||||
|
||||
|
||||
for dup_list in p_duplicate_full_list:
|
||||
path, simil_percent = dup_list
|
||||
if dup_list[0] == "tlsh":
|
||||
dup_list[2] = int(((tlsh_to_percent - float(dup_list[2])) / tlsh_to_percent)*100)
|
||||
else:
|
||||
dup_list[2] = int(dup_list[2])
|
||||
|
||||
p_duplicate_full_list.sort(lambda x,y: cmp(x[2], y[2]), reverse=True)
|
||||
|
||||
new_dup_list = []
|
||||
dup_list_removed = []
|
||||
for dup_list_index in range(0, len(p_duplicate_full_list)):
|
||||
if dup_list_index in dup_list_removed:
|
||||
continue
|
||||
indices = [i for i, x in enumerate(p_duplicate_full_list) if x[1] == p_duplicate_full_list[dup_list_index][1]]
|
||||
hash_types = []
|
||||
comp_vals = []
|
||||
for i in indices:
|
||||
hash_types.append(p_duplicate_full_list[i][0])
|
||||
comp_vals.append(p_duplicate_full_list[i][2])
|
||||
dup_list_removed.append(i)
|
||||
|
||||
hash_types = str(hash_types).replace("[","").replace("]","") if len(hash_types)==1 else str(hash_types)
|
||||
comp_vals = str(comp_vals).replace("[","").replace("]","") if len(comp_vals)==1 else str(comp_vals)
|
||||
new_dup_list.append([hash_types.replace("'", ""), p_duplicate_full_list[dup_list_index][1], comp_vals])
|
||||
|
||||
for dup_list in new_dup_list:
|
||||
hash_type, path, simil_percent = dup_list
|
||||
p_duplicate_list.append(path)
|
||||
p_simil_list.append(simil_percent)
|
||||
p_hashtype_list.append(hash_type)
|
||||
|
||||
if content_range != 0:
|
||||
p_content = p_content[0:content_range]
|
||||
|
||||
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list)
|
||||
return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list, hashtype_list = p_hashtype_list)
|
||||
|
||||
def get_date_range(num_day):
|
||||
curr_date = datetime.date.today()
|
||||
|
|
|
@ -43,16 +43,25 @@
|
|||
</div>
|
||||
<div class="panel-body" id="panel-body">
|
||||
{% if duplicate_list|length == 0 %}
|
||||
<h4> No Duplicate </h4>
|
||||
<h3> No Duplicate </h3>
|
||||
{% else %}
|
||||
<h4> Duplicate list: </h4>
|
||||
<h3> Duplicate list: </h3>
|
||||
<table style="width:100%">
|
||||
{% set i = 0 %}
|
||||
<tr>
|
||||
<th style="text-align:left;">Hash type</th><th style="text-align:left;">Paste info</th>
|
||||
</tr>
|
||||
{% for dup_path in duplicate_list %}
|
||||
Similarity: {{ simil_list[i] }}% - <a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></br>
|
||||
<tr>
|
||||
<td>{{ hashtype_list[i] }}</td>
|
||||
<td>Similarity: {{ simil_list[i] }}%</td>
|
||||
<td><a target="_blank" href="{{ url_for('showsavedpaste') }}?paste={{ dup_path }}" id='dup_path'>{{ dup_path }}</a></td>
|
||||
</tr>
|
||||
{% set i = i + 1 %}
|
||||
{% endfor %}
|
||||
</table>
|
||||
{% endif %}
|
||||
<h4> Content: </h4>
|
||||
<h3> Content: </h3>
|
||||
<p data-initsize="{{ initsize }}"> <xmp id="paste-holder">{{ content }}</xmp></p>
|
||||
</div>
|
||||
</div>
|
||||
|
|
Loading…
Reference in New Issue