mirror of https://github.com/CIRCL/AIL-framework
Added two new version of duplicate module.
One with hashes are saved in json on disk The other with only leveldbpull/63/head
parent
14e9850dd6
commit
4f6813350b
|
@ -20,7 +20,6 @@ import json
|
|||
import ssdeep
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
from pybloomfilter import BloomFilter
|
||||
|
||||
from Helper import Process
|
||||
|
||||
|
@ -29,10 +28,10 @@ if __name__ == "__main__":
|
|||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'Duplicates'
|
||||
saved_dico_and_reload = 1 #min
|
||||
save_dico_and_reload = 1 #min
|
||||
time_1 = time.time()
|
||||
flag_reload = True
|
||||
flag_to_disk = False
|
||||
flag_reload_from_disk = True
|
||||
flag_write_to_disk = False
|
||||
|
||||
p = Process(config_section)
|
||||
|
||||
|
@ -81,18 +80,16 @@ if __name__ == "__main__":
|
|||
filedicopath_today = filedicopath
|
||||
|
||||
# Save I/O
|
||||
if time.time() - time_1 > saved_dico_and_reload*60:
|
||||
flag_to_disk = True
|
||||
if time.time() - time_1 > save_dico_and_reload*60:
|
||||
flag_write_to_disk = True
|
||||
|
||||
if os.path.exists(filedicopath):
|
||||
if flag_reload == True:
|
||||
flag_reload = False
|
||||
if flag_reload_from_disk == True:
|
||||
flag_reload_from_disk = False
|
||||
print 'Reloading'
|
||||
time_1 = time.time()
|
||||
with open(filedicopath, 'r') as fp:
|
||||
today_dico = json.load(fp)
|
||||
else:
|
||||
time_1 = time.time()
|
||||
today_dico = {}
|
||||
with open(filedicopath, 'w') as fp:
|
||||
json.dump(today_dico, fp)
|
||||
|
@ -105,44 +102,47 @@ if __name__ == "__main__":
|
|||
r_serv0 = dico_redis[yearly_index]
|
||||
r_serv0.incr("current_index")
|
||||
index = r_serv0.get("current_index")+str(PST.p_date)
|
||||
# HASHTABLES PER MONTH (because of r_serv1 changing db)
|
||||
r_serv1.set(index, PST.p_path)
|
||||
r_serv1.sadd("INDEX", index)
|
||||
|
||||
# For each dico
|
||||
opened_dico = []
|
||||
for dico in dico_path_set:
|
||||
# Opening dico
|
||||
if dico == filedicopath_today:
|
||||
opened_dico.append([dico, today_dico])
|
||||
with open(dico, 'r') as fp:
|
||||
opened_dico.append([dico, json.load(fp)])
|
||||
else:
|
||||
with open(dico, 'r') as fp:
|
||||
opened_dico.append([dico, json.load(fp)])
|
||||
|
||||
|
||||
#retrieve hash from paste
|
||||
paste_hash = PST._get_p_hash()
|
||||
# Adding the hash in Redis
|
||||
r_serv1.set(paste_hash, index)
|
||||
r_serv1.sadd("HASHS", paste_hash)
|
||||
|
||||
# Go throught the Database of the dico (of the month)
|
||||
threshold_dup = 10
|
||||
threshold_dup = 99
|
||||
for dico_name, dico in opened_dico:
|
||||
for dico_key, dico_hash in dico.items():
|
||||
percent = ssdeep.compare(dico_hash, paste_hash)
|
||||
if percent > threshold_dup:
|
||||
db = dico_name[-6:]
|
||||
# Go throught the Database of the bloom filter (month)
|
||||
# Go throught the Database of the dico filter (month)
|
||||
r_serv_dico = dico_redis[db]
|
||||
|
||||
# index of paste
|
||||
# FIXME Use r_serv_dico and do not consider only 1 server!!
|
||||
index_current = r_serv1.get(dico_hash)
|
||||
paste_path = r_serv1.get(index_current)
|
||||
index_current = r_serv_dico.get(dico_hash)
|
||||
paste_path = r_serv_dico.get(index_current)
|
||||
if paste_path != None:
|
||||
hash_dico[dico_hash] = (paste_path, percent)
|
||||
|
||||
print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
|
||||
print ' '+ PST.p_path[44:] +', '+ paste_path[44:]
|
||||
#print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
|
||||
print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent)
|
||||
|
||||
# Add paste in DB to prevent its analyse twice
|
||||
# HASHTABLES PER MONTH (because of r_serv1 changing db)
|
||||
r_serv1.set(index, PST.p_path)
|
||||
r_serv1.sadd("INDEX", index)
|
||||
# Adding the hash in Redis
|
||||
r_serv1.set(paste_hash, index)
|
||||
r_serv1.sadd("HASHS", paste_hash)
|
||||
##################### Similarity found #######################
|
||||
|
||||
# if there is data in this dictionnary
|
||||
|
@ -168,9 +168,11 @@ if __name__ == "__main__":
|
|||
# Adding the hash in the dico of the month
|
||||
today_dico[index] = paste_hash
|
||||
|
||||
if flag_to_disk:
|
||||
flag_to_disk = False
|
||||
flag_reload = True
|
||||
if flag_write_to_disk:
|
||||
time_1 = time.time()
|
||||
flag_write_to_disk = False
|
||||
flag_reload_from_disk = True
|
||||
print 'writing'
|
||||
with open(filedicopath, 'w') as fp:
|
||||
json.dump(today_dico, fp)
|
||||
except IOError:
|
||||
|
|
|
@ -0,0 +1,160 @@
|
|||
#!/usr/bin/env python2
|
||||
# -*-coding:UTF-8 -*
|
||||
|
||||
"""
|
||||
The Duplicate module
|
||||
====================
|
||||
|
||||
This huge module is, in short term, checking duplicates.
|
||||
|
||||
This one differ from v1 by only using redis and not json file on disk
|
||||
|
||||
Requirements:
|
||||
-------------
|
||||
|
||||
|
||||
"""
|
||||
import redis
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
import json
|
||||
import ssdeep
|
||||
from packages import Paste
|
||||
from pubsublogger import publisher
|
||||
|
||||
from Helper import Process
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'Duplicates'
|
||||
|
||||
p = Process(config_section)
|
||||
|
||||
maximum_month_range = int(p.config.get("Modules_Duplicates", "maximum_month_range"))
|
||||
threshold_duplicate = int(p.config.get("Modules_Duplicates", "threshold_duplicate"))
|
||||
min_paste_size = float(p.config.get("Modules_Duplicates", "min_paste_size"))
|
||||
|
||||
# REDIS #
|
||||
dico_redis = {}
|
||||
date_today = datetime.today()
|
||||
for year in xrange(2013, date_today.year+1):
|
||||
for month in xrange(0, 13):
|
||||
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
|
||||
host=p.config.get("Redis_Level_DB", "host"), port=year,
|
||||
db=month)
|
||||
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("Script duplicate started")
|
||||
|
||||
while True:
|
||||
try:
|
||||
hash_dico = {}
|
||||
dupl = []
|
||||
dico_range_list = []
|
||||
|
||||
x = time.time()
|
||||
|
||||
message = p.get_from_set()
|
||||
if message is not None:
|
||||
path = message
|
||||
PST = Paste.Paste(path)
|
||||
else:
|
||||
publisher.debug("Script Attribute is idling 10s")
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
# the paste is too small
|
||||
if (PST._get_p_size() < min_paste_size):
|
||||
continue
|
||||
|
||||
PST._set_p_hash_kind("ssdeep")
|
||||
|
||||
# Assignate the correct redis connexion
|
||||
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
|
||||
|
||||
# Creating the dico name: yyyymm
|
||||
# Get the date of the range
|
||||
date_range = date_today - timedelta(days = maximum_month_range*30.4166666)
|
||||
num_of_month = (date_today.year - date_range.year)*12 + (date_today.month - date_range.month)
|
||||
for diff_month in xrange(0, num_of_month+1):
|
||||
curr_date_range = date_today - timedelta(days = diff_month*30.4166666)
|
||||
to_append = str(curr_date_range.year)+str(curr_date_range.month).zfill(2)
|
||||
dico_range_list.append(to_append)
|
||||
|
||||
# Use all dico in range
|
||||
dico_range_list = dico_range_list[0:maximum_month_range]
|
||||
|
||||
# UNIQUE INDEX HASHS TABLE
|
||||
yearly_index = str(date_today.year)+'00'
|
||||
r_serv0 = dico_redis[yearly_index]
|
||||
r_serv0.incr("current_index")
|
||||
index = r_serv0.get("current_index")+str(PST.p_date)
|
||||
|
||||
# Open selected dico range
|
||||
opened_dico = []
|
||||
for dico_name in dico_range_list:
|
||||
opened_dico.append([dico_name, dico_redis[dico_name]])
|
||||
|
||||
# retrieve hash from paste
|
||||
paste_hash = PST._get_p_hash()
|
||||
|
||||
# Go throught the Database of the dico (of the month)
|
||||
for curr_dico_name, curr_dico_redis in opened_dico:
|
||||
for dico_hash in curr_dico_redis.smembers('HASHS'):
|
||||
try:
|
||||
percent = ssdeep.compare(dico_hash, paste_hash)
|
||||
if percent > threshold_duplicate:
|
||||
# Go throught the Database of the dico filter (month)
|
||||
r_serv_dico = dico_redis[curr_dico_name]
|
||||
|
||||
# index of paste
|
||||
index_current = r_serv_dico.get(dico_hash)
|
||||
paste_path = r_serv_dico.get(index_current)
|
||||
if paste_path != None:
|
||||
hash_dico[dico_hash] = (paste_path, percent)
|
||||
|
||||
print 'comparing: ' + str(PST.p_path[44:]) + ' and ' + str(paste_path[44:]) + ' percentage: ' + str(percent)
|
||||
#print ' '+ PST.p_path[44:] +', '+ paste_path[44:] + ', ' + str(percent)
|
||||
except:
|
||||
# ssdeep hash not comparable
|
||||
print 'ssdeep hash not comparable'
|
||||
publisher.error('ssdeep hash not comparable')
|
||||
|
||||
# Add paste in DB after checking to prevent its analysis twice
|
||||
# hash_i -> index_i AND index_i -> PST.PATH
|
||||
r_serv1.set(index, PST.p_path)
|
||||
r_serv1.sadd("INDEX", index)
|
||||
# Adding the hash in Redis
|
||||
r_serv1.set(paste_hash, index)
|
||||
r_serv1.sadd("HASHS", paste_hash)
|
||||
##################### Similarity found #######################
|
||||
|
||||
# if there is data in this dictionnary
|
||||
if len(hash_dico) != 0:
|
||||
# paste_tuple = (paste_path, percent)
|
||||
for dico_hash, paste_tuple in hash_dico.items():
|
||||
dupl.append(paste_tuple)
|
||||
|
||||
# Creating the object attribute and save it.
|
||||
to_print = 'Duplicate;{};{};{};'.format(
|
||||
PST.p_source, PST.p_date, PST.p_name)
|
||||
if dupl != []:
|
||||
PST.__setattr__("p_duplicate", dupl)
|
||||
PST.save_attribute_redis("p_duplicate", dupl)
|
||||
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
|
||||
#print '{}Detected {}'.format(to_print, len(dupl))
|
||||
|
||||
y = time.time()
|
||||
|
||||
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
|
||||
#print '{}Processed in {} sec'.format(to_print, y-x)
|
||||
|
||||
except IOError:
|
||||
to_print = 'Duplicate;{};{};{};'.format(
|
||||
PST.p_source, PST.p_date, PST.p_name)
|
||||
print "CRC Checksum Failed on :", PST.p_path
|
||||
publisher.error('{}CRC Checksum Failed'.format(to_print))
|
|
@ -105,7 +105,7 @@ function launching_scripts {
|
|||
|
||||
screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
|
||||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x'
|
||||
screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep_v2.py; read x'
|
||||
sleep 0.1
|
||||
screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
|
||||
sleep 0.1
|
||||
|
|
|
@ -4,6 +4,13 @@ pastes = PASTES
|
|||
wordtrending_csv = var/www/static/csv/wordstrendingdata
|
||||
wordsfile = files/wordfile
|
||||
|
||||
#### Modules ####
|
||||
[Modules_Duplicates]
|
||||
#Number of month to look back
|
||||
maximum_month_range = 3
|
||||
#The value where two pastes are considerate duplicate.
|
||||
threshold_duplicate = 50
|
||||
|
||||
##### Redis #####
|
||||
[Redis_Cache]
|
||||
host = localhost
|
||||
|
|
Loading…
Reference in New Issue