mirror of https://github.com/CIRCL/AIL-framework
Added new module for Duplicate paste. Seems working but has some small bug (re-check same paste twice)
parent
0332f23579
commit
14e9850dd6
|
@ -0,0 +1,180 @@
|
||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
|
"""
|
||||||
|
The Duplicate module
|
||||||
|
====================
|
||||||
|
|
||||||
|
This huge module is, in short term, checking duplicates.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
-------------
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
import redis
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import ssdeep
|
||||||
|
from packages import Paste
|
||||||
|
from pubsublogger import publisher
|
||||||
|
from pybloomfilter import BloomFilter
|
||||||
|
|
||||||
|
from Helper import Process
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
publisher.port = 6380
|
||||||
|
publisher.channel = "Script"
|
||||||
|
|
||||||
|
config_section = 'Duplicates'
|
||||||
|
saved_dico_and_reload = 1 #min
|
||||||
|
time_1 = time.time()
|
||||||
|
flag_reload = True
|
||||||
|
flag_to_disk = False
|
||||||
|
|
||||||
|
p = Process(config_section)
|
||||||
|
|
||||||
|
# REDIS #
|
||||||
|
# DB OBJECT & HASHS ( DISK )
|
||||||
|
# FIXME increase flexibility
|
||||||
|
dico_redis = {}
|
||||||
|
for year in xrange(2013, datetime.date.today().year+1):
|
||||||
|
for month in xrange(0, 16):
|
||||||
|
dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
|
||||||
|
host=p.config.get("Redis_Level_DB", "host"), port=year,
|
||||||
|
db=month)
|
||||||
|
#print("dup: "+str(year)+str(month).zfill(2)+"\n")
|
||||||
|
|
||||||
|
# FUNCTIONS #
|
||||||
|
publisher.info("Script duplicate started")
|
||||||
|
|
||||||
|
dicopath = os.path.join(os.environ['AIL_HOME'],
|
||||||
|
p.config.get("Directories", "dicofilters"))
|
||||||
|
|
||||||
|
dico_path_set = set()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
hash_dico = {}
|
||||||
|
dupl = []
|
||||||
|
|
||||||
|
x = time.time()
|
||||||
|
|
||||||
|
message = p.get_from_set()
|
||||||
|
if message is not None:
|
||||||
|
path = message
|
||||||
|
PST = Paste.Paste(path)
|
||||||
|
else:
|
||||||
|
publisher.debug("Script Attribute is idling 10s")
|
||||||
|
time.sleep(10)
|
||||||
|
continue
|
||||||
|
|
||||||
|
PST._set_p_hash_kind("ssdeep")
|
||||||
|
|
||||||
|
# Assignate the correct redis connexion
|
||||||
|
r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]
|
||||||
|
|
||||||
|
# Creating the dicor name: dicoyyyymm
|
||||||
|
filedicopath = os.path.join(dicopath, 'dico' + PST.p_date.year +
|
||||||
|
PST.p_date.month)
|
||||||
|
filedicopath_today = filedicopath
|
||||||
|
|
||||||
|
# Save I/O
|
||||||
|
if time.time() - time_1 > saved_dico_and_reload*60:
|
||||||
|
flag_to_disk = True
|
||||||
|
|
||||||
|
if os.path.exists(filedicopath):
|
||||||
|
if flag_reload == True:
|
||||||
|
flag_reload = False
|
||||||
|
print 'Reloading'
|
||||||
|
time_1 = time.time()
|
||||||
|
with open(filedicopath, 'r') as fp:
|
||||||
|
today_dico = json.load(fp)
|
||||||
|
else:
|
||||||
|
time_1 = time.time()
|
||||||
|
today_dico = {}
|
||||||
|
with open(filedicopath, 'w') as fp:
|
||||||
|
json.dump(today_dico, fp)
|
||||||
|
|
||||||
|
# For now, just use monthly dico
|
||||||
|
dico_path_set.add(filedicopath)
|
||||||
|
|
||||||
|
# UNIQUE INDEX HASHS TABLE
|
||||||
|
yearly_index = str(datetime.date.today().year)+'00'
|
||||||
|
r_serv0 = dico_redis[yearly_index]
|
||||||
|
r_serv0.incr("current_index")
|
||||||
|
index = r_serv0.get("current_index")+str(PST.p_date)
|
||||||
|
# HASHTABLES PER MONTH (because of r_serv1 changing db)
|
||||||
|
r_serv1.set(index, PST.p_path)
|
||||||
|
r_serv1.sadd("INDEX", index)
|
||||||
|
# For each dico
|
||||||
|
opened_dico = []
|
||||||
|
for dico in dico_path_set:
|
||||||
|
# Opening dico
|
||||||
|
if dico == filedicopath_today:
|
||||||
|
opened_dico.append([dico, today_dico])
|
||||||
|
with open(dico, 'r') as fp:
|
||||||
|
opened_dico.append([dico, json.load(fp)])
|
||||||
|
|
||||||
|
|
||||||
|
#retrieve hash from paste
|
||||||
|
paste_hash = PST._get_p_hash()
|
||||||
|
# Adding the hash in Redis
|
||||||
|
r_serv1.set(paste_hash, index)
|
||||||
|
r_serv1.sadd("HASHS", paste_hash)
|
||||||
|
# Go throught the Database of the dico (of the month)
|
||||||
|
threshold_dup = 10
|
||||||
|
for dico_name, dico in opened_dico:
|
||||||
|
for dico_key, dico_hash in dico.items():
|
||||||
|
percent = ssdeep.compare(dico_hash, paste_hash)
|
||||||
|
if percent > threshold_dup:
|
||||||
|
db = dico_name[-6:]
|
||||||
|
# Go throught the Database of the bloom filter (month)
|
||||||
|
r_serv_dico = dico_redis[db]
|
||||||
|
|
||||||
|
# index of paste
|
||||||
|
# FIXME Use r_serv_dico and do not consider only 1 server!!
|
||||||
|
index_current = r_serv1.get(dico_hash)
|
||||||
|
paste_path = r_serv1.get(index_current)
|
||||||
|
if paste_path != None:
|
||||||
|
hash_dico[dico_hash] = (paste_path, percent)
|
||||||
|
|
||||||
|
print 'comparing: ' + str(dico_hash[:20]) + ' and ' + str(paste_hash[:20]) + ' percentage: ' + str(percent)
|
||||||
|
print ' '+ PST.p_path[44:] +', '+ paste_path[44:]
|
||||||
|
|
||||||
|
##################### Similarity found #######################
|
||||||
|
|
||||||
|
# if there is data in this dictionnary
|
||||||
|
if len(hash_dico) != 0:
|
||||||
|
for dico_hash, paste_tuple in hash_dico.items():
|
||||||
|
paste_path, percent = paste_tuple
|
||||||
|
dupl.append((paste_path, percent))
|
||||||
|
|
||||||
|
# Creating the object attribute and save it.
|
||||||
|
to_print = 'Duplicate;{};{};{};'.format(
|
||||||
|
PST.p_source, PST.p_date, PST.p_name)
|
||||||
|
if dupl != []:
|
||||||
|
PST.__setattr__("p_duplicate", dupl)
|
||||||
|
PST.save_attribute_redis("p_duplicate", dupl)
|
||||||
|
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
|
||||||
|
print '{}Detected {}'.format(to_print, len(dupl))
|
||||||
|
|
||||||
|
y = time.time()
|
||||||
|
|
||||||
|
publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
|
||||||
|
|
||||||
|
|
||||||
|
# Adding the hash in the dico of the month
|
||||||
|
today_dico[index] = paste_hash
|
||||||
|
|
||||||
|
if flag_to_disk:
|
||||||
|
flag_to_disk = False
|
||||||
|
flag_reload = True
|
||||||
|
with open(filedicopath, 'w') as fp:
|
||||||
|
json.dump(today_dico, fp)
|
||||||
|
except IOError:
|
||||||
|
to_print = 'Duplicate;{};{};{};'.format(
|
||||||
|
PST.p_source, PST.p_date, PST.p_name)
|
||||||
|
print "CRC Checksum Failed on :", PST.p_path
|
||||||
|
publisher.error('{}CRC Checksum Failed'.format(to_print))
|
|
@ -105,7 +105,7 @@ function launching_scripts {
|
||||||
|
|
||||||
screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
|
screen -S "Script" -X screen -t "Global" bash -c './Global.py; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate.py; read x'
|
screen -S "Script" -X screen -t "Duplicate" bash -c './Duplicate_ssdeep.py; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
|
screen -S "Script" -X screen -t "Attribute" bash -c './Attribute.py; read x'
|
||||||
sleep 0.1
|
sleep 0.1
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import hashlib
|
import hashlib
|
||||||
import crcmod
|
import crcmod
|
||||||
import mmh3
|
import mmh3
|
||||||
import simhash
|
import ssdeep
|
||||||
|
|
||||||
|
|
||||||
class Hash(object):
|
class Hash(object):
|
||||||
|
@ -33,7 +33,7 @@ class Hash(object):
|
||||||
elif self.name == "murmur":
|
elif self.name == "murmur":
|
||||||
hash = mmh3.hash(string)
|
hash = mmh3.hash(string)
|
||||||
|
|
||||||
elif self.name == "simhash":
|
elif self.name == "ssdeep":
|
||||||
hash = Simhash(string)
|
hash = ssdeep.hash(string)
|
||||||
|
|
||||||
return hash
|
return hash
|
||||||
|
|
Loading…
Reference in New Issue