Added SimHash library

pull/63/head
Mokaddem 2016-07-15 08:56:16 +02:00
parent 60552bca4d
commit 0332f23579
3 changed files with 9 additions and 3 deletions

View File

@ -74,9 +74,9 @@ if __name__ == "__main__":
# Creating the bloom filter name: bloomyyyymm
filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
PST.p_date.month)
if os.path.exists(filebloompath):
bloom = BloomFilter.open(filebloompath)
bloop_path_set.add(filebloompath)
else:
bloom = BloomFilter(100000000, 0.01, filebloompath)
bloop_path_set.add(filebloompath)
@ -94,7 +94,6 @@ if __name__ == "__main__":
for bloo in bloop_path_set:
# Opening blooms
opened_bloom.append(BloomFilter.open(bloo))
# For each hash of the paste
for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
nb_hash_current += 1
@ -105,7 +104,6 @@ if __name__ == "__main__":
r_serv1.sadd("HASHS", line_hash)
# Adding the hash in the bloom of the month
bloom.add(line_hash)
# Go throught the Database of the bloom filter (of the month)
for bloo in opened_bloom:
if line_hash in bloo:
@ -148,6 +146,8 @@ if __name__ == "__main__":
percentage = round((count/float(nb_hash_current))*100, 2)
if percentage >= 50:
dupl.append((paste, percentage))
else:
print 'percentage: ' + str(percentage)
# Creating the object attribute and save it.
to_print = 'Duplicate;{};{};{};'.format(
@ -156,6 +156,7 @@ if __name__ == "__main__":
PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl)
publisher.info('{}Detected {}'.format(to_print, len(dupl)))
print '{}Detected {}'.format(to_print, len(dupl))
y = time.time()

View File

@ -1,6 +1,7 @@
import hashlib
import crcmod
import mmh3
import simhash
class Hash(object):
@ -32,4 +33,7 @@ class Hash(object):
elif self.name == "murmur":
hash = mmh3.hash(string)
elif self.name == "simhash":
hash = Simhash(string)
return hash

View File

@ -17,6 +17,7 @@ nltk
# Hashlib
crcmod
mmh3
simhash
#Others
python-magic