Added SimHash library

pull/63/head
Mokaddem 2016-07-15 08:56:16 +02:00
parent 60552bca4d
commit 0332f23579
3 changed files with 9 additions and 3 deletions

View File

@ -74,9 +74,9 @@ if __name__ == "__main__":
# Creating the bloom filter name: bloomyyyymm # Creating the bloom filter name: bloomyyyymm
filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year + filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
PST.p_date.month) PST.p_date.month)
if os.path.exists(filebloompath): if os.path.exists(filebloompath):
bloom = BloomFilter.open(filebloompath) bloom = BloomFilter.open(filebloompath)
bloop_path_set.add(filebloompath)
else: else:
bloom = BloomFilter(100000000, 0.01, filebloompath) bloom = BloomFilter(100000000, 0.01, filebloompath)
bloop_path_set.add(filebloompath) bloop_path_set.add(filebloompath)
@ -94,7 +94,6 @@ if __name__ == "__main__":
for bloo in bloop_path_set: for bloo in bloop_path_set:
# Opening blooms # Opening blooms
opened_bloom.append(BloomFilter.open(bloo)) opened_bloom.append(BloomFilter.open(bloo))
# For each hash of the paste # For each hash of the paste
for line_hash in PST._get_hash_lines(min=5, start=1, jump=0): for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
nb_hash_current += 1 nb_hash_current += 1
@ -105,7 +104,6 @@ if __name__ == "__main__":
r_serv1.sadd("HASHS", line_hash) r_serv1.sadd("HASHS", line_hash)
# Adding the hash in the bloom of the month # Adding the hash in the bloom of the month
bloom.add(line_hash) bloom.add(line_hash)
# Go throught the Database of the bloom filter (of the month) # Go throught the Database of the bloom filter (of the month)
for bloo in opened_bloom: for bloo in opened_bloom:
if line_hash in bloo: if line_hash in bloo:
@ -148,6 +146,8 @@ if __name__ == "__main__":
percentage = round((count/float(nb_hash_current))*100, 2) percentage = round((count/float(nb_hash_current))*100, 2)
if percentage >= 50: if percentage >= 50:
dupl.append((paste, percentage)) dupl.append((paste, percentage))
else:
print 'percentage: ' + str(percentage)
# Creating the object attribute and save it. # Creating the object attribute and save it.
to_print = 'Duplicate;{};{};{};'.format( to_print = 'Duplicate;{};{};{};'.format(
@ -156,6 +156,7 @@ if __name__ == "__main__":
PST.__setattr__("p_duplicate", dupl) PST.__setattr__("p_duplicate", dupl)
PST.save_attribute_redis("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl)
publisher.info('{}Detected {}'.format(to_print, len(dupl))) publisher.info('{}Detected {}'.format(to_print, len(dupl)))
print '{}Detected {}'.format(to_print, len(dupl))
y = time.time() y = time.time()

View File

@ -1,6 +1,7 @@
import hashlib import hashlib
import crcmod import crcmod
import mmh3 import mmh3
import simhash
class Hash(object): class Hash(object):
@ -32,4 +33,7 @@ class Hash(object):
elif self.name == "murmur": elif self.name == "murmur":
hash = mmh3.hash(string) hash = mmh3.hash(string)
elif self.name == "simhash":
hash = Simhash(string)
return hash return hash

View File

@ -17,6 +17,7 @@ nltk
# Hashlib # Hashlib
crcmod crcmod
mmh3 mmh3
simhash
#Others #Others
python-magic python-magic