From 0332f23579ae7dee01c7f42db72aff070c8aa019 Mon Sep 17 00:00:00 2001 From: Mokaddem Date: Fri, 15 Jul 2016 08:56:16 +0200 Subject: [PATCH] Added SimHash library --- bin/Duplicate.py | 7 ++++--- bin/packages/Hash.py | 4 ++++ pip_packages_requirement.txt | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/Duplicate.py b/bin/Duplicate.py index a7a41dc1..59610f83 100755 --- a/bin/Duplicate.py +++ b/bin/Duplicate.py @@ -74,9 +74,9 @@ if __name__ == "__main__": # Creating the bloom filter name: bloomyyyymm filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year + PST.p_date.month) - if os.path.exists(filebloompath): bloom = BloomFilter.open(filebloompath) + bloop_path_set.add(filebloompath) else: bloom = BloomFilter(100000000, 0.01, filebloompath) bloop_path_set.add(filebloompath) @@ -94,7 +94,6 @@ if __name__ == "__main__": for bloo in bloop_path_set: # Opening blooms opened_bloom.append(BloomFilter.open(bloo)) - # For each hash of the paste for line_hash in PST._get_hash_lines(min=5, start=1, jump=0): nb_hash_current += 1 @@ -105,7 +104,6 @@ if __name__ == "__main__": r_serv1.sadd("HASHS", line_hash) # Adding the hash in the bloom of the month bloom.add(line_hash) - # Go throught the Database of the bloom filter (of the month) for bloo in opened_bloom: if line_hash in bloo: @@ -148,6 +146,8 @@ if __name__ == "__main__": percentage = round((count/float(nb_hash_current))*100, 2) if percentage >= 50: dupl.append((paste, percentage)) + else: + print 'percentage: ' + str(percentage) # Creating the object attribute and save it. to_print = 'Duplicate;{};{};{};'.format( @@ -156,6 +156,7 @@ if __name__ == "__main__": PST.__setattr__("p_duplicate", dupl) PST.save_attribute_redis("p_duplicate", dupl) publisher.info('{}Detected {}'.format(to_print, len(dupl))) + print '{}Detected {}'.format(to_print, len(dupl)) y = time.time() diff --git a/bin/packages/Hash.py b/bin/packages/Hash.py index f8dcac0f..d46abcba 100644 --- a/bin/packages/Hash.py +++ b/bin/packages/Hash.py @@ -1,6 +1,7 @@ import hashlib import crcmod import mmh3 +import simhash class Hash(object): @@ -32,4 +33,7 @@ class Hash(object): elif self.name == "murmur": hash = mmh3.hash(string) + elif self.name == "simhash": + hash = Simhash(string) + return hash diff --git a/pip_packages_requirement.txt b/pip_packages_requirement.txt index 40dcda8e..db2f23c5 100644 --- a/pip_packages_requirement.txt +++ b/pip_packages_requirement.txt @@ -17,6 +17,7 @@ nltk # Hashlib crcmod mmh3 +simhash #Others python-magic