AIL-framework/bin/Duplicate.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*

"""
The Duplicate module
====================

This huge module is, in short term, checking duplicates.

Requirements:
-------------


"""
import redis
import os
import time
from packages import Paste
from pubsublogger import publisher
from pybloomfilter import BloomFilter

from Helper import Process

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'Duplicates'

    p = Process(config_section)

    # REDIS #
    # DB OBJECT & HASHS ( DISK )
    # FIXME increase flexibility
    dico_redis = {}
    for year in xrange(2013, 2015):
        for month in xrange(0, 16):
            dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(
                host=p.config.get("Redis_Level_DB", "host"), port=year,
                db=month)

    # FUNCTIONS #
    publisher.info("Script duplicate started")

    set_limit = 100
    bloompath = os.path.join(os.environ['AIL_HOME'],
                             p.config.get("Directories", "bloomfilters"))

    bloop_path_set = set()
    while True:
        try:
            super_dico = {}
            hash_dico = {}
            dupl = []
            nb_hash_current = 0

            x = time.time()

            message = p.get_from_set()
            if message is not None:
                path = message
                PST = Paste.Paste(path)
            else:
                publisher.debug("Script Attribute is idling 10s")
                time.sleep(10)
                continue

            PST._set_p_hash_kind("md5")

            # Assignate the correct redis connexion
            r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]

            # Creating the bloom filter name: bloomyyyymm
            filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +
                                         PST.p_date.month)

            if os.path.exists(filebloompath):
                bloom = BloomFilter.open(filebloompath)
            else:
                bloom = BloomFilter(100000000, 0.01, filebloompath)
                bloop_path_set.add(filebloompath)

            # UNIQUE INDEX HASHS TABLE
            r_serv0 = dico_redis["201300"]
            r_serv0.incr("current_index")
            index = r_serv0.get("current_index")+str(PST.p_date)
            # HASHTABLES PER MONTH (because of r_serv1 changing db)
            r_serv1.set(index, PST.p_path)
            r_serv1.sadd("INDEX", index)

            # For each bloom filter
            opened_bloom = []
            for bloo in bloop_path_set:
                # Opening blooms
                opened_bloom.append(BloomFilter.open(bloo))

            # For each hash of the paste
            for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):
                nb_hash_current += 1

                # Adding the hash in Redis & limiting the set
                if r_serv1.scard(line_hash) <= set_limit:
                    r_serv1.sadd(line_hash, index)
                    r_serv1.sadd("HASHS", line_hash)
                # Adding the hash in the bloom of the month
                bloom.add(line_hash)

                # Go throught the Database of the bloom filter (of the month)
                for bloo in opened_bloom:
                    if line_hash in bloo:
                        db = bloo.name[-6:]
                        # Go throught the Database of the bloom filter (month)
                        r_serv_bloom = dico_redis[db]

                        # set of index paste: set([1,2,4,65])
                        hash_current = r_serv_bloom.smembers(line_hash)
                        # removing itself from the list
                        hash_current = hash_current - set([index])

                        # if the hash is present at least in 1 files
                        # (already processed)
                        if len(hash_current) != 0:
                            hash_dico[line_hash] = hash_current

                        # if there is data in this dictionnary
                        if len(hash_dico) != 0:
                            super_dico[index] = hash_dico

    ###########################################################################

            # if there is data in this dictionnary
            if len(super_dico) != 0:
                # current = current paste, phash_dico = {hash: set, ...}
                occur_dico = {}
                for current, phash_dico in super_dico.items():
                    # phash = hash, pset = set([ pastes ...])
                    for phash, pset in hash_dico.items():

                        for p_fname in pset:
                            occur_dico.setdefault(p_fname, 0)
                            # Count how much hash is similar per file occuring
                            # in the dictionnary
                            if occur_dico[p_fname] >= 0:
                                occur_dico[p_fname] = occur_dico[p_fname] + 1

                for paste, count in occur_dico.items():
                    percentage = round((count/float(nb_hash_current))*100, 2)
                    if percentage >= 50:
                        dupl.append((paste, percentage))

                # Creating the object attribute and save it.
                to_print = 'Duplicate;{};{};{};'.format(
                    PST.p_source, PST.p_date, PST.p_name)
                if dupl != []:
                    PST.__setattr__("p_duplicate", dupl)
                    PST.save_attribute_redis("p_duplicate", dupl)
                    publisher.info('{}Detected {}'.format(to_print, len(dupl)))

                y = time.time()

                publisher.debug('{}Processed in {} sec'.format(to_print, y-x))
        except IOError:
            print "CRC Checksum Failed on :", PST.p_path
            publisher.error('{}CRC Checksum Failed'.format(to_print))
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`

			`"""`
			`The Duplicate module`
			`====================`

			`This huge module is, in short term, checking duplicates.`

			`Requirements:`
			`-------------`


			`"""`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`import redis`
			`import os`
			`import time`
			`from packages import Paste`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`from pubsublogger import publisher`
			`from pybloomfilter import BloomFilter`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`from Helper import Process`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`if __name__ == "__main__":`
Small fixes to make the refactoring production ready * the port for the logging is 6380 * use os.environ properly * fix typos 2014-08-22 17:35:40 +02:00			`publisher.port = 6380`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`publisher.channel = "Script"`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`config_section = 'Duplicates'`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`p = Process(config_section)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# REDIS #`
			`# DB OBJECT & HASHS ( DISK )`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`# FIXME increase flexibility`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`dico_redis = {}`
			`for year in xrange(2013, 2015):`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`for month in xrange(0, 16):`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`dico_redis[str(year)+str(month).zfill(2)] = redis.StrictRedis(`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`host=p.config.get("Redis_Level_DB", "host"), port=year,`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`db=month)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# FUNCTIONS #`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`publisher.info("Script duplicate started")`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`set_limit = 100`
Small fixes to make the refactoring production ready * the port for the logging is 6380 * use os.environ properly * fix typos 2014-08-22 17:35:40 +02:00			`bloompath = os.path.join(os.environ['AIL_HOME'],`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`p.config.get("Directories", "bloomfilters"))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`bloop_path_set = set()`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`while True:`
			`try:`
			`super_dico = {}`
			`hash_dico = {}`
			`dupl = []`
			`nb_hash_current = 0`

			`x = time.time()`

Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`message = p.get_from_set()`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`if message is not None:`
Big refactoring, make the queues more flexible 2014-08-29 19:37:56 +02:00			`path = message`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`PST = Paste.Paste(path)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`else:`
			`publisher.debug("Script Attribute is idling 10s")`
			`time.sleep(10)`
			`continue`

			`PST._set_p_hash_kind("md5")`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Assignate the correct redis connexion`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`r_serv1 = dico_redis[PST.p_date.year + PST.p_date.month]`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Creating the bloom filter name: bloomyyyymm`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`filebloompath = os.path.join(bloompath, 'bloom' + PST.p_date.year +`
			`PST.p_date.month)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`if os.path.exists(filebloompath):`
			`bloom = BloomFilter.open(filebloompath)`
			`else:`
			`bloom = BloomFilter(100000000, 0.01, filebloompath)`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`bloop_path_set.add(filebloompath)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`# UNIQUE INDEX HASHS TABLE`
			`r_serv0 = dico_redis["201300"]`
			`r_serv0.incr("current_index")`
			`index = r_serv0.get("current_index")+str(PST.p_date)`
			`# HASHTABLES PER MONTH (because of r_serv1 changing db)`
			`r_serv1.set(index, PST.p_path)`
			`r_serv1.sadd("INDEX", index)`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# For each bloom filter`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`opened_bloom = []`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`for bloo in bloop_path_set:`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Opening blooms`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`opened_bloom.append(BloomFilter.open(bloo))`

			`# For each hash of the paste`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`for line_hash in PST._get_hash_lines(min=5, start=1, jump=0):`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`nb_hash_current += 1`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Adding the hash in Redis & limiting the set`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`if r_serv1.scard(line_hash) <= set_limit:`
			`r_serv1.sadd(line_hash, index)`
			`r_serv1.sadd("HASHS", line_hash)`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Adding the hash in the bloom of the month`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`bloom.add(line_hash)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# Go throught the Database of the bloom filter (of the month)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`for bloo in opened_bloom:`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`if line_hash in bloo:`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`db = bloo.name[-6:]`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`# Go throught the Database of the bloom filter (month)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`r_serv_bloom = dico_redis[db]`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# set of index paste: set([1,2,4,65])`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`hash_current = r_serv_bloom.smembers(line_hash)`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# removing itself from the list`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`hash_current = hash_current - set([index])`

completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`# if the hash is present at least in 1 files`
			`# (already processed)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if len(hash_current) != 0:`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`hash_dico[line_hash] = hash_current`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# if there is data in this dictionnary`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if len(hash_dico) != 0:`
			`super_dico[index] = hash_dico`

completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`###########################################################################`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`# if there is data in this dictionnary`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if len(super_dico) != 0:`
			`# current = current paste, phash_dico = {hash: set, ...}`
			`occur_dico = {}`
			`for current, phash_dico in super_dico.items():`
			`# phash = hash, pset = set([ pastes ...])`
			`for phash, pset in hash_dico.items():`

			`for p_fname in pset:`
			`occur_dico.setdefault(p_fname, 0)`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`# Count how much hash is similar per file occuring`
			`# in the dictionnary`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if occur_dico[p_fname] >= 0:`
			`occur_dico[p_fname] = occur_dico[p_fname] + 1`

			`for paste, count in occur_dico.items():`
			`percentage = round((count/float(nb_hash_current))*100, 2)`
			`if percentage >= 50:`
			`dupl.append((paste, percentage))`

			`# Creating the object attribute and save it.`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`to_print = 'Duplicate;{};{};{};'.format(`
			`PST.p_source, PST.p_date, PST.p_name)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`if dupl != []:`
			`PST.__setattr__("p_duplicate", dupl)`
move Redis_Data_Merging to Paste 2014-08-21 12:22:07 +02:00			`PST.save_attribute_redis("p_duplicate", dupl)`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`publisher.info('{}Detected {}'.format(to_print, len(dupl)))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`y = time.time()`

Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`publisher.debug('{}Processed in {} sec'.format(to_print, y-x))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`except IOError:`
			`print "CRC Checksum Failed on :", PST.p_path`
Big cleanup, pep8 2014-08-14 17:55:18 +02:00			`publisher.error('{}CRC Checksum Failed'.format(to_print))`