AIL-framework/bin/packages/lib_redis_insert.py

import sys, hashlib, os, os.path, gzip, string, glob, itertools, copy, shutil
import redis, crcmod, mmh3, time, fileinput
import crcmod, mmh3

from operator import itemgetter, attrgetter
from pubsublogger import publisher


def listdirectory(path):
    """Path Traversing Function.

    :param path: -- The absolute pathname to a directory.

    This function is returning all the absolute path of the files contained in
    the argument directory.

    """
    fichier=[]
    for root, dirs, files in os.walk(path):

        for i in files:

            fichier.append(os.path.join(root, i))

    return fichier


clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""


def select_hash(hashkind, line):
    """Select the kind of hashing for the line.

    :param hashkind: -- (str) The name of the hash
    :param line: -- (str) The string to hash.

    This function is a kind of hash selector which will use the hash passed
    in argument to hash the string also passed in argument.

    """
    if hashkind == "md5":
        hashline = hashlib.md5(line).hexdigest()

    elif hashkind == "sha1":
        hashline = hashlib.sha1(line).hexdigest()

    elif hashkind == "crc":
        crc32 = crcmod.Crc(0x104c11db7, initCrc=0, xorOut=0xFFFFFFFF)
        crc32.update(line)
        hashline = crc32.hexdigest()

    elif hashkind == "murmur":
        hashline = mmh3.hash(line)

    return str(hashline)


def redis_populate(pipe, folder, minline, hashkind, jmp, insert_type):
    """Call another function with different "mode"

    :param pipe: -- Redis pipe
    :param folder: -- the absolute path name to the folder where to process
    :param minline: -- the minimum lenght of line to hash
    :param hashkind: -- the hash to use
    :param jmp: -- (bool) trigger the jumping line mode or not
     :param insert_type: -- which kind of datastructure to create in redis.

     This Function actually call the function "insert_redis" with differents
     method to process it.
     In one way, x lines are jumped before the Insertion.
     In another, all the line are hashed and inserted in redis.

    """
    for filename in folder:

        with gzip.open(filename, 'rb') as F:
            start_line = 1

            for num, line in enumerate(F, start_line):

                if jmp != 1:

                    if (num % jmp) == 1 :
                        insert_redis(filename,
                            line,
                            pipe,
                            minline,
                            hashkind,
                            num,
                            insert_type)

                else:
                    insert_redis(filename,
                        line,
                        pipe,
                        minline,
                        hashkind,
                        num,
                        insert_type)

            pipe.execute()


def insert_redis(filename, line, pipe, minline, hashkind, num, insert_type):
    """Insert hashed line in redis.

    :param filename: -- the absolute path name to the folder where to process
    :param line: -- the clear line which will be hashed.
    :param pipe: -- Redis pipe
    :param minline: -- the minimum lenght of line to hash
    :param hashkind: -- the hash to use
    :param num: -- (int) the first line of the file (better human read)
    :param insert_type: -- (int) Choose the datastructure used in redis.

    This function insert hashed lines in the selected redis datastructure
    The datastructure is represented as follow:

    case one: ALLIN
    "hash"[hashedline][occurence] => to index all different hashs + scoring
    "hashedline"[filename.gz] => to associate the file.gz to his hashedline
    "L:hashedline"[clearline] => for the correspondance

    case two: SORTED SET (for the ./top.py script)
    "hash"[hashedline][occurence] => to index all different hashs + scoring
    "hashedline"[filename.gz] => to associate the file.gz to his hashedline

    case tree: BASIC SET (for ./Graph.py)
    "hash"[hashedline] to index all different hashs (without scores)
    "hashedline"[filename.gz] => to associate the file.gz to his hashedline
    "filename.gz"[firstline] => for human reading

    """
    if (insert_type == 2): # ALLIN
        if len(line) >= minline:

            pipe.zincrby("hash", select_hash(hashkind, line), 1)
            pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])
            pipe.sadd("L:"+select_hash(hashkind, line), clean(line))

            if (num == 1):

                pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))


    elif (insert_type == 1): # SORTED SET FOR TOP100.py

        if len(line) >= minline:

            pipe.zincrby("hash", select_hash(hashkind, line), 1)
            pipe.sadd(select_hash(hashkind, line), clean(line))


    elif (insert_type == 0): # SET FOR THE GRAPH

        if len(line) >= minline:

            pipe.sadd("hash", select_hash(hashkind, line))
            pipe.sadd(select_hash(hashkind,line), filename.split('/',20)[-1])

            if (num == 1):

                pipe.sadd(filename.split('/',20)[-1], clean(line[0:80]))


def remove_pure_doppelganger(r_serv, nb):
    """Remove identic paste

    :param r_serv: -- Redis connexion database
    :param nb: -- (int) Number of execution wanted

    Add to a temporary list the hash of wholes files and compare the new hash
    to the element of this list. If the hash is already inside, the file
    is deleted otherwise the hash is added in the list.

    """
    hashlist = []
    for x in xrange(0,nb):
        filename = r_serv.lpop("filelist")

        with open(filename, 'rb') as L:
            hashline = hashlib.md5(L.read()).hexdigest()

            print len(hashlist)

            if hashline in hashlist:

                os.remove(filename)
                publisher.debug("{0} removed".format(filename))
                print "{0} removed".format(filename)
            else:
                hashlist.append(hashline)