AIL-framework/bin/packages/lib_words.py

import redis, gzip

import numpy as np
import matplotlib.pyplot as plt
from pylab import *

from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from lib_redis_insert import clean, listdirectory
from lib_jobs import *

from pubsublogger import publisher

import calendar as cal
from datetime import date, timedelta
from dateutil.rrule import rrule, DAILY

from packages import *

def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):
    """Looping function

    :param pipe: -- Redis pipe.
    :param nb: -- (int) Number of pastes proceeded by function
    :param minlength: -- (int) passed to the next function
    :param maxlength: -- (int) passed to the next function

    """
    try:
        for n in xrange(0,nb):

                path = r_serv.lpop("filelist")

                if path != None:
                    set_listof_pid(r_serv, path, sys.argv[0])

                    redis_zincr_words(pipe, path, minlength, maxlength)

                    update_listof_pid(r_serv)

                    r_serv.lpush("processed",path)

                    publisher.debug(path)
                else:
                    publisher.debug("Empty list")
                    break
    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")


def redis_zincr_words(pipe, filename, minlength, maxlength):
    """Create news sorted set in redis.

    :param minlength: -- (int) Minimum words length inserted
    :param maxlength: -- (int) Maximum words length inserted
    :param filename: -- The absolute path to the file.gz to process.

    Representation of the set in redis:

    +------------+------------+-----------+
    |     Keys   | Members    | Scores    |
    +============+============+===========+
    | 20131001   | word1      | 142       |
    +------------+------------+-----------+
    | ...        | word2      | 120       |
    +------------+------------+-----------+
    | 20131002   | ...        | ...       |
    +------------+------------+-----------+

    This function store all words between minlength and maxlength in redis.
    Redis will count as well how much time each word will appear by day:
    The cardinality.

    """
    tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)

    with gzip.open(filename, 'rb') as F:

        blob = TextBlob(clean(F.read()), tokenizer = tokenizer)

        for word in blob.tokens:

            if (len(word) >= minlength) and (len(word) <= maxlength):
                pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)

            if (len(word) >= maxlength):
                publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
                publisher.info(word)

        pipe.execute()


def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
    """Tokenizing on word category

    :param r_serv: -- Redis database connexion
    :param listname: -- (str) path to the file containing the list of path of category files
    :param choicedatastruct: -- (bool) Changing the index of datastructure
    :param nb: -- (int) Number of pastes proceeded by function

    Redis data structures cas be choose as follow:

    +---------------+------------+-----------+
    |     Keys      | Members    | Scores    |
    +===============+============+===========+
    | mails_categ   | filename   | 25000     |
    +---------------+------------+-----------+
    | ...           | filename2  | 2400      |
    +---------------+------------+-----------+
    | web_categ     | ...        | ...       |
    +---------------+------------+-----------+

    Or

    +--------------+-------------+-----------+
    |     Keys     | Members     | Scores    |
    +==============+=============+===========+
    | filename     | mails_categ | 100000    |
    +--------------+-------------+-----------+
    | ...          | web_categ   | 24050     |
    +--------------+-------------+-----------+
    | filename2    | ...         | ...       |
    +--------------+-------------+-----------+

    This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
    And insert data in redis if the token match the keywords in a list previously
    created.
    These lists of keywords can be list of everything you want but it's better
    to create "category" of keywords.

    """

    try:
        for n in xrange(0,nb):
            filename = r_serv.lpop(r_set)

            if filename != None:

                tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
                set_listof_pid(r_serv, filename, sys.argv[0])

                with open(listname, 'rb') as L:
                    # for each "categ" listed in the file
                    for num, fname in enumerate(L):
                        # contain keywords by categ
                        tmp_list = []
                        #for each keywords
                        with open(fname[:-1], 'rb') as LS:

                            for num, kword in enumerate(LS):
                                tmp_list.append(kword[:-1])

                            # for each paste
                            with gzip.open(filename, 'rb') as F:

                                blob = TextBlob(clean(F.read()),
                                tokenizer = tokenizer)

                                # for each paste token
                                for word in blob.tokens.lower():

                                    if word in tmp_list:
                                        # choosing between two data structures.
                                        if choicedatastruct:
                                            r_serv.zincrby(filename,
                                                fname.split('/')[-1][:-1],
                                                1)
                                        else:
                                            r_serv.zincrby(fname.split('/')[-1][:-1],
                                            filename,
                                            1)

                update_listof_pid(r_serv)

            else:
                publisher.debug("Empty list")
                #r_serv.save()
                break

    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")


def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):
    """Store longlines's linenumbers in redis

    :param r_serv: -- The redis connexion database
    :param r_key: -- (str) The key name in redis
    :param store: -- (bool) Store the line numbers or not.
    :param maxlength: -- The limit between "short lines" and "long lines"

    This function connect to a redis list of filename (pastes filename);
    Open the paste and check inside if there is some line with their
    length >= to maxlength.
    If yes, the paste is "tagged" as containing a longlines in another
    redis structures, and the linenumber (of the long lines) can be stored
    in addition if the argument store is at True.

    """
    try:
        while True:
            #r_key_list (categ)
            filename = r_serv.lpop(r_key)

            if filename != None:

                set_listof_pid(r_serv, filename, sys.argv[0])

                # for each pastes
                with gzip.open(filename, 'rb') as F:
                    var = True
                    for num, line in enumerate(F):

                        if  len(line) >= maxlength:
                            #publisher.debug("Longline:{0}".format(line))
                            if var:
                                r_serv.rpush("longlines", filename)
                                var = False

                            if store:
                                r_serv.sadd(filename, num)
                            else:
                                publisher.debug("Line numbers of longlines not stored")

                update_listof_pid(r_serv)
            else:
                publisher.debug("Empty list")
                return False
                break

    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")


# NOT USED RIGHT NOW #
def recovering_longlines(r_serv):
    """Get longlines with linenumbers

    """
    try:
        for n in xrange(0,nb):
            filename = r_serv.lpop("longlines")

            if filename != None:
                # For each values in redis (longline's line number)
                for numline in r_serv.smembers(filename):

                    with gzip.open(filename,'rb') as F:

                        for num, line in enumerate(F):
                            #When corresponding.
                            if int(num) == int(numline):
                                pass
                                # TREATMENT
            else:
                publisher.debug("Empty list")
                r_serv.save()
                break

    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")


def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):
    """Remove from a set, file with long lines.

    :param r_serv: -- The redis connexion database
    :param r_key: -- (str) The key name in redis
    :param store: -- (bool) Store the line numbers or not.
    :param delete: -- (bool) If true, delete the used key from redis.
    :param maxlength: -- The limit between "short lines" and "long lines"

    """
    publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))

    #Create a list of file to proceed (1)
    for filename in r_serv.zrange(r_key, 0, -1):
        r_serv.rpush(r_key+"_list", filename)

    #detecting longlines in pastes
    dectect_longlines(r_serv, r_key+"_list", store, maxlength)

    #remove false positive members
    while True:
        fp_filename = r_serv.lpop("longlines")

        if fp_filename == None:
            break

        else:
            # if wanted, delete in addition the set with linenumbers (created with store)
            if delete:
                r_serv.zrem(r_key, fp_filename)
                r_serv.delete(fp_filename)

            else:
                #remove the file with longline from the r_key zset.
                r_serv.zrem(r_key, fp_filename)

    publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))


def detect_longline_from_list(r_serv, nb):
    try:
        for n in xrange(0,nb):

                if not dectect_longlines(r_serv, "filelist", True):
                    break

    except (KeyboardInterrupt, SystemExit) as e:
        flush_list_of_pid(r_serv)
        publisher.debug("Pid list flushed")


def create_dirfile(r_serv, directory, overwrite):
    """Create a file of path.

    :param r_serv: -- connexion to redis database
    :param directory: -- The folder where to launch the listing of the .gz files

    This function create a list in redis with inside the absolute path
    of all the pastes needed to be proceeded by function using parallel
    (like redis_words_ranking)

    """
    if overwrite:
        r_serv.delete("filelist")

        for x in listdirectory(directory):
            r_serv.rpush("filelist",x)

        publisher.info("The list was overwritten")

    else:
        if r_serv.llen("filelist") == 0:

            for x in listdirectory(directory):
                r_serv.rpush("filelist",x)

            publisher.info("New list created")
        else:

            for x in listdirectory(directory):
                r_serv.rpush("filelist",x)

            publisher.info("The list was updated with new elements")


def redis_interbargraph_set(r_serv, year, month, overwrite):
    """Create a Redis sorted set.

    :param r_serv: -- connexion to redis database
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process
    :param overwrite: -- (bool) trigger the overwrite mode

    This function create inside redis the intersection of all days in
    a month two by two.
    Example:
    For a month of 31days it will create 30 sorted set between day and
    day+1 until the last day.
    The overwrite mode delete the intersets and re-create them.

    """
    a = date(year, month, 01)
    b = date(year, month, cal.monthrange(year, month)[1])

    if overwrite:
        r_serv.delete("InterSet")

        for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
            dayafter = dt+timedelta(1)

            r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))

            r_serv.zinterstore(
                str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
                {str(dt.strftime("%Y%m%d")):1,
                str(dayafter.strftime("%Y%m%d")):-1})

            r_serv.zadd(
                "InterSet",
                1,
                str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
    else:
        for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
            dayafter = dt+timedelta(1)

            if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:

                r_serv.zinterstore(
                    str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
                    {str(dt.strftime("%Y%m%d")):1,
                    str(dayafter.strftime("%Y%m%d")):-1})

                r_serv.zadd(
                    "InterSet",
                    1,
                    str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))

                publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")

            else:
                publisher.warning("Data already exist, operation aborted.")


def word_bar_graph(r_serv, year, month, filename):
    """Create an histogram.

    :param r_serv: -- connexion to redis database
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process
    :param filename: -- The absolute path where to save the figure.png

    This function use matplotlib to create an histogram.
    The redis database need obviously to be populated first
    with functions: redis_words_ranking and redis_interbargraph_set.

    """
    lw = []
    adate = []
    inter = [0]
    rcParams['figure.figsize'] = 15, 10

    a = date(year, month, 01)
    b = date(year, month, cal.monthrange(year,month)[1])

    for dt in rrule(DAILY, dtstart = a, until = b):
        lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))
        adate.append(dt.strftime("%d"))

    for x in r_serv.zrange("InterSet", 0, 31):
        inter.append(r_serv.zcard(x))

    n_groups = len(lw)
    card_words = tuple(lw)
    card_interword = tuple(inter)

    index = np.arange(n_groups)
    bar_width = 0.5
    opacity = 0.6

    words = plt.bar(index, card_words, bar_width,
                 alpha=opacity,
                 color='g',
                 label='Words/day')

    lwords = plt.bar(index - 0.5, card_interword, bar_width,
                 alpha=opacity,
                 color='r',
                 label='Intersection')


    plt.plot(tuple(inter), 'b--')
    plt.xlabel(str(year)+'/'+str(month)+' Days')
    plt.ylabel('Words')
    plt.title('Words Cardinality & Intersection Histogram')
    plt.xticks(index + bar_width/2 , tuple(adate))

    plt.legend()
    plt.grid()

    plt.tight_layout()

    plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
        orientation='portrait', papertype=None, format="png",
        transparent=False, bbox_inches=None, pad_inches=0.1,
        frameon=True)

    publisher.info(filename+".png"+" saved!")


def create_data_words_curve(r_serv, r_serv2, year, month, filename):
    """Create a Redis hashes.

    :param r_serv: -- connexion to redis database (read)
    :param r_serv2: -- connexion to redis database (write)
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process
    :param filename: -- the path to the file which contain a list of words.


    The hashes of redis is created as follow:

    +------------+------------+-----------+
    |   Keys     | Field      | Values    |
    +============+============+===========+
    | word1      | 20131001   | 150       |
    +------------+------------+-----------+
    | ...        | 20131002   | 145       |
    +------------+------------+-----------+
    | word2      | ...        | ...       |
    +------------+------------+-----------+

    The filename need to be a list of words separated by a carriage return
    with an empty line at the end.
    This function create datas which is used by the function
    create_curve_with_word_file which create a csv file.

    """
    stop = stopwords.words('english')
    a = date(year, month, 01)
    b = date(year, month, cal.monthrange(year,month)[1])

    with open(filename, 'rb') as F:

        for line in F:

            for dt in rrule(DAILY, dtstart = a, until = b):

                if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:
                    #tester si ca existe deja "en option" et ajouter un WARNING log
                    r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})
                else:
                    pass


def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.

    :param r_serv: -- connexion to redis database
    :param csvfilename: -- the path to the .csv file created
    :param feederfilename: -- the path to the file which contain a list of words.
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process

    This function create a .csv file using datas in redis.
    It's checking if the words contained in feederfilename and
    their respectives values by days exists. If these values are missing
    (Word not present during a day) it's will automatically put a 0
    to keep the timeline of the curve correct.

    """
    a = date(year, month, 01)
    b = date(year, month, cal.monthrange(year,month)[1])
    days = {}
    words = []

    with open(feederfilename, 'rb') as F:
        for word in F: # words of the files
            words.append(word[:-1]) # list of words (sorted as in the file)

        for dt in rrule(DAILY, dtstart = a, until = b): # for each days

            mot = []
            mot1 = []
            mot2 = []

            days[dt.strftime("%Y%m%d")] = ''
            for word in sorted(words): # from the 1srt day to the last of the list
                if r_serv.hexists(word, dt.strftime("%Y%m%d")): # if the word have a value for the day
                    mot1.append(str(word))
                    mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))

                    mot = zip(mot1, mot2)

                    days[dt.strftime("%Y%m%d")] = mot
                else:

                    mot1.append(str(word))
                    mot2.append(0)

                    mot = zip(mot1, mot2)

                    days[dt.strftime("%Y%m%d")] = mot

    with open(csvfilename+".csv", 'wb') as F:
        F.write("Date," + ",".join(sorted(words)) + '\n')

        for x, s in days.items():
            val = []
            for y in s:
                val.append(y[1])

            F.write(x + ',' + str(val) + '\n')

    with open(csvfilename+".csv", 'rb') as F:
        h = F.read()
        h = h.replace("[","")
        h = h.replace("]","")
        h = h.replace('\'',"")

    with open(csvfilename+".csv", 'wb') as F:
        F.write(h)
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`import redis, gzip`

			`import numpy as np`
			`import matplotlib.pyplot as plt`
			`from pylab import *`

			`from textblob import TextBlob`
			`from nltk.corpus import stopwords`
			`from nltk.tokenize import RegexpTokenizer`

			`from lib_redis_insert import clean, listdirectory`
			`from lib_jobs import *`

			`from pubsublogger import publisher`

			`import calendar as cal`
			`from datetime import date, timedelta`
			`from dateutil.rrule import rrule, DAILY`

			`from packages import *`

			`def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):`
			`"""Looping function`

			`:param pipe: -- Redis pipe.`
			`:param nb: -- (int) Number of pastes proceeded by function`
			`:param minlength: -- (int) passed to the next function`
			`:param maxlength: -- (int) passed to the next function`

			`"""`
			`try:`
			`for n in xrange(0,nb):`

			`path = r_serv.lpop("filelist")`

			`if path != None:`
			`set_listof_pid(r_serv, path, sys.argv[0])`

			`redis_zincr_words(pipe, path, minlength, maxlength)`

			`update_listof_pid(r_serv)`

			`r_serv.lpush("processed",path)`

			`publisher.debug(path)`
			`else:`
			`publisher.debug("Empty list")`
			`break`
			`except (KeyboardInterrupt, SystemExit) as e:`
			`flush_list_of_pid(r_serv)`
			`publisher.debug("Pid list flushed")`





			`def redis_zincr_words(pipe, filename, minlength, maxlength):`
			`"""Create news sorted set in redis.`

			`:param minlength: -- (int) Minimum words length inserted`
			`:param maxlength: -- (int) Maximum words length inserted`
			`:param filename: -- The absolute path to the file.gz to process.`

			`Representation of the set in redis:`

			`+------------+------------+-----------+`
			`\| Keys \| Members \| Scores \|`
			`+============+============+===========+`
			`\| 20131001 \| word1 \| 142 \|`
			`+------------+------------+-----------+`
			`\| ... \| word2 \| 120 \|`
			`+------------+------------+-----------+`
			`\| 20131002 \| ... \| ... \|`
			`+------------+------------+-----------+`

			`This function store all words between minlength and maxlength in redis.`
			`Redis will count as well how much time each word will appear by day:`
			`The cardinality.`

			`"""`
			`tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)`

			`with gzip.open(filename, 'rb') as F:`

			`blob = TextBlob(clean(F.read()), tokenizer = tokenizer)`

			`for word in blob.tokens:`

			`if (len(word) >= minlength) and (len(word) <= maxlength):`
			`pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)`

			`if (len(word) >= maxlength):`
			`publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))`
			`publisher.info(word)`

			`pipe.execute()`




			`def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):`
			`"""Tokenizing on word category`

			`:param r_serv: -- Redis database connexion`
			`:param listname: -- (str) path to the file containing the list of path of category files`
			`:param choicedatastruct: -- (bool) Changing the index of datastructure`
			`:param nb: -- (int) Number of pastes proceeded by function`

			`Redis data structures cas be choose as follow:`

			`+---------------+------------+-----------+`
			`\| Keys \| Members \| Scores \|`
			`+===============+============+===========+`
			`\| mails_categ \| filename \| 25000 \|`
			`+---------------+------------+-----------+`
			`\| ... \| filename2 \| 2400 \|`
			`+---------------+------------+-----------+`
			`\| web_categ \| ... \| ... \|`
			`+---------------+------------+-----------+`

			`Or`

			`+--------------+-------------+-----------+`
			`\| Keys \| Members \| Scores \|`
			`+==============+=============+===========+`
			`\| filename \| mails_categ \| 100000 \|`
			`+--------------+-------------+-----------+`
			`\| ... \| web_categ \| 24050 \|`
			`+--------------+-------------+-----------+`
			`\| filename2 \| ... \| ... \|`
			`+--------------+-------------+-----------+`

			`This function tokenise on all special characters like: @^\\|[{#~}]!:;$^=`
			`And insert data in redis if the token match the keywords in a list previously`
			`created.`
			`These lists of keywords can be list of everything you want but it's better`
			`to create "category" of keywords.`

			`"""`

			`try:`
			`for n in xrange(0,nb):`
			`filename = r_serv.lpop(r_set)`

			`if filename != None:`

			`tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)`
			`set_listof_pid(r_serv, filename, sys.argv[0])`

			`with open(listname, 'rb') as L:`
			`# for each "categ" listed in the file`
			`for num, fname in enumerate(L):`
			`# contain keywords by categ`
			`tmp_list = []`
			`#for each keywords`
			`with open(fname[:-1], 'rb') as LS:`

			`for num, kword in enumerate(LS):`
			`tmp_list.append(kword[:-1])`

			`# for each paste`
			`with gzip.open(filename, 'rb') as F:`

			`blob = TextBlob(clean(F.read()),`
			`tokenizer = tokenizer)`

			`# for each paste token`
			`for word in blob.tokens.lower():`

			`if word in tmp_list:`
			`# choosing between two data structures.`
			`if choicedatastruct:`
			`r_serv.zincrby(filename,`
			`fname.split('/')[-1][:-1],`
			`1)`
			`else:`
			`r_serv.zincrby(fname.split('/')[-1][:-1],`
			`filename,`
			`1)`

			`update_listof_pid(r_serv)`

			`else:`
			`publisher.debug("Empty list")`
			`#r_serv.save()`
			`break`

			`except (KeyboardInterrupt, SystemExit) as e:`
			`flush_list_of_pid(r_serv)`
			`publisher.debug("Pid list flushed")`




			`def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):`
			`"""Store longlines's linenumbers in redis`

			`:param r_serv: -- The redis connexion database`
			`:param r_key: -- (str) The key name in redis`
			`:param store: -- (bool) Store the line numbers or not.`
			`:param maxlength: -- The limit between "short lines" and "long lines"`

			`This function connect to a redis list of filename (pastes filename);`
			`Open the paste and check inside if there is some line with their`
			`length >= to maxlength.`
			`If yes, the paste is "tagged" as containing a longlines in another`
			`redis structures, and the linenumber (of the long lines) can be stored`
			`in addition if the argument store is at True.`

			`"""`
			`try:`
			`while True:`
			`#r_key_list (categ)`
			`filename = r_serv.lpop(r_key)`

			`if filename != None:`

			`set_listof_pid(r_serv, filename, sys.argv[0])`

			`# for each pastes`
			`with gzip.open(filename, 'rb') as F:`
			`var = True`
			`for num, line in enumerate(F):`

			`if len(line) >= maxlength:`
			`#publisher.debug("Longline:{0}".format(line))`
			`if var:`
			`r_serv.rpush("longlines", filename)`
			`var = False`

			`if store:`
			`r_serv.sadd(filename, num)`
			`else:`
			`publisher.debug("Line numbers of longlines not stored")`

			`update_listof_pid(r_serv)`
			`else:`
			`publisher.debug("Empty list")`
			`return False`
			`break`

			`except (KeyboardInterrupt, SystemExit) as e:`
			`flush_list_of_pid(r_serv)`
			`publisher.debug("Pid list flushed")`




			`# NOT USED RIGHT NOW #`
			`def recovering_longlines(r_serv):`
			`"""Get longlines with linenumbers`

			`"""`
			`try:`
			`for n in xrange(0,nb):`
			`filename = r_serv.lpop("longlines")`

			`if filename != None:`
			`# For each values in redis (longline's line number)`
			`for numline in r_serv.smembers(filename):`

			`with gzip.open(filename,'rb') as F:`

			`for num, line in enumerate(F):`
			`#When corresponding.`
			`if int(num) == int(numline):`
			`pass`
			`# TREATMENT`
			`else:`
			`publisher.debug("Empty list")`
			`r_serv.save()`
			`break`

			`except (KeyboardInterrupt, SystemExit) as e:`
			`flush_list_of_pid(r_serv)`
			`publisher.debug("Pid list flushed")`




			`def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):`
			`"""Remove from a set, file with long lines.`

			`:param r_serv: -- The redis connexion database`
			`:param r_key: -- (str) The key name in redis`
			`:param store: -- (bool) Store the line numbers or not.`
			`:param delete: -- (bool) If true, delete the used key from redis.`
			`:param maxlength: -- The limit between "short lines" and "long lines"`

			`"""`
			`publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))`

			`#Create a list of file to proceed (1)`
			`for filename in r_serv.zrange(r_key, 0, -1):`
			`r_serv.rpush(r_key+"_list", filename)`

			`#detecting longlines in pastes`
			`dectect_longlines(r_serv, r_key+"_list", store, maxlength)`

			`#remove false positive members`
			`while True:`
			`fp_filename = r_serv.lpop("longlines")`

			`if fp_filename == None:`
			`break`

			`else:`
			`# if wanted, delete in addition the set with linenumbers (created with store)`
			`if delete:`
			`r_serv.zrem(r_key, fp_filename)`
			`r_serv.delete(fp_filename)`

			`else:`
			`#remove the file with longline from the r_key zset.`
			`r_serv.zrem(r_key, fp_filename)`

			`publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))`




			`def detect_longline_from_list(r_serv, nb):`
			`try:`
			`for n in xrange(0,nb):`

			`if not dectect_longlines(r_serv, "filelist", True):`
			`break`

			`except (KeyboardInterrupt, SystemExit) as e:`
			`flush_list_of_pid(r_serv)`
			`publisher.debug("Pid list flushed")`




			`def create_dirfile(r_serv, directory, overwrite):`
			`"""Create a file of path.`

			`:param r_serv: -- connexion to redis database`
			`:param directory: -- The folder where to launch the listing of the .gz files`

			`This function create a list in redis with inside the absolute path`
			`of all the pastes needed to be proceeded by function using parallel`
			`(like redis_words_ranking)`

			`"""`
			`if overwrite:`
			`r_serv.delete("filelist")`

			`for x in listdirectory(directory):`
			`r_serv.rpush("filelist",x)`

			`publisher.info("The list was overwritten")`

			`else:`
			`if r_serv.llen("filelist") == 0:`

			`for x in listdirectory(directory):`
			`r_serv.rpush("filelist",x)`

			`publisher.info("New list created")`
			`else:`

			`for x in listdirectory(directory):`
			`r_serv.rpush("filelist",x)`

			`publisher.info("The list was updated with new elements")`




			`def redis_interbargraph_set(r_serv, year, month, overwrite):`
			`"""Create a Redis sorted set.`

			`:param r_serv: -- connexion to redis database`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`
			`:param overwrite: -- (bool) trigger the overwrite mode`

			`This function create inside redis the intersection of all days in`
			`a month two by two.`
			`Example:`
			`For a month of 31days it will create 30 sorted set between day and`
			`day+1 until the last day.`
			`The overwrite mode delete the intersets and re-create them.`

			`"""`
			`a = date(year, month, 01)`
			`b = date(year, month, cal.monthrange(year, month)[1])`

			`if overwrite:`
			`r_serv.delete("InterSet")`

			`for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):`
			`dayafter = dt+timedelta(1)`

			`r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))`

			`r_serv.zinterstore(`
			`str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),`
			`{str(dt.strftime("%Y%m%d")):1,`
			`str(dayafter.strftime("%Y%m%d")):-1})`

			`r_serv.zadd(`
			`"InterSet",`
			`1,`
			`str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))`
			`else:`
			`for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):`
			`dayafter = dt+timedelta(1)`

			`if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:`

			`r_serv.zinterstore(`
			`str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),`
			`{str(dt.strftime("%Y%m%d")):1,`
			`str(dayafter.strftime("%Y%m%d")):-1})`

			`r_serv.zadd(`
			`"InterSet",`
			`1,`
			`str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))`

			`publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")`

			`else:`
			`publisher.warning("Data already exist, operation aborted.")`





			`def word_bar_graph(r_serv, year, month, filename):`
			`"""Create an histogram.`

			`:param r_serv: -- connexion to redis database`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`
			`:param filename: -- The absolute path where to save the figure.png`

			`This function use matplotlib to create an histogram.`
			`The redis database need obviously to be populated first`
			`with functions: redis_words_ranking and redis_interbargraph_set.`

			`"""`
			`lw = []`
			`adate = []`
			`inter = [0]`
			`rcParams['figure.figsize'] = 15, 10`

			`a = date(year, month, 01)`
			`b = date(year, month, cal.monthrange(year,month)[1])`

			`for dt in rrule(DAILY, dtstart = a, until = b):`
			`lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))`
			`adate.append(dt.strftime("%d"))`

			`for x in r_serv.zrange("InterSet", 0, 31):`
			`inter.append(r_serv.zcard(x))`

			`n_groups = len(lw)`
			`card_words = tuple(lw)`
			`card_interword = tuple(inter)`

			`index = np.arange(n_groups)`
			`bar_width = 0.5`
			`opacity = 0.6`

			`words = plt.bar(index, card_words, bar_width,`
			`alpha=opacity,`
			`color='g',`
			`label='Words/day')`

			`lwords = plt.bar(index - 0.5, card_interword, bar_width,`
			`alpha=opacity,`
			`color='r',`
			`label='Intersection')`


			`plt.plot(tuple(inter), 'b--')`
			`plt.xlabel(str(year)+'/'+str(month)+' Days')`
			`plt.ylabel('Words')`
			`plt.title('Words Cardinality & Intersection Histogram')`
			`plt.xticks(index + bar_width/2 , tuple(adate))`

			`plt.legend()`
			`plt.grid()`

			`plt.tight_layout()`

			`plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',`
			`orientation='portrait', papertype=None, format="png",`
			`transparent=False, bbox_inches=None, pad_inches=0.1,`
			`frameon=True)`

			`publisher.info(filename+".png"+" saved!")`




			`def create_data_words_curve(r_serv, r_serv2, year, month, filename):`
			`"""Create a Redis hashes.`

			`:param r_serv: -- connexion to redis database (read)`
			`:param r_serv2: -- connexion to redis database (write)`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`
			`:param filename: -- the path to the file which contain a list of words.`


			`The hashes of redis is created as follow:`

			`+------------+------------+-----------+`
			`\| Keys \| Field \| Values \|`
			`+============+============+===========+`
			`\| word1 \| 20131001 \| 150 \|`
			`+------------+------------+-----------+`
			`\| ... \| 20131002 \| 145 \|`
			`+------------+------------+-----------+`
			`\| word2 \| ... \| ... \|`
			`+------------+------------+-----------+`

			`The filename need to be a list of words separated by a carriage return`
			`with an empty line at the end.`
			`This function create datas which is used by the function`
			`create_curve_with_word_file which create a csv file.`

			`"""`
			`stop = stopwords.words('english')`
			`a = date(year, month, 01)`
			`b = date(year, month, cal.monthrange(year,month)[1])`

			`with open(filename, 'rb') as F:`

			`for line in F:`

			`for dt in rrule(DAILY, dtstart = a, until = b):`

			`if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:`
			`#tester si ca existe deja "en option" et ajouter un WARNING log`
			`r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})`
			`else:`
			`pass`




			`def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):`
			`"""Create a csv file used with dygraph.`

			`:param r_serv: -- connexion to redis database`
			`:param csvfilename: -- the path to the .csv file created`
			`:param feederfilename: -- the path to the file which contain a list of words.`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`

			`This function create a .csv file using datas in redis.`
			`It's checking if the words contained in feederfilename and`
			`their respectives values by days exists. If these values are missing`
			`(Word not present during a day) it's will automatically put a 0`
			`to keep the timeline of the curve correct.`

			`"""`
			`a = date(year, month, 01)`
			`b = date(year, month, cal.monthrange(year,month)[1])`
			`days = {}`
			`words = []`

			`with open(feederfilename, 'rb') as F:`
			`for word in F: # words of the files`
			`words.append(word[:-1]) # list of words (sorted as in the file)`

			`for dt in rrule(DAILY, dtstart = a, until = b): # for each days`

			`mot = []`
			`mot1 = []`
			`mot2 = []`

			`days[dt.strftime("%Y%m%d")] = ''`
			`for word in sorted(words): # from the 1srt day to the last of the list`
			`if r_serv.hexists(word, dt.strftime("%Y%m%d")): # if the word have a value for the day`
			`mot1.append(str(word))`
			`mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))`

			`mot = zip(mot1, mot2)`

			`days[dt.strftime("%Y%m%d")] = mot`
			`else:`

			`mot1.append(str(word))`
			`mot2.append(0)`

			`mot = zip(mot1, mot2)`

			`days[dt.strftime("%Y%m%d")] = mot`

			`with open(csvfilename+".csv", 'wb') as F:`
			`F.write("Date," + ",".join(sorted(words)) + '\n')`

			`for x, s in days.items():`
			`val = []`
			`for y in s:`
			`val.append(y[1])`

			`F.write(x + ',' + str(val) + '\n')`

			`with open(csvfilename+".csv", 'rb') as F:`
			`h = F.read()`
			`h = h.replace("[","")`
			`h = h.replace("]","")`
			`h = h.replace('\'',"")`

			`with open(csvfilename+".csv", 'wb') as F:`
			`F.write(h)`