AIL-framework/bin/RegexForTermsFrequency.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.

"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
import os
import datetime
import calendar
import re

from Helper import Process

# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]


if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'RegexForTermsFrequency'
    p = Process(config_section)

    # REDIS #
    server_term = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_TermFreq", "host"),
        port=p.config.get("Redis_Level_DB_TermFreq", "port"),
        db=p.config.get("Redis_Level_DB_TermFreq", "db"))

    # FUNCTIONS #
    publisher.info("RegexForTermsFrequency script started")

    #compile the regex
    dico_regex = {}
    for regex_str in server_term.smembers(TrackedRegexSet_Name):
        dico_regex[regex_str] = re.compile(regex_str)


    message = p.get_from_set()

    # Regex Frequency
    while True:

        if message is not None:
            filename, timestamp, word = message.split()
            curr_set = top_termFreq_setName_day[0] + str(timestamp)

            #iterate the word with the regex
            for regex_str, compiled_regex in dico_regex.items():
                matched = compiled_regex.match(word)
                if word == "amzinggg":
                    print("matched")
                    server_term.incr("thisistest")

                if matched is not None: #there is a match
                    matched = matched.group(0)
                    # Add in Regex track set only if term is not in the blacklist
                    if matched not in server_term.smembers(BlackListTermsSet_Name):
                        set_name = 'regex_' + regex_str
                        new_to_the_set = server_term.sadd(set_name, filename)
                        new_to_the_set = True if new_to_the_set == 1 else False

                        #consider the num of occurence of this term
                        regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
                        #1 term per paste
                        if new_to_the_set:
                            regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
                            server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
                    server_term.zincrby(curr_set, regex_str, float(1))


        else:
            publisher.debug("Script RegexForTermsFrequency is Idling")
            print "sleeping"
            time.sleep(5)
        message = p.get_from_set()