AIL-framework/bin/SetForTermsFrequency.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the sets
supplied in  the term webpage.

"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast

from Helper import Process

# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]

def add_quote_inside_tab(tab):
    quoted_tab = "["
    for elem in tab[1:-1].split(','):
        elem = elem.lstrip().strip()
        quoted_tab += "\'{}\', ".format(elem)
    quoted_tab = quoted_tab[:-2] #remove trailing ,
    quoted_tab += "]"
    return str(quoted_tab)

if __name__ == "__main__":
    publisher.port = 6380
    publisher.channel = "Script"

    config_section = 'SetForTermsFrequency'
    p = Process(config_section)

    # REDIS #
    server_term = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_TermFreq", "host"),
        port=p.config.get("Redis_Level_DB_TermFreq", "port"),
        db=p.config.get("Redis_Level_DB_TermFreq", "db"))

    # FUNCTIONS #
    publisher.info("RegexForTermsFrequency script started")

    #get the dico and matching percent
    dico_percent = {}
    dico_set_tab = {}
    dico_setname_to_redis = {}
    for set_str in server_term.smembers(TrackedSetSet_Name):
        tab_set = set_str[1:-1]
        tab_set = add_quote_inside_tab(tab_set)
        perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
        if perc_finder is not None:
            match_percent = perc_finder.group(0)[1:-1]
            dico_percent[tab_set] = float(match_percent)
            dico_set_tab[tab_set] = ast.literal_eval(tab_set)
            dico_setname_to_redis[tab_set] = set_str
        else:
            continue


    message = p.get_from_set()

    while True:

        if message is not None:
            filename = message
            temp = filename.split('/')
            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
            content = Paste.Paste(filename).get_p_content()

            curr_set = top_termFreq_setName_day[0] + str(timestamp)

            #iterate over the words of the file
            match_dico = {}
            for word in content.split():
                for cur_set, array_set in dico_set_tab.items():
                    for w_set in array_set[:-1]: #avoid the percent matching
                        if word == w_set:
                            try:
                                match_dico[str(array_set)] += 1
                            except KeyError:
                                match_dico[str(array_set)] = 1

            #compute matching %
            for the_set, matchingNum in match_dico.items():
                eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
                if eff_percent >= dico_percent[the_set]:
                    print(the_set, "matched in", filename)
                    set_name = 'set_' + dico_setname_to_redis[the_set]
                    new_to_the_set = server_term.sadd(set_name, filename)
                    new_to_the_set = True if new_to_the_set == 1 else False
                    

                    #consider the num of occurence of this set
                    set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))

                    # FIXME - avoid using per paste as a set is checked over the entire paste
                    #1 term per paste
                    if new_to_the_set:
                        set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
                        server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
                server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))


        else:
            publisher.debug("Script RegexForTermsFrequency is Idling")
            print "sleeping"
            time.sleep(5)
        message = p.get_from_set()
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
			`"""`
			`This Module is used for term frequency.`
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00			`It processes every paste coming from the global module and test the sets`
			`supplied in the term webpage.`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00
			`"""`
			`import redis`
			`import time`
			`from pubsublogger import publisher`
			`from packages import lib_words`
			`from packages import Paste`
			`import os`
			`import datetime`
			`import calendar`
			`import re`
			`import ast`

			`from Helper import Process`

			`# Config Variables`
			`BlackListTermsSet_Name = "BlackListSetTermSet"`
			`TrackedTermsSet_Name = "TrackedSetTermSet"`
			`TrackedRegexSet_Name = "TrackedRegexSet"`
			`TrackedSetSet_Name = "TrackedSetSet"`
			`top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set`
			`oneDay = 606024`
			`top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]`
			`top_termFreq_setName_week = ["TopTermFreq_set_week", 7]`
			`top_termFreq_setName_month = ["TopTermFreq_set_month", 31]`
			`top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]`

			`def add_quote_inside_tab(tab):`
			`quoted_tab = "["`
			`for elem in tab[1:-1].split(','):`
			`elem = elem.lstrip().strip()`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`quoted_tab += "\'{}\', ".format(elem)`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`quoted_tab = quoted_tab[:-2] #remove trailing ,`
			`quoted_tab += "]"`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`return str(quoted_tab)`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00
			`if __name__ == "__main__":`
			`publisher.port = 6380`
			`publisher.channel = "Script"`

			`config_section = 'SetForTermsFrequency'`
			`p = Process(config_section)`

			`# REDIS #`
			`server_term = redis.StrictRedis(`
			`host=p.config.get("Redis_Level_DB_TermFreq", "host"),`
			`port=p.config.get("Redis_Level_DB_TermFreq", "port"),`
			`db=p.config.get("Redis_Level_DB_TermFreq", "db"))`

			`# FUNCTIONS #`
			`publisher.info("RegexForTermsFrequency script started")`

			`#get the dico and matching percent`
			`dico_percent = {}`
			`dico_set_tab = {}`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`dico_setname_to_redis = {}`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`for set_str in server_term.smembers(TrackedSetSet_Name):`
			`tab_set = set_str[1:-1]`
			`tab_set = add_quote_inside_tab(tab_set)`
			`perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)`
			`if perc_finder is not None:`
			`match_percent = perc_finder.group(0)[1:-1]`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`dico_percent[tab_set] = float(match_percent)`
			`dico_set_tab[tab_set] = ast.literal_eval(tab_set)`
			`dico_setname_to_redis[tab_set] = set_str`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`else:`
			`continue`


			`message = p.get_from_set()`

			`while True:`

			`if message is not None:`
			`filename = message`
			`temp = filename.split('/')`
			`timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))`
			`content = Paste.Paste(filename).get_p_content()`

			`curr_set = top_termFreq_setName_day[0] + str(timestamp)`

			`#iterate over the words of the file`
			`match_dico = {}`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`for word in content.split():`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`for cur_set, array_set in dico_set_tab.items():`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`for w_set in array_set[:-1]: #avoid the percent matching`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`if word == w_set:`
			`try:`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`match_dico[str(array_set)] += 1`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`except KeyError:`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`match_dico[str(array_set)] = 1`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00
			`#compute matching %`
			`for the_set, matchingNum in match_dico.items():`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching`
			`if eff_percent >= dico_percent[the_set]:`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00			`print(the_set, "matched in", filename)`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`set_name = 'set_' + dico_setname_to_redis[the_set]`
			`new_to_the_set = server_term.sadd(set_name, filename)`
			`new_to_the_set = True if new_to_the_set == 1 else False`

New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00
			`#consider the num of occurence of this set`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00
			`# FIXME - avoid using per paste as a set is checked over the entire paste`
			`#1 term per paste`
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...) 2017-04-18 15:28:21 +02:00			`if new_to_the_set:`
			`set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))`
			`server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))`
			`server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))`
New feature in Terms: regex and set of words support (draft) 2017-03-28 17:42:44 +02:00

			`else:`
			`publisher.debug("Script RegexForTermsFrequency is Idling")`
			`print "sleeping"`
			`time.sleep(5)`
			`message = p.get_from_set()`