AIL-framework/bin/SetForTermsFrequency.py

123 lines
4.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast
from Helper import Process
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\'{}\', ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return str(quoted_tab)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'SetForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
dico_setname_to_redis = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[tab_set] = float(match_percent)
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
dico_setname_to_redis[tab_set] = set_str
else:
continue
message = p.get_from_set()
while True:
if message is not None:
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
content = Paste.Paste(filename).get_p_content()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate over the words of the file
match_dico = {}
for word in content.split():
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set[:-1]: #avoid the percent matching
if word == w_set:
try:
match_dico[str(array_set)] += 1
except KeyError:
match_dico[str(array_set)] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
if eff_percent >= dico_percent[the_set]:
print(the_set, "matched in", filename)
set_name = 'set_' + dico_setname_to_redis[the_set]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
if new_to_the_set:
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()