Removed previously copied files

pull/183/head
Philipp Schmied 2018-02-27 16:01:07 +01:00
parent 5b1f0b0212
commit 02d587134f
2 changed files with 0 additions and 257 deletions

View File

@ -1,123 +0,0 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the regexs
supplied in the term webpage.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
from os import environ
import datetime
import calendar
import re
from Helper import Process
# Email notifications
from NotificationHelper import *
# Config Variables
DICO_REFRESH_TIME = 60 #s
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def refresh_dicos():
dico_regex = {}
dico_regexname_to_redis = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
return dico_regex, dico_regexname_to_redis
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'RegexForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#compile the regex
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
message = p.get_from_set()
# Regex Frequency
while True:
if message is not None:
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
print('dico got refreshed')
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
content = Paste.Paste(filename).get_p_content()
#iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
matched = compiled_regex.search(content)
if matched is not None: #there is a match
print('regex matched {}'.format(regex_str))
matched = matched.group(0)
# Add in Regex track set only if term is not in the blacklist
if matched not in server_term.smembers(BlackListTermsSet_Name):
# Send a notification only when the member is in the set
if matched in server_term.smembers(TrackedTermsNotificationEnabled_Name):
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + matched):
sendEmailNotification(email, matched)
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
#1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
else:
pass
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()

View File

@ -1,134 +0,0 @@
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the sets
supplied in the term webpage.
"""
import redis
import time
from pubsublogger import publisher
from packages import lib_words
from packages import Paste
import os
import datetime
import calendar
import re
import ast
from Helper import Process
# Email notifications
from NotificationHelper import *
# Config Variables
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
TrackedSetSet_Name = "TrackedSetSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
def add_quote_inside_tab(tab):
quoted_tab = "["
for elem in tab[1:-1].split(','):
elem = elem.lstrip().strip()
quoted_tab += "\'{}\', ".format(elem)
quoted_tab = quoted_tab[:-2] #remove trailing ,
quoted_tab += "]"
return str(quoted_tab)
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'SetForTermsFrequency'
p = Process(config_section)
# REDIS #
server_term = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
#get the dico and matching percent
dico_percent = {}
dico_set_tab = {}
dico_setname_to_redis = {}
for set_str in server_term.smembers(TrackedSetSet_Name):
tab_set = set_str[1:-1]
tab_set = add_quote_inside_tab(tab_set)
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
if perc_finder is not None:
match_percent = perc_finder.group(0)[1:-1]
dico_percent[tab_set] = float(match_percent)
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
dico_setname_to_redis[tab_set] = set_str
else:
continue
message = p.get_from_set()
while True:
if message is not None:
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
content = Paste.Paste(filename).get_p_content()
curr_set = top_termFreq_setName_day[0] + str(timestamp)
#iterate over the words of the file
match_dico = {}
for word in content.split():
for cur_set, array_set in dico_set_tab.items():
for w_set in array_set[:-1]: #avoid the percent matching
if word == w_set:
try:
match_dico[str(array_set)] += 1
except KeyError:
match_dico[str(array_set)] = 1
#compute matching %
for the_set, matchingNum in match_dico.items():
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
if eff_percent >= dico_percent[the_set]:
# Send a notification only when the member is in the set
if the_set in server_term.smembers(TrackedTermsNotificationEnabled_Name):
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + the_set):
sendEmailNotification(email, the_set)
print(the_set, "matched in", filename)
set_name = 'set_' + dico_setname_to_redis[the_set]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
#consider the num of occurence of this set
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
# FIXME - avoid using per paste as a set is checked over the entire paste
#1 term per paste
if new_to_the_set:
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
print "sleeping"
time.sleep(5)
message = p.get_from_set()