From 02d587134f5c1cccf48ab02730cb0434c8503790 Mon Sep 17 00:00:00 2001 From: Philipp Schmied Date: Tue, 27 Feb 2018 16:01:07 +0100 Subject: [PATCH] Removed previously copied files --- RegexForTermsFrequency.py | 123 ---------------------------------- SetForTermsFrequency.py | 134 -------------------------------------- 2 files changed, 257 deletions(-) delete mode 100755 RegexForTermsFrequency.py delete mode 100755 SetForTermsFrequency.py diff --git a/RegexForTermsFrequency.py b/RegexForTermsFrequency.py deleted file mode 100755 index b5570ea9..00000000 --- a/RegexForTermsFrequency.py +++ /dev/null @@ -1,123 +0,0 @@ -#!/usr/bin/env python2 -# -*-coding:UTF-8 -* -""" -This Module is used for term frequency. -It processes every paste coming from the global module and test the regexs -supplied in the term webpage. - -""" -import redis -import time -from pubsublogger import publisher -from packages import lib_words -from packages import Paste -import os -from os import environ -import datetime -import calendar -import re -from Helper import Process - -# Email notifications -from NotificationHelper import * - -# Config Variables -DICO_REFRESH_TIME = 60 #s - -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -TrackedRegexSet_Name = "TrackedRegexSet" - -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] - - -def refresh_dicos(): - dico_regex = {} - dico_regexname_to_redis = {} - for regex_str in server_term.smembers(TrackedRegexSet_Name): - dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1]) - dico_regexname_to_redis[regex_str[1:-1]] = regex_str - - return dico_regex, dico_regexname_to_redis - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'RegexForTermsFrequency' - p = Process(config_section) - - # REDIS # - server_term = redis.StrictRedis( - host=p.config.get("Redis_Level_DB_TermFreq", "host"), - port=p.config.get("Redis_Level_DB_TermFreq", "port"), - db=p.config.get("Redis_Level_DB_TermFreq", "db")) - - # FUNCTIONS # - publisher.info("RegexForTermsFrequency script started") - - #compile the regex - dico_refresh_cooldown = time.time() - dico_regex, dico_regexname_to_redis = refresh_dicos() - - message = p.get_from_set() - - # Regex Frequency - while True: - - if message is not None: - if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME: - dico_refresh_cooldown = time.time() - dico_regex, dico_regexname_to_redis = refresh_dicos() - print('dico got refreshed') - - filename = message - temp = filename.split('/') - timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - - curr_set = top_termFreq_setName_day[0] + str(timestamp) - content = Paste.Paste(filename).get_p_content() - - #iterate the word with the regex - for regex_str, compiled_regex in dico_regex.items(): - matched = compiled_regex.search(content) - - if matched is not None: #there is a match - print('regex matched {}'.format(regex_str)) - matched = matched.group(0) - - # Add in Regex track set only if term is not in the blacklist - if matched not in server_term.smembers(BlackListTermsSet_Name): - - # Send a notification only when the member is in the set - if matched in server_term.smembers(TrackedTermsNotificationEnabled_Name): - - # Send to every associated email adress - for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + matched): - sendEmailNotification(email, matched) - - - set_name = 'regex_' + dico_regexname_to_redis[regex_str] - new_to_the_set = server_term.sadd(set_name, filename) - new_to_the_set = True if new_to_the_set == 1 else False - - #consider the num of occurence of this term - regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1))) - #1 term per paste - if new_to_the_set: - regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1))) - server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1)) - server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1)) - else: - pass - - else: - publisher.debug("Script RegexForTermsFrequency is Idling") - print "sleeping" - time.sleep(5) - message = p.get_from_set() diff --git a/SetForTermsFrequency.py b/SetForTermsFrequency.py deleted file mode 100755 index d6e9bef9..00000000 --- a/SetForTermsFrequency.py +++ /dev/null @@ -1,134 +0,0 @@ -#!/usr/bin/env python2 -# -*-coding:UTF-8 -* -""" -This Module is used for term frequency. -It processes every paste coming from the global module and test the sets -supplied in the term webpage. - -""" -import redis -import time -from pubsublogger import publisher -from packages import lib_words -from packages import Paste -import os -import datetime -import calendar -import re -import ast -from Helper import Process - -# Email notifications -from NotificationHelper import * - -# Config Variables -BlackListTermsSet_Name = "BlackListSetTermSet" -TrackedTermsSet_Name = "TrackedSetTermSet" -TrackedRegexSet_Name = "TrackedRegexSet" -TrackedSetSet_Name = "TrackedSetSet" - -top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set -oneDay = 60*60*24 -top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] -top_termFreq_setName_week = ["TopTermFreq_set_week", 7] -top_termFreq_setName_month = ["TopTermFreq_set_month", 31] -top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] - -def add_quote_inside_tab(tab): - quoted_tab = "[" - for elem in tab[1:-1].split(','): - elem = elem.lstrip().strip() - quoted_tab += "\'{}\', ".format(elem) - quoted_tab = quoted_tab[:-2] #remove trailing , - quoted_tab += "]" - return str(quoted_tab) - -if __name__ == "__main__": - publisher.port = 6380 - publisher.channel = "Script" - - config_section = 'SetForTermsFrequency' - p = Process(config_section) - - # REDIS # - server_term = redis.StrictRedis( - host=p.config.get("Redis_Level_DB_TermFreq", "host"), - port=p.config.get("Redis_Level_DB_TermFreq", "port"), - db=p.config.get("Redis_Level_DB_TermFreq", "db")) - - # FUNCTIONS # - publisher.info("RegexForTermsFrequency script started") - - #get the dico and matching percent - dico_percent = {} - dico_set_tab = {} - dico_setname_to_redis = {} - for set_str in server_term.smembers(TrackedSetSet_Name): - tab_set = set_str[1:-1] - tab_set = add_quote_inside_tab(tab_set) - perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set) - if perc_finder is not None: - match_percent = perc_finder.group(0)[1:-1] - dico_percent[tab_set] = float(match_percent) - dico_set_tab[tab_set] = ast.literal_eval(tab_set) - dico_setname_to_redis[tab_set] = set_str - else: - continue - - - message = p.get_from_set() - - while True: - - if message is not None: - filename = message - temp = filename.split('/') - timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) - content = Paste.Paste(filename).get_p_content() - - curr_set = top_termFreq_setName_day[0] + str(timestamp) - - #iterate over the words of the file - match_dico = {} - for word in content.split(): - for cur_set, array_set in dico_set_tab.items(): - for w_set in array_set[:-1]: #avoid the percent matching - if word == w_set: - try: - match_dico[str(array_set)] += 1 - except KeyError: - match_dico[str(array_set)] = 1 - - #compute matching % - for the_set, matchingNum in match_dico.items(): - eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching - if eff_percent >= dico_percent[the_set]: - - # Send a notification only when the member is in the set - if the_set in server_term.smembers(TrackedTermsNotificationEnabled_Name): - - # Send to every associated email adress - for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + the_set): - sendEmailNotification(email, the_set) - - print(the_set, "matched in", filename) - set_name = 'set_' + dico_setname_to_redis[the_set] - new_to_the_set = server_term.sadd(set_name, filename) - new_to_the_set = True if new_to_the_set == 1 else False - - #consider the num of occurence of this set - set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1))) - - # FIXME - avoid using per paste as a set is checked over the entire paste - #1 term per paste - if new_to_the_set: - set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1))) - server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1)) - server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1)) - - - else: - publisher.debug("Script RegexForTermsFrequency is Idling") - print "sleeping" - time.sleep(5) - message = p.get_from_set()