Removed previously copied files

2018-02-27 16:01:07 +01:00 · 2018-02-27 16:01:07 +01:00 · 02d587134f
parent 5b1f0b0212
commit 02d587134f
2 changed files with 0 additions and 257 deletions
--- a/RegexForTermsFrequency.py
+++ b/RegexForTermsFrequency.py
@ -1,123 +0,0 @@
-#!/usr/bin/env python2
-# -*-coding:UTF-8 -*
-"""
-This Module is used for term frequency.
-It processes every paste coming from the global module and test the regexs
-supplied in  the term webpage.
-
-"""
-import redis
-import time
-from pubsublogger import publisher
-from packages import lib_words
-from packages import Paste
-import os
-from os import environ
-import datetime
-import calendar
-import re
-from Helper import Process
-
-# Email notifications
-from NotificationHelper import *
-
-# Config Variables
-DICO_REFRESH_TIME = 60 #s
-
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-TrackedRegexSet_Name = "TrackedRegexSet"
-
-top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
-
-
-def refresh_dicos():
-    dico_regex = {} 
-    dico_regexname_to_redis = {}
-    for regex_str in server_term.smembers(TrackedRegexSet_Name):
-        dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
-        dico_regexname_to_redis[regex_str[1:-1]] = regex_str
-
-    return dico_regex, dico_regexname_to_redis
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'RegexForTermsFrequency'
-    p = Process(config_section)
-
-    # REDIS #
-    server_term = redis.StrictRedis(
-        host=p.config.get("Redis_Level_DB_TermFreq", "host"),
-        port=p.config.get("Redis_Level_DB_TermFreq", "port"),
-        db=p.config.get("Redis_Level_DB_TermFreq", "db"))
-
-    # FUNCTIONS #
-    publisher.info("RegexForTermsFrequency script started")
-
-    #compile the regex
-    dico_refresh_cooldown = time.time()
-    dico_regex, dico_regexname_to_redis = refresh_dicos()
-
-    message = p.get_from_set()
-
-    # Regex Frequency
-    while True:
-
-        if message is not None:
-            if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
-                dico_refresh_cooldown = time.time()
-                dico_regex, dico_regexname_to_redis = refresh_dicos()
-                print('dico got refreshed')
-
-            filename = message
-            temp = filename.split('/')
-            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
-
-            curr_set = top_termFreq_setName_day[0] + str(timestamp)
-            content = Paste.Paste(filename).get_p_content()
-
-            #iterate the word with the regex
-            for regex_str, compiled_regex in dico_regex.items():
-                matched = compiled_regex.search(content)
-
-                if matched is not None: #there is a match
-                    print('regex matched {}'.format(regex_str))
-                    matched = matched.group(0)
-                    
-                    # Add in Regex track set only if term is not in the blacklist
-                    if matched not in server_term.smembers(BlackListTermsSet_Name):
-                        
-                        # Send a notification only when the member is in the set
-                        if matched in server_term.smembers(TrackedTermsNotificationEnabled_Name):
-                            
-                            # Send to every associated email adress
-                            for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + matched):
-                                sendEmailNotification(email, matched)
-
-                        
-                        set_name = 'regex_' + dico_regexname_to_redis[regex_str]
-                        new_to_the_set = server_term.sadd(set_name, filename)
-                        new_to_the_set = True if new_to_the_set == 1 else False
-
-                        #consider the num of occurence of this term
-                        regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
-                        #1 term per paste
-                        if new_to_the_set:
-                            regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
-                            server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
-                    server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
-                else:
-                    pass
-
-        else:
-            publisher.debug("Script RegexForTermsFrequency is Idling")
-            print "sleeping"
-            time.sleep(5)
-        message = p.get_from_set()
--- a/SetForTermsFrequency.py
+++ b/SetForTermsFrequency.py
@ -1,134 +0,0 @@
-#!/usr/bin/env python2
-# -*-coding:UTF-8 -*
-"""
-This Module is used for term frequency.
-It processes every paste coming from the global module and test the sets
-supplied in  the term webpage.
-
-"""
-import redis
-import time
-from pubsublogger import publisher
-from packages import lib_words
-from packages import Paste
-import os
-import datetime
-import calendar
-import re
-import ast
-from Helper import Process
-
-# Email notifications
-from NotificationHelper import *
-
-# Config Variables
-BlackListTermsSet_Name = "BlackListSetTermSet"
-TrackedTermsSet_Name = "TrackedSetTermSet"
-TrackedRegexSet_Name = "TrackedRegexSet"
-TrackedSetSet_Name = "TrackedSetSet"
-
-top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
-oneDay = 60*60*24
-top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
-top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
-top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
-top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
-
-def add_quote_inside_tab(tab):
-    quoted_tab = "["
-    for elem in tab[1:-1].split(','):
-        elem = elem.lstrip().strip()
-        quoted_tab += "\'{}\', ".format(elem)
-    quoted_tab = quoted_tab[:-2] #remove trailing ,
-    quoted_tab += "]"
-    return str(quoted_tab)
-
-if __name__ == "__main__":
-    publisher.port = 6380
-    publisher.channel = "Script"
-
-    config_section = 'SetForTermsFrequency'
-    p = Process(config_section)
-
-    # REDIS #
-    server_term = redis.StrictRedis(
-        host=p.config.get("Redis_Level_DB_TermFreq", "host"),
-        port=p.config.get("Redis_Level_DB_TermFreq", "port"),
-        db=p.config.get("Redis_Level_DB_TermFreq", "db"))
-
-    # FUNCTIONS #
-    publisher.info("RegexForTermsFrequency script started")
-
-    #get the dico and matching percent
-    dico_percent = {}
-    dico_set_tab = {}
-    dico_setname_to_redis = {}
-    for set_str in server_term.smembers(TrackedSetSet_Name):
-        tab_set = set_str[1:-1]
-        tab_set = add_quote_inside_tab(tab_set)
-        perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
-        if perc_finder is not None:
-            match_percent = perc_finder.group(0)[1:-1]
-            dico_percent[tab_set] = float(match_percent)
-            dico_set_tab[tab_set] = ast.literal_eval(tab_set)
-            dico_setname_to_redis[tab_set] = set_str
-        else:
-            continue
-
-
-    message = p.get_from_set()
-
-    while True:
-
-        if message is not None:
-            filename = message
-            temp = filename.split('/')
-            timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
-            content = Paste.Paste(filename).get_p_content()
-
-            curr_set = top_termFreq_setName_day[0] + str(timestamp)
-
-            #iterate over the words of the file
-            match_dico = {}
-            for word in content.split():
-                for cur_set, array_set in dico_set_tab.items():
-                    for w_set in array_set[:-1]: #avoid the percent matching
-                        if word == w_set:
-                            try:
-                                match_dico[str(array_set)] += 1
-                            except KeyError:
-                                match_dico[str(array_set)] = 1
-
-            #compute matching %
-            for the_set, matchingNum in match_dico.items():
-                eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
-                if eff_percent >= dico_percent[the_set]:
-                    
-                    # Send a notification only when the member is in the set
-                    if the_set in server_term.smembers(TrackedTermsNotificationEnabled_Name):
-                        
-                        # Send to every associated email adress
-                        for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + the_set):
-                            sendEmailNotification(email, the_set)
-                    
-                    print(the_set, "matched in", filename)
-                    set_name = 'set_' + dico_setname_to_redis[the_set]
-                    new_to_the_set = server_term.sadd(set_name, filename)
-                    new_to_the_set = True if new_to_the_set == 1 else False
-                    
-                    #consider the num of occurence of this set
-                    set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
-
-                    # FIXME - avoid using per paste as a set is checked over the entire paste
-                    #1 term per paste
-                    if new_to_the_set:
-                        set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
-                        server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
-                server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
-
-
-        else:
-            publisher.debug("Script RegexForTermsFrequency is Idling")
-            print "sleeping"
-            time.sleep(5)
-        message = p.get_from_set()