mirror of https://github.com/CIRCL/AIL-framework
Removed previously copied files
parent
5b1f0b0212
commit
02d587134f
|
@ -1,123 +0,0 @@
|
|||
#!/usr/bin/env python2
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This Module is used for term frequency.
|
||||
It processes every paste coming from the global module and test the regexs
|
||||
supplied in the term webpage.
|
||||
|
||||
"""
|
||||
import redis
|
||||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
from packages import Paste
|
||||
import os
|
||||
from os import environ
|
||||
import datetime
|
||||
import calendar
|
||||
import re
|
||||
from Helper import Process
|
||||
|
||||
# Email notifications
|
||||
from NotificationHelper import *
|
||||
|
||||
# Config Variables
|
||||
DICO_REFRESH_TIME = 60 #s
|
||||
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
|
||||
def refresh_dicos():
|
||||
dico_regex = {}
|
||||
dico_regexname_to_redis = {}
|
||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
||||
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
||||
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
||||
|
||||
return dico_regex, dico_regexname_to_redis
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'RegexForTermsFrequency'
|
||||
p = Process(config_section)
|
||||
|
||||
# REDIS #
|
||||
server_term = redis.StrictRedis(
|
||||
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
|
||||
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
|
||||
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("RegexForTermsFrequency script started")
|
||||
|
||||
#compile the regex
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
|
||||
message = p.get_from_set()
|
||||
|
||||
# Regex Frequency
|
||||
while True:
|
||||
|
||||
if message is not None:
|
||||
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
print('dico got refreshed')
|
||||
|
||||
filename = message
|
||||
temp = filename.split('/')
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
content = Paste.Paste(filename).get_p_content()
|
||||
|
||||
#iterate the word with the regex
|
||||
for regex_str, compiled_regex in dico_regex.items():
|
||||
matched = compiled_regex.search(content)
|
||||
|
||||
if matched is not None: #there is a match
|
||||
print('regex matched {}'.format(regex_str))
|
||||
matched = matched.group(0)
|
||||
|
||||
# Add in Regex track set only if term is not in the blacklist
|
||||
if matched not in server_term.smembers(BlackListTermsSet_Name):
|
||||
|
||||
# Send a notification only when the member is in the set
|
||||
if matched in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
||||
|
||||
# Send to every associated email adress
|
||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + matched):
|
||||
sendEmailNotification(email, matched)
|
||||
|
||||
|
||||
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
#consider the num of occurence of this term
|
||||
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
||||
#1 term per paste
|
||||
if new_to_the_set:
|
||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
else:
|
||||
pass
|
||||
|
||||
else:
|
||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||
print "sleeping"
|
||||
time.sleep(5)
|
||||
message = p.get_from_set()
|
|
@ -1,134 +0,0 @@
|
|||
#!/usr/bin/env python2
|
||||
# -*-coding:UTF-8 -*
|
||||
"""
|
||||
This Module is used for term frequency.
|
||||
It processes every paste coming from the global module and test the sets
|
||||
supplied in the term webpage.
|
||||
|
||||
"""
|
||||
import redis
|
||||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
from packages import Paste
|
||||
import os
|
||||
import datetime
|
||||
import calendar
|
||||
import re
|
||||
import ast
|
||||
from Helper import Process
|
||||
|
||||
# Email notifications
|
||||
from NotificationHelper import *
|
||||
|
||||
# Config Variables
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||
TrackedSetSet_Name = "TrackedSetSet"
|
||||
|
||||
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
||||
oneDay = 60*60*24
|
||||
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
||||
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
||||
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
def add_quote_inside_tab(tab):
|
||||
quoted_tab = "["
|
||||
for elem in tab[1:-1].split(','):
|
||||
elem = elem.lstrip().strip()
|
||||
quoted_tab += "\'{}\', ".format(elem)
|
||||
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
||||
quoted_tab += "]"
|
||||
return str(quoted_tab)
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
||||
config_section = 'SetForTermsFrequency'
|
||||
p = Process(config_section)
|
||||
|
||||
# REDIS #
|
||||
server_term = redis.StrictRedis(
|
||||
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
|
||||
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
|
||||
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
|
||||
|
||||
# FUNCTIONS #
|
||||
publisher.info("RegexForTermsFrequency script started")
|
||||
|
||||
#get the dico and matching percent
|
||||
dico_percent = {}
|
||||
dico_set_tab = {}
|
||||
dico_setname_to_redis = {}
|
||||
for set_str in server_term.smembers(TrackedSetSet_Name):
|
||||
tab_set = set_str[1:-1]
|
||||
tab_set = add_quote_inside_tab(tab_set)
|
||||
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
||||
if perc_finder is not None:
|
||||
match_percent = perc_finder.group(0)[1:-1]
|
||||
dico_percent[tab_set] = float(match_percent)
|
||||
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
|
||||
dico_setname_to_redis[tab_set] = set_str
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
message = p.get_from_set()
|
||||
|
||||
while True:
|
||||
|
||||
if message is not None:
|
||||
filename = message
|
||||
temp = filename.split('/')
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
content = Paste.Paste(filename).get_p_content()
|
||||
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
|
||||
#iterate over the words of the file
|
||||
match_dico = {}
|
||||
for word in content.split():
|
||||
for cur_set, array_set in dico_set_tab.items():
|
||||
for w_set in array_set[:-1]: #avoid the percent matching
|
||||
if word == w_set:
|
||||
try:
|
||||
match_dico[str(array_set)] += 1
|
||||
except KeyError:
|
||||
match_dico[str(array_set)] = 1
|
||||
|
||||
#compute matching %
|
||||
for the_set, matchingNum in match_dico.items():
|
||||
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
|
||||
if eff_percent >= dico_percent[the_set]:
|
||||
|
||||
# Send a notification only when the member is in the set
|
||||
if the_set in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
||||
|
||||
# Send to every associated email adress
|
||||
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + the_set):
|
||||
sendEmailNotification(email, the_set)
|
||||
|
||||
print(the_set, "matched in", filename)
|
||||
set_name = 'set_' + dico_setname_to_redis[the_set]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
#consider the num of occurence of this set
|
||||
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
|
||||
|
||||
# FIXME - avoid using per paste as a set is checked over the entire paste
|
||||
#1 term per paste
|
||||
if new_to_the_set:
|
||||
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
|
||||
|
||||
else:
|
||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||
print "sleeping"
|
||||
time.sleep(5)
|
||||
message = p.get_from_set()
|
Loading…
Reference in New Issue