AIL-framework/bin/RegexForTermsFrequency.py

151 lines
5.6 KiB
Python
Raw Normal View History

2018-05-04 13:53:29 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
This Module is used for term frequency.
It processes every paste coming from the global module and test the regexs
supplied in the term webpage.
"""
import redis
import time
from pubsublogger import publisher
from packages import Paste
import calendar
import re
import signal
import time
from Helper import Process
# Email notifications
from NotificationHelper import *
class TimeoutException(Exception):
pass
def timeout_handler(signum, frame):
raise TimeoutException
signal.signal(signal.SIGALRM, timeout_handler)
# Config Variables
DICO_REFRESH_TIME = 60 # s
BlackListTermsSet_Name = "BlackListSetTermSet"
TrackedTermsSet_Name = "TrackedSetTermSet"
TrackedRegexSet_Name = "TrackedRegexSet"
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
oneDay = 60*60*24
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
2018-07-16 15:51:37 +02:00
# create direct link in mail
2018-07-17 15:11:25 +02:00
full_paste_url = "/showsavedpaste/?paste="
def refresh_dicos():
dico_regex = {}
dico_regexname_to_redis = {}
for regex_str in server_term.smembers(TrackedRegexSet_Name):
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
return dico_regex, dico_regexname_to_redis
if __name__ == "__main__":
publisher.port = 6380
publisher.channel = "Script"
config_section = 'RegexForTermsFrequency'
p = Process(config_section)
2018-10-09 11:30:04 +02:00
max_execution_time = p.config.getint(config_section, "max_execution_time")
# REDIS #
server_term = redis.StrictRedis(
2018-05-07 14:50:40 +02:00
host=p.config.get("ARDB_TermFreq", "host"),
port=p.config.get("ARDB_TermFreq", "port"),
db=p.config.get("ARDB_TermFreq", "db"),
2018-05-04 13:53:29 +02:00
decode_responses=True)
# FUNCTIONS #
publisher.info("RegexForTermsFrequency script started")
2018-07-17 15:11:25 +02:00
# create direct link in mail
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
# compile the regex
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
message = p.get_from_set()
# Regex Frequency
while True:
if message is not None:
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
dico_refresh_cooldown = time.time()
dico_regex, dico_regexname_to_redis = refresh_dicos()
print('dico got refreshed')
filename = message
temp = filename.split('/')
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
curr_set = top_termFreq_setName_day[0] + str(timestamp)
paste = Paste.Paste(filename)
content = paste.get_p_content()
# iterate the word with the regex
for regex_str, compiled_regex in dico_regex.items():
signal.alarm(max_execution_time)
try:
matched = compiled_regex.search(content)
except TimeoutException:
2018-11-02 16:07:27 +01:00
print ("{0} processing timeout".format(paste.p_rel_path))
continue
else:
signal.alarm(0)
if matched is not None: # there is a match
print('regex matched {}'.format(regex_str))
matched = matched.group(0)
regex_str_complete = "/" + regex_str + "/"
# Add in Regex track set only if term is not in the blacklist
if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
# Send a notification only when the member is in the set
if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
2018-07-16 15:51:37 +02:00
# create mail body
mail_body = ("AIL Framework,\n"
"New occurrence for regex: " + regex_str + "\n"
''+full_paste_url + filename)
2018-07-16 15:51:37 +02:00
# Send to every associated email adress
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
2018-07-16 15:51:37 +02:00
sendEmailNotification(email, 'Term', mail_body)
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
new_to_the_set = server_term.sadd(set_name, filename)
new_to_the_set = True if new_to_the_set == 1 else False
# consider the num of occurence of this term
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
# 1 term per paste
if new_to_the_set:
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
else:
pass
else:
publisher.debug("Script RegexForTermsFrequency is Idling")
2018-04-16 14:50:04 +02:00
print("sleeping")
time.sleep(5)
message = p.get_from_set()