2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2017-03-28 17:42:44 +02:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
|
|
|
This Module is used for term frequency.
|
2017-05-09 11:13:16 +02:00
|
|
|
It processes every paste coming from the global module and test the regexs
|
|
|
|
supplied in the term webpage.
|
2017-03-28 17:42:44 +02:00
|
|
|
|
|
|
|
"""
|
|
|
|
import redis
|
|
|
|
import time
|
|
|
|
from pubsublogger import publisher
|
2017-04-18 15:28:21 +02:00
|
|
|
from packages import Paste
|
2017-03-28 17:42:44 +02:00
|
|
|
import calendar
|
|
|
|
import re
|
2018-10-08 11:25:32 +02:00
|
|
|
import signal
|
|
|
|
import time
|
2017-03-28 17:42:44 +02:00
|
|
|
from Helper import Process
|
2018-02-27 15:12:02 +01:00
|
|
|
# Email notifications
|
|
|
|
from NotificationHelper import *
|
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
|
|
|
|
class TimeoutException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
def timeout_handler(signum, frame):
|
|
|
|
raise TimeoutException
|
|
|
|
|
|
|
|
signal.signal(signal.SIGALRM, timeout_handler)
|
|
|
|
|
2017-03-28 17:42:44 +02:00
|
|
|
# Config Variables
|
2018-10-08 11:25:32 +02:00
|
|
|
DICO_REFRESH_TIME = 60 # s
|
2017-04-18 15:28:21 +02:00
|
|
|
|
2017-03-28 17:42:44 +02:00
|
|
|
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
|
|
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
|
|
TrackedRegexSet_Name = "TrackedRegexSet"
|
2018-02-27 15:12:02 +01:00
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
2017-03-28 17:42:44 +02:00
|
|
|
oneDay = 60*60*24
|
|
|
|
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
|
|
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
|
|
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
2018-10-08 11:25:32 +02:00
|
|
|
top_termFreq_set_array = [top_termFreq_setName_day, top_termFreq_setName_week, top_termFreq_setName_month]
|
2017-03-28 17:42:44 +02:00
|
|
|
|
2018-11-06 13:38:37 +01:00
|
|
|
TrackedTermsNotificationTagsPrefix_Name = "TrackedNotificationTags_"
|
|
|
|
|
2018-07-16 15:51:37 +02:00
|
|
|
# create direct link in mail
|
2018-07-17 15:11:25 +02:00
|
|
|
full_paste_url = "/showsavedpaste/?paste="
|
2017-03-28 17:42:44 +02:00
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
|
2017-04-18 15:28:21 +02:00
|
|
|
def refresh_dicos():
|
2018-02-28 11:31:16 +01:00
|
|
|
dico_regex = {}
|
2017-04-18 15:28:21 +02:00
|
|
|
dico_regexname_to_redis = {}
|
|
|
|
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
|
|
|
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
|
|
|
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
|
|
|
|
|
|
|
return dico_regex, dico_regexname_to_redis
|
|
|
|
|
2017-03-28 17:42:44 +02:00
|
|
|
if __name__ == "__main__":
|
|
|
|
publisher.port = 6380
|
|
|
|
publisher.channel = "Script"
|
|
|
|
|
|
|
|
config_section = 'RegexForTermsFrequency'
|
|
|
|
p = Process(config_section)
|
2018-10-09 11:30:04 +02:00
|
|
|
max_execution_time = p.config.getint(config_section, "max_execution_time")
|
2017-03-28 17:42:44 +02:00
|
|
|
|
|
|
|
# REDIS #
|
|
|
|
server_term = redis.StrictRedis(
|
2018-05-07 14:50:40 +02:00
|
|
|
host=p.config.get("ARDB_TermFreq", "host"),
|
|
|
|
port=p.config.get("ARDB_TermFreq", "port"),
|
|
|
|
db=p.config.get("ARDB_TermFreq", "db"),
|
2018-05-04 13:53:29 +02:00
|
|
|
decode_responses=True)
|
2017-03-28 17:42:44 +02:00
|
|
|
|
|
|
|
# FUNCTIONS #
|
|
|
|
publisher.info("RegexForTermsFrequency script started")
|
|
|
|
|
2018-07-17 15:11:25 +02:00
|
|
|
# create direct link in mail
|
|
|
|
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
# compile the regex
|
2017-04-18 15:28:21 +02:00
|
|
|
dico_refresh_cooldown = time.time()
|
|
|
|
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
2017-03-28 17:42:44 +02:00
|
|
|
|
|
|
|
message = p.get_from_set()
|
|
|
|
|
|
|
|
# Regex Frequency
|
|
|
|
while True:
|
|
|
|
|
|
|
|
if message is not None:
|
2017-04-18 15:28:21 +02:00
|
|
|
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
|
|
|
dico_refresh_cooldown = time.time()
|
|
|
|
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
|
|
|
print('dico got refreshed')
|
|
|
|
|
|
|
|
filename = message
|
|
|
|
temp = filename.split('/')
|
|
|
|
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
|
|
|
|
2017-03-28 17:42:44 +02:00
|
|
|
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
2018-10-09 09:32:32 +02:00
|
|
|
paste = Paste.Paste(filename)
|
|
|
|
content = paste.get_p_content()
|
2017-03-28 17:42:44 +02:00
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
# iterate the word with the regex
|
2017-03-28 17:42:44 +02:00
|
|
|
for regex_str, compiled_regex in dico_regex.items():
|
|
|
|
|
2018-10-09 09:32:32 +02:00
|
|
|
signal.alarm(max_execution_time)
|
2018-10-08 11:25:32 +02:00
|
|
|
try:
|
|
|
|
matched = compiled_regex.search(content)
|
|
|
|
except TimeoutException:
|
2018-10-09 09:32:32 +02:00
|
|
|
print ("{0} processing timeout".format(paste.p_path))
|
2018-10-08 11:25:32 +02:00
|
|
|
continue
|
|
|
|
else:
|
|
|
|
signal.alarm(0)
|
|
|
|
|
|
|
|
if matched is not None: # there is a match
|
2017-04-18 15:28:21 +02:00
|
|
|
print('regex matched {}'.format(regex_str))
|
2017-03-28 17:42:44 +02:00
|
|
|
matched = matched.group(0)
|
2018-02-28 11:31:16 +01:00
|
|
|
regex_str_complete = "/" + regex_str + "/"
|
2017-03-28 17:42:44 +02:00
|
|
|
# Add in Regex track set only if term is not in the blacklist
|
2018-02-28 11:31:16 +01:00
|
|
|
if regex_str_complete not in server_term.smembers(BlackListTermsSet_Name):
|
2018-02-27 15:12:02 +01:00
|
|
|
# Send a notification only when the member is in the set
|
2018-02-28 11:31:16 +01:00
|
|
|
if regex_str_complete in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
2018-07-16 15:51:37 +02:00
|
|
|
|
|
|
|
# create mail body
|
|
|
|
mail_body = ("AIL Framework,\n"
|
2018-10-08 11:25:32 +02:00
|
|
|
"New occurrence for regex: " + regex_str + "\n"
|
|
|
|
''+full_paste_url + filename)
|
2018-07-16 15:51:37 +02:00
|
|
|
|
2018-02-27 15:12:02 +01:00
|
|
|
# Send to every associated email adress
|
2018-02-28 11:31:16 +01:00
|
|
|
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + regex_str_complete):
|
2018-07-16 15:51:37 +02:00
|
|
|
sendEmailNotification(email, 'Term', mail_body)
|
2018-02-27 15:12:02 +01:00
|
|
|
|
2018-11-06 13:38:37 +01:00
|
|
|
# tag paste
|
|
|
|
for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + regex_str_complete):
|
|
|
|
msg = '{};{}'.format(tag, filename)
|
|
|
|
p.populate_set_out(msg, 'Tags')
|
|
|
|
|
2017-04-18 15:28:21 +02:00
|
|
|
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
2017-03-28 17:42:44 +02:00
|
|
|
new_to_the_set = server_term.sadd(set_name, filename)
|
|
|
|
new_to_the_set = True if new_to_the_set == 1 else False
|
|
|
|
|
2018-10-08 11:25:32 +02:00
|
|
|
# consider the num of occurence of this term
|
2017-04-18 15:28:21 +02:00
|
|
|
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
2018-10-08 11:25:32 +02:00
|
|
|
# 1 term per paste
|
2017-03-28 17:42:44 +02:00
|
|
|
if new_to_the_set:
|
2017-04-18 15:28:21 +02:00
|
|
|
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
|
|
|
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
|
|
|
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
|
|
|
else:
|
|
|
|
pass
|
2017-03-28 17:42:44 +02:00
|
|
|
|
|
|
|
else:
|
|
|
|
publisher.debug("Script RegexForTermsFrequency is Idling")
|
2018-04-16 14:50:04 +02:00
|
|
|
print("sleeping")
|
2017-03-28 17:42:44 +02:00
|
|
|
time.sleep(5)
|
|
|
|
message = p.get_from_set()
|