2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2014-08-06 11:43:40 +02:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
2014-08-07 14:46:43 +02:00
|
|
|
This module is consuming the Redis-list created by the ZMQ_Sub_Curve_Q Module.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-20 15:14:57 +02:00
|
|
|
This modules update a .csv file used to draw curves representing selected
|
|
|
|
words and their occurency per day.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
..note:: The channel will have the name of the file created.
|
|
|
|
|
|
|
|
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
|
|
the same Subscriber name in both of them.
|
|
|
|
|
2016-08-19 16:53:46 +02:00
|
|
|
|
2016-08-22 20:59:56 +02:00
|
|
|
This Module is also used for term frequency.
|
2016-08-19 16:53:46 +02:00
|
|
|
|
2016-08-22 20:59:56 +02:00
|
|
|
/!\ Top set management is done in the module Curve_manage_top_set
|
2016-08-19 16:53:46 +02:00
|
|
|
|
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
Requirements
|
|
|
|
------------
|
|
|
|
|
|
|
|
*Need running Redis instances. (Redis)
|
|
|
|
*Categories files of words in /files/ need to be created
|
|
|
|
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
|
|
|
|
|
|
|
"""
|
2014-08-14 17:55:18 +02:00
|
|
|
import redis
|
|
|
|
import time
|
2014-08-06 11:43:40 +02:00
|
|
|
from pubsublogger import publisher
|
|
|
|
from packages import lib_words
|
2014-08-20 16:00:56 +02:00
|
|
|
import os
|
2014-09-02 18:20:28 +02:00
|
|
|
import datetime
|
2016-08-18 15:34:19 +02:00
|
|
|
import calendar
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
from Helper import Process
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2018-02-27 15:12:02 +01:00
|
|
|
# Email notifications
|
|
|
|
from NotificationHelper import *
|
|
|
|
|
2016-08-18 15:34:19 +02:00
|
|
|
# Config Variables
|
2016-08-22 13:35:49 +02:00
|
|
|
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
|
|
TrackedTermsSet_Name = "TrackedSetTermSet"
|
2016-08-19 13:34:02 +02:00
|
|
|
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
2016-08-22 13:35:49 +02:00
|
|
|
oneDay = 60*60*24
|
|
|
|
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
|
|
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
|
|
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
|
|
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
|
|
|
|
2018-07-16 15:51:37 +02:00
|
|
|
# create direct link in mail
|
2018-07-17 15:11:25 +02:00
|
|
|
full_paste_url = "/showsavedpaste/?paste="
|
2018-07-16 15:51:37 +02:00
|
|
|
|
2016-08-22 13:35:49 +02:00
|
|
|
def check_if_tracked_term(term, path):
|
2018-05-04 13:53:29 +02:00
|
|
|
if term in server_term.smembers(TrackedTermsSet_Name):
|
2016-08-22 13:35:49 +02:00
|
|
|
#add_paste to tracked_word_set
|
|
|
|
set_name = "tracked_" + term
|
2016-08-22 16:01:42 +02:00
|
|
|
server_term.sadd(set_name, path)
|
2018-04-16 14:50:04 +02:00
|
|
|
print(term, 'addded', set_name, '->', path)
|
2016-08-22 13:35:49 +02:00
|
|
|
p.populate_set_out("New Term added", 'CurveManageTopSets')
|
2016-08-18 15:34:19 +02:00
|
|
|
|
2018-02-27 15:12:02 +01:00
|
|
|
# Send a notification only when the member is in the set
|
|
|
|
if term in server_term.smembers(TrackedTermsNotificationEnabled_Name):
|
2018-04-16 14:50:04 +02:00
|
|
|
|
2018-07-16 15:51:37 +02:00
|
|
|
# create mail body
|
|
|
|
mail_body = ("AIL Framework,\n"
|
|
|
|
"New occurrence for term: " + term + "\n"
|
|
|
|
''+full_paste_url + path)
|
|
|
|
|
2018-02-27 15:12:02 +01:00
|
|
|
# Send to every associated email adress
|
|
|
|
for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + term):
|
2018-07-16 15:51:37 +02:00
|
|
|
sendEmailNotification(email, 'Term', mail_body)
|
2018-02-27 15:12:02 +01:00
|
|
|
|
2016-08-18 15:34:19 +02:00
|
|
|
|
|
|
|
def getValueOverRange(word, startDate, num_day):
|
|
|
|
to_return = 0
|
|
|
|
for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay):
|
|
|
|
value = server_term.hget(timestamp, word)
|
|
|
|
to_return += int(value) if value is not None else 0
|
|
|
|
return to_return
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-08-20 15:14:57 +02:00
|
|
|
if __name__ == "__main__":
|
2014-08-22 17:35:40 +02:00
|
|
|
publisher.port = 6380
|
2014-08-20 15:14:57 +02:00
|
|
|
publisher.channel = "Script"
|
2014-08-14 17:55:18 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
config_section = 'Curve'
|
|
|
|
p = Process(config_section)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-20 15:14:57 +02:00
|
|
|
# REDIS #
|
2014-08-06 11:43:40 +02:00
|
|
|
r_serv1 = redis.StrictRedis(
|
2018-05-07 14:50:40 +02:00
|
|
|
host=p.config.get("ARDB_Curve", "host"),
|
|
|
|
port=p.config.get("ARDB_Curve", "port"),
|
|
|
|
db=p.config.get("ARDB_Curve", "db"),
|
2018-05-04 13:53:29 +02:00
|
|
|
decode_responses=True)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2016-08-18 15:34:19 +02:00
|
|
|
server_term = redis.StrictRedis(
|
2018-05-07 14:50:40 +02:00
|
|
|
host=p.config.get("ARDB_TermFreq", "host"),
|
|
|
|
port=p.config.get("ARDB_TermFreq", "port"),
|
|
|
|
db=p.config.get("ARDB_TermFreq", "db"),
|
2018-05-04 13:53:29 +02:00
|
|
|
decode_responses=True)
|
2016-08-18 15:34:19 +02:00
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
# FUNCTIONS #
|
2014-08-29 19:37:56 +02:00
|
|
|
publisher.info("Script Curve started")
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2018-07-17 15:11:25 +02:00
|
|
|
# create direct link in mail
|
|
|
|
full_paste_url = p.config.get("Notifications", "ail_domain") + full_paste_url
|
|
|
|
|
2014-08-11 11:33:18 +02:00
|
|
|
# FILE CURVE SECTION #
|
2014-08-22 17:35:40 +02:00
|
|
|
csv_path = os.path.join(os.environ['AIL_HOME'],
|
2014-08-29 19:37:56 +02:00
|
|
|
p.config.get("Directories", "wordtrending_csv"))
|
2014-08-22 17:35:40 +02:00
|
|
|
wordfile_path = os.path.join(os.environ['AIL_HOME'],
|
2014-08-29 19:37:56 +02:00
|
|
|
p.config.get("Directories", "wordsfile"))
|
2014-08-11 11:33:18 +02:00
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
message = p.get_from_set()
|
2014-08-06 11:43:40 +02:00
|
|
|
prec_filename = None
|
2014-09-05 10:42:01 +02:00
|
|
|
generate_new_graph = False
|
2016-08-19 13:34:02 +02:00
|
|
|
|
|
|
|
# Term Frequency
|
2016-08-22 13:35:49 +02:00
|
|
|
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
2016-08-19 13:34:02 +02:00
|
|
|
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
|
|
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
while True:
|
2016-08-19 13:34:02 +02:00
|
|
|
|
2014-08-14 17:55:18 +02:00
|
|
|
if message is not None:
|
2014-09-02 18:20:28 +02:00
|
|
|
generate_new_graph = True
|
|
|
|
|
2014-08-29 19:37:56 +02:00
|
|
|
filename, word, score = message.split()
|
2014-09-02 18:20:28 +02:00
|
|
|
temp = filename.split('/')
|
|
|
|
date = temp[-4] + temp[-3] + temp[-2]
|
2016-08-19 13:34:02 +02:00
|
|
|
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
2016-08-22 16:01:42 +02:00
|
|
|
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
2016-08-19 13:34:02 +02:00
|
|
|
|
|
|
|
|
2014-09-02 18:20:28 +02:00
|
|
|
low_word = word.lower()
|
2016-08-22 20:59:56 +02:00
|
|
|
#Old curve with words in file
|
2016-08-18 15:34:19 +02:00
|
|
|
r_serv1.hincrby(low_word, date, int(score))
|
|
|
|
|
|
|
|
# Update redis
|
2017-02-15 16:29:02 +01:00
|
|
|
#consider the num of occurence of this term
|
2016-08-18 15:34:19 +02:00
|
|
|
curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score)))
|
2017-02-15 16:29:02 +01:00
|
|
|
#1 term per paste
|
2017-02-28 15:01:48 +01:00
|
|
|
curr_word_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), low_word, int(1)))
|
2016-08-22 20:59:56 +02:00
|
|
|
|
|
|
|
# Add in set only if term is not in the blacklist
|
2018-05-04 13:53:29 +02:00
|
|
|
if low_word not in server_term.smembers(BlackListTermsSet_Name):
|
2017-02-15 16:29:02 +01:00
|
|
|
#consider the num of occurence of this term
|
2016-08-22 16:01:42 +02:00
|
|
|
server_term.zincrby(curr_set, low_word, float(score))
|
2017-02-15 16:29:02 +01:00
|
|
|
#1 term per paste
|
2017-02-28 15:01:48 +01:00
|
|
|
server_term.zincrby("per_paste_" + curr_set, low_word, float(1))
|
2018-04-16 14:50:04 +02:00
|
|
|
|
2016-08-22 13:35:49 +02:00
|
|
|
#Add more info for tracked terms
|
|
|
|
check_if_tracked_term(low_word, filename)
|
2016-08-18 15:34:19 +02:00
|
|
|
|
2017-03-28 17:42:44 +02:00
|
|
|
#send to RegexForTermsFrequency
|
|
|
|
to_send = "{} {} {}".format(filename, timestamp, word)
|
|
|
|
p.populate_set_out(to_send, 'RegexForTermsFrequency')
|
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
else:
|
2016-08-22 20:59:56 +02:00
|
|
|
|
2014-09-02 18:20:28 +02:00
|
|
|
if generate_new_graph:
|
|
|
|
generate_new_graph = False
|
2018-04-16 14:50:04 +02:00
|
|
|
print('Building graph')
|
2014-09-02 18:20:28 +02:00
|
|
|
today = datetime.date.today()
|
|
|
|
year = today.year
|
|
|
|
month = today.month
|
2018-04-16 14:50:04 +02:00
|
|
|
|
2014-09-02 18:20:28 +02:00
|
|
|
lib_words.create_curve_with_word_file(r_serv1, csv_path,
|
|
|
|
wordfile_path, year,
|
|
|
|
month)
|
|
|
|
|
2014-08-06 11:43:40 +02:00
|
|
|
publisher.debug("Script Curve is Idling")
|
2018-04-16 14:50:04 +02:00
|
|
|
print("sleeping")
|
2014-09-02 18:20:28 +02:00
|
|
|
time.sleep(10)
|
2014-08-29 19:37:56 +02:00
|
|
|
message = p.get_from_set()
|