From bb6d3a6a263eb3d6c55e32f3b9b2c25558669708 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Wed, 7 Aug 2019 12:08:24 +0200 Subject: [PATCH] chg: [Term tracker] add term tracker module (word + set) + API: add new term to track (word + set + regex) --- bin/NotificationHelper.py | 7 -- bin/TermTrackerMod.py | 88 ++++++++++++------ bin/packages/Term.py | 113 +++++++++++++++++++---- bin/packages/config.cfg.sample | 2 +- doc/README.md | 2 +- var/www/modules/restApi/Flask_restApi.py | 20 ++-- 6 files changed, 170 insertions(+), 62 deletions(-) diff --git a/bin/NotificationHelper.py b/bin/NotificationHelper.py index 1bccd314..4007e56f 100755 --- a/bin/NotificationHelper.py +++ b/bin/NotificationHelper.py @@ -20,13 +20,6 @@ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') publisher.port = 6380 publisher.channel = "Script" -# notifications enabled/disabled -TrackedTermsNotificationEnabled_Name = "TrackedNotifications" - -# associated notification email addresses for a specific term` -# Keys will be e.g. TrackedNotificationEmails -TrackedTermsNotificationEmailsPrefix_Name = "TrackedNotificationEmails_" - def sendEmailNotification(recipient, alert_name, content): if not os.path.exists(configfile): diff --git a/bin/TermTrackerMod.py b/bin/TermTrackerMod.py index 2d0458b5..fe60640e 100755 --- a/bin/TermTrackerMod.py +++ b/bin/TermTrackerMod.py @@ -9,50 +9,84 @@ import os import sys import time +from Helper import Process +from pubsublogger import publisher + +import NotificationHelper + from packages import Paste from packages import Term sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) import Flask_config -r_serv_term = Flask_config.r_serv_term +full_item_url = "/showsavedpaste/?paste=" + +mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}" # loads tracked words list_tracked_words = Term.get_tracked_words_list() set_tracked_words_list = Term.get_set_tracked_words_list() -def new_term_found(term, term_type): - uuid_list = get_term_uuid_list() - email_notification = [] - tags = [] +def new_term_found(term, term_type, item_id): + uuid_list = Term.get_term_uuid_list(term) for term_uuid in uuid_list: - pass + Term.add_tracked_item(term_uuid, item_id) + + tags_to_add = Term.get_term_tags(term_uuid) + for tag in tags_to_add: + msg = '{};{}'.format(tag, item_id) + p.populate_set_out(msg, 'Tags') + + mail_to_notify = Term.get_term_mails(term_uuid) + if mail_to_notify: + mail_body = mail_body_template.format(term, item_id, full_item_url, item_id) + for mail in mail_to_notify: + NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body) if __name__ == "__main__": - item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz' - #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz' - paste = Paste.Paste(item_id) - res = Term.parse_tracked_term_to_add('test zorro meroio apple weert', 'word') + publisher.port = 6380 + publisher.channel = "Script" + publisher.info("Script TermTrackerMod started") - ''' - dict_words_freq = Term.get_text_word_frequency(paste.get_p_content()) + #config_section = 'TermTrackerMod' + config_section = 'Curve' + p = Process(config_section) - # check solo words - for word in list_tracked_words: - if word in dict_words_freq: - pass - # tag + get uuids ... + full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url - # check words set - for list_words, nb_words_threshold in set_tracked_words_list: - nb_uniq_word = 0 - for word in list_words: - if word in dict_words_freq: - nb_uniq_word += 1 - if nb_uniq_word > nb_words_threshold: - # tag + get uuid - pass - ''' + while True: + + item_id = p.get_from_set() + item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz' + #item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz' + + if message is not None: + + paste = Paste.Paste(item_id) + + dict_words_freq = Term.get_text_word_frequency(paste.get_p_content()) + + # check solo words + for word in list_tracked_words: + if word in dict_words_freq: + new_term_found(word, 'word', item_id) + + # check words set + for elem in set_tracked_words_list: + list_words = elem[0] + nb_words_threshold = elem[1] + word_set = elem[2] + nb_uniq_word = 0 + + for word in list_words: + if word in dict_words_freq: + nb_uniq_word += 1 + if nb_uniq_word >= nb_words_threshold: + new_term_found(word_set, 'set', item_id) + + else: + time.sleep(5) diff --git a/bin/packages/Term.py b/bin/packages/Term.py index 0902d56f..312ed7bd 100755 --- a/bin/packages/Term.py +++ b/bin/packages/Term.py @@ -2,6 +2,7 @@ # -*-coding:UTF-8 -* import os +import re import sys import uuid import redis @@ -16,6 +17,7 @@ sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules')) import Flask_config r_serv_term = Flask_config.r_serv_term +email_regex = Flask_config.email_regex special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\') special_characters.add('\\s') @@ -24,6 +26,26 @@ special_characters.add('\\s') tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps=True, discard_empty=True) +def is_valid_mail(email): + result = email_regex.match(email) + if result: + return True + else: + return False + +def verify_mail_list(mail_list): + for mail in mail_list: + if not is_valid_mail(mail): + return ({'status': 'error', 'reason': 'Invalid email', 'value': mail}, 400) + return None + +def is_valid_regex(term_regex): + try: + re.compile(term_regex) + return True + except: + return False + def get_text_word_frequency(item_content, filtering=True): item_content = item_content.lower() words_dict = defaultdict(int) @@ -34,7 +56,6 @@ def get_text_word_frequency(item_content, filtering=True): blob = TextBlob(item_content) for word in blob.tokens: words_dict[word] += 1 - print(words_dict) return words_dict # # TODO: create all tracked words @@ -45,28 +66,40 @@ def get_set_tracked_words_list(): set_list = r_serv_term.smembers('all:tracked_term:set') all_set_list = [] for elem in set_list: - elem = elem.split(';') - num_words = int(elem[1]) - ter_set = elem[0].split(',') - all_set_list.append((ter_set, num_words)) + res = elem.split(';') + num_words = int(res[1]) + ter_set = res[0].split(',') + all_set_list.append((ter_set, num_words, elem)) + return all_set_list -def parse_json_term_to_add(dict_input): +def is_term_tracked_in_global_level(term): + res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)) + if res: + for elem_uuid in res: + if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1': + return True + return False + +def parse_json_term_to_add(dict_input, user_id): term = dict_input.get('term', None) if not term: return ({"status": "error", "reason": "Term not provided"}, 400) - term_type = dict_input.get('term', None) + term_type = dict_input.get('type', None) if not term_type: return ({"status": "error", "reason": "Term type not provided"}, 400) nb_words = dict_input.get('nb_words', 1) res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words) - if res['status']=='error': + if res[1]!=200: return res + term = res[0]['term'] + term_type = res[0]['type'] - # get user_id tags = dict_input.get('tags', []) mails = dict_input.get('mails', []) - ## TODO: verify mail integrity + res = verify_mail_list(mails) + if res: + return res ## TODO: add dashboard key level = dict_input.get('level', 1) @@ -77,17 +110,20 @@ def parse_json_term_to_add(dict_input): except: level = 1 + # check if term already tracked in global + if level==1: + if is_term_tracked_in_global_level(term): + return ({"status": "error", "reason": "Term already tracked"}, 409) + term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails) - return ({'term': term, 'uuid': term_uuid}, 200) + return ({'term': term, 'type': term_type, 'uuid': term_uuid}, 200) def parse_tracked_term_to_add(term , term_type, nb_words=1): - - # todo verify regex format if term_type=='regex': - # TODO: verify regex integrity - pass + if not is_valid_regex(term): + return ({"status": "error", "reason": "Invalid regex"}, 400) elif term_type=='word' or term_type=='set': # force lowercase term = term.lower() @@ -97,7 +133,7 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1): return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400) words = term.split() # not a word - if term_type=='word' and words: + if term_type=='word' and len(words)>1: term_type = 'set' # ouput format: term1,term2,term3;2 @@ -106,19 +142,21 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1): nb_words = int(nb_words) except: nb_words = 1 + if nb_words==0: + nb_words = 1 words_set = set(words) words_set = sorted(words_set) + term = ",".join(words_set) term = "{};{}".format(term, nb_words) - print(term) - print(term_type) - - return ({"status": "success", "term": term, "type": term_type}, 200) + if nb_words > len(words_set): + nb_words = len(words_set) else: return ({"status": "error", "reason": "Incorrect type"}, 400) + return ({"status": "success", "term": term, "type": term_type}, 200) def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0): @@ -154,9 +192,44 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0) return term_uuid +def delete_term(term_uuid): + term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked') + term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type') + term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level') + r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid) + r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid) + + + if level == 0: # user only + user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id') + r_serv_term.srem('user:tracked_term:{}'.format(user_id), term_uuid) + elif level == 1: # global + r_serv_term.srem('gobal:tracked_term', term_uuid) + + # delete metatadata + r_serv_term.delete('tracked_term:{}'.format(term_uuid)) + + # remove tags + r_serv_term.delete('tracked_term:tags:{}'.format(term_uuid)) + + # remove mails + r_serv_term.delete('tracked_term:mail:{}'.format(term_uuid)) + + # remove item set + r_serv_term.delete('tracked_term:item:{}'.format(term_uuid)) + def get_term_uuid_list(term): return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))) +def get_term_tags(term_uuid): + return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid))) + +def get_term_mails(term_uuid): + return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid))) + +def add_tracked_item(term_uuid, item_id): + r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id) + diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index ea0ea55c..09e05ddf 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -23,7 +23,7 @@ sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon ##### Notifications ###### [Notifications] -ail_domain = http://localhost:7000 +ail_domain = https://localhost:7000 sender = sender@example.com sender_host = smtp.example.com sender_port = 1337 diff --git a/doc/README.md b/doc/README.md index 52768cd0..31f13cc3 100644 --- a/doc/README.md +++ b/doc/README.md @@ -600,7 +600,7 @@ Add term tracker - `term` - term to add - *str - word(s)* - - default: `text` + - mandatory - `nb_words` - number of words in set - *int* diff --git a/var/www/modules/restApi/Flask_restApi.py b/var/www/modules/restApi/Flask_restApi.py index f951ef9f..864e7ed3 100644 --- a/var/www/modules/restApi/Flask_restApi.py +++ b/var/www/modules/restApi/Flask_restApi.py @@ -17,6 +17,7 @@ import Import_helper import Item import Paste import Tag +import Term from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response from flask_login import login_required @@ -55,8 +56,11 @@ def verify_token(token): else: return False +def get_user_from_token(token): + return r_serv_db.hget('user:tokens', token) + def verify_user_role(role, token): - user_id = r_serv_db.hget('user:tokens', token) + user_id = get_user_from_token(token) if user_id: if is_in_role(user_id, role): return True @@ -308,13 +312,17 @@ def get_all_tags(): return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -# # # # # # # # # # # # # # TAGS # # # # # # # # # # # # # # # # # +# # # # # # # # # # # # # # TRACKER # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -@restApi.route("api/v1/add/tracker/term", methods=['POST']) -#@token_required('analyst') +@restApi.route("api/v1/add/tracker/term", methods=['GET']) +@token_required('analyst') def add_tracker_term(): - data = request.get_json() - + #data = request.get_json() + data = {"term": "pi", 'type' : "word"} + user_token = get_auth_from_header() + user_id = get_user_from_token(user_token) + res = Term.parse_json_term_to_add(data, user_id) + return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1] # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # #