mirror of https://github.com/CIRCL/AIL-framework
chg: [Term tracker] add term tracker module (word + set) + API: add new term to track (word + set + regex)
parent
28320a32a6
commit
bb6d3a6a26
|
@ -20,13 +20,6 @@ configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
|
||||||
# notifications enabled/disabled
|
|
||||||
TrackedTermsNotificationEnabled_Name = "TrackedNotifications"
|
|
||||||
|
|
||||||
# associated notification email addresses for a specific term`
|
|
||||||
# Keys will be e.g. TrackedNotificationEmails<TERMNAME>
|
|
||||||
TrackedTermsNotificationEmailsPrefix_Name = "TrackedNotificationEmails_"
|
|
||||||
|
|
||||||
def sendEmailNotification(recipient, alert_name, content):
|
def sendEmailNotification(recipient, alert_name, content):
|
||||||
|
|
||||||
if not os.path.exists(configfile):
|
if not os.path.exists(configfile):
|
||||||
|
|
|
@ -9,50 +9,84 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from Helper import Process
|
||||||
|
from pubsublogger import publisher
|
||||||
|
|
||||||
|
import NotificationHelper
|
||||||
|
|
||||||
from packages import Paste
|
from packages import Paste
|
||||||
from packages import Term
|
from packages import Term
|
||||||
|
|
||||||
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||||
import Flask_config
|
import Flask_config
|
||||||
|
|
||||||
r_serv_term = Flask_config.r_serv_term
|
full_item_url = "/showsavedpaste/?paste="
|
||||||
|
|
||||||
|
mail_body_template = "AIL Framework,\nNew occurrence for term tracked term: {}\nitem id: {}\nurl: {}{}"
|
||||||
|
|
||||||
# loads tracked words
|
# loads tracked words
|
||||||
list_tracked_words = Term.get_tracked_words_list()
|
list_tracked_words = Term.get_tracked_words_list()
|
||||||
set_tracked_words_list = Term.get_set_tracked_words_list()
|
set_tracked_words_list = Term.get_set_tracked_words_list()
|
||||||
|
|
||||||
def new_term_found(term, term_type):
|
def new_term_found(term, term_type, item_id):
|
||||||
uuid_list = get_term_uuid_list()
|
uuid_list = Term.get_term_uuid_list(term)
|
||||||
email_notification = []
|
|
||||||
tags = []
|
|
||||||
|
|
||||||
for term_uuid in uuid_list:
|
for term_uuid in uuid_list:
|
||||||
pass
|
Term.add_tracked_item(term_uuid, item_id)
|
||||||
|
|
||||||
|
tags_to_add = Term.get_term_tags(term_uuid)
|
||||||
|
for tag in tags_to_add:
|
||||||
|
msg = '{};{}'.format(tag, item_id)
|
||||||
|
p.populate_set_out(msg, 'Tags')
|
||||||
|
|
||||||
|
mail_to_notify = Term.get_term_mails(term_uuid)
|
||||||
|
if mail_to_notify:
|
||||||
|
mail_body = mail_body_template.format(term, item_id, full_item_url, item_id)
|
||||||
|
for mail in mail_to_notify:
|
||||||
|
NotificationHelper.sendEmailNotification(mail, 'Term Tracker', mail_body)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
|
publisher.port = 6380
|
||||||
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
|
publisher.channel = "Script"
|
||||||
paste = Paste.Paste(item_id)
|
publisher.info("Script TermTrackerMod started")
|
||||||
res = Term.parse_tracked_term_to_add('test zorro meroio apple weert', 'word')
|
|
||||||
|
|
||||||
'''
|
#config_section = 'TermTrackerMod'
|
||||||
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
|
config_section = 'Curve'
|
||||||
|
p = Process(config_section)
|
||||||
|
|
||||||
# check solo words
|
full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url
|
||||||
for word in list_tracked_words:
|
|
||||||
if word in dict_words_freq:
|
|
||||||
pass
|
|
||||||
# tag + get uuids ...
|
|
||||||
|
|
||||||
# check words set
|
while True:
|
||||||
for list_words, nb_words_threshold in set_tracked_words_list:
|
|
||||||
nb_uniq_word = 0
|
item_id = p.get_from_set()
|
||||||
for word in list_words:
|
item_id = 'submitted/2019/08/02/cc1900ed-6051-473a-ba7a-850a17d0cc02.gz'
|
||||||
if word in dict_words_freq:
|
#item_id = 'submitted/2019/08/02/0a52d82d-a89d-4004-9535-8a0bc9c1ce49.gz'
|
||||||
nb_uniq_word += 1
|
|
||||||
if nb_uniq_word > nb_words_threshold:
|
if message is not None:
|
||||||
# tag + get uuid
|
|
||||||
pass
|
paste = Paste.Paste(item_id)
|
||||||
'''
|
|
||||||
|
dict_words_freq = Term.get_text_word_frequency(paste.get_p_content())
|
||||||
|
|
||||||
|
# check solo words
|
||||||
|
for word in list_tracked_words:
|
||||||
|
if word in dict_words_freq:
|
||||||
|
new_term_found(word, 'word', item_id)
|
||||||
|
|
||||||
|
# check words set
|
||||||
|
for elem in set_tracked_words_list:
|
||||||
|
list_words = elem[0]
|
||||||
|
nb_words_threshold = elem[1]
|
||||||
|
word_set = elem[2]
|
||||||
|
nb_uniq_word = 0
|
||||||
|
|
||||||
|
for word in list_words:
|
||||||
|
if word in dict_words_freq:
|
||||||
|
nb_uniq_word += 1
|
||||||
|
if nb_uniq_word >= nb_words_threshold:
|
||||||
|
new_term_found(word_set, 'set', item_id)
|
||||||
|
|
||||||
|
else:
|
||||||
|
time.sleep(5)
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# -*-coding:UTF-8 -*
|
# -*-coding:UTF-8 -*
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import uuid
|
import uuid
|
||||||
import redis
|
import redis
|
||||||
|
@ -16,6 +17,7 @@ sys.path.append(os.path.join(os.environ['AIL_FLASK'], 'modules'))
|
||||||
import Flask_config
|
import Flask_config
|
||||||
|
|
||||||
r_serv_term = Flask_config.r_serv_term
|
r_serv_term = Flask_config.r_serv_term
|
||||||
|
email_regex = Flask_config.email_regex
|
||||||
|
|
||||||
special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\')
|
special_characters = set('[<>~!?@#$%^&*|()_-+={}":;,.\'\n\r\t]/\\')
|
||||||
special_characters.add('\\s')
|
special_characters.add('\\s')
|
||||||
|
@ -24,6 +26,26 @@ special_characters.add('\\s')
|
||||||
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
|
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
|
||||||
gaps=True, discard_empty=True)
|
gaps=True, discard_empty=True)
|
||||||
|
|
||||||
|
def is_valid_mail(email):
|
||||||
|
result = email_regex.match(email)
|
||||||
|
if result:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def verify_mail_list(mail_list):
|
||||||
|
for mail in mail_list:
|
||||||
|
if not is_valid_mail(mail):
|
||||||
|
return ({'status': 'error', 'reason': 'Invalid email', 'value': mail}, 400)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def is_valid_regex(term_regex):
|
||||||
|
try:
|
||||||
|
re.compile(term_regex)
|
||||||
|
return True
|
||||||
|
except:
|
||||||
|
return False
|
||||||
|
|
||||||
def get_text_word_frequency(item_content, filtering=True):
|
def get_text_word_frequency(item_content, filtering=True):
|
||||||
item_content = item_content.lower()
|
item_content = item_content.lower()
|
||||||
words_dict = defaultdict(int)
|
words_dict = defaultdict(int)
|
||||||
|
@ -34,7 +56,6 @@ def get_text_word_frequency(item_content, filtering=True):
|
||||||
blob = TextBlob(item_content)
|
blob = TextBlob(item_content)
|
||||||
for word in blob.tokens:
|
for word in blob.tokens:
|
||||||
words_dict[word] += 1
|
words_dict[word] += 1
|
||||||
print(words_dict)
|
|
||||||
return words_dict
|
return words_dict
|
||||||
|
|
||||||
# # TODO: create all tracked words
|
# # TODO: create all tracked words
|
||||||
|
@ -45,28 +66,40 @@ def get_set_tracked_words_list():
|
||||||
set_list = r_serv_term.smembers('all:tracked_term:set')
|
set_list = r_serv_term.smembers('all:tracked_term:set')
|
||||||
all_set_list = []
|
all_set_list = []
|
||||||
for elem in set_list:
|
for elem in set_list:
|
||||||
elem = elem.split(';')
|
res = elem.split(';')
|
||||||
num_words = int(elem[1])
|
num_words = int(res[1])
|
||||||
ter_set = elem[0].split(',')
|
ter_set = res[0].split(',')
|
||||||
all_set_list.append((ter_set, num_words))
|
all_set_list.append((ter_set, num_words, elem))
|
||||||
|
return all_set_list
|
||||||
|
|
||||||
def parse_json_term_to_add(dict_input):
|
def is_term_tracked_in_global_level(term):
|
||||||
|
res = r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term))
|
||||||
|
if res:
|
||||||
|
for elem_uuid in res:
|
||||||
|
if r_serv_term.hget('tracked_term:{}'.format(elem_uuid), 'level')=='1':
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def parse_json_term_to_add(dict_input, user_id):
|
||||||
term = dict_input.get('term', None)
|
term = dict_input.get('term', None)
|
||||||
if not term:
|
if not term:
|
||||||
return ({"status": "error", "reason": "Term not provided"}, 400)
|
return ({"status": "error", "reason": "Term not provided"}, 400)
|
||||||
term_type = dict_input.get('term', None)
|
term_type = dict_input.get('type', None)
|
||||||
if not term_type:
|
if not term_type:
|
||||||
return ({"status": "error", "reason": "Term type not provided"}, 400)
|
return ({"status": "error", "reason": "Term type not provided"}, 400)
|
||||||
nb_words = dict_input.get('nb_words', 1)
|
nb_words = dict_input.get('nb_words', 1)
|
||||||
|
|
||||||
res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words)
|
res = parse_tracked_term_to_add(term , term_type, nb_words=nb_words)
|
||||||
if res['status']=='error':
|
if res[1]!=200:
|
||||||
return res
|
return res
|
||||||
|
term = res[0]['term']
|
||||||
|
term_type = res[0]['type']
|
||||||
|
|
||||||
# get user_id
|
|
||||||
tags = dict_input.get('tags', [])
|
tags = dict_input.get('tags', [])
|
||||||
mails = dict_input.get('mails', [])
|
mails = dict_input.get('mails', [])
|
||||||
## TODO: verify mail integrity
|
res = verify_mail_list(mails)
|
||||||
|
if res:
|
||||||
|
return res
|
||||||
|
|
||||||
## TODO: add dashboard key
|
## TODO: add dashboard key
|
||||||
level = dict_input.get('level', 1)
|
level = dict_input.get('level', 1)
|
||||||
|
@ -77,17 +110,20 @@ def parse_json_term_to_add(dict_input):
|
||||||
except:
|
except:
|
||||||
level = 1
|
level = 1
|
||||||
|
|
||||||
|
# check if term already tracked in global
|
||||||
|
if level==1:
|
||||||
|
if is_term_tracked_in_global_level(term):
|
||||||
|
return ({"status": "error", "reason": "Term already tracked"}, 409)
|
||||||
|
|
||||||
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
|
term_uuid = add_tracked_term(term , term_type, user_id, level, tags, mails)
|
||||||
|
|
||||||
return ({'term': term, 'uuid': term_uuid}, 200)
|
return ({'term': term, 'type': term_type, 'uuid': term_uuid}, 200)
|
||||||
|
|
||||||
|
|
||||||
def parse_tracked_term_to_add(term , term_type, nb_words=1):
|
def parse_tracked_term_to_add(term , term_type, nb_words=1):
|
||||||
|
|
||||||
# todo verify regex format
|
|
||||||
if term_type=='regex':
|
if term_type=='regex':
|
||||||
# TODO: verify regex integrity
|
if not is_valid_regex(term):
|
||||||
pass
|
return ({"status": "error", "reason": "Invalid regex"}, 400)
|
||||||
elif term_type=='word' or term_type=='set':
|
elif term_type=='word' or term_type=='set':
|
||||||
# force lowercase
|
# force lowercase
|
||||||
term = term.lower()
|
term = term.lower()
|
||||||
|
@ -97,7 +133,7 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
|
||||||
return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400)
|
return ({"status": "error", "reason": "special character not allowed", "message": "Please use a regex or remove all special characters"}, 400)
|
||||||
words = term.split()
|
words = term.split()
|
||||||
# not a word
|
# not a word
|
||||||
if term_type=='word' and words:
|
if term_type=='word' and len(words)>1:
|
||||||
term_type = 'set'
|
term_type = 'set'
|
||||||
|
|
||||||
# ouput format: term1,term2,term3;2
|
# ouput format: term1,term2,term3;2
|
||||||
|
@ -106,19 +142,21 @@ def parse_tracked_term_to_add(term , term_type, nb_words=1):
|
||||||
nb_words = int(nb_words)
|
nb_words = int(nb_words)
|
||||||
except:
|
except:
|
||||||
nb_words = 1
|
nb_words = 1
|
||||||
|
if nb_words==0:
|
||||||
|
nb_words = 1
|
||||||
|
|
||||||
words_set = set(words)
|
words_set = set(words)
|
||||||
words_set = sorted(words_set)
|
words_set = sorted(words_set)
|
||||||
|
|
||||||
term = ",".join(words_set)
|
term = ",".join(words_set)
|
||||||
term = "{};{}".format(term, nb_words)
|
term = "{};{}".format(term, nb_words)
|
||||||
|
|
||||||
print(term)
|
if nb_words > len(words_set):
|
||||||
print(term_type)
|
nb_words = len(words_set)
|
||||||
|
|
||||||
return ({"status": "success", "term": term, "type": term_type}, 200)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
return ({"status": "error", "reason": "Incorrect type"}, 400)
|
return ({"status": "error", "reason": "Incorrect type"}, 400)
|
||||||
|
return ({"status": "success", "term": term, "type": term_type}, 200)
|
||||||
|
|
||||||
def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0):
|
def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0):
|
||||||
|
|
||||||
|
@ -154,9 +192,44 @@ def add_tracked_term(term , term_type, user_id, level, tags, mails, dashboard=0)
|
||||||
|
|
||||||
return term_uuid
|
return term_uuid
|
||||||
|
|
||||||
|
def delete_term(term_uuid):
|
||||||
|
term = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'tracked')
|
||||||
|
term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'type')
|
||||||
|
term_level = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'level')
|
||||||
|
r_serv_term.srem('all:tracked_term_uuid:{}'.format(term), term_uuid)
|
||||||
|
r_serv_term.srem('all:tracked_term:{}'.format(term_type), term_uuid)
|
||||||
|
|
||||||
|
|
||||||
|
if level == 0: # user only
|
||||||
|
user_id = term_type = r_serv_term.hget('tracked_term:{}'.format(term_uuid), 'user_id')
|
||||||
|
r_serv_term.srem('user:tracked_term:{}'.format(user_id), term_uuid)
|
||||||
|
elif level == 1: # global
|
||||||
|
r_serv_term.srem('gobal:tracked_term', term_uuid)
|
||||||
|
|
||||||
|
# delete metatadata
|
||||||
|
r_serv_term.delete('tracked_term:{}'.format(term_uuid))
|
||||||
|
|
||||||
|
# remove tags
|
||||||
|
r_serv_term.delete('tracked_term:tags:{}'.format(term_uuid))
|
||||||
|
|
||||||
|
# remove mails
|
||||||
|
r_serv_term.delete('tracked_term:mail:{}'.format(term_uuid))
|
||||||
|
|
||||||
|
# remove item set
|
||||||
|
r_serv_term.delete('tracked_term:item:{}'.format(term_uuid))
|
||||||
|
|
||||||
def get_term_uuid_list(term):
|
def get_term_uuid_list(term):
|
||||||
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
|
return list(r_serv_term.smembers('all:tracked_term_uuid:{}'.format(term)))
|
||||||
|
|
||||||
|
def get_term_tags(term_uuid):
|
||||||
|
return list(r_serv_term.smembers('tracked_term:tags:{}'.format(term_uuid)))
|
||||||
|
|
||||||
|
def get_term_mails(term_uuid):
|
||||||
|
return list(r_serv_term.smembers('tracked_term:mail:{}'.format(term_uuid)))
|
||||||
|
|
||||||
|
def add_tracked_item(term_uuid, item_id):
|
||||||
|
r_serv_term.sadd('tracked_term:item:{}'.format(term_uuid), item_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ sentiment_lexicon_file = sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon
|
||||||
|
|
||||||
##### Notifications ######
|
##### Notifications ######
|
||||||
[Notifications]
|
[Notifications]
|
||||||
ail_domain = http://localhost:7000
|
ail_domain = https://localhost:7000
|
||||||
sender = sender@example.com
|
sender = sender@example.com
|
||||||
sender_host = smtp.example.com
|
sender_host = smtp.example.com
|
||||||
sender_port = 1337
|
sender_port = 1337
|
||||||
|
|
|
@ -600,7 +600,7 @@ Add term tracker
|
||||||
- `term`
|
- `term`
|
||||||
- term to add
|
- term to add
|
||||||
- *str - word(s)*
|
- *str - word(s)*
|
||||||
- default: `text`
|
- mandatory
|
||||||
- `nb_words`
|
- `nb_words`
|
||||||
- number of words in set
|
- number of words in set
|
||||||
- *int*
|
- *int*
|
||||||
|
|
|
@ -17,6 +17,7 @@ import Import_helper
|
||||||
import Item
|
import Item
|
||||||
import Paste
|
import Paste
|
||||||
import Tag
|
import Tag
|
||||||
|
import Term
|
||||||
|
|
||||||
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response
|
from flask import Flask, render_template, jsonify, request, Blueprint, redirect, url_for, Response
|
||||||
from flask_login import login_required
|
from flask_login import login_required
|
||||||
|
@ -55,8 +56,11 @@ def verify_token(token):
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def get_user_from_token(token):
|
||||||
|
return r_serv_db.hget('user:tokens', token)
|
||||||
|
|
||||||
def verify_user_role(role, token):
|
def verify_user_role(role, token):
|
||||||
user_id = r_serv_db.hget('user:tokens', token)
|
user_id = get_user_from_token(token)
|
||||||
if user_id:
|
if user_id:
|
||||||
if is_in_role(user_id, role):
|
if is_in_role(user_id, role):
|
||||||
return True
|
return True
|
||||||
|
@ -308,13 +312,17 @@ def get_all_tags():
|
||||||
return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200
|
return Response(json.dumps(res, indent=2, sort_keys=True), mimetype='application/json'), 200
|
||||||
|
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # # TAGS # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # TRACKER # # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
@restApi.route("api/v1/add/tracker/term", methods=['POST'])
|
@restApi.route("api/v1/add/tracker/term", methods=['GET'])
|
||||||
#@token_required('analyst')
|
@token_required('analyst')
|
||||||
def add_tracker_term():
|
def add_tracker_term():
|
||||||
data = request.get_json()
|
#data = request.get_json()
|
||||||
|
data = {"term": "pi", 'type' : "word"}
|
||||||
|
user_token = get_auth_from_header()
|
||||||
|
user_id = get_user_from_token(user_token)
|
||||||
|
res = Term.parse_json_term_to_add(data, user_id)
|
||||||
|
return Response(json.dumps(res[0], indent=2, sort_keys=True), mimetype='application/json'), res[1]
|
||||||
|
|
||||||
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
|
||||||
# # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # #
|
# # # # # # # # # # # # # IMPORT # # # # # # # # # # # # # # # # # #
|
||||||
|
|
Loading…
Reference in New Issue