diff --git a/bin/Curve.py b/bin/Curve.py index e6c0bb05..14392a47 100755 --- a/bin/Curve.py +++ b/bin/Curve.py @@ -132,6 +132,10 @@ if __name__ == "__main__": #Add more info for tracked terms check_if_tracked_term(low_word, filename) + #send to RegexForTermsFrequency + to_send = "{} {} {}".format(filename, timestamp, word) + p.populate_set_out(to_send, 'RegexForTermsFrequency') + else: if generate_new_graph: diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index e082b7f0..da660e3e 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -149,6 +149,10 @@ function launching_scripts { sleep 0.1 screen -S "Script" -X screen -t "CurveManageTopSets" bash -c './CurveManageTopSets.py; read x' sleep 0.1 + screen -S "Script" -X screen -t "RegexForTermsFrequency" bash -c './RegexForTermsFrequency.py; read x' + sleep 0.1 + screen -S "Script" -X screen -t "SetForTermsFrequency" bash -c './SetForTermsFrequency.py; read x' + sleep 0.1 screen -S "Script" -X screen -t "Indexer" bash -c './Indexer.py; read x' sleep 0.1 screen -S "Script" -X screen -t "Keys" bash -c './Keys.py; read x' diff --git a/bin/RegexForTermsFrequency.py b/bin/RegexForTermsFrequency.py new file mode 100755 index 00000000..b9710a2c --- /dev/null +++ b/bin/RegexForTermsFrequency.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* +""" +This Module is used for term frequency. + +""" +import redis +import time +from pubsublogger import publisher +from packages import lib_words +import os +import datetime +import calendar +import re + +from Helper import Process + +# Config Variables +BlackListTermsSet_Name = "BlackListSetTermSet" +TrackedTermsSet_Name = "TrackedSetTermSet" +TrackedRegexSet_Name = "TrackedRegexSet" +top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set +oneDay = 60*60*24 +top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] +top_termFreq_setName_week = ["TopTermFreq_set_week", 7] +top_termFreq_setName_month = ["TopTermFreq_set_month", 31] +top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] + + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'RegexForTermsFrequency' + p = Process(config_section) + + # REDIS # + server_term = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_TermFreq", "host"), + port=p.config.get("Redis_Level_DB_TermFreq", "port"), + db=p.config.get("Redis_Level_DB_TermFreq", "db")) + + # FUNCTIONS # + publisher.info("RegexForTermsFrequency script started") + + #compile the regex + dico_regex = {} + for regex_str in server_term.smembers(TrackedRegexSet_Name): + dico_regex[regex_str] = re.compile(regex_str) + + + message = p.get_from_set() + + # Regex Frequency + while True: + + if message is not None: + filename, timestamp, word = message.split() + curr_set = top_termFreq_setName_day[0] + str(timestamp) + + #iterate the word with the regex + for regex_str, compiled_regex in dico_regex.items(): + matched = compiled_regex.match(word) + if word == "amzinggg": + print("matched") + server_term.incr("thisistest") + + if matched is not None: #there is a match + matched = matched.group(0) + # Add in Regex track set only if term is not in the blacklist + if matched not in server_term.smembers(BlackListTermsSet_Name): + set_name = 'regex_' + regex_str + new_to_the_set = server_term.sadd(set_name, filename) + new_to_the_set = True if new_to_the_set == 1 else False + + #consider the num of occurence of this term + regex_value = int(server_term.hincrby(timestamp, regex_str, int(1))) + #1 term per paste + if new_to_the_set: + regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1))) + server_term.zincrby("per_paste_" + curr_set, regex_str, float(1)) + server_term.zincrby(curr_set, regex_str, float(1)) + + + else: + publisher.debug("Script RegexForTermsFrequency is Idling") + print "sleeping" + time.sleep(5) + message = p.get_from_set() diff --git a/bin/SetForTermsFrequency.py b/bin/SetForTermsFrequency.py new file mode 100755 index 00000000..626a79a9 --- /dev/null +++ b/bin/SetForTermsFrequency.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python2 +# -*-coding:UTF-8 -* +""" +This Module is used for term frequency. + +""" +import redis +import time +from pubsublogger import publisher +from packages import lib_words +from packages import Paste +import os +import datetime +import calendar +import re +import ast + +from Helper import Process + +# Config Variables +BlackListTermsSet_Name = "BlackListSetTermSet" +TrackedTermsSet_Name = "TrackedSetTermSet" +TrackedRegexSet_Name = "TrackedRegexSet" +TrackedSetSet_Name = "TrackedSetSet" +top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set +oneDay = 60*60*24 +top_termFreq_setName_day = ["TopTermFreq_set_day_", 1] +top_termFreq_setName_week = ["TopTermFreq_set_week", 7] +top_termFreq_setName_month = ["TopTermFreq_set_month", 31] +top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] + +def add_quote_inside_tab(tab): + quoted_tab = "[" + for elem in tab[1:-1].split(','): + elem = elem.lstrip().strip() + quoted_tab += "\"{}\", ".format(elem) + quoted_tab = quoted_tab[:-2] #remove trailing , + quoted_tab += "]" + return quoted_tab + +if __name__ == "__main__": + publisher.port = 6380 + publisher.channel = "Script" + + config_section = 'SetForTermsFrequency' + p = Process(config_section) + + # REDIS # + server_term = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_TermFreq", "host"), + port=p.config.get("Redis_Level_DB_TermFreq", "port"), + db=p.config.get("Redis_Level_DB_TermFreq", "db")) + + # FUNCTIONS # + publisher.info("RegexForTermsFrequency script started") + + #get the dico and matching percent + dico_percent = {} + dico_set_tab = {} + for set_str in server_term.smembers(TrackedSetSet_Name): + tab_set = set_str[1:-1] + tab_set = add_quote_inside_tab(tab_set) + perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set) + if perc_finder is not None: + match_percent = perc_finder.group(0)[1:-1] + dico_percent[str(set_str)] = match_percent + tab_set = '["IoT", "mirai", "botnet", [50]]' + dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1] + else: + continue + + + message = p.get_from_set() + + while True: + + if message is not None: + filename = message + temp = filename.split('/') + timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) + content = Paste.Paste(filename).get_p_content() + + curr_set = top_termFreq_setName_day[0] + str(timestamp) + + #iterate over the words of the file + match_dico = {} + for word in content: + for cur_set, array_set in dico_set_tab.items(): + for w_set in array_set: + if word == w_set: + try: + match_dico[curr_set] += 1 + except KeyError: + match_dico[curr_set] = 1 + + #compute matching % + for the_set, matchingNum in match_dico.items(): + eff_percent = matchingNum / len(dico_set_tab[str(the_set)]) + if eff_percent >= dico_percent[str(set_str)]: + print(the_set, "matched in", filename) + set_name = 'set_' + the_set + server_term.sadd(set_name, filename) + + #consider the num of occurence of this set + set_value = int(server_term.hincrby(timestamp, the_set, int(1))) + + # FIXME - avoid using per paste as a set is checked over the entire paste + #1 term per paste + regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1))) + server_term.zincrby("per_paste_" + curr_set, the_set, float(1)) + server_term.zincrby(curr_set, the_set, float(1)) + + + else: + publisher.debug("Script RegexForTermsFrequency is Idling") + print "sleeping" + time.sleep(5) + message = p.get_from_set() diff --git a/bin/packages/modules.cfg b/bin/packages/modules.cfg index c82f5c6b..73c51a24 100644 --- a/bin/packages/modules.cfg +++ b/bin/packages/modules.cfg @@ -32,7 +32,13 @@ publish = Redis_Words [Curve] subscribe = Redis_Words -publish = Redis_CurveManageTopSets +publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency + +[RegexForTermsFrequency] +subscribe = Redis_RegexForTermsFrequency + +[SetForTermsFrequency] +subscribe = Redis_Global [CurveManageTopSets] subscribe = Redis_CurveManageTopSets diff --git a/doc/module-data-flow.png b/doc/module-data-flow.png index 373fa44d..65c58bce 100644 Binary files a/doc/module-data-flow.png and b/doc/module-data-flow.png differ diff --git a/var/www/Flasks/Flask_terms.py b/var/www/Flasks/Flask_terms.py index fad048bd..75be5c58 100644 --- a/var/www/Flasks/Flask_terms.py +++ b/var/www/Flasks/Flask_terms.py @@ -9,7 +9,7 @@ import datetime import calendar import flask from flask import Flask, render_template, jsonify, request - +import re import Paste # ============ VARIABLES ============ @@ -18,6 +18,22 @@ import Flask_config app = Flask_config.app cfg = Flask_config.cfg r_serv_term = Flask_config.r_serv_term + +DEFAULT_MATCH_PERCENT = 50 + +#tracked +TrackedTermsSet_Name = "TrackedSetTermSet" +TrackedTermsDate_Name = "TrackedTermDate" +#black +BlackListTermsDate_Name = "BlackListTermDate" +BlackListTermsSet_Name = "BlackListSetTermSet" +#regex +TrackedRegexSet_Name = "TrackedRegexSet" +TrackedRegexDate_Name = "TrackedRegexDate" +#set +TrackedSetSet_Name = "TrackedSetSet" +TrackedSetDate_Name = "TrackedSetDate" + # ============ FUNCTIONS ============ def Term_getValueOverRange(word, startDate, num_day, per_paste=""): @@ -47,15 +63,43 @@ def terms_management(): per_paste_text = "" per_paste = 0 - TrackedTermsSet_Name = "TrackedSetTermSet" - BlackListTermsSet_Name = "BlackListSetTermSet" - TrackedTermsDate_Name = "TrackedTermDate" - BlackListTermsDate_Name = "BlackListTermDate" - today = datetime.datetime.now() today = today.replace(hour=0, minute=0, second=0, microsecond=0) today_timestamp = calendar.timegm(today.timetuple()) + #Regex + trackReg_list = [] + trackReg_list_values = [] + trackReg_list_num_of_paste = [] + for tracked_regex in r_serv_term.smembers(TrackedRegexSet_Name): + trackReg_list.append(tracked_regex) + value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text) + + term_date = r_serv_term.hget(TrackedRegexDate_Name, tracked_regex) + + set_paste_name = "regex_" + tracked_regex + trackReg_list_num_of_paste.append(r_serv_term.scard(set_paste_name)) + term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded" + value_range.append(term_date) + trackReg_list_values.append(value_range) + + #Set + trackSet_list = [] + trackSet_list_values = [] + trackSet_list_num_of_paste = [] + for tracked_set in r_serv_term.smembers(TrackedSetSet_Name): + trackSet_list.append(tracked_set) + value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text) + + term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set) + + set_paste_name = "set_" + tracked_set + trackSet_list_num_of_paste.append(r_serv_term.scard(set_paste_name)) + term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded" + value_range.append(term_date) + trackSet_list_values.append(value_range) + + #Tracked terms track_list = [] track_list_values = [] track_list_num_of_paste = [] @@ -72,23 +116,36 @@ def terms_management(): track_list_values.append(value_range) + #blacklist terms black_list = [] for blacked_term in r_serv_term.smembers(BlackListTermsSet_Name): term_date = r_serv_term.hget(BlackListTermsDate_Name, blacked_term) term_date = datetime.datetime.utcfromtimestamp(int(term_date)) if term_date is not None else "No date recorded" black_list.append([blacked_term, term_date]) - return render_template("terms_management.html", black_list=black_list, track_list=track_list, track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste, per_paste=per_paste) + return render_template("terms_management.html", + black_list=black_list, track_list=track_list, trackReg_list=trackReg_list, trackSet_list=trackSet_list, + track_list_values=track_list_values, track_list_num_of_paste=track_list_num_of_paste, + trackReg_list_values=trackReg_list_values, trackReg_list_num_of_paste=trackReg_list_num_of_paste, + trackSet_list_values=trackSet_list_values, trackSet_list_num_of_paste=trackSet_list_num_of_paste, + per_paste=per_paste) @app.route("/terms_management_query_paste/") def terms_management_query_paste(): term = request.args.get('term') - TrackedTermsSet_Name = "TrackedSetTermSet" paste_info = [] - set_paste_name = "tracked_" + term - track_list_path = r_serv_term.smembers(set_paste_name) + # check if regex or not + if term.startswith('/') and term.endswith('/'): + set_paste_name = "regex_" + term + track_list_path = r_serv_term.smembers(set_paste_name) + elif term.startswith('\\') and term.endswith('\\'): + set_paste_name = "set_" + term + track_list_path = r_serv_term.smembers(set_paste_name) + else: + set_paste_name = "tracked_" + term + track_list_path = r_serv_term.smembers(set_paste_name) for path in track_list_path: paste = Paste.Paste(path) @@ -131,11 +188,6 @@ def terms_management_query(): @app.route("/terms_management_action/", methods=['GET']) def terms_management_action(): - TrackedTermsSet_Name = "TrackedSetTermSet" - TrackedTermsDate_Name = "TrackedTermDate" - BlackListTermsDate_Name = "BlackListTermDate" - BlackListTermsSet_Name = "BlackListSetTermSet" - today = datetime.datetime.now() today = today.replace(microsecond=0) today_timestamp = calendar.timegm(today.timetuple()) @@ -149,10 +201,42 @@ def terms_management_action(): else: if section == "followTerm": if action == "add": - r_serv_term.sadd(TrackedTermsSet_Name, term.lower()) - r_serv_term.hset(TrackedTermsDate_Name, term, today_timestamp) + # check if regex/set or simple term + #regex + if term.startswith('/') and term.endswith('/'): + r_serv_term.sadd(TrackedRegexSet_Name, term) + r_serv_term.hset(TrackedRegexDate_Name, term, today_timestamp) + + #set + elif term.startswith('\\') and term.endswith('\\'): + tab_term = term[1:-1] + perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_term) + if perc_finder is not None: + match_percent = perc_finder.group(0)[1:-1] + set_to_add = term + else: + match_percent = DEFAULT_MATCH_PERCENT + set_to_add = "\\" + tab_term[:-1] + ", [{}]]\\".format(match_percent) + r_serv_term.sadd(TrackedSetSet_Name, set_to_add) + r_serv_term.hset(TrackedSetDate_Name, set_to_add, today_timestamp) + + #simple term + else: + r_serv_term.sadd(TrackedTermsSet_Name, term.lower()) + r_serv_term.hset(TrackedTermsDate_Name, term.lower(), today_timestamp) + #del action else: - r_serv_term.srem(TrackedTermsSet_Name, term.lower()) + if term.startswith('/') and term.endswith('/'): + r_serv_term.srem(TrackedRegexSet_Name, term) + r_serv_term.hdel(TrackedRegexDate_Name, term) + elif term.startswith('\\') and term.endswith('\\'): + r_serv_term.srem(TrackedSetSet_Name, term) + print(term) + r_serv_term.hdel(TrackedSetDate_Name, term) + else: + r_serv_term.srem(TrackedTermsSet_Name, term.lower()) + r_serv_term.hdel(TrackedTermsDate_Name, term.lower()) + elif section == "blacklistTerm": if action == "add": r_serv_term.sadd(BlackListTermsSet_Name, term.lower()) diff --git a/var/www/templates/terms_management.html b/var/www/templates/terms_management.html index 22cbbc51..9f25d68d 100644 --- a/var/www/templates/terms_management.html +++ b/var/www/templates/terms_management.html @@ -105,9 +105,16 @@
Regex: surround the term by '/'. | /([a-z])\w+([a-z])\n/ |
Set of terms: surround the list by '\'. | \[term1, term2, ...]\ |
- To set a custom matching threshold (defaut=50), append it at the end as a inner list '[thresh]'. | \[term1, term2, ..., [75]]\ |
+ + +
+ + +