mirror of https://github.com/CIRCL/AIL-framework
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...)
parent
2da4c572c7
commit
c8baabd882
|
@ -1,26 +0,0 @@
|
||||||
Global
|
|
||||||
Duplicates
|
|
||||||
Indexer
|
|
||||||
Attributes
|
|
||||||
Lines
|
|
||||||
DomClassifier
|
|
||||||
Tokenize
|
|
||||||
Curve
|
|
||||||
CurveManageTopSets
|
|
||||||
Categ
|
|
||||||
CreditCards
|
|
||||||
Mail
|
|
||||||
Onion
|
|
||||||
DumpValidOnion
|
|
||||||
Web
|
|
||||||
WebStats
|
|
||||||
SQLInjectionDetection
|
|
||||||
ModuleStats
|
|
||||||
Browse_warning_paste
|
|
||||||
SentimentAnalysis
|
|
||||||
Release
|
|
||||||
Credential
|
|
||||||
Cve
|
|
||||||
Phone
|
|
||||||
SourceCode
|
|
||||||
Keys
|
|
|
@ -8,6 +8,7 @@ import redis
|
||||||
import time
|
import time
|
||||||
from pubsublogger import publisher
|
from pubsublogger import publisher
|
||||||
from packages import lib_words
|
from packages import lib_words
|
||||||
|
from packages import Paste
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
import calendar
|
import calendar
|
||||||
|
@ -16,6 +17,8 @@ import re
|
||||||
from Helper import Process
|
from Helper import Process
|
||||||
|
|
||||||
# Config Variables
|
# Config Variables
|
||||||
|
DICO_REFRESH_TIME = 60 #s
|
||||||
|
|
||||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||||
|
@ -27,6 +30,15 @@ top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
||||||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_dicos():
|
||||||
|
dico_regex = {}
|
||||||
|
dico_regexname_to_redis = {}
|
||||||
|
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
||||||
|
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
||||||
|
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
||||||
|
|
||||||
|
return dico_regex, dico_regexname_to_redis
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
publisher.channel = "Script"
|
publisher.channel = "Script"
|
||||||
|
@ -44,10 +56,8 @@ if __name__ == "__main__":
|
||||||
publisher.info("RegexForTermsFrequency script started")
|
publisher.info("RegexForTermsFrequency script started")
|
||||||
|
|
||||||
#compile the regex
|
#compile the regex
|
||||||
dico_regex = {}
|
dico_refresh_cooldown = time.time()
|
||||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||||
dico_regex[regex_str] = re.compile(regex_str)
|
|
||||||
|
|
||||||
|
|
||||||
message = p.get_from_set()
|
message = p.get_from_set()
|
||||||
|
|
||||||
|
@ -55,32 +65,40 @@ if __name__ == "__main__":
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
if message is not None:
|
if message is not None:
|
||||||
filename, timestamp, word = message.split()
|
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
||||||
|
dico_refresh_cooldown = time.time()
|
||||||
|
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||||
|
print('dico got refreshed')
|
||||||
|
|
||||||
|
filename = message
|
||||||
|
temp = filename.split('/')
|
||||||
|
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||||
|
|
||||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||||
|
content = Paste.Paste(filename).get_p_content()
|
||||||
|
|
||||||
#iterate the word with the regex
|
#iterate the word with the regex
|
||||||
for regex_str, compiled_regex in dico_regex.items():
|
for regex_str, compiled_regex in dico_regex.items():
|
||||||
matched = compiled_regex.match(word)
|
matched = compiled_regex.search(content)
|
||||||
if word == "amzinggg":
|
|
||||||
print("matched")
|
|
||||||
server_term.incr("thisistest")
|
|
||||||
|
|
||||||
if matched is not None: #there is a match
|
if matched is not None: #there is a match
|
||||||
|
print('regex matched {}'.format(regex_str))
|
||||||
matched = matched.group(0)
|
matched = matched.group(0)
|
||||||
# Add in Regex track set only if term is not in the blacklist
|
# Add in Regex track set only if term is not in the blacklist
|
||||||
if matched not in server_term.smembers(BlackListTermsSet_Name):
|
if matched not in server_term.smembers(BlackListTermsSet_Name):
|
||||||
set_name = 'regex_' + regex_str
|
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
||||||
new_to_the_set = server_term.sadd(set_name, filename)
|
new_to_the_set = server_term.sadd(set_name, filename)
|
||||||
new_to_the_set = True if new_to_the_set == 1 else False
|
new_to_the_set = True if new_to_the_set == 1 else False
|
||||||
|
|
||||||
#consider the num of occurence of this term
|
#consider the num of occurence of this term
|
||||||
regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
|
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
||||||
#1 term per paste
|
#1 term per paste
|
||||||
if new_to_the_set:
|
if new_to_the_set:
|
||||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
|
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
||||||
server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
|
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||||
server_term.zincrby(curr_set, regex_str, float(1))
|
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
else:
|
else:
|
||||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||||
|
|
|
@ -33,10 +33,10 @@ def add_quote_inside_tab(tab):
|
||||||
quoted_tab = "["
|
quoted_tab = "["
|
||||||
for elem in tab[1:-1].split(','):
|
for elem in tab[1:-1].split(','):
|
||||||
elem = elem.lstrip().strip()
|
elem = elem.lstrip().strip()
|
||||||
quoted_tab += "\"{}\", ".format(elem)
|
quoted_tab += "\'{}\', ".format(elem)
|
||||||
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
||||||
quoted_tab += "]"
|
quoted_tab += "]"
|
||||||
return quoted_tab
|
return str(quoted_tab)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
publisher.port = 6380
|
publisher.port = 6380
|
||||||
|
@ -57,15 +57,16 @@ if __name__ == "__main__":
|
||||||
#get the dico and matching percent
|
#get the dico and matching percent
|
||||||
dico_percent = {}
|
dico_percent = {}
|
||||||
dico_set_tab = {}
|
dico_set_tab = {}
|
||||||
|
dico_setname_to_redis = {}
|
||||||
for set_str in server_term.smembers(TrackedSetSet_Name):
|
for set_str in server_term.smembers(TrackedSetSet_Name):
|
||||||
tab_set = set_str[1:-1]
|
tab_set = set_str[1:-1]
|
||||||
tab_set = add_quote_inside_tab(tab_set)
|
tab_set = add_quote_inside_tab(tab_set)
|
||||||
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
||||||
if perc_finder is not None:
|
if perc_finder is not None:
|
||||||
match_percent = perc_finder.group(0)[1:-1]
|
match_percent = perc_finder.group(0)[1:-1]
|
||||||
dico_percent[str(set_str)] = match_percent
|
dico_percent[tab_set] = float(match_percent)
|
||||||
tab_set = '["IoT", "mirai", "botnet", [50]]'
|
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
|
||||||
dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1]
|
dico_setname_to_redis[tab_set] = set_str
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -84,31 +85,34 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
#iterate over the words of the file
|
#iterate over the words of the file
|
||||||
match_dico = {}
|
match_dico = {}
|
||||||
for word in content:
|
for word in content.split():
|
||||||
for cur_set, array_set in dico_set_tab.items():
|
for cur_set, array_set in dico_set_tab.items():
|
||||||
for w_set in array_set:
|
for w_set in array_set[:-1]: #avoid the percent matching
|
||||||
if word == w_set:
|
if word == w_set:
|
||||||
try:
|
try:
|
||||||
match_dico[curr_set] += 1
|
match_dico[str(array_set)] += 1
|
||||||
except KeyError:
|
except KeyError:
|
||||||
match_dico[curr_set] = 1
|
match_dico[str(array_set)] = 1
|
||||||
|
|
||||||
#compute matching %
|
#compute matching %
|
||||||
for the_set, matchingNum in match_dico.items():
|
for the_set, matchingNum in match_dico.items():
|
||||||
eff_percent = matchingNum / len(dico_set_tab[str(the_set)])
|
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
|
||||||
if eff_percent >= dico_percent[str(set_str)]:
|
if eff_percent >= dico_percent[the_set]:
|
||||||
print(the_set, "matched in", filename)
|
print(the_set, "matched in", filename)
|
||||||
set_name = 'set_' + the_set
|
set_name = 'set_' + dico_setname_to_redis[the_set]
|
||||||
server_term.sadd(set_name, filename)
|
new_to_the_set = server_term.sadd(set_name, filename)
|
||||||
|
new_to_the_set = True if new_to_the_set == 1 else False
|
||||||
|
|
||||||
|
|
||||||
#consider the num of occurence of this set
|
#consider the num of occurence of this set
|
||||||
set_value = int(server_term.hincrby(timestamp, the_set, int(1)))
|
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
|
||||||
|
|
||||||
# FIXME - avoid using per paste as a set is checked over the entire paste
|
# FIXME - avoid using per paste as a set is checked over the entire paste
|
||||||
#1 term per paste
|
#1 term per paste
|
||||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1)))
|
if new_to_the_set:
|
||||||
server_term.zincrby("per_paste_" + curr_set, the_set, float(1))
|
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
|
||||||
server_term.zincrby(curr_set, the_set, float(1))
|
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
|
||||||
|
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
|
||||||
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -32,10 +32,10 @@ publish = Redis_Words
|
||||||
|
|
||||||
[Curve]
|
[Curve]
|
||||||
subscribe = Redis_Words
|
subscribe = Redis_Words
|
||||||
publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency
|
publish = Redis_CurveManageTopSets
|
||||||
|
|
||||||
[RegexForTermsFrequency]
|
[RegexForTermsFrequency]
|
||||||
subscribe = Redis_RegexForTermsFrequency
|
subscribe = Redis_Global
|
||||||
|
|
||||||
[SetForTermsFrequency]
|
[SetForTermsFrequency]
|
||||||
subscribe = Redis_Global
|
subscribe = Redis_Global
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 178 KiB After Width: | Height: | Size: 188 KiB |
|
@ -89,7 +89,7 @@ def terms_management():
|
||||||
trackSet_list_num_of_paste = []
|
trackSet_list_num_of_paste = []
|
||||||
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
|
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
|
||||||
trackSet_list.append(tracked_set)
|
trackSet_list.append(tracked_set)
|
||||||
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
|
value_range = Term_getValueOverRange(tracked_set, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
|
||||||
|
|
||||||
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)
|
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)
|
||||||
|
|
||||||
|
|
|
@ -289,7 +289,7 @@
|
||||||
//console.log(data);
|
//console.log(data);
|
||||||
event.preventDefault();
|
event.preventDefault();
|
||||||
var the_modal=$(this);
|
var the_modal=$(this);
|
||||||
var url = "{{ url_for('terms_management_query_paste') }}?term=" + $(this).attr('data-term');
|
var url = "{{ url_for('terms_management_query_paste') }}?term=" + encodeURIComponent($(this).attr('data-term'));
|
||||||
$.getJSON(url, function (data) {
|
$.getJSON(url, function (data) {
|
||||||
if (data.length != 0) {
|
if (data.length != 0) {
|
||||||
var html_to_add = "";
|
var html_to_add = "";
|
||||||
|
|
Loading…
Reference in New Issue