mirror of https://github.com/CIRCL/AIL-framework
Updated TermsFrequency related modules + Fixed bugs (encoding, behaviors, ...)
parent
2da4c572c7
commit
c8baabd882
|
@ -1,26 +0,0 @@
|
|||
Global
|
||||
Duplicates
|
||||
Indexer
|
||||
Attributes
|
||||
Lines
|
||||
DomClassifier
|
||||
Tokenize
|
||||
Curve
|
||||
CurveManageTopSets
|
||||
Categ
|
||||
CreditCards
|
||||
Mail
|
||||
Onion
|
||||
DumpValidOnion
|
||||
Web
|
||||
WebStats
|
||||
SQLInjectionDetection
|
||||
ModuleStats
|
||||
Browse_warning_paste
|
||||
SentimentAnalysis
|
||||
Release
|
||||
Credential
|
||||
Cve
|
||||
Phone
|
||||
SourceCode
|
||||
Keys
|
|
@ -8,6 +8,7 @@ import redis
|
|||
import time
|
||||
from pubsublogger import publisher
|
||||
from packages import lib_words
|
||||
from packages import Paste
|
||||
import os
|
||||
import datetime
|
||||
import calendar
|
||||
|
@ -16,6 +17,8 @@ import re
|
|||
from Helper import Process
|
||||
|
||||
# Config Variables
|
||||
DICO_REFRESH_TIME = 60 #s
|
||||
|
||||
BlackListTermsSet_Name = "BlackListSetTermSet"
|
||||
TrackedTermsSet_Name = "TrackedSetTermSet"
|
||||
TrackedRegexSet_Name = "TrackedRegexSet"
|
||||
|
@ -27,6 +30,15 @@ top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|||
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
||||
|
||||
|
||||
def refresh_dicos():
|
||||
dico_regex = {}
|
||||
dico_regexname_to_redis = {}
|
||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
||||
dico_regex[regex_str[1:-1]] = re.compile(regex_str[1:-1])
|
||||
dico_regexname_to_redis[regex_str[1:-1]] = regex_str
|
||||
|
||||
return dico_regex, dico_regexname_to_redis
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
publisher.channel = "Script"
|
||||
|
@ -44,10 +56,8 @@ if __name__ == "__main__":
|
|||
publisher.info("RegexForTermsFrequency script started")
|
||||
|
||||
#compile the regex
|
||||
dico_regex = {}
|
||||
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
||||
dico_regex[regex_str] = re.compile(regex_str)
|
||||
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
|
||||
message = p.get_from_set()
|
||||
|
||||
|
@ -55,32 +65,40 @@ if __name__ == "__main__":
|
|||
while True:
|
||||
|
||||
if message is not None:
|
||||
filename, timestamp, word = message.split()
|
||||
if time.time() - dico_refresh_cooldown > DICO_REFRESH_TIME:
|
||||
dico_refresh_cooldown = time.time()
|
||||
dico_regex, dico_regexname_to_redis = refresh_dicos()
|
||||
print('dico got refreshed')
|
||||
|
||||
filename = message
|
||||
temp = filename.split('/')
|
||||
timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0))
|
||||
|
||||
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
||||
content = Paste.Paste(filename).get_p_content()
|
||||
|
||||
#iterate the word with the regex
|
||||
for regex_str, compiled_regex in dico_regex.items():
|
||||
matched = compiled_regex.match(word)
|
||||
if word == "amzinggg":
|
||||
print("matched")
|
||||
server_term.incr("thisistest")
|
||||
matched = compiled_regex.search(content)
|
||||
|
||||
if matched is not None: #there is a match
|
||||
print('regex matched {}'.format(regex_str))
|
||||
matched = matched.group(0)
|
||||
# Add in Regex track set only if term is not in the blacklist
|
||||
if matched not in server_term.smembers(BlackListTermsSet_Name):
|
||||
set_name = 'regex_' + regex_str
|
||||
set_name = 'regex_' + dico_regexname_to_redis[regex_str]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
#consider the num of occurence of this term
|
||||
regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
|
||||
regex_value = int(server_term.hincrby(timestamp, dico_regexname_to_redis[regex_str], int(1)))
|
||||
#1 term per paste
|
||||
if new_to_the_set:
|
||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
|
||||
server_term.zincrby(curr_set, regex_str, float(1))
|
||||
|
||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_regexname_to_redis[regex_str], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
server_term.zincrby(curr_set, dico_regexname_to_redis[regex_str], float(1))
|
||||
else:
|
||||
pass
|
||||
|
||||
else:
|
||||
publisher.debug("Script RegexForTermsFrequency is Idling")
|
||||
|
|
|
@ -33,10 +33,10 @@ def add_quote_inside_tab(tab):
|
|||
quoted_tab = "["
|
||||
for elem in tab[1:-1].split(','):
|
||||
elem = elem.lstrip().strip()
|
||||
quoted_tab += "\"{}\", ".format(elem)
|
||||
quoted_tab += "\'{}\', ".format(elem)
|
||||
quoted_tab = quoted_tab[:-2] #remove trailing ,
|
||||
quoted_tab += "]"
|
||||
return quoted_tab
|
||||
return str(quoted_tab)
|
||||
|
||||
if __name__ == "__main__":
|
||||
publisher.port = 6380
|
||||
|
@ -57,15 +57,16 @@ if __name__ == "__main__":
|
|||
#get the dico and matching percent
|
||||
dico_percent = {}
|
||||
dico_set_tab = {}
|
||||
dico_setname_to_redis = {}
|
||||
for set_str in server_term.smembers(TrackedSetSet_Name):
|
||||
tab_set = set_str[1:-1]
|
||||
tab_set = add_quote_inside_tab(tab_set)
|
||||
perc_finder = re.compile("\[[0-9]{1,3}\]").search(tab_set)
|
||||
if perc_finder is not None:
|
||||
match_percent = perc_finder.group(0)[1:-1]
|
||||
dico_percent[str(set_str)] = match_percent
|
||||
tab_set = '["IoT", "mirai", "botnet", [50]]'
|
||||
dico_set_tab[str(set_str)] = ast.literal_eval(tab_set)[:-1]
|
||||
dico_percent[tab_set] = float(match_percent)
|
||||
dico_set_tab[tab_set] = ast.literal_eval(tab_set)
|
||||
dico_setname_to_redis[tab_set] = set_str
|
||||
else:
|
||||
continue
|
||||
|
||||
|
@ -84,31 +85,34 @@ if __name__ == "__main__":
|
|||
|
||||
#iterate over the words of the file
|
||||
match_dico = {}
|
||||
for word in content:
|
||||
for word in content.split():
|
||||
for cur_set, array_set in dico_set_tab.items():
|
||||
for w_set in array_set:
|
||||
for w_set in array_set[:-1]: #avoid the percent matching
|
||||
if word == w_set:
|
||||
try:
|
||||
match_dico[curr_set] += 1
|
||||
match_dico[str(array_set)] += 1
|
||||
except KeyError:
|
||||
match_dico[curr_set] = 1
|
||||
match_dico[str(array_set)] = 1
|
||||
|
||||
#compute matching %
|
||||
for the_set, matchingNum in match_dico.items():
|
||||
eff_percent = matchingNum / len(dico_set_tab[str(the_set)])
|
||||
if eff_percent >= dico_percent[str(set_str)]:
|
||||
eff_percent = float(matchingNum) / float((len(ast.literal_eval(the_set))-1)) * 100 #-1 bc if the percent matching
|
||||
if eff_percent >= dico_percent[the_set]:
|
||||
print(the_set, "matched in", filename)
|
||||
set_name = 'set_' + the_set
|
||||
server_term.sadd(set_name, filename)
|
||||
set_name = 'set_' + dico_setname_to_redis[the_set]
|
||||
new_to_the_set = server_term.sadd(set_name, filename)
|
||||
new_to_the_set = True if new_to_the_set == 1 else False
|
||||
|
||||
|
||||
#consider the num of occurence of this set
|
||||
set_value = int(server_term.hincrby(timestamp, the_set, int(1)))
|
||||
set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1)))
|
||||
|
||||
# FIXME - avoid using per paste as a set is checked over the entire paste
|
||||
#1 term per paste
|
||||
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), the_set, int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, the_set, float(1))
|
||||
server_term.zincrby(curr_set, the_set, float(1))
|
||||
if new_to_the_set:
|
||||
set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1)))
|
||||
server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
|
||||
|
||||
|
||||
else:
|
||||
|
|
|
@ -32,10 +32,10 @@ publish = Redis_Words
|
|||
|
||||
[Curve]
|
||||
subscribe = Redis_Words
|
||||
publish = Redis_CurveManageTopSets,Redis_RegexForTermsFrequency
|
||||
publish = Redis_CurveManageTopSets
|
||||
|
||||
[RegexForTermsFrequency]
|
||||
subscribe = Redis_RegexForTermsFrequency
|
||||
subscribe = Redis_Global
|
||||
|
||||
[SetForTermsFrequency]
|
||||
subscribe = Redis_Global
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 178 KiB After Width: | Height: | Size: 188 KiB |
|
@ -89,7 +89,7 @@ def terms_management():
|
|||
trackSet_list_num_of_paste = []
|
||||
for tracked_set in r_serv_term.smembers(TrackedSetSet_Name):
|
||||
trackSet_list.append(tracked_set)
|
||||
value_range = Term_getValueOverRange(tracked_regex, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
|
||||
value_range = Term_getValueOverRange(tracked_set, today_timestamp, [1, 7, 31], per_paste=per_paste_text)
|
||||
|
||||
term_date = r_serv_term.hget(TrackedSetDate_Name, tracked_set)
|
||||
|
||||
|
|
|
@ -289,7 +289,7 @@
|
|||
//console.log(data);
|
||||
event.preventDefault();
|
||||
var the_modal=$(this);
|
||||
var url = "{{ url_for('terms_management_query_paste') }}?term=" + $(this).attr('data-term');
|
||||
var url = "{{ url_for('terms_management_query_paste') }}?term=" + encodeURIComponent($(this).attr('data-term'));
|
||||
$.getJSON(url, function (data) {
|
||||
if (data.length != 0) {
|
||||
var html_to_add = "";
|
||||
|
|
Loading…
Reference in New Issue