diff --git a/bin/Curve.py b/bin/Curve.py index 4c3378ea..cd641e54 100755 --- a/bin/Curve.py +++ b/bin/Curve.py @@ -28,9 +28,24 @@ from pubsublogger import publisher from packages import lib_words import os import datetime +import calendar from Helper import Process +# Config Variables +top_term_freq_max_set_cardinality = 50 # Max cardinality of the terms frequences set + + +def getValueOverRange(word, startDate, num_day): + oneDay = 60*60*24 + to_return = 0 + for timestamp in range(startDate, startDate - num_day*oneDay, -oneDay): + value = server_term.hget(timestamp, word) + to_return += int(value) if value is not None else 0 + return to_return + + + if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" @@ -44,6 +59,11 @@ if __name__ == "__main__": port=p.config.get("Redis_Level_DB_Curve", "port"), db=p.config.get("Redis_Level_DB_Curve", "db")) + server_term = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_TermFreq", "host"), + port=p.config.get("Redis_Level_DB_TermFreq", "port"), + db=p.config.get("Redis_Level_DB_TermFreq", "db")) + # FUNCTIONS # publisher.info("Script Curve started") @@ -56,6 +76,7 @@ if __name__ == "__main__": message = p.get_from_set() prec_filename = None generate_new_graph = False + iii = 0 while True: if message is not None: generate_new_graph = True @@ -65,11 +86,49 @@ if __name__ == "__main__": date = temp[-4] + temp[-3] + temp[-2] low_word = word.lower() - prev_score = r_serv1.hget(low_word, date) - if prev_score is not None: - r_serv1.hset(low_word, date, int(prev_score) + int(score)) + r_serv1.hincrby(low_word, date, int(score)) + + # Term Frequency + top_termFreq_setName_day = ["TopTermFreq_set_day", 1] + top_termFreq_setName_week = ["TopTermFreq_set_week", 7] + top_termFreq_setName_month = ["TopTermFreq_set_month", 31] + top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month] + timestamp = calendar.timegm((int(temp[-4]), int(temp[-3]), int(temp[-2]), 0, 0, 0)) + + # Update redis + curr_word_value = int(server_term.hincrby(timestamp, low_word, int(score))) + +# print '+----------------------------------------------------------------' + # Manage Top set + for curr_set, curr_num_day in top_termFreq_set_array: + + if server_term.scard(curr_set) < top_term_freq_max_set_cardinality: + server_term.sadd(curr_set, low_word) + elif server_term.sismember(curr_set, low_word): + continue + + else: + top_termFreq = server_term.smembers(curr_set) + sorted_top_termFreq_set = [] + for word in top_termFreq: + word_value = getValueOverRange(word, timestamp, curr_num_day) + sorted_top_termFreq_set.append((word, word_value)) + + sorted_top_termFreq_set.sort(key=lambda tup: tup[1]) +# if curr_num_day == 1: +# print sorted_top_termFreq_set + curr_word_value = getValueOverRange(low_word, timestamp, curr_num_day) + + if curr_word_value > int(sorted_top_termFreq_set[0][1]): + print str(curr_num_day)+':', low_word, curr_word_value, '\t', sorted_top_termFreq_set[0][0], sorted_top_termFreq_set[0][1], '\t', curr_word_value > sorted_top_termFreq_set[0][1] + #print sorted_top_termFreq_set + server_term.srem(curr_set, sorted_top_termFreq_set[0][0]) + server_term.sadd(curr_set, low_word) + if iii == 2: + iii-=1 else: - r_serv1.hset(low_word, date, score) + iii+=1 + else: if generate_new_graph: diff --git a/bin/LAUNCH.sh b/bin/LAUNCH.sh index b2f4029a..d4a457cd 100755 --- a/bin/LAUNCH.sh +++ b/bin/LAUNCH.sh @@ -58,6 +58,10 @@ function launching_redis { screen -S "Redis" -X screen -t "6380" bash -c 'redis-server '$conf_dir'6380.conf ; read x' sleep 0.1 screen -S "Redis" -X screen -t "6381" bash -c 'redis-server '$conf_dir'6381.conf ; read x' + + # For Words and curves + sleep 0.1 + screen -S "Redis" -X screen -t "6382" bash -c 'redis-server '$conf_dir'6382.conf ; read x' } function launching_lvldb { diff --git a/bin/ModuleStats.py b/bin/ModuleStats.py index 10da3f7c..b85c2ae5 100755 --- a/bin/ModuleStats.py +++ b/bin/ModuleStats.py @@ -30,14 +30,10 @@ def get_date_range(num_day): def compute_most_posted(server, message): module, num, keyword, paste_date = message.split(';') - redis_progression_name_set = 'top_'+ module +'_set' + redis_progression_name_set = 'top_'+ module +'_set_' + paste_date # Add/Update in Redis - prev_score = server.hget(paste_date, module+'-'+keyword) - if prev_score is not None: - ok = server.hset(paste_date, module+'-'+keyword, int(prev_score) + int(num)) - else: - ok = server.hset(paste_date, module+'-'+keyword, int(num)) + server.hincrby(paste_date, module+'-'+keyword, int(num)) # Compute Most Posted date = get_date_range(0)[0] @@ -47,44 +43,64 @@ def compute_most_posted(server, message): curr_value = server.hget(date, module+'-'+keyword) keyword_total_sum += int(curr_value) if curr_value is not None else 0 - if keyword in server.smembers(redis_progression_name_set): # if it is already in the set - return + if server.zcard(redis_progression_name_set) < max_set_cardinality: + server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) - if (server.scard(redis_progression_name_set) < max_set_cardinality): - server.sadd(redis_progression_name_set, keyword) + else: # not in set + member_set = server.zrangebyscore(redis_progression_name_set, '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if int(member_set[0][1]) < keyword_total_sum: + #remove min from set and add the new one + print module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' + server.zrem(redis_progression_name_set, member_set[0][0]) + server.zadd(redis_progression_name_set, float(keyword_total_sum), keyword) + print redis_progression_name_set - else: #not in the set - #Check value for all members - member_set = [] - for keyw in server.smembers(redis_progression_name_set): - keyw_value = server.hget(paste_date, module+'-'+keyw) - if keyw_value is not None: - member_set.append((keyw, int(keyw_value))) - else: #No data for this set for today - member_set.append((keyw, int(0))) - member_set.sort(key=lambda tup: tup[1]) - if len(member_set) > 0: - if member_set[0][1] < keyword_total_sum: - #remove min from set and add the new one - print module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' - server.srem(redis_progression_name_set, member_set[0][0]) - server.sadd(redis_progression_name_set, keyword) +# if keyword in server.smembers(redis_progression_name_set): # if it is already in the set +# return +# +# if (server.scard(redis_progression_name_set) < max_set_cardinality): +# server.sadd(redis_progression_name_set, keyword) + +# else: #not in the set +# #Check value for all members +# member_set = [] +# for keyw in server.smembers(redis_progression_name_set): +# keyw_value = server.hget(paste_date, module+'-'+keyw) +# if keyw_value is not None: +# member_set.append((keyw, int(keyw_value))) +# else: #No data for this set for today +# member_set.append((keyw, int(0))) +# member_set.sort(key=lambda tup: tup[1]) +# if len(member_set) > 0: +# if member_set[0][1] < keyword_total_sum: +# #remove min from set and add the new one +# print module + ': adding ' +keyword+ '(' +str(keyword_total_sum)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' +# server.srem(redis_progression_name_set, member_set[0][0]) +# server.sadd(redis_progression_name_set, keyword) def compute_provider_info(server, path): redis_all_provider = 'all_provider_set' - redis_avg_size_name_set = 'top_size_set' - redis_providers_name_set = 'providers_set' paste = Paste.Paste(path) paste_size = paste._get_p_size() paste_provider = paste.p_source - paste_date = paste._get_p_date() - new_avg = paste_size + paste_date = str(paste._get_p_date()) + redis_sum_size_set = 'top_size_set_' + paste_date + redis_avg_size_name_set = 'top_avg_size_set_' + paste_date + redis_providers_name_set = 'providers_set_' + paste_date # Add/Update in Redis server.sadd(redis_all_provider, paste_provider) + + num_paste = int(server.hincrby(paste_provider+'_num', paste_date, 1)) + sum_size = float(server.hincrbyfloat(paste_provider+'_size', paste_date, paste_size)) + new_avg = float(sum_size) / float(num_paste) + server.hset(paste_provider +'_avg', paste_date, new_avg) + + ''' prev_num_paste = server.hget(paste_provider+'_num', paste_date) if prev_num_paste is not None: ok = server.hset(paste_provider+'_num', paste_date, int(prev_num_paste)+1) @@ -99,12 +115,28 @@ def compute_provider_info(server, path): else: ok = server.hset(paste_provider+'_num', paste_date, 1) prev_num_paste = 0 + ''' # # Compute Most Posted # # Size + if server.zcard(redis_sum_size_set) < max_set_cardinality or server.zscore(redis_sum_size_set, paste_provider) != "nil": + server.zadd(redis_sum_size_set, float(num_paste), paste_provider) + server.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) + else: #set full capacity + member_set = server.zrangebyscore(redis_sum_size_set, '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if float(member_set[0][1]) < new_avg: + #remove min from set and add the new one + print 'Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' + server.zrem(redis_sum_size_set, member_set[0][0]) + server.zadd(redis_sum_size_set, float(sum_size), paste_provider) + server.zrem(redis_avg_size_name_set, member_set[0][0]) + server.zadd(redis_avg_size_name_set, float(new_avg), paste_provider) + + ''' if paste_provider not in server.smembers(redis_avg_size_name_set): # if it is already in the set if (server.scard(redis_avg_size_name_set) < max_set_cardinality): server.sadd(redis_avg_size_name_set, paste_provider) @@ -125,8 +157,22 @@ def compute_provider_info(server, path): print 'Size - adding ' +paste_provider+ '(' +str(new_avg)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' server.srem(redis_avg_size_name_set, member_set[0][0]) server.sadd(redis_avg_size_name_set, paste_provider) + ''' # Num + # if set not full or provider already present + if server.zcard(redis_providers_name_set) < max_set_cardinality or server.zscore(redis_providers_name_set, paste_provider) != "nil": + server.zadd(redis_providers_name_set, float(num_paste), paste_provider) + else: #set at full capacity + member_set = server.zrangebyscore(redis_providers_name_set, '-inf', '+inf', withscores=True, start=0, num=1) + # Member set is a list of (value, score) pairs + if int(member_set[0][1]) < num_paste: + #remove min from set and add the new one + print 'Num - adding ' +paste_provider+ '(' +str(num_paste)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' + server.zrem(member_set[0][0]) + server.zadd(redis_providers_name_set, float(num_paste), paste_provider) + + ''' if paste_provider not in server.smembers(redis_providers_name_set): # if it is already in the set if (server.scard(redis_providers_name_set) < max_set_cardinality): server.sadd(redis_providers_name_set, paste_provider) @@ -146,6 +192,7 @@ def compute_provider_info(server, path): print 'Num - adding ' +paste_provider+ '(' +str(int(prev_num_paste)+1)+') in set and removing '+member_set[0][0]+'('+str(member_set[0][1])+')' server.srem(redis_providers_name_set, member_set[0][0]) server.sadd(redis_providers_name_set, paste_provider) + ''' if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) diff --git a/bin/WebStats.py b/bin/WebStats.py index 243c59a7..6fdd9ee3 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -23,16 +23,11 @@ num_day_to_look = 5 # the detection of the progression start num_day_to_lo def analyse(server, field_name, date, url_parsed): field = url_parsed[field_name] if field is not None: - prev_score = server.hget(field, date) - if prev_score is not None: - server.hset(field, date, int(prev_score) + 1) - - else: - server.hset(field, date, 1) - if field_name == "domain": #save domain in a set for the monthly plot - domain_set_name = "domain_set_" + date[0:6] - server.sadd(domain_set_name, field) - print "added in " + domain_set_name +": "+ field + server.hincrby(field, date, 1) + if field_name == "domain": #save domain in a set for the monthly plot + domain_set_name = "domain_set_" + date[0:6] + server.sadd(domain_set_name, field) + print "added in " + domain_set_name +": "+ field def get_date_range(num_day): curr_date = datetime.date.today() @@ -84,7 +79,7 @@ def compute_progression(server, field_name, num_day, url_parsed): member_set.append((keyw, int(server.hget(redis_progression_name, keyw)))) print member_set member_set.sort(key=lambda tup: tup[1]) - if member_set[0] < keyword_increase: + if member_set[0][1] < keyword_increase: #remove min from set and add the new one server.srem(redis_progression_name_set, member_set[0]) server.sadd(redis_progression_name_set, keyword) @@ -107,11 +102,6 @@ if __name__ == '__main__': publisher.info("Makes statistics about valid URL") # REDIS # - r_serv1 = redis.StrictRedis( - host=p.config.get("Redis_Level_DB", "host"), - port=p.config.get("Redis_Level_DB", "port"), - db=p.config.get("Redis_Level_DB", "db")) - r_serv_trend = redis.StrictRedis( host=p.config.get("Redis_Level_DB_Trending", "host"), port=p.config.get("Redis_Level_DB_Trending", "port"), diff --git a/bin/empty_queue.py b/bin/empty_queue.py index a5ccae68..f1b3c453 100755 --- a/bin/empty_queue.py +++ b/bin/empty_queue.py @@ -24,6 +24,7 @@ if __name__ == "__main__": publisher.channel = "Script" config_section = ['Global', 'Duplicates', 'Indexer', 'Attributes', 'Lines', 'DomClassifier', 'Tokenize', 'Curve', 'Categ', 'CreditCards', 'Mail', 'Onion', 'DumpValidOnion', 'Web', 'WebStats', 'Release', 'Credential', 'Cve', 'Phone', 'SourceCode', 'Keys'] + config_section = ['Curve'] for queue in config_section: print 'dropping: ' + queue diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 63a8bb4a..2a50be6b 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -188,23 +188,31 @@ def get_date_range(num_day): # Iterate over elements in the module provided and return the today data or the last data # return format: [('passed_days', num_of_passed_days), ('elem_name1', elem_value1), ('elem_name2', elem_value2)]] def get_top_relevant_data(server, module_name): - redis_progression_name_set = 'top_'+ module_name +'_set' days = 0 for date in get_date_range(15): - member_set = [] - for keyw in server.smembers(redis_progression_name_set): - redis_progression_name = module_name+'-'+keyw - keyw_value = server.hget(date ,redis_progression_name) - keyw_value = keyw_value if keyw_value is not None else 0 - member_set.append((keyw, int(keyw_value))) - member_set.sort(key=lambda tup: tup[1], reverse=True) - if member_set[0][1] == 0: #No data for this date + redis_progression_name_set = 'top_'+ module_name +'_set_' + date + member_set = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True) + #print member_set + if len(member_set) == 0: #No data for this date days += 1 - continue else: member_set.insert(0, ("passed_days", days)) return member_set +# member_set = [] +# for keyw in server.smembers(redis_progression_name_set): +# redis_progression_name = module_name+'-'+keyw +# keyw_value = server.hget(date ,redis_progression_name) +# keyw_value = keyw_value if keyw_value is not None else 0 +# member_set.append((keyw, int(keyw_value))) +# member_set.sort(key=lambda tup: tup[1], reverse=True) +# if member_set[0][1] == 0: #No data for this date +# days += 1 +# continue +# else: +# member_set.insert(0, ("passed_days", days)) +# return member_set + # ========= CACHE CONTROL ======== @app.after_request def add_header(response): @@ -300,10 +308,12 @@ def providersChart(): for date in date_range: curr_value_size = r_serv_charts.hget(keyword_name+'_'+'size', date) curr_value_num = r_serv_charts.hget(keyword_name+'_'+'num', date) + curr_value_size_avg = r_serv_charts.hget(keyword_name+'_'+'avg', date) if module_name == "size": - curr_value_num = curr_value_num if curr_value_num is not None else 0 - curr_value_num = curr_value_num if int(curr_value_num) != 0 else 10000000000 - curr_value = float(curr_value_size if curr_value_size is not None else 0.0) / float(curr_value_num) + curr_value = float(curr_value_size_avg if curr_value_size_avg is not None else 0) + #curr_value_num = curr_value_num if curr_value_num is not None else 0 + #curr_value_num = curr_value_num if int(curr_value_num) != 0 else 10000000000 + #curr_value = float(curr_value_size if curr_value_size is not None else 0.0) / float(curr_value_num) else: curr_value = float(curr_value_num if curr_value_num is not None else 0.0) @@ -312,8 +322,18 @@ def providersChart(): return jsonify(bar_values) else: - redis_provider_name_set = 'top_size_set' if module_name == "size" else 'providers_set' + #redis_provider_name_set = 'top_size_set' if module_name == "size" else 'providers_set' + redis_provider_name_set = 'top_avg_size_set_' if module_name == "size" else 'providers_set_' + redis_provider_name_set = redis_provider_name_set + get_date_range(0)[0] + + member_set = r_serv_charts.zrangebyscore(redis_provider_name_set, '-inf', '+inf', withscores=True, start=0, num=8) + # Member set is a list of (value, score) pairs + if len(member_set) == 0: + member_set.append(("No relevant data", float(100))) + return jsonify(member_set) + +''' # Iterate over element in top_x_set and retreive their value member_set = [] for keyw in r_serv_charts.smembers(redis_provider_name_set): @@ -339,7 +359,7 @@ def providersChart(): if len(member_set) == 0: member_set.append(("No relevant data", float(100))) return jsonify(member_set) - +''' @app.route("/search", methods=['POST']) @@ -465,18 +485,18 @@ def sentiment_analysis_getplotdata(): dateStart_timestamp = calendar.timegm(dateStart.timetuple()) to_return = {} - for cur_provider in r_serv_charts.smembers('providers_set'): - cur_provider_name = cur_provider + '_' - list_date = {} - for cur_timestamp in range(int(dateStart_timestamp), int(dateStart_timestamp)-sevenDays-oneHour, -oneHour): - cur_set_name = cur_provider_name + str(cur_timestamp) + for cur_provider in r_serv_charts.zrangebyscore('providers_set_'+ get_date_range(0)[0], '-inf', '+inf', start=0, num=8): + cur_provider_name = cur_provider + '_' + list_date = {} + for cur_timestamp in range(int(dateStart_timestamp), int(dateStart_timestamp)-sevenDays-oneHour, -oneHour): + cur_set_name = cur_provider_name + str(cur_timestamp) - list_value = [] - for cur_id in r_serv_sentiment.smembers(cur_set_name): - cur_value = r_serv_sentiment.get(cur_id) - list_value.append(cur_value) - list_date[cur_timestamp] = list_value - to_return[cur_provider] = list_date + list_value = [] + for cur_id in r_serv_sentiment.smembers(cur_set_name): + cur_value = r_serv_sentiment.get(cur_id) + list_value.append(cur_value) + list_date[cur_timestamp] = list_value + to_return[cur_provider] = list_date return jsonify(to_return) @@ -532,6 +552,37 @@ def sentiment_analysis_plot_tool_getdata(): return jsonify(to_return) +@app.route("/test/") #completely shows the paste in a new tab +def test(): + + server = redis.StrictRedis( + host=cfg.get("Redis_Level_DB_TermFreq", "host"), + port=cfg.getint("Redis_Level_DB_TermFreq", "port"), + db=cfg.getint("Redis_Level_DB_TermFreq", "db")) + + array1 = [] + for w in server.smembers('TopTermFreq_set_day'): + val = server.hget('1471478400', w) + val = val if val is not None else 0 + val2 = server.hget('1471392000', w) + val2 = val2 if val2 is not None else 0 + array1.append((w, (int(val), int(val2)))) + +# array2 = [] +# for w in server.smembers('TopTermFreq_set_week'): +# array2.append((w, int(server.hget('1471478400', w)))) + + array1.sort(key=lambda tup: tup[1][0]+tup[1][1]) + stri = "

day

" + for e in array1: + stri += "

"+ e[0] + "\t" + str(e[1]) +"

" +# stri += "

week

" +# for e in array2: +# stri += "

"+ e[0] + "\t" + str(e[1]) +"

" + + return stri + + @app.route("/showsavedpaste/") #completely shows the paste in a new tab def showsavedpaste(): diff --git a/var/www/static/js/sentiment_trending.js b/var/www/static/js/sentiment_trending.js index 2872c502..1d69f6ed 100644 --- a/var/www/static/js/sentiment_trending.js +++ b/var/www/static/js/sentiment_trending.js @@ -42,7 +42,7 @@ $.getJSON("/sentiment_analysis_getplotdata/", function(data) { - //console.log(data); + console.log(data); var all_data = []; var plot_data = []; var graph_avg = []; @@ -136,7 +136,8 @@ $.getJSON("/sentiment_analysis_getplotdata/", var placeholder = '.sparkLineStatsWeek' + num; $(placeholder).sparkline(plot_data[graphNum], sparklineOptions); $(placeholder+'t').text(curr_provider); - $(placeholder+'s').text(curr_avg.toFixed(5)); + var curr_avg_text = isNaN(curr_avg) ? "No data" : curr_avg.toFixed(5); + $(placeholder+'s').text(curr_avg_text); sparklineOptions.barWidth = 18; sparklineOptions.tooltipFormat = ' Avg: {{value}} ' @@ -144,11 +145,23 @@ $.getJSON("/sentiment_analysis_getplotdata/", sparklineOptions.tooltipFormat = ' {{offset:names}}, {{value}} ' sparklineOptions.barWidth = 2; - sparklineOptions.tooltipValueLookups = { names: offset_to_time}; sparklineOptions.chartRangeMax = max_value_day; sparklineOptions.chartRangeMin = -max_value_day; + var avgName = ".pannelWeek" + num; + if (curr_avg > 0) { + $(avgName).addClass("panel-success") + } else if(curr_avg < 0) { + $(avgName).addClass("panel-danger") + } else if(isNaN(curr_avg)) { + $(avgName).addClass("panel-info") + } else { + $(avgName).addClass("panel-warning") + } + + + // print today var data_length = plot_data[graphNum].length; var data_today = plot_data[graphNum].slice(data_length-24, data_length); @@ -170,6 +183,17 @@ $.getJSON("/sentiment_analysis_getplotdata/", sparklineOptions.barWidth = 2; $(placeholder+'s').text(day_avg_text); + avgName = ".pannelToday" + num; + if (day_avg > 0) { + $(avgName).addClass("panel-success") + } else if(day_avg < 0) { + $(avgName).addClass("panel-danger") + } else if(isNaN(day_sum/day_sum_elem)) { + $(avgName).addClass("panel-info") + } else { + $(avgName).addClass("panel-warning") + } + }//for loop @@ -338,6 +362,7 @@ $.getJSON("/sentiment_analysis_getplotdata/", chart_canvas2.render(); + } ); diff --git a/var/www/templates/sentiment_analysis_trending.html b/var/www/templates/sentiment_analysis_trending.html index 709eb694..89b4f675 100644 --- a/var/www/templates/sentiment_analysis_trending.html +++ b/var/www/templates/sentiment_analysis_trending.html @@ -18,6 +18,17 @@