diff --git a/bin/WebStats.py b/bin/WebStats.py index 5573b8fd..eba2c0c4 100755 --- a/bin/WebStats.py +++ b/bin/WebStats.py @@ -15,24 +15,11 @@ from Helper import Process from pyfaup.faup import Faup # Config Var -threshold_need_to_look = 50 -range_to_look = 10 -threshold_to_plot = 1 # 500% -to_plot = set() -clean_frequency = 10 # minutes +threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression +threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look +max_set_cardinality = 10 # The cardinality of the progression set +num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past - -def analyse(server, field_name): - field = url_parsed[field_name] - if field is not None: - prev_score = server.hget(field, date) - if prev_score is not None: - server.hset(field, date, int(prev_score) + 1) - else: - server.hset(field, date, 1) - - -def analyse_and_progression(server, field_name): field = url_parsed[field_name] if field is not None: prev_score = server.hget(field, date) @@ -44,28 +31,56 @@ def analyse_and_progression(server, field_name): to_plot.add(field) else: server.hset(field, date, 1) + if field_name == "domain": #save domain in a set for the monthly plot + domain_set_name = "domain_set_" + date[0:6] + server.sadd(domain_set_name, field) + print "added in " + domain_set_name +": "+ field +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) + date_list = [] -def check_for_progression(server, field, date): - previous_data = set() - tot_sum = 0 - for i in range(0, range_to_look): - curr_value = server.hget(field, Date(date).substract_day(i)) - if curr_value is None: # no further data - break - else: - curr_value = int(curr_value) - previous_data.add(curr_value) - tot_sum += curr_value - if i == 0: - today_val = curr_value + for i in range(0, num_day+1): + date_list.append(date.substract_day(i)) + return date_list - print 'totsum=' + str(tot_sum) - print 'div=' + str(tot_sum / today_val) - if tot_sum / today_val >= threshold_to_plot: - return True - else: - return False +def compute_progression(server, field_name, num_day, url_parsed): + redis_progression_name = 'top_progression_'+field_name + redis_progression_name_set = 'top_progression_'+field_name+'_set' + + keyword = url_parsed[field_name] + if keyword is not None: + date_range = get_date_range(num_day) + # check if this keyword is eligible for progression + keyword_total_sum = 0 + value_list = [] + for date in date_range: + curr_value = server.hget(keyword, date) + value_list.append(int(curr_value if curr_value is not None else 0)) + keyword_total_sum += int(curr_value) if curr_value is not None else 0 + oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division + keyword_increase = value_list[0] / oldest_value + + # filter + if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase): + + if server.sismember(redis_progression_name_set, keyword): #if keyword is in the set + server.hset(redis_progression_name, keyword, keyword_increase) #update its value + + elif (server.scard(redis_progression_name_set) < max_set_cardinality): + server.sadd(redis_progression_name_set, keyword) + + else: #not in the set + #Check value for all members + member_set = [] + for keyw in server.smembers(redis_progression_name_set): + member_set += (keyw, int(server.hget(redis_progression_name, keyw))) + member_set.sort(key=lambda tup: tup[1]) + if member_set[0] < keyword_increase: + #remove min from set and add the new one + server.srem(redis_progression_name_set, member_set[0]) + server.sadd(redis_progression_name_set, keyword) if __name__ == '__main__': @@ -89,18 +104,18 @@ if __name__ == '__main__': host=p.config.get("Redis_Level_DB", "host"), port=p.config.get("Redis_Level_DB", "port"), db=p.config.get("Redis_Level_DB", "db")) - - r_serv2 = redis.StrictRedis( - host=p.config.get("Redis_Level_DB_Domain", "host"), - port=p.config.get("Redis_Level_DB_Domain", "port"), - db=p.config.get("Redis_Level_DB_Domain", "db")) + + r_serv_trend = redis.StrictRedis( + host=p.config.get("Redis_Level_DB_Trending", "host"), + port=p.config.get("Redis_Level_DB_Trending", "port"), + db=p.config.get("Redis_Level_DB_Trending", "db")) # FILE CURVE SECTION # csv_path_proto = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "protocolstrending_csv")) protocolsfile_path = os.path.join(os.environ['AIL_HOME'], - p.config.get("Directories", "protocolsfile")) - + p.config.get("Directories", "protocolsfile")) + csv_path_tld = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "tldstrending_csv")) tldsfile_path = os.path.join(os.environ['AIL_HOME'], @@ -119,26 +134,29 @@ if __name__ == '__main__': if message is None: if generate_new_graph: generate_new_graph = False - print 'Building graph' today = datetime.date.today() year = today.year month = today.month - lib_words.create_curve_with_word_file(r_serv1, csv_path_proto, + print 'Building protocol graph' + lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto, protocolsfile_path, year, month) - lib_words.create_curve_with_word_file(r_serv1, csv_path_tld, + print 'Building tld graph' + lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld, tldsfile_path, year, month) - lib_words.create_curve_with_list(r_serv2, csv_path_domain, - to_plot, year, month) + print 'Building domain graph' + lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain, + "domain", year, + month) print 'end building' publisher.debug("{} queue is empty, waiting".format(config_section)) print 'sleeping' - time.sleep(5) + time.sleep(5*60) continue else: @@ -147,7 +165,10 @@ if __name__ == '__main__': url, date = message.split() faup.decode(url) url_parsed = faup.get() - - analyse(r_serv1, 'scheme') # Scheme analysis - analyse(r_serv1, 'tld') # Tld analysis - analyse_and_progression(r_serv2, 'domain') # Domain analysis + + analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis + analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis + analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis + compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed) + compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed) + compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed) diff --git a/bin/packages/config.cfg.sample b/bin/packages/config.cfg.sample index 718596e5..12330629 100644 --- a/bin/packages/config.cfg.sample +++ b/bin/packages/config.cfg.sample @@ -62,10 +62,10 @@ host = localhost port = 2016 db = 0 -[Redis_Level_DB_Domain] +[Redis_Level_DB_Trending] host = localhost port = 2016 -db = 3 +db = 0 [Redis_Level_DB_Hashs] host = localhost diff --git a/bin/packages/lib_words.py b/bin/packages/lib_words.py index b2cf418b..e98609d7 100644 --- a/bin/packages/lib_words.py +++ b/bin/packages/lib_words.py @@ -88,7 +88,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month with open(feederfilename, 'rb') as f: # words of the files - words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ]) + words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' and word.strip()!='' ]) headers = ['Date'] + words with open(csvfilename+'.csv', 'wb') as f: @@ -112,7 +112,7 @@ def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month row.append(value) writer.writerow(row) -def create_curve_with_list(server, csvfilename, to_plot, year, month): +def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month): """Create a csv file used with dygraph. :param r_serv: -- connexion to redis database @@ -122,15 +122,17 @@ def create_curve_with_list(server, csvfilename, to_plot, year, month): :param month: -- (integer) The month to process This function create a .csv file using datas in redis. - It's checking if the words contained in to_plot and + It's checking if the words contained in set_to_plot and their respectives values by days exists. """ first_day = date(year, month, 01) last_day = date(year, month, calendar.monthrange(year, month)[1]) - words = sorted(to_plot) - + + redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2) + words = list(server.smembers(redis_set_name)) + headers = ['Date'] + words with open(csvfilename+'.csv', 'wb') as f: writer = csv.writer(f) diff --git a/var/www/Flask_server.py b/var/www/Flask_server.py index 08ea0675..4bdee047 100755 --- a/var/www/Flask_server.py +++ b/var/www/Flask_server.py @@ -4,12 +4,14 @@ import redis import ConfigParser import json +import datetime from flask import Flask, render_template, jsonify, request import flask import os import sys sys.path.append(os.path.join(os.environ['AIL_BIN'], 'packages/')) import Paste +from Date import Date # CONFIG # configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') @@ -35,6 +37,11 @@ r_serv_log = redis.StrictRedis( port=cfg.getint("Redis_Log", "port"), db=cfg.getint("Redis_Log", "db")) +r_serv_charts = redis.StrictRedis( + host=cfg.get("Redis_Level_DB_Trending", "host"), + port=cfg.getint("Redis_Level_DB_Trending", "port"), + db=cfg.getint("Redis_Level_DB_Trending", "db")) + app = Flask(__name__, static_url_path='/static/') @@ -100,6 +107,20 @@ def showpaste(content_range): return render_template("show_saved_paste.html", date=p_date, source=p_source, encoding=p_encoding, language=p_language, size=p_size, mime=p_mime, lineinfo=p_lineinfo, content=p_content, initsize=len(p_content), duplicate_list = p_duplicate_list, simil_list = p_simil_list) +def get_date_range(num_day): + curr_date = datetime.date.today() + date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2)) + date_list = [] + + for i in range(0, num_day+1): + date_list.append(date.substract_day(i)) + return date_list + + + + + +# ============ ROUTES ============ @app.route("/_logs") def logs(): @@ -110,6 +131,38 @@ def logs(): def stuff(): return jsonify(row1=get_queues(r_serv)) +@app.route("/_progressionCharts", methods=['GET']) +def progressionCharts(): + #To be used later + attribute_name = request.args.get('attributeName') + trending_name = request.args.get('trendingName') + bar_requested = True if request.args.get('bar') == "true" else False + + if (bar_requested): + num_day = int(request.args.get('days')) + bar_values = [] + + date_range = get_date_range(num_day) + # Retreive all data from the last num_day + for date in date_range: + curr_value = r_serv_charts.hget(attribute_name, date) + bar_values.append([date[0:4]+'/'+date[4:6]+'/'+date[6:8], int(curr_value if curr_value is not None else 0)]) + return jsonify(bar_values) + + else: + redis_progression_name = 'top_progression_'+trending_name + redis_progression_name_set = 'top_progression_'+trending_name+'_set' + + member_set = [] + for keyw in r_serv_charts.smembers(redis_progression_name_set): + keyw_value = r_serv_charts.hget(redis_progression_name, keyw) + keyw_value = keyw_value if keyw_value is not None else 0 + member_set.append((keyw, int(keyw_value))) + member_set.sort(key=lambda tup: tup[1], reverse=True) + if len(member_set) == 0: + member_set.append(("No relevant data", int(100))) + return jsonify(member_set) + @app.route("/search", methods=['POST']) def search(): diff --git a/var/www/templates/Trending.html b/var/www/templates/Trending.html index f99a88f9..7c46cacc 100644 --- a/var/www/templates/Trending.html +++ b/var/www/templates/Trending.html @@ -15,7 +15,10 @@ - + + + @@ -58,26 +61,34 @@
+ + -
+ + +
{% include 'trending_graphs/Tldstrending.html' %}
{% include 'trending_graphs/Domainstrending.html' %}
-
- {% include 'trending_graphs/Wordstrending.html' %} -
{% include 'trending_graphs/Protocolstrending.html' %}
+
+ {% include 'trending_graphs/Wordstrending.html' %} +
@@ -86,6 +97,24 @@ + +