AIL-framework/bin/WebStats.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*

"""
The WebStats Module
======================

This module makes stats on URL recolted from the web module.
It consider the TLD, Domain and protocol.

"""

import time
import datetime
import redis
import os
from packages import lib_words
from packages.Date import Date
from pubsublogger import publisher
from Helper import Process
from pyfaup.faup import Faup

# Config Var
threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression
threshold_increase = 1.0  # The percentage representing the keyword occurence since num_day_to_look
max_set_cardinality = 10  # The cardinality of the progression set
num_day_to_look = 5       # the detection of the progression start num_day_to_look in the past

def analyse(server, field_name, date, url_parsed):
    field = url_parsed[field_name]
    if field is not None:
        server.hincrby(field, date, 1)
        if field_name == "domain": #save domain in a set for the monthly plot
            domain_set_name = "domain_set_" + date[0:6]
            server.sadd(domain_set_name, field)
            print "added in " + domain_set_name +": "+ field

def get_date_range(num_day):
    curr_date = datetime.date.today()
    date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))
    date_list = []

    for i in range(0, num_day+1):
        date_list.append(date.substract_day(i))
    return date_list

# Compute the progression for one keyword
def compute_progression_word(server, num_day, keyword):
    date_range = get_date_range(num_day)
    # check if this keyword is eligible for progression
    keyword_total_sum = 0
    value_list = []
    for date in date_range: # get value up to date_range
        curr_value = server.hget(keyword, date)
        value_list.append(int(curr_value if curr_value is not None else 0))
        keyword_total_sum += int(curr_value) if curr_value is not None else 0
    oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division

    # The progression is based on the ratio: value[i] / value[i-1]
    keyword_increase = 0
    value_list_reversed = value_list[:]
    value_list_reversed.reverse()
    for i in range(1, len(value_list_reversed)):
        divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1
        keyword_increase += value_list_reversed[i] / divisor

    return (keyword_increase, keyword_total_sum)


'''
    recompute the set top_progression zset
        - Compute the current field progression
        - re-compute the current progression for each first 2*max_set_cardinality fields in the top_progression_zset
'''
def compute_progression(server, field_name, num_day, url_parsed):
    redis_progression_name_set = "z_top_progression_"+field_name

    keyword = url_parsed[field_name]
    if keyword is not None:

        #compute the progression of the current word
        keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword)

        #re-compute the progression of 2*max_set_cardinality
        current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality)
        for word, value in current_top:
            word_inc, word_tot_sum = compute_progression_word(server, num_day, word)
            server.zrem(redis_progression_name_set, word)
            if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):
                server.zadd(redis_progression_name_set, float(word_inc), word)

        # filter before adding
        if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):
            server.zadd(redis_progression_name_set, float(keyword_increase), keyword)


if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'WebStats'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("Makes statistics about valid URL")

    # REDIS #
    r_serv_trend = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_Trending", "host"),
        port=p.config.get("Redis_Level_DB_Trending", "port"),
        db=p.config.get("Redis_Level_DB_Trending", "db"))

    # FILE CURVE SECTION #
    csv_path_proto = os.path.join(os.environ['AIL_HOME'],
                                  p.config.get("Directories", "protocolstrending_csv"))
    protocolsfile_path = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "protocolsfile"))
    
    csv_path_tld = os.path.join(os.environ['AIL_HOME'],
                                p.config.get("Directories", "tldstrending_csv"))
    tldsfile_path = os.path.join(os.environ['AIL_HOME'],
                                 p.config.get("Directories", "tldsfile"))

    csv_path_domain = os.path.join(os.environ['AIL_HOME'],
                                   p.config.get("Directories", "domainstrending_csv"))

    faup = Faup()
    generate_new_graph = False
    # Endless loop getting messages from the input queue
    while True:
        # Get one message from the input queue
        message = p.get_from_set()

        if message is None:
            if generate_new_graph:
                generate_new_graph = False
                today = datetime.date.today()
                year = today.year
                month = today.month

                print 'Building protocol graph'
                lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,
                                                      protocolsfile_path, year,
                                                      month)

                print 'Building tld graph'
                lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,
                                                      tldsfile_path, year,
                                                      month)

                print 'Building domain graph'
                lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,
                                                      "domain", year,
                                                      month)
                print 'end building'

            publisher.debug("{} queue is empty, waiting".format(config_section))
            print 'sleeping'
            time.sleep(5*60)
            continue

        else:
            generate_new_graph = True
            # Do something with the message from the queue
            url, date, path = message.split()
            faup.decode(url)
            url_parsed = faup.get()
            
            analyse(r_serv_trend, 'scheme', date, url_parsed)	#Scheme analysis
            analyse(r_serv_trend, 'tld', date, url_parsed)	#Tld analysis
	    analyse(r_serv_trend, 'domain', date, url_parsed)	#Domain analysis
            compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)
            compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)
            compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`"""`
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00			`The WebStats Module`
			`======================`

			`This module makes stats on URL recolted from the web module.`
			`It consider the TLD, Domain and protocol.`

Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`"""`

			`import time`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`import datetime`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`import redis`
			`import os`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`from packages import lib_words`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`from packages.Date import Date`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`from pubsublogger import publisher`
			`from Helper import Process`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`from pyfaup.faup import Faup`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`# Config Var`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`threshold_total_sum = 200 # Above this value, a keyword is eligible for a progression`
			`threshold_increase = 1.0 # The percentage representing the keyword occurence since num_day_to_look`
			`max_set_cardinality = 10 # The cardinality of the progression set`
			`num_day_to_look = 5 # the detection of the progression start num_day_to_look in the past`
Fix pep8 2016-07-20 14:12:18 +02:00
Fixed bug introduced in merge-conflict 2016-07-22 09:25:05 +02:00			`def analyse(server, field_name, date, url_parsed):`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`field = url_parsed[field_name]`
			`if field is not None:`
- Modified redis connection (from levelDB to redis). - Added term frequency in curve. - Modified ModuleStats and WebStats to use redis special command (incrby instead of get-set and zset) and Flask to perform the correct queries. - Added panel color in sentiment trending 2016-08-18 15:34:19 +02:00			`server.hincrby(field, date, 1)`
			`if field_name == "domain": #save domain in a set for the monthly plot`
			`domain_set_name = "domain_set_" + date[0:6]`
			`server.sadd(domain_set_name, field)`
			`print "added in " + domain_set_name +": "+ field`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`def get_date_range(num_day):`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`curr_date = datetime.date.today()`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`date = Date(str(curr_date.year)+str(curr_date.month).zfill(2)+str(curr_date.day).zfill(2))`
			`date_list = []`

			`for i in range(0, num_day+1):`
			`date_list.append(date.substract_day(i))`
			`return date_list`

Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00			`# Compute the progression for one keyword`
fix invalid variable propagation 2017-01-10 16:46:46 +01:00			`def compute_progression_word(server, num_day, keyword):`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00			`date_range = get_date_range(num_day)`
			`# check if this keyword is eligible for progression`
			`keyword_total_sum = 0`
			`value_list = []`
			`for date in date_range: # get value up to date_range`
			`curr_value = server.hget(keyword, date)`
			`value_list.append(int(curr_value if curr_value is not None else 0))`
			`keyword_total_sum += int(curr_value) if curr_value is not None else 0`
			`oldest_value = value_list[-1] if value_list[-1] != 0 else 1 #Avoid zero division`

			`# The progression is based on the ratio: value[i] / value[i-1]`
			`keyword_increase = 0`
			`value_list_reversed = value_list[:]`
			`value_list_reversed.reverse()`
			`for i in range(1, len(value_list_reversed)):`
			`divisor = value_list_reversed[i-1] if value_list_reversed[i-1] != 0 else 1`
			`keyword_increase += value_list_reversed[i] / divisor`

			`return (keyword_increase, keyword_total_sum)`


			`'''`
			`recompute the set top_progression zset`
			`- Compute the current field progression`
			`- re-compute the current progression for each first 2*max_set_cardinality fields in the top_progression_zset`
			`'''`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`def compute_progression(server, field_name, num_day, url_parsed):`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00			`redis_progression_name_set = "z_top_progression_"+field_name`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00
			`keyword = url_parsed[field_name]`
			`if keyword is not None:`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00
			`#compute the progression of the current word`
fix invalid variable propagation 2017-01-10 16:46:46 +01:00			`keyword_increase, keyword_total_sum = compute_progression_word(server, num_day, keyword)`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00
			`#re-compute the progression of 2*max_set_cardinality`
			`current_top = server.zrevrangebyscore(redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2*max_set_cardinality)`
fix invalid variable propagation 2017-01-10 16:46:46 +01:00			`for word, value in current_top:`
			`word_inc, word_tot_sum = compute_progression_word(server, num_day, word)`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00			`server.zrem(redis_progression_name_set, word)`
			`if (word_tot_sum > threshold_total_sum) and (word_inc > threshold_increase):`
			`server.zadd(redis_progression_name_set, float(word_inc), word)`

			`# filter before adding`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`if (keyword_total_sum > threshold_total_sum) and (keyword_increase > threshold_increase):`
Webstats should correctly updates top_progression_zset (Not fully tested because not enough data. Will be tested latter) 2016-12-08 10:05:07 +01:00			`server.zadd(redis_progression_name_set, float(keyword_increase), keyword)`

Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00
			`if __name__ == '__main__':`
			`# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)`
			`# Port of the redis instance used by pubsublogger`
			`publisher.port = 6380`
			`# Script is the default channel used for the modules.`
			`publisher.channel = 'Script'`

			`# Section name in bin/packages/modules.cfg`
			`config_section = 'WebStats'`

			`# Setup the I/O queues`
			`p = Process(config_section)`

			`# Sent to the logging a description of the module`
			`publisher.info("Makes statistics about valid URL")`

			`# REDIS #`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`r_serv_trend = redis.StrictRedis(`
			`host=p.config.get("Redis_Level_DB_Trending", "host"),`
			`port=p.config.get("Redis_Level_DB_Trending", "port"),`
			`db=p.config.get("Redis_Level_DB_Trending", "db"))`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00
			`# FILE CURVE SECTION #`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`csv_path_proto = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 14:12:18 +02:00			`p.config.get("Directories", "protocolstrending_csv"))`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`protocolsfile_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "protocolsfile"))`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00
			`csv_path_tld = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 14:12:18 +02:00			`p.config.get("Directories", "tldstrending_csv"))`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`tldsfile_path = os.path.join(os.environ['AIL_HOME'],`
			`p.config.get("Directories", "tldsfile"))`

Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`csv_path_domain = os.path.join(os.environ['AIL_HOME'],`
Fix pep8 2016-07-20 14:12:18 +02:00			`p.config.get("Directories", "domainstrending_csv"))`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`faup = Faup()`
			`generate_new_graph = False`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`# Endless loop getting messages from the input queue`
			`while True:`
			`# Get one message from the input queue`
			`message = p.get_from_set()`
Fix pep8 2016-07-20 14:12:18 +02:00
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`if message is None:`
			`if generate_new_graph:`
			`generate_new_graph = False`
			`today = datetime.date.today()`
			`year = today.year`
			`month = today.month`
Optimized create_plot and removed test commemts 2016-07-12 11:47:51 +02:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`print 'Building protocol graph'`
			`lib_words.create_curve_with_word_file(r_serv_trend, csv_path_proto,`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`protocolsfile_path, year,`
			`month)`
Optimized create_plot and removed test commemts 2016-07-12 11:47:51 +02:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`print 'Building tld graph'`
			`lib_words.create_curve_with_word_file(r_serv_trend, csv_path_tld,`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`tldsfile_path, year,`
			`month)`
Optimized create_plot and removed test commemts 2016-07-12 11:47:51 +02:00
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`print 'Building domain graph'`
			`lib_words.create_curve_from_redis_set(r_serv_trend, csv_path_domain,`
			`"domain", year,`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`month)`
			`print 'end building'`
Optimized create_plot and removed test commemts 2016-07-12 11:47:51 +02:00
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`publisher.debug("{} queue is empty, waiting".format(config_section))`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`print 'sleeping'`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`time.sleep(5*60)`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`continue`

Fix pep8 2016-07-20 14:12:18 +02:00			`else:`
Added new modules and started WebTrending web interface 2016-06-30 14:38:28 +02:00			`generate_new_graph = True`
			`# Do something with the message from the queue`
Added SQLInjectionDetection module 2016-08-02 15:43:11 +02:00			`url, date, path = message.split()`
Added template tld. Modified URL using Faup and refactored WebStats. 2016-07-01 16:59:08 +02:00			`faup.decode(url)`
			`url_parsed = faup.get()`

Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`analyse(r_serv_trend, 'scheme', date, url_parsed) #Scheme analysis`
			`analyse(r_serv_trend, 'tld', date, url_parsed) #Tld analysis`
			`analyse(r_serv_trend, 'domain', date, url_parsed) #Domain analysis`
			`compute_progression(r_serv_trend, 'scheme', num_day_to_look, url_parsed)`
			`compute_progression(r_serv_trend, 'tld', num_day_to_look, url_parsed)`
			`compute_progression(r_serv_trend, 'domain', num_day_to_look, url_parsed)`