AIL-framework/bin/packages/lib_words.py

import os
import string

from pubsublogger import publisher

import calendar
from datetime import date
from dateutil.rrule import rrule, DAILY
import csv


def listdirectory(path):
    """Path Traversing Function.

    :param path: -- The absolute pathname to a directory.

    This function is returning all the absolute path of the files contained in
    the argument directory.

    """
    fichier = []
    for root, dirs, files in os.walk(path):

        for i in files:

            fichier.append(os.path.join(root, i))

    return fichier

clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""


def create_dirfile(r_serv, directory, overwrite):
    """Create a file of path.

    :param r_serv: -- connexion to redis database
    :param directory: -- The folder where to launch the listing of the .gz files

    This function create a list in redis with inside the absolute path
    of all the pastes needed to be proceeded by function using parallel
    (like redis_words_ranking)

    """
    if overwrite:
        r_serv.delete("filelist")

        for x in listdirectory(directory):
            r_serv.lpush("filelist", x)

        publisher.info("The list was overwritten")

    else:
        if r_serv.llen("filelist") == 0:

            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)

            publisher.info("New list created")
        else:

            for x in listdirectory(directory):
                r_serv.lpush("filelist", x)

            publisher.info("The list was updated with new elements")


def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
    """Create a csv file used with dygraph.

    :param r_serv: -- connexion to redis database
    :param csvfilename: -- the path to the .csv file created
    :param feederfilename: -- the path to the file which contain a list of words.
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process

    This function create a .csv file using datas in redis.
    It's checking if the words contained in feederfilename and
    their respectives values by days exists. If these values are missing
    (Word not present during a day) it's will automatically put a 0
    to keep the timeline of the curve correct.

    """
    threshold = 50
    first_day = date(year, month, 01)
    last_day = date(year, month, calendar.monthrange(year, month)[1])
    words = []

    with open(feederfilename, 'rb') as f:
        # words of the files
        words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' and word.strip()!='' ])

    headers = ['Date'] + words
    with open(csvfilename+'.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(headers)

        # for each days
        for dt in rrule(DAILY, dtstart=first_day, until=last_day):
            row = []
            curdate = dt.strftime("%Y%m%d")
            row.append(curdate)
            # from the 1srt day to the last of the list
            for word in words:
                value = r_serv.hget(word, curdate)
                if value is None:
                    row.append(0)
                else:
                    # if the word have a value for the day
                    # FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold
                    if value >= threshold:
                        row.append(value)
            writer.writerow(row)

def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month):
    """Create a csv file used with dygraph.

    :param r_serv: -- connexion to redis database
    :param csvfilename: -- the path to the .csv file created
    :param to_plot: -- the list which contain a words to plot.
    :param year: -- (integer) The year to process
    :param month: -- (integer) The month to process

    This function create a .csv file using datas in redis.
    It's checking if the words contained in set_to_plot and
    their respectives values by days exists.

    """

    first_day = date(year, month, 01)
    last_day = date(year, month, calendar.monthrange(year, month)[1])
    
    redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2)
    words = list(server.smembers(redis_set_name))
    
    headers = ['Date'] + words
    with open(csvfilename+'.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerow(headers)

        # for each days
        for dt in rrule(DAILY, dtstart=first_day, until=last_day):
            row = []
            curdate = dt.strftime("%Y%m%d")
            row.append(curdate)
            # from the 1srt day to the last of the list
            for word in words:
                value = server.hget(word, curdate)
                if value is None:
                    row.append(0)
                else:
                    # if the word have a value for the day
                    row.append(value)
            writer.writerow(row)
Cleanup (remove unused imports, more pep8 compatible) 2014-08-14 14:11:07 +02:00			`import os`
			`import string`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`from pubsublogger import publisher`

Cleanup (remove unused imports, more pep8 compatible) 2014-08-14 14:11:07 +02:00			`import calendar`
			`from datetime import date`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`from dateutil.rrule import rrule, DAILY`
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`import csv`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00

maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`def listdirectory(path):`
			`"""Path Traversing Function.`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`:param path: -- The absolute pathname to a directory.`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`This function is returning all the absolute path of the files contained in`
			`the argument directory.`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`"""`
Cleanup (remove unused imports, more pep8 compatible) 2014-08-14 14:11:07 +02:00			`fichier = []`
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`for root, dirs, files in os.walk(path):`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`for i in files:`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`fichier.append(os.path.join(root, i))`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`return fichier`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
maxi cleanup old code :'( 2014-08-14 11:48:46 +02:00			`clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))`
			`"""It filters out non-printable characters from the string it receives."""`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00

			`def create_dirfile(r_serv, directory, overwrite):`
			`"""Create a file of path.`

			`:param r_serv: -- connexion to redis database`
			`:param directory: -- The folder where to launch the listing of the .gz files`

			`This function create a list in redis with inside the absolute path`
			`of all the pastes needed to be proceeded by function using parallel`
			`(like redis_words_ranking)`

			`"""`
			`if overwrite:`
			`r_serv.delete("filelist")`

			`for x in listdirectory(directory):`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`r_serv.lpush("filelist", x)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`publisher.info("The list was overwritten")`

			`else:`
			`if r_serv.llen("filelist") == 0:`

			`for x in listdirectory(directory):`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`r_serv.lpush("filelist", x)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`publisher.info("New list created")`
			`else:`

			`for x in listdirectory(directory):`
completely remove ZMQ_PubSub.py 2014-08-20 15:14:57 +02:00			`r_serv.lpush("filelist", x)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
			`publisher.info("The list was updated with new elements")`


			`def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):`
			`"""Create a csv file used with dygraph.`

			`:param r_serv: -- connexion to redis database`
			`:param csvfilename: -- the path to the .csv file created`
			`:param feederfilename: -- the path to the file which contain a list of words.`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`

			`This function create a .csv file using datas in redis.`
			`It's checking if the words contained in feederfilename and`
			`their respectives values by days exists. If these values are missing`
			`(Word not present during a day) it's will automatically put a 0`
			`to keep the timeline of the curve correct.`

			`"""`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`threshold = 50`
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`first_day = date(year, month, 01)`
			`last_day = date(year, month, calendar.monthrange(year, month)[1])`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`words = []`

stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`with open(feederfilename, 'rb') as f:`
Cleanup (remove unused imports, more pep8 compatible) 2014-08-14 14:11:07 +02:00			`# words of the files`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' and word.strip()!='' ])`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`headers = ['Date'] + words`
			`with open(csvfilename+'.csv', 'wb') as f:`
			`writer = csv.writer(f)`
			`writer.writerow(headers)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`# for each days`
			`for dt in rrule(DAILY, dtstart=first_day, until=last_day):`
			`row = []`
			`curdate = dt.strftime("%Y%m%d")`
			`row.append(curdate)`
Cleanup (remove unused imports, more pep8 compatible) 2014-08-14 14:11:07 +02:00			`# from the 1srt day to the last of the list`
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`for word in words:`
			`value = r_serv.hget(word, curdate)`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`if value is None:`
			`row.append(0)`
			`else:`
			`# if the word have a value for the day`
			`# FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold`
			`if value >= threshold:`
			`row.append(value)`
			`writer.writerow(row)`

Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`def create_curve_from_redis_set(server, csvfilename, set_to_plot, year, month):`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`"""Create a csv file used with dygraph.`

			`:param r_serv: -- connexion to redis database`
			`:param csvfilename: -- the path to the .csv file created`
			`:param to_plot: -- the list which contain a words to plot.`
			`:param year: -- (integer) The year to process`
			`:param month: -- (integer) The month to process`

			`This function create a .csv file using datas in redis.`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00			`It's checking if the words contained in set_to_plot and`
Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`their respectives values by days exists.`

			`"""`

			`first_day = date(year, month, 01)`
			`last_day = date(year, month, calendar.monthrange(year, month)[1])`
Added top_progression chart for tld, domain and scheme + Small modification in config file. 2016-07-21 13:44:22 +02:00
			`redis_set_name = set_to_plot + "_set_" + str(year) + str(month).zfill(2)`
			`words = list(server.smembers(redis_set_name))`

Added DomainTrending seems working. Started search features with related html pages, not finish yet. 2016-07-05 16:53:03 +02:00			`headers = ['Date'] + words`
			`with open(csvfilename+'.csv', 'wb') as f:`
			`writer = csv.writer(f)`
			`writer.writerow(headers)`

			`# for each days`
			`for dt in rrule(DAILY, dtstart=first_day, until=last_day):`
			`row = []`
			`curdate = dt.strftime("%Y%m%d")`
			`row.append(curdate)`
			`# from the 1srt day to the last of the list`
			`for word in words:`
			`value = server.hget(word, curdate)`
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`if value is None:`
			`row.append(0)`
Initial import of AIL framework - Analysis Information Leak framework AIL is a modular framework to analyse potential information leak from unstructured data source like pastes from Past ebin or similar services. AIL framework is flexible and can be extended to support other functionalities to mine sen sitive information 2014-08-06 11:43:40 +02:00			`else:`
stop killing the disk when creating the word curve 2014-09-02 18:20:28 +02:00			`# if the word have a value for the day`
			`row.append(value)`
			`writer.writerow(row)`