AIL-framework/bin/packages/lib_words.py

153 lines
4.8 KiB
Python
Raw Normal View History

import os
import string
from pubsublogger import publisher
import calendar
from datetime import date
from dateutil.rrule import rrule, DAILY
import csv
2014-08-14 11:48:46 +02:00
def listdirectory(path):
"""Path Traversing Function.
2014-08-14 11:48:46 +02:00
:param path: -- The absolute pathname to a directory.
2014-08-14 11:48:46 +02:00
This function is returning all the absolute path of the files contained in
the argument directory.
"""
fichier = []
2014-08-14 11:48:46 +02:00
for root, dirs, files in os.walk(path):
2014-08-14 11:48:46 +02:00
for i in files:
2014-08-14 11:48:46 +02:00
fichier.append(os.path.join(root, i))
2014-08-14 11:48:46 +02:00
return fichier
2014-08-14 11:48:46 +02:00
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
"""It filters out non-printable characters from the string it receives."""
def create_dirfile(r_serv, directory, overwrite):
"""Create a file of path.
:param r_serv: -- connexion to redis database
:param directory: -- The folder where to launch the listing of the .gz files
This function create a list in redis with inside the absolute path
of all the pastes needed to be proceeded by function using parallel
(like redis_words_ranking)
"""
if overwrite:
r_serv.delete("filelist")
for x in listdirectory(directory):
2014-08-20 15:14:57 +02:00
r_serv.lpush("filelist", x)
publisher.info("The list was overwritten")
else:
if r_serv.llen("filelist") == 0:
for x in listdirectory(directory):
2014-08-20 15:14:57 +02:00
r_serv.lpush("filelist", x)
publisher.info("New list created")
else:
for x in listdirectory(directory):
2014-08-20 15:14:57 +02:00
r_serv.lpush("filelist", x)
publisher.info("The list was updated with new elements")
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
"""Create a csv file used with dygraph.
:param r_serv: -- connexion to redis database
:param csvfilename: -- the path to the .csv file created
:param feederfilename: -- the path to the file which contain a list of words.
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function create a .csv file using datas in redis.
It's checking if the words contained in feederfilename and
their respectives values by days exists. If these values are missing
(Word not present during a day) it's will automatically put a 0
to keep the timeline of the curve correct.
"""
threshold = 50
first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1])
words = []
with open(feederfilename, 'rb') as f:
# words of the files
words = sorted([word.strip() for word in f if word.strip()[0:2]!='//' ])
headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
# for each days
for dt in rrule(DAILY, dtstart=first_day, until=last_day):
row = []
curdate = dt.strftime("%Y%m%d")
row.append(curdate)
# from the 1srt day to the last of the list
for word in words:
value = r_serv.hget(word, curdate)
if value is None:
row.append(0)
else:
# if the word have a value for the day
# FIXME Due to performance issues (too many tlds, leads to more than 7s to perform this procedure), I added a threshold
if value >= threshold:
row.append(value)
writer.writerow(row)
def create_curve_with_list(server, csvfilename, to_plot, year, month):
"""Create a csv file used with dygraph.
:param r_serv: -- connexion to redis database
:param csvfilename: -- the path to the .csv file created
:param to_plot: -- the list which contain a words to plot.
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function create a .csv file using datas in redis.
It's checking if the words contained in to_plot and
their respectives values by days exists.
"""
first_day = date(year, month, 01)
last_day = date(year, month, calendar.monthrange(year, month)[1])
words = sorted(to_plot)
headers = ['Date'] + words
with open(csvfilename+'.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerow(headers)
# for each days
for dt in rrule(DAILY, dtstart=first_day, until=last_day):
row = []
curdate = dt.strftime("%Y%m%d")
row.append(curdate)
# from the 1srt day to the last of the list
for word in words:
value = server.hget(word, curdate)
if value is None:
row.append(0)
else:
# if the word have a value for the day
row.append(value)
writer.writerow(row)