2014-08-14 14:11:07 +02:00
|
|
|
import os
|
|
|
|
import string
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
from pubsublogger import publisher
|
|
|
|
|
2014-08-14 14:11:07 +02:00
|
|
|
import calendar
|
|
|
|
from datetime import date
|
2014-08-06 11:43:40 +02:00
|
|
|
from dateutil.rrule import rrule, DAILY
|
|
|
|
|
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
def listdirectory(path):
|
|
|
|
"""Path Traversing Function.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
:param path: -- The absolute pathname to a directory.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
This function is returning all the absolute path of the files contained in
|
|
|
|
the argument directory.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
"""
|
2014-08-14 14:11:07 +02:00
|
|
|
fichier = []
|
2014-08-14 11:48:46 +02:00
|
|
|
for root, dirs, files in os.walk(path):
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
for i in files:
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
fichier.append(os.path.join(root, i))
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
return fichier
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 11:48:46 +02:00
|
|
|
clean = lambda dirty: ''.join(filter(string.printable.__contains__, dirty))
|
|
|
|
"""It filters out non-printable characters from the string it receives."""
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
|
|
|
|
def create_dirfile(r_serv, directory, overwrite):
|
|
|
|
"""Create a file of path.
|
|
|
|
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
|
|
:param directory: -- The folder where to launch the listing of the .gz files
|
|
|
|
|
|
|
|
This function create a list in redis with inside the absolute path
|
|
|
|
of all the pastes needed to be proceeded by function using parallel
|
|
|
|
(like redis_words_ranking)
|
|
|
|
|
|
|
|
"""
|
|
|
|
if overwrite:
|
|
|
|
r_serv.delete("filelist")
|
|
|
|
|
|
|
|
for x in listdirectory(directory):
|
2014-08-20 15:14:57 +02:00
|
|
|
r_serv.lpush("filelist", x)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
publisher.info("The list was overwritten")
|
|
|
|
|
|
|
|
else:
|
|
|
|
if r_serv.llen("filelist") == 0:
|
|
|
|
|
|
|
|
for x in listdirectory(directory):
|
2014-08-20 15:14:57 +02:00
|
|
|
r_serv.lpush("filelist", x)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
publisher.info("New list created")
|
|
|
|
else:
|
|
|
|
|
|
|
|
for x in listdirectory(directory):
|
2014-08-20 15:14:57 +02:00
|
|
|
r_serv.lpush("filelist", x)
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
publisher.info("The list was updated with new elements")
|
|
|
|
|
|
|
|
|
|
|
|
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
|
|
|
|
"""Create a csv file used with dygraph.
|
|
|
|
|
|
|
|
:param r_serv: -- connexion to redis database
|
|
|
|
:param csvfilename: -- the path to the .csv file created
|
|
|
|
:param feederfilename: -- the path to the file which contain a list of words.
|
|
|
|
:param year: -- (integer) The year to process
|
|
|
|
:param month: -- (integer) The month to process
|
|
|
|
|
|
|
|
This function create a .csv file using datas in redis.
|
|
|
|
It's checking if the words contained in feederfilename and
|
|
|
|
their respectives values by days exists. If these values are missing
|
|
|
|
(Word not present during a day) it's will automatically put a 0
|
|
|
|
to keep the timeline of the curve correct.
|
|
|
|
|
|
|
|
"""
|
|
|
|
a = date(year, month, 01)
|
2014-08-14 14:11:07 +02:00
|
|
|
b = date(year, month, calendar.monthrange(year, month)[1])
|
2014-08-06 11:43:40 +02:00
|
|
|
days = {}
|
|
|
|
words = []
|
|
|
|
|
|
|
|
with open(feederfilename, 'rb') as F:
|
2014-08-14 14:11:07 +02:00
|
|
|
# words of the files
|
|
|
|
for word in F:
|
|
|
|
# list of words (sorted as in the file)
|
|
|
|
words.append(word[:-1])
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2014-08-14 14:11:07 +02:00
|
|
|
# for each days
|
|
|
|
for dt in rrule(DAILY, dtstart=a, until=b):
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
mot = []
|
|
|
|
mot1 = []
|
|
|
|
mot2 = []
|
|
|
|
|
|
|
|
days[dt.strftime("%Y%m%d")] = ''
|
2014-08-14 14:11:07 +02:00
|
|
|
# from the 1srt day to the last of the list
|
|
|
|
for word in sorted(words):
|
|
|
|
|
|
|
|
# if the word have a value for the day
|
|
|
|
if r_serv.hexists(word, dt.strftime("%Y%m%d")):
|
2014-08-06 11:43:40 +02:00
|
|
|
mot1.append(str(word))
|
|
|
|
mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))
|
|
|
|
|
|
|
|
mot = zip(mot1, mot2)
|
|
|
|
|
|
|
|
days[dt.strftime("%Y%m%d")] = mot
|
|
|
|
else:
|
|
|
|
|
|
|
|
mot1.append(str(word))
|
|
|
|
mot2.append(0)
|
|
|
|
|
|
|
|
mot = zip(mot1, mot2)
|
|
|
|
|
|
|
|
days[dt.strftime("%Y%m%d")] = mot
|
|
|
|
|
|
|
|
with open(csvfilename+".csv", 'wb') as F:
|
|
|
|
F.write("Date," + ",".join(sorted(words)) + '\n')
|
|
|
|
|
|
|
|
for x, s in days.items():
|
|
|
|
val = []
|
|
|
|
for y in s:
|
|
|
|
val.append(y[1])
|
|
|
|
|
|
|
|
F.write(x + ',' + str(val) + '\n')
|
|
|
|
|
|
|
|
with open(csvfilename+".csv", 'rb') as F:
|
|
|
|
h = F.read()
|
2014-08-14 14:11:07 +02:00
|
|
|
h = h.replace("[", "")
|
|
|
|
h = h.replace("]", "")
|
|
|
|
h = h.replace('\'', "")
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
with open(csvfilename+".csv", 'wb') as F:
|
|
|
|
F.write(h)
|