AIL-framework/bin/packages/lib_words.py

615 lines
20 KiB
Python
Raw Normal View History

import redis, gzip
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from lib_redis_insert import clean, listdirectory
from lib_jobs import *
from pubsublogger import publisher
import calendar as cal
from datetime import date, timedelta
from dateutil.rrule import rrule, DAILY
from packages import *
def redis_words_ranking(pipe, r_serv, nb, minlength, maxlength):
"""Looping function
:param pipe: -- Redis pipe.
:param nb: -- (int) Number of pastes proceeded by function
:param minlength: -- (int) passed to the next function
:param maxlength: -- (int) passed to the next function
"""
try:
for n in xrange(0,nb):
path = r_serv.lpop("filelist")
if path != None:
set_listof_pid(r_serv, path, sys.argv[0])
redis_zincr_words(pipe, path, minlength, maxlength)
update_listof_pid(r_serv)
r_serv.lpush("processed",path)
publisher.debug(path)
else:
publisher.debug("Empty list")
break
except (KeyboardInterrupt, SystemExit) as e:
flush_list_of_pid(r_serv)
publisher.debug("Pid list flushed")
def redis_zincr_words(pipe, filename, minlength, maxlength):
"""Create news sorted set in redis.
:param minlength: -- (int) Minimum words length inserted
:param maxlength: -- (int) Maximum words length inserted
:param filename: -- The absolute path to the file.gz to process.
Representation of the set in redis:
+------------+------------+-----------+
| Keys | Members | Scores |
+============+============+===========+
| 20131001 | word1 | 142 |
+------------+------------+-----------+
| ... | word2 | 120 |
+------------+------------+-----------+
| 20131002 | ... | ... |
+------------+------------+-----------+
This function store all words between minlength and maxlength in redis.
Redis will count as well how much time each word will appear by day:
The cardinality.
"""
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
with gzip.open(filename, 'rb') as F:
blob = TextBlob(clean(F.read()), tokenizer = tokenizer)
for word in blob.tokens:
if (len(word) >= minlength) and (len(word) <= maxlength):
pipe.zincrby(filename[-22:-12].replace('/',''), word, 1)
if (len(word) >= maxlength):
publisher.info("word bigger than {0} detected at {1}".format(maxlength, filename))
publisher.info(word)
pipe.execute()
def classify_token_paste(r_serv, listname, choicedatastruct, nb, r_set):
"""Tokenizing on word category
:param r_serv: -- Redis database connexion
:param listname: -- (str) path to the file containing the list of path of category files
:param choicedatastruct: -- (bool) Changing the index of datastructure
:param nb: -- (int) Number of pastes proceeded by function
Redis data structures cas be choose as follow:
+---------------+------------+-----------+
| Keys | Members | Scores |
+===============+============+===========+
| mails_categ | filename | 25000 |
+---------------+------------+-----------+
| ... | filename2 | 2400 |
+---------------+------------+-----------+
| web_categ | ... | ... |
+---------------+------------+-----------+
Or
+--------------+-------------+-----------+
| Keys | Members | Scores |
+==============+=============+===========+
| filename | mails_categ | 100000 |
+--------------+-------------+-----------+
| ... | web_categ | 24050 |
+--------------+-------------+-----------+
| filename2 | ... | ... |
+--------------+-------------+-----------+
This function tokenise on all special characters like: @^\|[{#~}]!:;$^=
And insert data in redis if the token match the keywords in a list previously
created.
These lists of keywords can be list of everything you want but it's better
to create "category" of keywords.
"""
try:
for n in xrange(0,nb):
filename = r_serv.lpop(r_set)
if filename != None:
tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\/\-/\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', gaps = True, discard_empty = True)
set_listof_pid(r_serv, filename, sys.argv[0])
with open(listname, 'rb') as L:
# for each "categ" listed in the file
for num, fname in enumerate(L):
# contain keywords by categ
tmp_list = []
#for each keywords
with open(fname[:-1], 'rb') as LS:
for num, kword in enumerate(LS):
tmp_list.append(kword[:-1])
# for each paste
with gzip.open(filename, 'rb') as F:
blob = TextBlob(clean(F.read()),
tokenizer = tokenizer)
# for each paste token
for word in blob.tokens.lower():
if word in tmp_list:
# choosing between two data structures.
if choicedatastruct:
r_serv.zincrby(filename,
fname.split('/')[-1][:-1],
1)
else:
r_serv.zincrby(fname.split('/')[-1][:-1],
filename,
1)
update_listof_pid(r_serv)
else:
publisher.debug("Empty list")
#r_serv.save()
break
except (KeyboardInterrupt, SystemExit) as e:
flush_list_of_pid(r_serv)
publisher.debug("Pid list flushed")
def dectect_longlines(r_serv, r_key, store = False, maxlength = 500):
"""Store longlines's linenumbers in redis
:param r_serv: -- The redis connexion database
:param r_key: -- (str) The key name in redis
:param store: -- (bool) Store the line numbers or not.
:param maxlength: -- The limit between "short lines" and "long lines"
This function connect to a redis list of filename (pastes filename);
Open the paste and check inside if there is some line with their
length >= to maxlength.
If yes, the paste is "tagged" as containing a longlines in another
redis structures, and the linenumber (of the long lines) can be stored
in addition if the argument store is at True.
"""
try:
while True:
#r_key_list (categ)
filename = r_serv.lpop(r_key)
if filename != None:
set_listof_pid(r_serv, filename, sys.argv[0])
# for each pastes
with gzip.open(filename, 'rb') as F:
var = True
for num, line in enumerate(F):
if len(line) >= maxlength:
#publisher.debug("Longline:{0}".format(line))
if var:
r_serv.rpush("longlines", filename)
var = False
if store:
r_serv.sadd(filename, num)
else:
publisher.debug("Line numbers of longlines not stored")
update_listof_pid(r_serv)
else:
publisher.debug("Empty list")
return False
break
except (KeyboardInterrupt, SystemExit) as e:
flush_list_of_pid(r_serv)
publisher.debug("Pid list flushed")
# NOT USED RIGHT NOW #
def recovering_longlines(r_serv):
"""Get longlines with linenumbers
"""
try:
for n in xrange(0,nb):
filename = r_serv.lpop("longlines")
if filename != None:
# For each values in redis (longline's line number)
for numline in r_serv.smembers(filename):
with gzip.open(filename,'rb') as F:
for num, line in enumerate(F):
#When corresponding.
if int(num) == int(numline):
pass
# TREATMENT
else:
publisher.debug("Empty list")
r_serv.save()
break
except (KeyboardInterrupt, SystemExit) as e:
flush_list_of_pid(r_serv)
publisher.debug("Pid list flushed")
def remove_longline_from_categ(r_serv, r_key, delete, store, maxlength):
"""Remove from a set, file with long lines.
:param r_serv: -- The redis connexion database
:param r_key: -- (str) The key name in redis
:param store: -- (bool) Store the line numbers or not.
:param delete: -- (bool) If true, delete the used key from redis.
:param maxlength: -- The limit between "short lines" and "long lines"
"""
publisher.info("Number of file before:{0}".format(r_serv.zcard(r_key)))
#Create a list of file to proceed (1)
for filename in r_serv.zrange(r_key, 0, -1):
r_serv.rpush(r_key+"_list", filename)
#detecting longlines in pastes
dectect_longlines(r_serv, r_key+"_list", store, maxlength)
#remove false positive members
while True:
fp_filename = r_serv.lpop("longlines")
if fp_filename == None:
break
else:
# if wanted, delete in addition the set with linenumbers (created with store)
if delete:
r_serv.zrem(r_key, fp_filename)
r_serv.delete(fp_filename)
else:
#remove the file with longline from the r_key zset.
r_serv.zrem(r_key, fp_filename)
publisher.info("Longline file removed from {0}, {1} Files remaining".format(r_key, r_serv.zcard(r_key)))
def detect_longline_from_list(r_serv, nb):
try:
for n in xrange(0,nb):
if not dectect_longlines(r_serv, "filelist", True):
break
except (KeyboardInterrupt, SystemExit) as e:
flush_list_of_pid(r_serv)
publisher.debug("Pid list flushed")
def create_dirfile(r_serv, directory, overwrite):
"""Create a file of path.
:param r_serv: -- connexion to redis database
:param directory: -- The folder where to launch the listing of the .gz files
This function create a list in redis with inside the absolute path
of all the pastes needed to be proceeded by function using parallel
(like redis_words_ranking)
"""
if overwrite:
r_serv.delete("filelist")
for x in listdirectory(directory):
r_serv.rpush("filelist",x)
publisher.info("The list was overwritten")
else:
if r_serv.llen("filelist") == 0:
for x in listdirectory(directory):
r_serv.rpush("filelist",x)
publisher.info("New list created")
else:
for x in listdirectory(directory):
r_serv.rpush("filelist",x)
publisher.info("The list was updated with new elements")
def redis_interbargraph_set(r_serv, year, month, overwrite):
"""Create a Redis sorted set.
:param r_serv: -- connexion to redis database
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
:param overwrite: -- (bool) trigger the overwrite mode
This function create inside redis the intersection of all days in
a month two by two.
Example:
For a month of 31days it will create 30 sorted set between day and
day+1 until the last day.
The overwrite mode delete the intersets and re-create them.
"""
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year, month)[1])
if overwrite:
r_serv.delete("InterSet")
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
dayafter = dt+timedelta(1)
r_serv.delete(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
r_serv.zinterstore(
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
{str(dt.strftime("%Y%m%d")):1,
str(dayafter.strftime("%Y%m%d")):-1})
r_serv.zadd(
"InterSet",
1,
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
else:
for dt in rrule(DAILY, dtstart = a, until = b - timedelta(1)):
dayafter = dt+timedelta(1)
if r_serv.zcard(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))) == 0:
r_serv.zinterstore(
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")),
{str(dt.strftime("%Y%m%d")):1,
str(dayafter.strftime("%Y%m%d")):-1})
r_serv.zadd(
"InterSet",
1,
str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d")))
publisher.info(str(dt.strftime("%Y%m%d"))+str(dayafter.strftime("%Y%m%d"))+" Intersection Created")
else:
publisher.warning("Data already exist, operation aborted.")
def word_bar_graph(r_serv, year, month, filename):
"""Create an histogram.
:param r_serv: -- connexion to redis database
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
:param filename: -- The absolute path where to save the figure.png
This function use matplotlib to create an histogram.
The redis database need obviously to be populated first
with functions: redis_words_ranking and redis_interbargraph_set.
"""
lw = []
adate = []
inter = [0]
rcParams['figure.figsize'] = 15, 10
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year,month)[1])
for dt in rrule(DAILY, dtstart = a, until = b):
lw.append(r_serv.zcard(dt.strftime("%Y%m%d")))
adate.append(dt.strftime("%d"))
for x in r_serv.zrange("InterSet", 0, 31):
inter.append(r_serv.zcard(x))
n_groups = len(lw)
card_words = tuple(lw)
card_interword = tuple(inter)
index = np.arange(n_groups)
bar_width = 0.5
opacity = 0.6
words = plt.bar(index, card_words, bar_width,
alpha=opacity,
color='g',
label='Words/day')
lwords = plt.bar(index - 0.5, card_interword, bar_width,
alpha=opacity,
color='r',
label='Intersection')
plt.plot(tuple(inter), 'b--')
plt.xlabel(str(year)+'/'+str(month)+' Days')
plt.ylabel('Words')
plt.title('Words Cardinality & Intersection Histogram')
plt.xticks(index + bar_width/2 , tuple(adate))
plt.legend()
plt.grid()
plt.tight_layout()
plt.savefig(filename+".png", dpi=None, facecolor='w', edgecolor='b',
orientation='portrait', papertype=None, format="png",
transparent=False, bbox_inches=None, pad_inches=0.1,
frameon=True)
publisher.info(filename+".png"+" saved!")
def create_data_words_curve(r_serv, r_serv2, year, month, filename):
"""Create a Redis hashes.
:param r_serv: -- connexion to redis database (read)
:param r_serv2: -- connexion to redis database (write)
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
:param filename: -- the path to the file which contain a list of words.
The hashes of redis is created as follow:
+------------+------------+-----------+
| Keys | Field | Values |
+============+============+===========+
| word1 | 20131001 | 150 |
+------------+------------+-----------+
| ... | 20131002 | 145 |
+------------+------------+-----------+
| word2 | ... | ... |
+------------+------------+-----------+
The filename need to be a list of words separated by a carriage return
with an empty line at the end.
This function create datas which is used by the function
create_curve_with_word_file which create a csv file.
"""
stop = stopwords.words('english')
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year,month)[1])
with open(filename, 'rb') as F:
for line in F:
for dt in rrule(DAILY, dtstart = a, until = b):
if r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1]) is not None:
#tester si ca existe deja "en option" et ajouter un WARNING log
r_serv2.hmset(line[:-1], {str(dt.strftime("%Y%m%d")):r_serv.zscore(dt.strftime("%Y%m%d"), line[:-1])})
else:
pass
def create_curve_with_word_file(r_serv, csvfilename, feederfilename, year, month):
"""Create a csv file used with dygraph.
:param r_serv: -- connexion to redis database
:param csvfilename: -- the path to the .csv file created
:param feederfilename: -- the path to the file which contain a list of words.
:param year: -- (integer) The year to process
:param month: -- (integer) The month to process
This function create a .csv file using datas in redis.
It's checking if the words contained in feederfilename and
their respectives values by days exists. If these values are missing
(Word not present during a day) it's will automatically put a 0
to keep the timeline of the curve correct.
"""
a = date(year, month, 01)
b = date(year, month, cal.monthrange(year,month)[1])
days = {}
words = []
with open(feederfilename, 'rb') as F:
for word in F: # words of the files
words.append(word[:-1]) # list of words (sorted as in the file)
for dt in rrule(DAILY, dtstart = a, until = b): # for each days
mot = []
mot1 = []
mot2 = []
days[dt.strftime("%Y%m%d")] = ''
for word in sorted(words): # from the 1srt day to the last of the list
if r_serv.hexists(word, dt.strftime("%Y%m%d")): # if the word have a value for the day
mot1.append(str(word))
mot2.append(r_serv.hget(word, dt.strftime("%Y%m%d")))
mot = zip(mot1, mot2)
days[dt.strftime("%Y%m%d")] = mot
else:
mot1.append(str(word))
mot2.append(0)
mot = zip(mot1, mot2)
days[dt.strftime("%Y%m%d")] = mot
with open(csvfilename+".csv", 'wb') as F:
F.write("Date," + ",".join(sorted(words)) + '\n')
for x, s in days.items():
val = []
for y in s:
val.append(y[1])
F.write(x + ',' + str(val) + '\n')
with open(csvfilename+".csv", 'rb') as F:
h = F.read()
h = h.replace("[","")
h = h.replace("]","")
h = h.replace('\'',"")
with open(csvfilename+".csv", 'wb') as F:
F.write(h)