Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range.

pull/68/head
Mokaddem 2016-08-17 09:46:25 +02:00
parent 1084e45f1b
commit 894b9efda9
3 changed files with 86 additions and 54 deletions

View File

@ -2,12 +2,16 @@
# -*-coding:UTF-8 -*
"""
Sentiment analyser module.
It takes its inputs from 'shortLine' and 'longLine'.
Source code is taken into account (in case of comments). If it is only source code,
it will be treated with a neutral value anyway.
It takes its inputs from 'global'.
nltk.sentiment.vader module:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
The content analysed comes from the pastes with length of the line
above a defined threshold removed (get_p_content_with_removed_lines).
This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
for long lines (function _slices_from_text line#1276).
nltk.sentiment.vader module credit:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""
@ -25,23 +29,27 @@ from nltk import tokenize
# Config Variables
accepted_Mime_type = ['text/plain']
size_threshold = 250
line_max_length_threshold = 1000
def Analyse(message, server):
#print 'analyzing'
path = message
paste = Paste.Paste(path)
content = paste.get_p_content()
# get content with removed line + number of them
num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)
provider = paste.p_source
p_date = str(paste._get_p_date())
p_MimeType = paste._get_p_encoding()
# Perform further analysis
if p_MimeType == "text/plain":
if isJSON(content):
if isJSON(p_content):
p_MimeType = "JSON"
if p_MimeType in accepted_Mime_type:
print 'Processing', path
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
#print 'pastedate: ', the_date
@ -53,54 +61,54 @@ def Analyse(message, server):
timestamp = calendar.timegm(combined_datetime.timetuple())
#print 'timestamp: ', timestamp
sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))
#print len(sentences)
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
neg_line = 0
pos_line = 0
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
if k == 'compound':
if ss['neg'] > ss['pos']:
avg_score['compoundNeg'] += ss[k]
neg_line += 1
if len(sentences) > 0:
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
neg_line = 0
pos_line = 0
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
if k == 'compound':
if ss['neg'] > ss['pos']:
avg_score['compoundNeg'] += ss[k]
neg_line += 1
else:
avg_score['compoundPos'] += ss[k]
pos_line += 1
else:
avg_score['compoundPos'] += ss[k]
pos_line += 1
else:
avg_score[k] += ss[k]
avg_score[k] += ss[k]
#print('{0}: {1}, '.format(k, ss[k]))
#print('{0}: {1}, '.format(k, ss[k]))
for k in avg_score:
if k == 'compoundPos':
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
elif k == 'compoundNeg':
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
else:
avg_score[k] = avg_score[k] / len(sentences)
for k in avg_score:
if k == 'compoundPos':
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
elif k == 'compoundNeg':
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
else:
avg_score[k] = avg_score[k] / len(sentences)
# In redis-levelDB: {} = set, () = K-V
# {Provider_set -> provider_i}
# {Provider_TimestampInHour_i -> UniqID_i}_j
# (UniqID_i -> PasteValue_i)
# In redis-levelDB: {} = set, () = K-V
# {Provider_set -> provider_i}
# {Provider_TimestampInHour_i -> UniqID_i}_j
# (UniqID_i -> PasteValue_i)
server.sadd('Provider_set', provider)
#print 'Provider_set', provider
server.sadd('Provider_set', provider)
#print 'Provider_set', provider
provider_timestamp = provider + '_' + str(timestamp)
#print provider_timestamp
server.incr('UniqID')
UniqID = server.get('UniqID')
print provider_timestamp, '->', UniqID
server.sadd(provider_timestamp, UniqID)
server.set(UniqID, avg_score)
print avg_score
#print UniqID, '->', avg_score
provider_timestamp = provider + '_' + str(timestamp)
#print provider_timestamp
server.incr('UniqID')
UniqID = server.get('UniqID')
print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'
server.sadd(provider_timestamp, UniqID)
server.set(UniqID, avg_score)
#print UniqID, '->', avg_score
else:
print 'Dropped:', p_MimeType
@ -146,3 +154,4 @@ if __name__ == '__main__':
# Do something with the message from the queue
Analyse(message, server)

View File

@ -91,6 +91,7 @@ class Paste(object):
self.p_langage = None
self.p_nb_lines = None
self.p_max_length_line = None
self.array_line_above_threshold = None
self.p_duplicate = None
def get_p_content(self):
@ -118,6 +119,21 @@ class Paste(object):
def get_p_content_as_file(self):
return cStringIO.StringIO(self.get_p_content())
def get_p_content_with_removed_lines(self, threshold):
num_line_removed = 0
line_length_threshold = threshold
string_content = ""
f = self.get_p_content_as_file()
line_id = 0
for line_id, line in enumerate(f):
length = len(line)
if length < line_length_threshold:
string_content += line
else:
num_line_removed+=1
return (num_line_removed, string_content)
def get_lines_info(self):
"""
Returning and setting the number of lines and the maximum lenght of the
@ -136,10 +152,12 @@ class Paste(object):
length = len(line)
if length >= max_length_line:
max_length_line = length
f.close()
self.p_nb_lines = line_id
self.p_max_length_line = max_length_line
return (self.p_nb_lines, self.p_max_length_line)
return (self.p_nb_lines, self.p_max_length_line, array_line_above_threshold)
def _get_p_encoding(self):
"""

View File

@ -7,13 +7,14 @@
};
function generate_offset_to_date(day){
day = day-1;
var now = new Date();
var to_ret = {};
for(i=0; i<day; i++){
for(i=day; i>=0; i--){
for(j=0; j<24; j++){
var t1 =now.getDate()-i + ":";
var t2 =now.getHours()-(23-j)+"h";
to_ret[j+24*i] = t1+t2;
to_ret[j+24*(day-i)] = t1+t2;
}
}
return to_ret;
@ -53,6 +54,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
var all_graph_day_sum = 0.0;
var all_graph_hour_sum = 0.0;
var all_day_avg = 0.0;
for (graphNum=0; graphNum<8; graphNum++) {
var max_value = 0.0;
@ -65,7 +67,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
var day_sum_elem = 0.0;
var hour_sum = 0.0;
for(curr_date=dateStart; curr_date<dateStart+oneWeek; curr_date+=oneHour){
for(curr_date=dateStart+oneHour; curr_date<=dateStart+oneWeek; curr_date+=oneHour){
var data_array = data[curr_provider][curr_date];
if (data_array.length == 0){
@ -99,7 +101,7 @@ $.getJSON("/sentiment_analysis_getplotdata/",
curr_sum_elem++;
max_value = Math.abs(pos-neg) > max_value ? Math.abs(pos-neg) : max_value;
if(curr_date >= dateStart+oneWeek-24*oneHour){
if(curr_date >= dateStart+oneWeek-23*oneHour){
day_sum += (pos-neg);
day_sum_elem++;
}
@ -150,11 +152,13 @@ $.getJSON("/sentiment_analysis_getplotdata/",
sparklineOptions.barWidth = 18;
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> Avg: {{value}} </span>'
//var day_avg = day_sum/24;
var day_avg = day_sum/day_sum_elem;
var day_avg = isNaN(day_sum/day_sum_elem) ? 0 : day_sum/day_sum_elem;
var day_avg_text = isNaN(day_sum/day_sum_elem) ? 'No data' : (day_avg).toFixed(5);
all_day_avg += day_avg;
$(placeholder+'b').sparkline([day_avg], sparklineOptions);
sparklineOptions.tooltipFormat = '<span style="color: {{color}}">&#9679;</span> {{offset:names}}, {{value}} </span>'
sparklineOptions.barWidth = 2;
$(placeholder+'s').text((day_avg).toFixed(5));
$(placeholder+'s').text(day_avg_text);
}//for loop
@ -197,7 +201,8 @@ $.getJSON("/sentiment_analysis_getplotdata/",
gaugeOptions2.appendTo = '#gauge_today_last_days';
gaugeOptions2.dialLabel = 'Today';
gaugeOptions2.elementId = 'gauge2';
piePercent = (all_graph_day_sum / (8*24)) / max_value;
//piePercent = (all_graph_day_sum / (8*24)) / max_value;
piePercent = (all_day_avg / 8) / max_value;
gaugeOptions2.inc = piePercent;
var gauge_today_last_days = new FlexGauge(gaugeOptions2);