AIL-framework/bin/SentimentAnalyser.py

149 lines
4.9 KiB
Python
Raw Normal View History

2016-08-11 09:40:42 +02:00
#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
Sentiment analyser module.
It takes its inputs from 'shortLine' and 'longLine'.
Source code is taken into account (in case of comments). If it is only source code,
it will be treated with a neutral value anyway.
2016-08-11 09:40:42 +02:00
nltk.sentiment.vader module:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
"""
import time
import datetime
import calendar
import redis
import json
2016-08-11 09:40:42 +02:00
from pubsublogger import publisher
from Helper import Process
from packages import Paste
2016-08-11 09:40:42 +02:00
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
# Config Variables
accepted_Mime_type = ['text/plain']
2016-08-11 09:40:42 +02:00
def Analyse(message, server):
#print 'analyzing'
2016-08-11 09:40:42 +02:00
path = message
paste = Paste.Paste(path)
content = paste.get_p_content()
provider = paste.p_source
p_date = str(paste._get_p_date())
p_MimeType = paste._get_p_encoding()
# Perform further analysis
if p_MimeType == "text/plain":
if isJSON(content):
p_MimeType = "JSON"
if p_MimeType in accepted_Mime_type:
print 'Processing', path
the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
#print 'pastedate: ', the_date
the_time = datetime.datetime.now()
the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
#print 'now: ', the_time
combined_datetime = datetime.datetime.combine(the_date, the_time)
#print 'combined: ', combined_datetime
timestamp = calendar.timegm(combined_datetime.timetuple())
#print 'timestamp: ', timestamp
sentences = tokenize.sent_tokenize(content.decode('utf-8', 'ignore'))
#print len(sentences)
avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
neg_line = 0
pos_line = 0
sid = SentimentIntensityAnalyzer()
for sentence in sentences:
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
if k == 'compound':
if ss['neg'] > ss['pos']:
avg_score['compoundNeg'] += ss[k]
neg_line += 1
else:
avg_score['compoundPos'] += ss[k]
pos_line += 1
else:
avg_score[k] += ss[k]
#print('{0}: {1}, '.format(k, ss[k]))
for k in avg_score:
if k == 'compoundPos':
avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
elif k == 'compoundNeg':
avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
else:
avg_score[k] = avg_score[k] / len(sentences)
# In redis-levelDB: {} = set, () = K-V
# {Provider_set -> provider_i}
# {Provider_TimestampInHour_i -> UniqID_i}_j
# (UniqID_i -> PasteValue_i)
server.sadd('Provider_set', provider)
#print 'Provider_set', provider
provider_timestamp = provider + '_' + str(timestamp)
#print provider_timestamp
server.incr('UniqID')
UniqID = server.get('UniqID')
print provider_timestamp, '->', UniqID
server.sadd(provider_timestamp, UniqID)
server.set(UniqID, avg_score)
print avg_score
#print UniqID, '->', avg_score
else:
print 'Dropped:', p_MimeType
def isJSON(content):
try:
json.loads(content)
return True
except Exception,e:
return False
2016-08-11 09:40:42 +02:00
if __name__ == '__main__':
# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
# Port of the redis instance used by pubsublogger
publisher.port = 6380
# Script is the default channel used for the modules.
publisher.channel = 'Script'
# Section name in bin/packages/modules.cfg
config_section = 'SentimentAnalysis'
2016-08-11 09:40:42 +02:00
# Setup the I/O queues
p = Process(config_section)
# Sent to the logging a description of the module
publisher.info("<description of the module>")
# REDIS_LEVEL_DB #
server = redis.StrictRedis(
host=p.config.get("Redis_Level_DB_Sentiment", "host"),
port=p.config.get("Redis_Level_DB_Sentiment", "port"),
db=p.config.get("Redis_Level_DB_Sentiment", "db"))
2016-08-11 09:40:42 +02:00
# Endless loop getting messages from the input queue
while True:
# Get one message from the input queue
message = p.get_from_set()
if message is None:
publisher.debug("{} queue is empty, waiting".format(config_section))
time.sleep(1)
continue
# Do something with the message from the queue
Analyse(message, server)