AIL-framework/bin/SentimentAnalysis.py

#!/usr/bin/env python2
# -*-coding:UTF-8 -*
"""
    Sentiment analyser module.
    It takes its inputs from 'global'.

    The content is analysed if the length of the line is
    above a defined threshold (get_p_content_with_removed_lines).
    This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
    for long lines (function _slices_from_text line#1276).


    nltk.sentiment.vader module credit:
        Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

"""

import time
import datetime
import calendar
import redis
import json
from pubsublogger import publisher
from Helper import Process
from packages import Paste

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

# Config Variables
accepted_Mime_type = ['text/plain']
size_threshold = 250
line_max_length_threshold = 1000

import os
import ConfigParser

configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
if not os.path.exists(configfile):
    raise Exception('Unable to find the configuration file. \
        Did you set environment variables? \
        Or activate the virtualenv.')

cfg = ConfigParser.ConfigParser()
cfg.read(configfile)

sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file")

def Analyse(message, server):
    path = message
    paste = Paste.Paste(path)

    # get content with removed line + number of them
    num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)
    provider = paste.p_source
    p_date = str(paste._get_p_date())
    p_MimeType = paste._get_p_encoding()

    # Perform further analysis
    if p_MimeType == "text/plain":
        if isJSON(p_content):
            p_MimeType = "JSON"

    if p_MimeType in accepted_Mime_type:

        the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
        the_time = datetime.datetime.now()
        the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
        combined_datetime = datetime.datetime.combine(the_date, the_time)
        timestamp = calendar.timegm(combined_datetime.timetuple())

        sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))

        if len(sentences) > 0:
            avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
            neg_line = 0
            pos_line = 0
            sid = SentimentIntensityAnalyzer(sentiment_lexicon_file)
            for sentence in sentences:
                 ss = sid.polarity_scores(sentence)
                 for k in sorted(ss):
                     if k == 'compound':
                         if ss['neg'] > ss['pos']:
                             avg_score['compoundNeg'] += ss[k]
                             neg_line += 1
                         else:
                             avg_score['compoundPos'] += ss[k]
                             pos_line += 1
                     else:
                         avg_score[k] += ss[k]


            for k in avg_score:
                if k == 'compoundPos':
                    avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
                elif k == 'compoundNeg':
                    avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
                else:
                    avg_score[k] = avg_score[k] / len(sentences)


            # In redis-levelDB: {} = set, () = K-V 
            # {Provider_set -> provider_i}
            # {Provider_TimestampInHour_i -> UniqID_i}_j
            # (UniqID_i -> PasteValue_i)

            server.sadd('Provider_set', provider)

            provider_timestamp = provider + '_' + str(timestamp)
            server.incr('UniqID')
            UniqID = server.get('UniqID')
            print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'
            server.sadd(provider_timestamp, UniqID)
            server.set(UniqID, avg_score)
    else:
        print 'Dropped:', p_MimeType


def isJSON(content):
    try:
        json.loads(content)
        return True

    except Exception,e:
        return False

import signal

class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

if __name__ == '__main__':
    # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)
    # Port of the redis instance used by pubsublogger
    publisher.port = 6380
    # Script is the default channel used for the modules.
    publisher.channel = 'Script'

    # Section name in bin/packages/modules.cfg
    config_section = 'SentimentAnalysis'

    # Setup the I/O queues
    p = Process(config_section)

    # Sent to the logging a description of the module
    publisher.info("<description of the module>")

    # REDIS_LEVEL_DB #
    server = redis.StrictRedis(
        host=p.config.get("Redis_Level_DB_Sentiment", "host"),
        port=p.config.get("Redis_Level_DB_Sentiment", "port"),
        db=p.config.get("Redis_Level_DB_Sentiment", "db"))

    while True:
        message = p.get_from_set()
        if message is None:
            publisher.debug("{} queue is empty, waiting".format(config_section))
            time.sleep(1)
            continue
        signal.alarm(60)
        try:
            Analyse(message, server)
        except TimeoutException:
            print ("{0} processing timeout".format(message))
            continue
        else:
            signal.alarm(0)
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`#!/usr/bin/env python2`
			`# --coding:UTF-8 -`
			`"""`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`Sentiment analyser module.`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`It takes its inputs from 'global'.`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00			`The content is analysed if the length of the line is`
			`above a defined threshold (get_p_content_with_removed_lines).`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash`
			`for long lines (function _slices_from_text line#1276).`


			`nltk.sentiment.vader module credit:`
			`Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
			`"""`

			`import time`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`import datetime`
			`import calendar`
			`import redis`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`import json`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`from pubsublogger import publisher`
			`from Helper import Process`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`from packages import Paste`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
			`from nltk.sentiment.vader import SentimentIntensityAnalyzer`
			`from nltk import tokenize`

Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`# Config Variables`
			`accepted_Mime_type = ['text/plain']`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`size_threshold = 250`
			`line_max_length_threshold = 1000`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
SentimentAnalysis: use lexicon file path from the ail configuration file 2017-01-11 11:00:36 +01:00			`import os`
			`import ConfigParser`

			`configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')`
			`if not os.path.exists(configfile):`
			`raise Exception('Unable to find the configuration file. \`
			`Did you set environment variables? \`
			`Or activate the virtualenv.')`

			`cfg = ConfigParser.ConfigParser()`
			`cfg.read(configfile)`

			`sentiment_lexicon_file = cfg.get("Directories", "sentiment_lexicon_file")`

Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`def Analyse(message, server):`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`path = message`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`paste = Paste.Paste(path)`

Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`# get content with removed line + number of them`
			`num_line_removed, p_content = paste.get_p_content_with_removed_lines(line_max_length_threshold)`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`provider = paste.p_source`
			`p_date = str(paste._get_p_date())`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`p_MimeType = paste._get_p_encoding()`

			`# Perform further analysis`
			`if p_MimeType == "text/plain":`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`if isJSON(p_content):`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`p_MimeType = "JSON"`

			`if p_MimeType in accepted_Mime_type:`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))`
			`the_time = datetime.datetime.now()`
			`the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)`
			`combined_datetime = datetime.datetime.combine(the_date, the_time)`
			`timestamp = calendar.timegm(combined_datetime.timetuple())`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`sentences = tokenize.sent_tokenize(p_content.decode('utf-8', 'ignore'))`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`if len(sentences) > 0:`
			`avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}`
			`neg_line = 0`
			`pos_line = 0`
SentimentAnalysis: use lexicon file path from the ail configuration file 2017-01-11 11:00:36 +01:00			`sid = SentimentIntensityAnalyzer(sentiment_lexicon_file)`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`for sentence in sentences:`
			`ss = sid.polarity_scores(sentence)`
			`for k in sorted(ss):`
			`if k == 'compound':`
			`if ss['neg'] > ss['pos']:`
			`avg_score['compoundNeg'] += ss[k]`
			`neg_line += 1`
			`else:`
			`avg_score['compoundPos'] += ss[k]`
			`pos_line += 1`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`else:`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`avg_score[k] += ss[k]`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00

Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`for k in avg_score:`
			`if k == 'compoundPos':`
			`avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)`
			`elif k == 'compoundNeg':`
			`avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)`
			`else:`
			`avg_score[k] = avg_score[k] / len(sentences)`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00

Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`# In redis-levelDB: {} = set, () = K-V`
			`# {Provider_set -> provider_i}`
			`# {Provider_TimestampInHour_i -> UniqID_i}_j`
			`# (UniqID_i -> PasteValue_i)`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`server.sadd('Provider_set', provider)`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`provider_timestamp = provider + '_' + str(timestamp)`
			`server.incr('UniqID')`
			`UniqID = server.get('UniqID')`
			`print provider_timestamp, '->', UniqID, 'dropped', num_line_removed, 'lines'`
			`server.sadd(provider_timestamp, UniqID)`
			`server.set(UniqID, avg_score)`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`else:`
			`print 'Dropped:', p_MimeType`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00
			`def isJSON(content):`
			`try:`
			`json.loads(content)`
			`return True`

			`except Exception,e:`
			`return False`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00			`import signal`

			`class TimeoutException(Exception):`
			`pass`

			`def timeout_handler(signum, frame):`
			`raise TimeoutException`

			`signal.signal(signal.SIGALRM, timeout_handler)`

Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`if __name__ == '__main__':`
			`# If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh)`
			`# Port of the redis instance used by pubsublogger`
			`publisher.port = 6380`
			`# Script is the default channel used for the modules.`
			`publisher.channel = 'Script'`

			`# Section name in bin/packages/modules.cfg`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`config_section = 'SentimentAnalysis'`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
			`# Setup the I/O queues`
			`p = Process(config_section)`

			`# Sent to the logging a description of the module`
			`publisher.info("<description of the module>")`

Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`# REDIS_LEVEL_DB #`
			`server = redis.StrictRedis(`
			`host=p.config.get("Redis_Level_DB_Sentiment", "host"),`
			`port=p.config.get("Redis_Level_DB_Sentiment", "port"),`
			`db=p.config.get("Redis_Level_DB_Sentiment", "db"))`

Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`while True:`
			`message = p.get_from_set()`
			`if message is None:`
			`publisher.debug("{} queue is empty, waiting".format(config_section))`
			`time.sleep(1)`
			`continue`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00			`signal.alarm(60)`
			`try:`
			`Analyse(message, server)`
			`except TimeoutException:`
			`print ("{0} processing timeout".format(message))`
			`continue`
			`else:`
			`signal.alarm(0)`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00