AIL-framework/bin/modules/SentimentAnalysis.py

#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
    Sentiment analyser module.
    It takes its inputs from 'global'.

    The content is analysed if the length of the line is
    above a defined threshold (get_p_content_with_removed_lines).
    This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash
    for long lines (function _slices_from_text line#1276).


    nltk.sentiment.vader module credit:
        Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.

"""

##################################
# Import External packages
##################################
import os
import sys
import time
import datetime
import calendar
import redis
import json
import signal
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize, download

sys.path.append(os.environ['AIL_BIN'])
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
from lib.objects.Items import Item
from lib import ConfigLoader


class TimeoutException(Exception):
    pass

def timeout_handler(signum, frame):
    raise TimeoutException

signal.signal(signal.SIGALRM, timeout_handler)

## TODO: REFACTOR MODULE + CLEAN HISTORY
class SentimentAnalysis(AbstractModule):
    """
    SentimentAnalysis module for AIL framework
    """

    # Config Variables
    accepted_Mime_type = ['text/plain']
    line_max_length_threshold = 1000

    def __init__(self):
        super(SentimentAnalysis, self).__init__()

        self.sentiment_lexicon_file = ConfigLoader.ConfigLoader().get_config_str("Directories", "sentiment_lexicon_file")

        # REDIS_LEVEL_DB #
        self.db = ConfigLoader.ConfigLoader().get_redis_conn("_Sentiment")

        self.time1 = time.time()

        # Waiting time in secondes between to message proccessed
        self.pending_seconds = 1

        # Send module state to logs
        self.logger.info(f"Module {self.module_name} initialized")

    def compute(self, message):
        # Max time to compute one entry
        signal.alarm(60)
        try:
            self.analyse(message)
        except TimeoutException:
            self.logger.debug(f"{message} processing timeout")
        else:
            signal.alarm(0)

    def get_p_content_with_removed_lines(self, threshold, item_content):
        num_line_removed = 0
        line_length_threshold = threshold
        string_content = ""
        f = item_content
        for line_id, line in enumerate(f):
            length = len(line)

            if length < line_length_threshold:
                string_content += line
            else:
                num_line_removed += 1

        return num_line_removed, string_content

    def analyse(self, message):

        item = Item(message)

        # get content with removed line + number of them
        num_line_removed, p_content = self.get_p_content_with_removed_lines(SentimentAnalysis.line_max_length_threshold,
                                                                            item.get_content())
        provider = item.get_source()
        p_date = item.get_date()
        p_MimeType = item.get_mimetype()

        # Perform further analysis
        if p_MimeType == "text/plain":
            if self.isJSON(p_content):
                p_MimeType = "JSON"

        if p_MimeType in SentimentAnalysis.accepted_Mime_type:
            self.logger.debug(f'Accepted :{p_MimeType}')

            the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))
            the_time = datetime.datetime.now()
            the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)
            combined_datetime = datetime.datetime.combine(the_date, the_time)
            timestamp = calendar.timegm(combined_datetime.timetuple())

            try:
                sentences = tokenize.sent_tokenize(p_content)
            except:
                # use the NLTK Downloader to obtain the resource
                download('punkt')
                sentences = tokenize.sent_tokenize(p_content)

            if len(sentences) > 0:
                avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}
                neg_line = 0
                pos_line = 0
                sid = SentimentIntensityAnalyzer(self.sentiment_lexicon_file)
                for sentence in sentences:
                    ss = sid.polarity_scores(sentence)
                    for k in sorted(ss):
                        if k == 'compound':
                            if ss['neg'] > ss['pos']:
                                avg_score['compoundNeg'] += ss[k]
                                neg_line += 1
                            else:
                                avg_score['compoundPos'] += ss[k]
                                pos_line += 1
                        else:
                            avg_score[k] += ss[k]


                for k in avg_score:
                    if k == 'compoundPos':
                        avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)
                    elif k == 'compoundNeg':
                        avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)
                    else:
                        avg_score[k] = avg_score[k] / len(sentences)


                # In redis-levelDB: {} = set, () = K-V
                # {Provider_set -> provider_i}
                # {Provider_TimestampInHour_i -> UniqID_i}_j
                # (UniqID_i -> PasteValue_i)

                self.db.sadd('Provider_set', provider)

                provider_timestamp = provider + '_' + str(timestamp)
                self.db.incr('UniqID')
                UniqID = self.db.get('UniqID')
                self.redis_logger.debug(f'{provider_timestamp}->{UniqID}dropped{num_line_removed}lines')
                self.db.sadd(provider_timestamp, UniqID)
                self.db.set(UniqID, avg_score)
        else:
            self.redis_logger.debug(f'Dropped:{p_MimeType}')


    def isJSON(self, content):
        try:
            json.loads(content)
            return True

        except Exception:
            return False


if __name__ == '__main__':

    module = SentimentAnalysis()
    module.run()
decode with redis connection 2018-05-04 13:53:29 +02:00			`#!/usr/bin/env python3`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`# --coding:UTF-8 -`
			`"""`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`Sentiment analyser module.`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`It takes its inputs from 'global'.`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
Improved description of modules inside the scripts 2017-05-09 11:13:16 +02:00			`The content is analysed if the length of the line is`
			`above a defined threshold (get_p_content_with_removed_lines).`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00			`This is done because NLTK sentences tokemnizer (sent_tokenize) seems to crash`
			`for long lines (function _slices_from_text line#1276).`


			`nltk.sentiment.vader module credit:`
			`Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
			`"""`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`##################################`
			`# Import External packages`
			`##################################`
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 15:18:03 +01:00			`import os`
			`import sys`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00			`import time`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00			`import datetime`
			`import calendar`
			`import redis`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00			`import json`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`import signal`
			`from nltk.sentiment.vader import SentimentIntensityAnalyzer`
			`from nltk import tokenize, download`

chg: [Tracker_Yara module] create module class 2021-06-02 16:04:52 +02:00			`sys.path.append(os.environ['AIL_BIN'])`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`##################################`
			`# Import Project packages`
			`##################################`
chg: [Tracker_Yara module] create module class 2021-06-02 16:04:52 +02:00			`from modules.abstract_module import AbstractModule`
chg: [crawler + core + cve] migrate crawler to lacus + add new CVE object and correlation + migrate core 2022-10-25 16:25:19 +02:00			`from lib.objects.Items import Item`
			`from lib import ConfigLoader`
chg: [core] mv bin/packages/config.cfg configs/core.cfg + use ConfigLoader 2019-11-05 15:18:03 +01:00
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`class TimeoutException(Exception):`
			`pass`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`def timeout_handler(signum, frame):`
			`raise TimeoutException`
SentimentAnalysis: use lexicon file path from the ail configuration file 2017-01-11 11:00:36 +01:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`signal.signal(signal.SIGALRM, timeout_handler)`
Added sentiment analyser module (draft) 2016-08-13 15:24:57 +02:00
chg: [Tracker_Yara module] create module class 2021-06-02 16:04:52 +02:00			`## TODO: REFACTOR MODULE + CLEAN HISTORY`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`class SentimentAnalysis(AbstractModule):`
			`"""`
			`SentimentAnalysis module for AIL framework`
			`"""`
Added draft of filter in sentiment analysis (Discard syntaxical languages) + Added nice tooltip for sparkline. Trending displays avg in function of the number of elements processed and not for the complete week + fixed bug in gauge and canvasjs (was performing avg with only 1 graph instead of all 8). 2016-08-16 16:33:02 +02:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`# Config Variables`
			`accepted_Mime_type = ['text/plain']`
			`line_max_length_threshold = 1000`
Added drop of really long line in sentiment-analysis module + Added description of sentiment module. Also, fixed bug in webpage sentiement-trending concerning avg and date range. 2016-08-17 09:46:25 +02:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`def __init__(self):`
			`super(SentimentAnalysis, self).__init__()`
Added module sentimentAnalyser 2016-08-11 09:40:42 +02:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`self.sentiment_lexicon_file = ConfigLoader.ConfigLoader().get_config_str("Directories", "sentiment_lexicon_file")`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`# REDIS_LEVEL_DB #`
chg: [cleanup] remove ARDB + fix hive case 2023-01-18 16:28:08 +01:00			`self.db = ConfigLoader.ConfigLoader().get_redis_conn("_Sentiment")`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`self.time1 = time.time()`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`# Waiting time in secondes between to message proccessed`
			`self.pending_seconds = 1`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`# Send module state to logs`
chg: [logs] add new logger 2023-05-12 15:29:53 +02:00			`self.logger.info(f"Module {self.module_name} initialized")`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`def compute(self, message):`
			`# Max time to compute one entry`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00			`signal.alarm(60)`
			`try:`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`self.analyse(message)`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00			`except TimeoutException:`
chg: [logs] add new logger 2023-05-12 15:29:53 +02:00			`self.logger.debug(f"{message} processing timeout")`
(partially fix #90 too) using a simple Alarm (SIGNAL) when exec-timeout Introducing a timer (in this case 60 seconds) to ensure that the execution time of the analysis takes less than 60 seconds. This is a simple and standard POSIX signal handler. If the timeout is reached, the module will process the next one. This approach fixes the specific issues we have currently with some inputs where the sentiment analysis takes too much time. This fix should be improved and be more generic: - Introducing statistics of content which timeouts. - Keeping a list/queue to further process those files using a different analysis approach. Maybe a set of "dirty" processes to handle the edge cases and to not impact the overall processing and analysis. - Make the timer configurable per module (at least for this one). 2017-01-26 08:11:18 +01:00			`else:`
			`signal.alarm(0)`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
chg: [crawler + core + cve] migrate crawler to lacus + add new CVE object and correlation + migrate core 2022-10-25 16:25:19 +02:00			`def get_p_content_with_removed_lines(self, threshold, item_content):`
			`num_line_removed = 0`
			`line_length_threshold = threshold`
			`string_content = ""`
			`f = item_content`
			`for line_id, line in enumerate(f):`
			`length = len(line)`

			`if length < line_length_threshold:`
			`string_content += line`
			`else:`
			`num_line_removed += 1`

			`return num_line_removed, string_content`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`def analyse(self, message):`

chg: [crawler + core + cve] migrate crawler to lacus + add new CVE object and correlation + migrate core 2022-10-25 16:25:19 +02:00			`item = Item(message)`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`# get content with removed line + number of them`
chg: [crawler + core + cve] migrate crawler to lacus + add new CVE object and correlation + migrate core 2022-10-25 16:25:19 +02:00			`num_line_removed, p_content = self.get_p_content_with_removed_lines(SentimentAnalysis.line_max_length_threshold,`
			`item.get_content())`
			`provider = item.get_source()`
			`p_date = item.get_date()`
			`p_MimeType = item.get_mimetype()`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`# Perform further analysis`
			`if p_MimeType == "text/plain":`
			`if self.isJSON(p_content):`
			`p_MimeType = "JSON"`

			`if p_MimeType in SentimentAnalysis.accepted_Mime_type:`
chg: [logs] add new logger 2023-05-12 15:29:53 +02:00			`self.logger.debug(f'Accepted :{p_MimeType}')`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00
			`the_date = datetime.date(int(p_date[0:4]), int(p_date[4:6]), int(p_date[6:8]))`
			`the_time = datetime.datetime.now()`
			`the_time = datetime.time(getattr(the_time, 'hour'), 0, 0)`
			`combined_datetime = datetime.datetime.combine(the_date, the_time)`
			`timestamp = calendar.timegm(combined_datetime.timetuple())`

			`try:`
			`sentences = tokenize.sent_tokenize(p_content)`
			`except:`
			`# use the NLTK Downloader to obtain the resource`
			`download('punkt')`
			`sentences = tokenize.sent_tokenize(p_content)`

			`if len(sentences) > 0:`
			`avg_score = {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compoundPos': 0.0, 'compoundNeg': 0.0}`
			`neg_line = 0`
			`pos_line = 0`
chg: [modules + tests] fix modules + test modules on samples 2021-06-08 16:46:36 +02:00			`sid = SentimentIntensityAnalyzer(self.sentiment_lexicon_file)`
fix: stuck queues and submit paste 2021-04-28 15:24:33 +02:00			`for sentence in sentences:`
			`ss = sid.polarity_scores(sentence)`
			`for k in sorted(ss):`
			`if k == 'compound':`
			`if ss['neg'] > ss['pos']:`
			`avg_score['compoundNeg'] += ss[k]`
			`neg_line += 1`
			`else:`
			`avg_score['compoundPos'] += ss[k]`
			`pos_line += 1`
			`else:`
			`avg_score[k] += ss[k]`


			`for k in avg_score:`
			`if k == 'compoundPos':`
			`avg_score[k] = avg_score[k] / (pos_line if pos_line > 0 else 1)`
			`elif k == 'compoundNeg':`
			`avg_score[k] = avg_score[k] / (neg_line if neg_line > 0 else 1)`
			`else:`
			`avg_score[k] = avg_score[k] / len(sentences)`


			`# In redis-levelDB: {} = set, () = K-V`
			`# {Provider_set -> provider_i}`
			`# {Provider_TimestampInHour_i -> UniqID_i}_j`
			`# (UniqID_i -> PasteValue_i)`

			`self.db.sadd('Provider_set', provider)`

			`provider_timestamp = provider + '_' + str(timestamp)`
			`self.db.incr('UniqID')`
			`UniqID = self.db.get('UniqID')`
			`self.redis_logger.debug(f'{provider_timestamp}->{UniqID}dropped{num_line_removed}lines')`
			`self.db.sadd(provider_timestamp, UniqID)`
			`self.db.set(UniqID, avg_score)`
			`else:`
			`self.redis_logger.debug(f'Dropped:{p_MimeType}')`


			`def isJSON(self, content):`
			`try:`
			`json.loads(content)`
			`return True`

			`except Exception:`
			`return False`


			`if __name__ == '__main__':`

			`module = SentimentAnalysis()`
			`module.run()`