mirror of https://github.com/CIRCL/AIL-framework
90 lines
3.1 KiB
Python
Executable File
90 lines
3.1 KiB
Python
Executable File
#!/usr/bin/env python2
|
|
# -*-coding:UTF-8 -*
|
|
"""
|
|
This Module is used for term frequency.
|
|
|
|
"""
|
|
import redis
|
|
import time
|
|
from pubsublogger import publisher
|
|
from packages import lib_words
|
|
import os
|
|
import datetime
|
|
import calendar
|
|
import re
|
|
|
|
from Helper import Process
|
|
|
|
# Config Variables
|
|
BlackListTermsSet_Name = "BlackListSetTermSet"
|
|
TrackedTermsSet_Name = "TrackedSetTermSet"
|
|
TrackedRegexSet_Name = "TrackedRegexSet"
|
|
top_term_freq_max_set_cardinality = 20 # Max cardinality of the terms frequences set
|
|
oneDay = 60*60*24
|
|
top_termFreq_setName_day = ["TopTermFreq_set_day_", 1]
|
|
top_termFreq_setName_week = ["TopTermFreq_set_week", 7]
|
|
top_termFreq_setName_month = ["TopTermFreq_set_month", 31]
|
|
top_termFreq_set_array = [top_termFreq_setName_day,top_termFreq_setName_week, top_termFreq_setName_month]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
publisher.port = 6380
|
|
publisher.channel = "Script"
|
|
|
|
config_section = 'RegexForTermsFrequency'
|
|
p = Process(config_section)
|
|
|
|
# REDIS #
|
|
server_term = redis.StrictRedis(
|
|
host=p.config.get("Redis_Level_DB_TermFreq", "host"),
|
|
port=p.config.get("Redis_Level_DB_TermFreq", "port"),
|
|
db=p.config.get("Redis_Level_DB_TermFreq", "db"))
|
|
|
|
# FUNCTIONS #
|
|
publisher.info("RegexForTermsFrequency script started")
|
|
|
|
#compile the regex
|
|
dico_regex = {}
|
|
for regex_str in server_term.smembers(TrackedRegexSet_Name):
|
|
dico_regex[regex_str] = re.compile(regex_str)
|
|
|
|
|
|
message = p.get_from_set()
|
|
|
|
# Regex Frequency
|
|
while True:
|
|
|
|
if message is not None:
|
|
filename, timestamp, word = message.split()
|
|
curr_set = top_termFreq_setName_day[0] + str(timestamp)
|
|
|
|
#iterate the word with the regex
|
|
for regex_str, compiled_regex in dico_regex.items():
|
|
matched = compiled_regex.match(word)
|
|
if word == "amzinggg":
|
|
print("matched")
|
|
server_term.incr("thisistest")
|
|
|
|
if matched is not None: #there is a match
|
|
matched = matched.group(0)
|
|
# Add in Regex track set only if term is not in the blacklist
|
|
if matched not in server_term.smembers(BlackListTermsSet_Name):
|
|
set_name = 'regex_' + regex_str
|
|
new_to_the_set = server_term.sadd(set_name, filename)
|
|
new_to_the_set = True if new_to_the_set == 1 else False
|
|
|
|
#consider the num of occurence of this term
|
|
regex_value = int(server_term.hincrby(timestamp, regex_str, int(1)))
|
|
#1 term per paste
|
|
if new_to_the_set:
|
|
regex_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), regex_str, int(1)))
|
|
server_term.zincrby("per_paste_" + curr_set, regex_str, float(1))
|
|
server_term.zincrby(curr_set, regex_str, float(1))
|
|
|
|
|
|
else:
|
|
publisher.debug("Script RegexForTermsFrequency is Idling")
|
|
print "sleeping"
|
|
time.sleep(5)
|
|
message = p.get_from_set()
|