AIL-framework/bin/modules/Categ.py

117 lines
3.7 KiB
Python
Raw Normal View History

2018-05-04 13:53:29 +02:00
#!/usr/bin/env python3
# -*-coding:UTF-8 -*
"""
The ZMQ_PubSub_Categ Module
============================
Each words files created under /files/ are representing categories.
This modules take these files and compare them to
2021-05-19 16:57:20 +02:00
the content of an item.
2021-05-19 16:57:20 +02:00
When a word from a item match one or more of these words file, the filename of
the item / zhe item id is published/forwarded to the next modules.
Each category (each files) are representing a dynamic channel.
This mean that if you create 1000 files under /files/ you'll have 1000 channels
2021-05-19 16:57:20 +02:00
where every time there is a matching word to a category, the item containing
this word will be pushed to this specific channel.
..note:: The channel will have the name of the file created.
Implementing modules can start here, create your own category file,
and then create your own module to treat the specific paste matching this
category.
Requirements
------------
*Need running Redis instances. (Redis)
*Categories files of words in /files/ need to be created
"""
2021-04-02 09:52:05 +02:00
##################################
# Import External packages
##################################
import argparse
import os
2014-09-05 17:05:45 +02:00
import re
import sys
2021-04-02 09:52:05 +02:00
sys.path.append(os.environ['AIL_BIN'])
2021-04-02 09:52:05 +02:00
##################################
# Import Project packages
##################################
from modules.abstract_module import AbstractModule
2021-05-19 16:57:20 +02:00
from packages.Item import Item
2014-08-19 19:07:07 +02:00
2021-04-02 09:52:05 +02:00
class Categ(AbstractModule):
"""
Categ module for AIL framework
"""
2021-05-19 16:57:20 +02:00
def __init__(self, categ_files_dir='../files/'):
2021-04-02 09:52:05 +02:00
"""
Init Categ
"""
super(Categ, self).__init__()
2021-05-19 16:57:20 +02:00
self.categ_files_dir = categ_files_dir
2021-05-19 16:57:20 +02:00
# default = 1 string
self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
2021-05-19 16:57:20 +02:00
self.reload_categ_words()
2021-04-02 09:52:05 +02:00
self.redis_logger.info("Script Categ started")
2021-05-19 16:57:20 +02:00
# # TODO: trigger reload on change ( save last reload time, ...)
def reload_categ_words(self):
2021-04-02 09:52:05 +02:00
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
tmp_dict = {}
for filename in categories:
bname = os.path.basename(filename)
tmp_dict[bname] = []
2021-05-19 16:57:20 +02:00
with open(os.path.join(self.categ_files_dir, filename), 'r') as f:
2021-04-02 09:52:05 +02:00
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
2021-05-19 16:57:20 +02:00
self.categ_words = tmp_dict.items()
2021-05-19 16:57:20 +02:00
def compute(self, message, r_result=False):
# Create Item Object
item = Item(message)
# Get item content
content = item.get_content()
categ_found = []
2021-05-19 16:57:20 +02:00
# Search for pattern categories in item content
for categ, pattern in self.categ_words:
2021-04-02 09:52:05 +02:00
2016-02-10 16:39:56 +01:00
found = set(re.findall(pattern, content))
2021-04-02 09:52:05 +02:00
lenfound = len(found)
if lenfound >= self.matchingThreshold:
2021-05-19 16:57:20 +02:00
categ_found.append(categ)
msg = f'{item.get_id()} {lenfound}'
2018-04-16 14:50:04 +02:00
2021-04-02 09:52:05 +02:00
# Export message to categ queue
2021-05-19 16:57:20 +02:00
print(msg, categ)
self.send_message_to_queue(msg, categ)
2016-02-10 16:39:56 +01:00
2021-04-02 09:52:05 +02:00
self.redis_logger.info(
2021-05-19 16:57:20 +02:00
f'Categ;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {lenfound} as {categ};{item.get_id()}')
if r_result:
return categ_found
2021-04-02 09:52:05 +02:00
2021-05-19 16:57:20 +02:00
if __name__ == '__main__':
2021-04-02 09:52:05 +02:00
2021-05-19 16:57:20 +02:00
# SCRIPT PARSER #
parser = argparse.ArgumentParser(description='Start Categ module on files.')
parser.add_argument(
'-d', type=str, default="../files/",
help='Path to the directory containing the category files.',
action='store')
args = parser.parse_args()
2021-04-02 09:52:05 +02:00
2021-05-19 16:57:20 +02:00
module = Categ(categ_files_dir=args.d)
2021-04-02 09:52:05 +02:00
module.run()