mirror of https://github.com/CIRCL/AIL-framework
chg: [Categ] tests + docs
parent
4a9bda2ee8
commit
20727fff77
|
@ -43,7 +43,7 @@ class ApiKey(AbstractModule):
|
||||||
# Send module state to logs
|
# Send module state to logs
|
||||||
self.redis_logger.info(f"Module {self.module_name} initialized")
|
self.redis_logger.info(f"Module {self.module_name} initialized")
|
||||||
|
|
||||||
def compute(self, message, r_match=False):
|
def compute(self, message, r_result=False):
|
||||||
id, score = message.split()
|
id, score = message.split()
|
||||||
item = Item(id)
|
item = Item(id)
|
||||||
item_content = item.get_content()
|
item_content = item.get_content()
|
||||||
|
@ -82,7 +82,7 @@ class ApiKey(AbstractModule):
|
||||||
# Send to duplicate
|
# Send to duplicate
|
||||||
self.send_message_to_queue('Duplicate', item.get_id())
|
self.send_message_to_queue('Duplicate', item.get_id())
|
||||||
|
|
||||||
if r_match:
|
if r_result:
|
||||||
return (google_api_key, aws_access_key, aws_secret_key)
|
return (google_api_key, aws_access_key, aws_secret_key)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
95
bin/Categ.py
95
bin/Categ.py
|
@ -4,19 +4,16 @@
|
||||||
The ZMQ_PubSub_Categ Module
|
The ZMQ_PubSub_Categ Module
|
||||||
============================
|
============================
|
||||||
|
|
||||||
This module is consuming the Redis-list created by the ZMQ_PubSub_Tokenize_Q
|
|
||||||
Module.
|
|
||||||
|
|
||||||
Each words files created under /files/ are representing categories.
|
Each words files created under /files/ are representing categories.
|
||||||
This modules take these files and compare them to
|
This modules take these files and compare them to
|
||||||
the stream of data given by the ZMQ_PubSub_Tokenize_Q Module.
|
the content of an item.
|
||||||
|
|
||||||
When a word from a paste match one or more of these words file, the filename of
|
When a word from a item match one or more of these words file, the filename of
|
||||||
the paste is published/forwarded to the next modules.
|
the item / zhe item id is published/forwarded to the next modules.
|
||||||
|
|
||||||
Each category (each files) are representing a dynamic channel.
|
Each category (each files) are representing a dynamic channel.
|
||||||
This mean that if you create 1000 files under /files/ you'll have 1000 channels
|
This mean that if you create 1000 files under /files/ you'll have 1000 channels
|
||||||
where every time there is a matching word to a category, the paste containing
|
where every time there is a matching word to a category, the item containing
|
||||||
this word will be pushed to this specific channel.
|
this word will be pushed to this specific channel.
|
||||||
|
|
||||||
..note:: The channel will have the name of the file created.
|
..note:: The channel will have the name of the file created.
|
||||||
|
@ -25,15 +22,11 @@ Implementing modules can start here, create your own category file,
|
||||||
and then create your own module to treat the specific paste matching this
|
and then create your own module to treat the specific paste matching this
|
||||||
category.
|
category.
|
||||||
|
|
||||||
..note:: Module ZMQ_Something_Q and ZMQ_Something are closely bound, always put
|
|
||||||
the same Subscriber name in both of them.
|
|
||||||
|
|
||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
|
||||||
*Need running Redis instances. (Redis)
|
*Need running Redis instances. (Redis)
|
||||||
*Categories files of words in /files/ need to be created
|
*Categories files of words in /files/ need to be created
|
||||||
*Need the ZMQ_PubSub_Tokenize_Q Module running to be able to work properly.
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -42,16 +35,13 @@ Requirements
|
||||||
##################################
|
##################################
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
import time
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
##################################
|
##################################
|
||||||
# Import Project packages
|
# Import Project packages
|
||||||
##################################
|
##################################
|
||||||
from module.abstract_module import AbstractModule
|
from module.abstract_module import AbstractModule
|
||||||
from pubsublogger import publisher
|
from packages.Item import Item
|
||||||
from packages import Paste
|
|
||||||
from Helper import Process
|
|
||||||
|
|
||||||
|
|
||||||
class Categ(AbstractModule):
|
class Categ(AbstractModule):
|
||||||
|
@ -59,73 +49,66 @@ class Categ(AbstractModule):
|
||||||
Categ module for AIL framework
|
Categ module for AIL framework
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, categ_files_dir='../files/'):
|
||||||
"""
|
"""
|
||||||
Init Categ
|
Init Categ
|
||||||
"""
|
"""
|
||||||
super(Categ, self).__init__()
|
super(Categ, self).__init__()
|
||||||
|
|
||||||
|
self.categ_files_dir = categ_files_dir
|
||||||
|
|
||||||
|
# default = 1 string
|
||||||
self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
|
self.matchingThreshold = self.process.config.getint("Categ", "matchingThreshold")
|
||||||
|
|
||||||
# SCRIPT PARSER #
|
self.reload_categ_words()
|
||||||
parser = argparse.ArgumentParser(description='Start Categ module on files.')
|
|
||||||
|
|
||||||
parser.add_argument(
|
|
||||||
'-d', type=str, default="../files/",
|
|
||||||
help='Path to the directory containing the category files.',
|
|
||||||
action='store')
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
self.redis_logger.info("Script Categ started")
|
self.redis_logger.info("Script Categ started")
|
||||||
|
|
||||||
|
# # TODO: trigger reload on change ( save last reload time, ...)
|
||||||
|
def reload_categ_words(self):
|
||||||
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
|
categories = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve', 'ApiKey']
|
||||||
tmp_dict = {}
|
tmp_dict = {}
|
||||||
for filename in categories:
|
for filename in categories:
|
||||||
bname = os.path.basename(filename)
|
bname = os.path.basename(filename)
|
||||||
tmp_dict[bname] = []
|
tmp_dict[bname] = []
|
||||||
with open(os.path.join(args.d, filename), 'r') as f:
|
with open(os.path.join(self.categ_files_dir, filename), 'r') as f:
|
||||||
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
|
patterns = [r'%s' % ( re.escape(s.strip()) ) for s in f]
|
||||||
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
|
tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE)
|
||||||
|
self.categ_words = tmp_dict.items()
|
||||||
|
|
||||||
self.categ_items = tmp_dict.items()
|
def compute(self, message, r_result=False):
|
||||||
|
# Create Item Object
|
||||||
|
item = Item(message)
|
||||||
|
# Get item content
|
||||||
|
content = item.get_content()
|
||||||
|
categ_found = []
|
||||||
|
|
||||||
prec_filename = None
|
# Search for pattern categories in item content
|
||||||
|
for categ, pattern in self.categ_words:
|
||||||
|
|
||||||
def compute(self, message):
|
|
||||||
# Cast message as paste
|
|
||||||
paste = Paste.Paste(message)
|
|
||||||
# Get paste content
|
|
||||||
content = paste.get_p_content()
|
|
||||||
|
|
||||||
# init categories found
|
|
||||||
is_categ_found = False
|
|
||||||
|
|
||||||
# Search for pattern categories in paste content
|
|
||||||
for categ, pattern in self.categ_items:
|
|
||||||
|
|
||||||
found = set(re.findall(pattern, content))
|
found = set(re.findall(pattern, content))
|
||||||
lenfound = len(found)
|
lenfound = len(found)
|
||||||
if lenfound >= self.matchingThreshold:
|
if lenfound >= self.matchingThreshold:
|
||||||
is_categ_found = True
|
categ_found.append(categ)
|
||||||
msg = '{} {}'.format(paste.p_rel_path, lenfound)
|
msg = f'{item.get_id()} {lenfound}'
|
||||||
|
|
||||||
self.redis_logger.debug('%s;%s %s'%(self.module_name, msg, categ))
|
|
||||||
|
|
||||||
# Export message to categ queue
|
# Export message to categ queue
|
||||||
self.process.populate_set_out(msg, categ)
|
print(msg, categ)
|
||||||
|
self.send_message_to_queue(categ, msg)
|
||||||
|
|
||||||
self.redis_logger.info(
|
self.redis_logger.info(
|
||||||
'Categ;{};{};{};Detected {} as {};{}'.format(
|
f'Categ;{item.get_source()};{item.get_date()};{item.get_basename()};Detected {lenfound} as {categ};{item.get_id()}')
|
||||||
paste.p_source, paste.p_date, paste.p_name,
|
if r_result:
|
||||||
lenfound, categ, paste.p_rel_path))
|
return categ_found
|
||||||
|
|
||||||
if not is_categ_found:
|
|
||||||
self.redis_logger.debug('No %s found in this paste: %s'%(self.module_name, paste.p_name))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
module = Categ()
|
# SCRIPT PARSER #
|
||||||
|
parser = argparse.ArgumentParser(description='Start Categ module on files.')
|
||||||
|
parser.add_argument(
|
||||||
|
'-d', type=str, default="../files/",
|
||||||
|
help='Path to the directory containing the category files.',
|
||||||
|
action='store')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
module = Categ(categ_files_dir=args.d)
|
||||||
module.run()
|
module.run()
|
||||||
|
|
|
@ -9,6 +9,7 @@ sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
|
||||||
# Modules Classes
|
# Modules Classes
|
||||||
from ApiKey import ApiKey
|
from ApiKey import ApiKey
|
||||||
|
from Categ import Categ
|
||||||
from Onion import Onion
|
from Onion import Onion
|
||||||
|
|
||||||
# project packages
|
# project packages
|
||||||
|
@ -25,11 +26,23 @@ class Test_Module_ApiKey(unittest.TestCase):
|
||||||
aws_access_key = 'AKIAIOSFODNN7EXAMPLE'
|
aws_access_key = 'AKIAIOSFODNN7EXAMPLE'
|
||||||
aws_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
|
aws_secret_key = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
|
||||||
|
|
||||||
matches = self.module_obj.compute(f'{item_id} 3', r_match=True)
|
matches = self.module_obj.compute(f'{item_id} 3', r_result=True)
|
||||||
self.assertCountEqual(matches[0], [google_api_key])
|
self.assertCountEqual(matches[0], [google_api_key])
|
||||||
self.assertCountEqual(matches[1], [aws_access_key])
|
self.assertCountEqual(matches[1], [aws_access_key])
|
||||||
self.assertCountEqual(matches[2], [aws_secret_key])
|
self.assertCountEqual(matches[2], [aws_secret_key])
|
||||||
|
|
||||||
|
class Test_Module_Categ(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.module_obj = Categ()
|
||||||
|
|
||||||
|
def test_module(self):
|
||||||
|
item_id = 'tests/2021/01/01/categ.gz'
|
||||||
|
test_categ = ['CreditCards', 'Mail', 'Onion', 'Web', 'Credential', 'Cve']
|
||||||
|
|
||||||
|
result = self.module_obj.compute(item_id, r_result=True)
|
||||||
|
self.assertCountEqual(result, test_categ)
|
||||||
|
|
||||||
class Test_Module_Onion(unittest.TestCase):
|
class Test_Module_Onion(unittest.TestCase):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
|
Loading…
Reference in New Issue