From 3fb281f6c37ed038cfcf209b7ff83d3203817850 Mon Sep 17 00:00:00 2001 From: terrtia Date: Fri, 11 Oct 2024 14:36:02 +0200 Subject: [PATCH] chg: [module] add CEDetector --- bin/lib/objects/Titles.py | 1 + bin/lib/objects/abstract_daterange_object.py | 10 ++ bin/modules/CEDetector.py | 122 +++++++++++++++++++ configs/modules.cfg | 8 +- 4 files changed, 140 insertions(+), 1 deletion(-) create mode 100755 bin/modules/CEDetector.py diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py index f9e0064b..75a7ece9 100755 --- a/bin/lib/objects/Titles.py +++ b/bin/lib/objects/Titles.py @@ -96,6 +96,7 @@ def create_title(content): title.create(content) return title + class Titles(AbstractDaterangeObjects): """ Titles Objects diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py index c6f2d1aa..519328a7 100755 --- a/bin/lib/objects/abstract_daterange_object.py +++ b/bin/lib/objects/abstract_daterange_object.py @@ -193,9 +193,19 @@ class AbstractDaterangeObjects(ABC): self.type = obj_type self.obj_class = obj_class + ################################################ + ################################################ + def get_ids(self): return r_object.smembers(f'{self.type}:all') + def get_iterator(self): + for obj_id in self.get_ids(): + yield self.obj_class(obj_id) + + ################################################ + ################################################ + # def get_ids_iterator(self): # return r_object.sscan_iter(r_object, f'{self.type}:all') diff --git a/bin/modules/CEDetector.py b/bin/modules/CEDetector.py new file mode 100755 index 00000000..b8f25174 --- /dev/null +++ b/bin/modules/CEDetector.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The Onion Module +============================ + +This module extract url from item and returning only ones which are tor +related (.onion). All These urls are send to the crawler discovery queue. + +Requirements +------------ + +*Need running Redis instances. (Redis) + +""" +import os +import sys + +from textblob import TextBlob +from nltk.tokenize import RegexpTokenizer + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader + +class CEDetector(AbstractModule): + """docstring for Onion module.""" + + def __init__(self, queue=True): + super(CEDetector, self).__init__(queue=queue) + + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + + self.csam_words = self.load_world_file('csam_words') + self.child_worlds = self.load_world_file('child_words') + self.porn_worlds = self.load_world_file('porn_words') + + self.ce_tag = 'dark-web:topic="pornography-child-exploitation"' + self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+', + gaps=True, discard_empty=True) + + def load_world_file(self, path): + words = set() + try: + with open(os.path.join(os.environ['AIL_HOME'], f'files/{path}')) as f: + content = f.read() + except FileNotFoundError: + content = '' + content = content.splitlines() + for line in content: + if line.startswith('#') or not line: + continue + word = line.split() + if word: + words.add(word[0]) + return words + + def compute(self, message): # TODO LIMIT TO DARKWEB ??? + to_tag = False + content = self.obj.get_content().lower() + # print(content) + + is_csam = False + is_child_word = False + is_porn_world = False + words = TextBlob(content, tokenizer=self.tokenizer).tokens + words = set(words) + + for word in words: + print(word) + if word in self.csam_words: + is_csam = True + if word in self.child_worlds: + is_child_word = True + if word in self.porn_worlds: + is_porn_world = True + # PERF ??? + # if is_child_word and is_porn_world: + # break + + if is_csam: + to_tag = True + if is_child_word and is_porn_world: + to_tag = True + + if to_tag: + # print(f'{content} DETECTED') + # print() + self.add_message_to_queue(message=self.ce_tag, queue='Tags') + + return to_tag + +def test_detection(): + from lib import Tag + from lib.objects.Domains import Domain + from lib.objects.Titles import Title + + not_detected = set() + tag = 'dark-web:topic="pornography-child-exploitation"' + tag_key = f'domain::{tag}' + for domain in Tag.get_obj_by_tag(tag_key): + dom = Domain(domain) + is_detected = False + for h in dom.get_correlation('title').get('title', []): + t = Title(h[1:]) + title = t.get_content() + module.obj = title + if module.compute(''): + is_detected = True + if not is_detected: + not_detected.add(domain) + print(not_detected) + + +if __name__ == "__main__": + module = CEDetector() + module.run() + # test_detection() diff --git a/configs/modules.cfg b/configs/modules.cfg index a559f626..1639c288 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -1,7 +1,7 @@ ######## IMPORTERS ######## [Crawler] -publish = Importers,Tags,Images +publish = Importers,Tags,Images,Titles [ZMQModuleImporter] publish = Importers @@ -172,6 +172,12 @@ publish = Item subscribe = Images publish = Item,Tags +######## TITLES ######## + +[CEDetector] +subscribe = Titles +publish = Tags + ######## CORE ######## [Tags]