From 3fb281f6c37ed038cfcf209b7ff83d3203817850 Mon Sep 17 00:00:00 2001
From: terrtia <or1994@hotmail.fr>
Date: Fri, 11 Oct 2024 14:36:02 +0200
Subject: [PATCH] chg: [module] add CEDetector

---
 bin/lib/objects/Titles.py                    |   1 +
 bin/lib/objects/abstract_daterange_object.py |  10 ++
 bin/modules/CEDetector.py                    | 122 +++++++++++++++++++
 configs/modules.cfg                          |   8 +-
 4 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100755 bin/modules/CEDetector.py

diff --git a/bin/lib/objects/Titles.py b/bin/lib/objects/Titles.py
index f9e0064b..75a7ece9 100755
--- a/bin/lib/objects/Titles.py
+++ b/bin/lib/objects/Titles.py
@@ -96,6 +96,7 @@ def create_title(content):
         title.create(content)
     return title
 
+
 class Titles(AbstractDaterangeObjects):
     """
         Titles Objects
diff --git a/bin/lib/objects/abstract_daterange_object.py b/bin/lib/objects/abstract_daterange_object.py
index c6f2d1aa..519328a7 100755
--- a/bin/lib/objects/abstract_daterange_object.py
+++ b/bin/lib/objects/abstract_daterange_object.py
@@ -193,9 +193,19 @@ class AbstractDaterangeObjects(ABC):
         self.type = obj_type
         self.obj_class = obj_class
 
+    ################################################
+    ################################################
+
     def get_ids(self):
         return r_object.smembers(f'{self.type}:all')
 
+    def get_iterator(self):
+        for obj_id in self.get_ids():
+            yield self.obj_class(obj_id)
+
+    ################################################
+    ################################################
+
     # def get_ids_iterator(self):
     #     return r_object.sscan_iter(r_object, f'{self.type}:all')
 
diff --git a/bin/modules/CEDetector.py b/bin/modules/CEDetector.py
new file mode 100755
index 00000000..b8f25174
--- /dev/null
+++ b/bin/modules/CEDetector.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+The Onion Module
+============================
+
+This module extract url from item and returning only ones which are tor
+related (.onion). All These urls are send to the crawler discovery queue.
+
+Requirements
+------------
+
+*Need running Redis instances. (Redis)
+
+"""
+import os
+import sys
+
+from textblob import TextBlob
+from nltk.tokenize import RegexpTokenizer
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from modules.abstract_module import AbstractModule
+from lib.ConfigLoader import ConfigLoader
+
+class CEDetector(AbstractModule):
+    """docstring for Onion module."""
+
+    def __init__(self, queue=True):
+        super(CEDetector, self).__init__(queue=queue)
+
+        config_loader = ConfigLoader()
+        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
+
+        self.csam_words = self.load_world_file('csam_words')
+        self.child_worlds = self.load_world_file('child_words')
+        self.porn_worlds = self.load_world_file('porn_words')
+
+        self.ce_tag = 'dark-web:topic="pornography-child-exploitation"'
+        self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
+                                         gaps=True, discard_empty=True)
+
+    def load_world_file(self, path):
+        words = set()
+        try:
+            with open(os.path.join(os.environ['AIL_HOME'], f'files/{path}')) as f:
+                content = f.read()
+        except FileNotFoundError:
+            content = ''
+        content = content.splitlines()
+        for line in content:
+            if line.startswith('#') or not line:
+                continue
+            word = line.split()
+            if word:
+                words.add(word[0])
+        return words
+
+    def compute(self, message):  # TODO LIMIT TO DARKWEB ???
+        to_tag = False
+        content = self.obj.get_content().lower()
+        # print(content)
+
+        is_csam = False
+        is_child_word = False
+        is_porn_world = False
+        words = TextBlob(content, tokenizer=self.tokenizer).tokens
+        words = set(words)
+
+        for word in words:
+            print(word)
+            if word in self.csam_words:
+                is_csam = True
+            if word in self.child_worlds:
+                is_child_word = True
+            if word in self.porn_worlds:
+                is_porn_world = True
+            # PERF ???
+            # if is_child_word and is_porn_world:
+            #     break
+
+        if is_csam:
+            to_tag = True
+        if is_child_word and is_porn_world:
+            to_tag = True
+
+        if to_tag:
+            # print(f'{content}                         DETECTED')
+            # print()
+            self.add_message_to_queue(message=self.ce_tag, queue='Tags')
+
+        return to_tag
+
+def test_detection():
+    from lib import Tag
+    from lib.objects.Domains import Domain
+    from lib.objects.Titles import Title
+
+    not_detected = set()
+    tag = 'dark-web:topic="pornography-child-exploitation"'
+    tag_key = f'domain::{tag}'
+    for domain in Tag.get_obj_by_tag(tag_key):
+        dom = Domain(domain)
+        is_detected = False
+        for h in dom.get_correlation('title').get('title', []):
+            t = Title(h[1:])
+            title = t.get_content()
+            module.obj = title
+            if module.compute(''):
+                is_detected = True
+        if not is_detected:
+            not_detected.add(domain)
+    print(not_detected)
+
+
+if __name__ == "__main__":
+    module = CEDetector()
+    module.run()
+    # test_detection()
diff --git a/configs/modules.cfg b/configs/modules.cfg
index a559f626..1639c288 100644
--- a/configs/modules.cfg
+++ b/configs/modules.cfg
@@ -1,7 +1,7 @@
 ######## IMPORTERS ########
 
 [Crawler]
-publish = Importers,Tags,Images
+publish = Importers,Tags,Images,Titles
 
 [ZMQModuleImporter]
 publish = Importers
@@ -172,6 +172,12 @@ publish = Item
 subscribe = Images
 publish = Item,Tags
 
+######## TITLES ########
+
+[CEDetector]
+subscribe = Titles
+publish = Tags
+
 ######## CORE ########
 
 [Tags]