mirror of https://github.com/CIRCL/AIL-framework
chg: [module] add CEDetector
parent
2ead8c21aa
commit
3fb281f6c3
|
@ -96,6 +96,7 @@ def create_title(content):
|
||||||
title.create(content)
|
title.create(content)
|
||||||
return title
|
return title
|
||||||
|
|
||||||
|
|
||||||
class Titles(AbstractDaterangeObjects):
|
class Titles(AbstractDaterangeObjects):
|
||||||
"""
|
"""
|
||||||
Titles Objects
|
Titles Objects
|
||||||
|
|
|
@ -193,9 +193,19 @@ class AbstractDaterangeObjects(ABC):
|
||||||
self.type = obj_type
|
self.type = obj_type
|
||||||
self.obj_class = obj_class
|
self.obj_class = obj_class
|
||||||
|
|
||||||
|
################################################
|
||||||
|
################################################
|
||||||
|
|
||||||
def get_ids(self):
|
def get_ids(self):
|
||||||
return r_object.smembers(f'{self.type}:all')
|
return r_object.smembers(f'{self.type}:all')
|
||||||
|
|
||||||
|
def get_iterator(self):
|
||||||
|
for obj_id in self.get_ids():
|
||||||
|
yield self.obj_class(obj_id)
|
||||||
|
|
||||||
|
################################################
|
||||||
|
################################################
|
||||||
|
|
||||||
# def get_ids_iterator(self):
|
# def get_ids_iterator(self):
|
||||||
# return r_object.sscan_iter(r_object, f'{self.type}:all')
|
# return r_object.sscan_iter(r_object, f'{self.type}:all')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*-coding:UTF-8 -*
|
||||||
|
"""
|
||||||
|
The Onion Module
|
||||||
|
============================
|
||||||
|
|
||||||
|
This module extract url from item and returning only ones which are tor
|
||||||
|
related (.onion). All These urls are send to the crawler discovery queue.
|
||||||
|
|
||||||
|
Requirements
|
||||||
|
------------
|
||||||
|
|
||||||
|
*Need running Redis instances. (Redis)
|
||||||
|
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from textblob import TextBlob
|
||||||
|
from nltk.tokenize import RegexpTokenizer
|
||||||
|
|
||||||
|
sys.path.append(os.environ['AIL_BIN'])
|
||||||
|
##################################
|
||||||
|
# Import Project packages
|
||||||
|
##################################
|
||||||
|
from modules.abstract_module import AbstractModule
|
||||||
|
from lib.ConfigLoader import ConfigLoader
|
||||||
|
|
||||||
|
class CEDetector(AbstractModule):
|
||||||
|
"""docstring for Onion module."""
|
||||||
|
|
||||||
|
def __init__(self, queue=True):
|
||||||
|
super(CEDetector, self).__init__(queue=queue)
|
||||||
|
|
||||||
|
config_loader = ConfigLoader()
|
||||||
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
||||||
|
|
||||||
|
self.csam_words = self.load_world_file('csam_words')
|
||||||
|
self.child_worlds = self.load_world_file('child_words')
|
||||||
|
self.porn_worlds = self.load_world_file('porn_words')
|
||||||
|
|
||||||
|
self.ce_tag = 'dark-web:topic="pornography-child-exploitation"'
|
||||||
|
self.tokenizer = RegexpTokenizer('[\&\~\:\;\,\.\(\)\{\}\|\[\]\\\\//\=\'\"\%\$\?\@\+\#\_\^\<\>\!\*\n\r\t\s]+',
|
||||||
|
gaps=True, discard_empty=True)
|
||||||
|
|
||||||
|
def load_world_file(self, path):
|
||||||
|
words = set()
|
||||||
|
try:
|
||||||
|
with open(os.path.join(os.environ['AIL_HOME'], f'files/{path}')) as f:
|
||||||
|
content = f.read()
|
||||||
|
except FileNotFoundError:
|
||||||
|
content = ''
|
||||||
|
content = content.splitlines()
|
||||||
|
for line in content:
|
||||||
|
if line.startswith('#') or not line:
|
||||||
|
continue
|
||||||
|
word = line.split()
|
||||||
|
if word:
|
||||||
|
words.add(word[0])
|
||||||
|
return words
|
||||||
|
|
||||||
|
def compute(self, message): # TODO LIMIT TO DARKWEB ???
|
||||||
|
to_tag = False
|
||||||
|
content = self.obj.get_content().lower()
|
||||||
|
# print(content)
|
||||||
|
|
||||||
|
is_csam = False
|
||||||
|
is_child_word = False
|
||||||
|
is_porn_world = False
|
||||||
|
words = TextBlob(content, tokenizer=self.tokenizer).tokens
|
||||||
|
words = set(words)
|
||||||
|
|
||||||
|
for word in words:
|
||||||
|
print(word)
|
||||||
|
if word in self.csam_words:
|
||||||
|
is_csam = True
|
||||||
|
if word in self.child_worlds:
|
||||||
|
is_child_word = True
|
||||||
|
if word in self.porn_worlds:
|
||||||
|
is_porn_world = True
|
||||||
|
# PERF ???
|
||||||
|
# if is_child_word and is_porn_world:
|
||||||
|
# break
|
||||||
|
|
||||||
|
if is_csam:
|
||||||
|
to_tag = True
|
||||||
|
if is_child_word and is_porn_world:
|
||||||
|
to_tag = True
|
||||||
|
|
||||||
|
if to_tag:
|
||||||
|
# print(f'{content} DETECTED')
|
||||||
|
# print()
|
||||||
|
self.add_message_to_queue(message=self.ce_tag, queue='Tags')
|
||||||
|
|
||||||
|
return to_tag
|
||||||
|
|
||||||
|
def test_detection():
|
||||||
|
from lib import Tag
|
||||||
|
from lib.objects.Domains import Domain
|
||||||
|
from lib.objects.Titles import Title
|
||||||
|
|
||||||
|
not_detected = set()
|
||||||
|
tag = 'dark-web:topic="pornography-child-exploitation"'
|
||||||
|
tag_key = f'domain::{tag}'
|
||||||
|
for domain in Tag.get_obj_by_tag(tag_key):
|
||||||
|
dom = Domain(domain)
|
||||||
|
is_detected = False
|
||||||
|
for h in dom.get_correlation('title').get('title', []):
|
||||||
|
t = Title(h[1:])
|
||||||
|
title = t.get_content()
|
||||||
|
module.obj = title
|
||||||
|
if module.compute(''):
|
||||||
|
is_detected = True
|
||||||
|
if not is_detected:
|
||||||
|
not_detected.add(domain)
|
||||||
|
print(not_detected)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
module = CEDetector()
|
||||||
|
module.run()
|
||||||
|
# test_detection()
|
|
@ -1,7 +1,7 @@
|
||||||
######## IMPORTERS ########
|
######## IMPORTERS ########
|
||||||
|
|
||||||
[Crawler]
|
[Crawler]
|
||||||
publish = Importers,Tags,Images
|
publish = Importers,Tags,Images,Titles
|
||||||
|
|
||||||
[ZMQModuleImporter]
|
[ZMQModuleImporter]
|
||||||
publish = Importers
|
publish = Importers
|
||||||
|
@ -172,6 +172,12 @@ publish = Item
|
||||||
subscribe = Images
|
subscribe = Images
|
||||||
publish = Item,Tags
|
publish = Item,Tags
|
||||||
|
|
||||||
|
######## TITLES ########
|
||||||
|
|
||||||
|
[CEDetector]
|
||||||
|
subscribe = Titles
|
||||||
|
publish = Tags
|
||||||
|
|
||||||
######## CORE ########
|
######## CORE ########
|
||||||
|
|
||||||
[Tags]
|
[Tags]
|
||||||
|
|
Loading…
Reference in New Issue