mirror of https://github.com/CIRCL/AIL-framework
113 lines
4.6 KiB
Python
Executable File
113 lines
4.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*-coding:UTF-8 -*
|
|
"""
|
|
The Onion Module
|
|
============================
|
|
|
|
This module extract url from item and returning only ones which are tor
|
|
related (.onion). All These urls are send to the crawler discovery queue.
|
|
|
|
Requirements
|
|
------------
|
|
|
|
*Need running Redis instances. (Redis)
|
|
|
|
"""
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
##################################
|
|
# Import Project packages
|
|
##################################
|
|
from modules.abstract_module import AbstractModule
|
|
from lib.ConfigLoader import ConfigLoader
|
|
from lib.objects.Items import Item
|
|
from lib import crawlers
|
|
|
|
class Onion(AbstractModule):
|
|
"""docstring for Onion module."""
|
|
|
|
def __init__(self, queue=True):
|
|
super(Onion, self).__init__(queue=queue)
|
|
|
|
config_loader = ConfigLoader()
|
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
|
|
self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
|
|
# regex timeout
|
|
self.regex_timeout = 30
|
|
|
|
self.faup = crawlers.get_faup()
|
|
|
|
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
|
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
|
|
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
|
|
|
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
re.compile(self.onion_regex)
|
|
# re.compile(self.i2p_regex)
|
|
|
|
self.logger.info(f"Module: {self.module_name} Launched")
|
|
|
|
# TEMP var: SAVE I2P Domain (future I2P crawler)
|
|
# self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
|
|
|
|
def extract(self, obj_id, content, tag):
|
|
extracted = []
|
|
onions = self.regex_finditer(self.onion_regex, obj_id, content)
|
|
for onion in onions:
|
|
start, end, value = onion
|
|
url_unpack = crawlers.unpack_url(value)
|
|
domain = url_unpack['domain']
|
|
if crawlers.is_valid_onion_domain(domain):
|
|
extracted.append([start, end, value, f'tag:{tag}'])
|
|
return extracted
|
|
|
|
def compute(self, message):
|
|
onion_urls = []
|
|
domains = []
|
|
|
|
score = message
|
|
item = self.get_obj()
|
|
item_content = item.get_content()
|
|
|
|
# max execution time on regex
|
|
res = self.regex_findall(self.onion_regex, item.get_id(), item_content)
|
|
for x in res:
|
|
# String to tuple
|
|
x = x[2:-2].replace(" '", "").split("',")
|
|
url = x[0]
|
|
print(url)
|
|
|
|
# TODO Crawl subdomain
|
|
url_unpack = crawlers.unpack_url(url)
|
|
domain = url_unpack['domain']
|
|
if crawlers.is_valid_onion_domain(domain):
|
|
domains.append(domain)
|
|
onion_urls.append(url)
|
|
|
|
if onion_urls:
|
|
if crawlers.is_crawler_activated():
|
|
for domain in domains:
|
|
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0,
|
|
har=self.har, screenshot=self.screenshot)
|
|
if task_uuid:
|
|
print(f'{domain} added to crawler queue: {task_uuid}')
|
|
else:
|
|
to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
|
print(f'{to_print}Detected {len(domains)} .onion(s);{self.obj.get_global_id()}')
|
|
self.redis_logger.warning(f'{to_print}Detected {len(domains)} .onion(s);{self.obj.get_global_id()}')
|
|
|
|
# TAG Item
|
|
tag = 'infoleak:automatic-detection="onion"'
|
|
self.add_message_to_queue(message=tag, queue='Tags')
|
|
|
|
|
|
if __name__ == "__main__":
|
|
module = Onion()
|
|
# module.compute('submitted/2022/10/10/submitted_705d1d92-7e9a-4a44-8c21-ccd167bfb7db.gz 9')
|
|
module.run()
|