2018-05-04 13:53:29 +02:00
|
|
|
#!/usr/bin/env python3
|
2014-08-06 11:43:40 +02:00
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
"""
|
2021-05-19 14:54:34 +02:00
|
|
|
The Onion Module
|
2014-08-06 11:43:40 +02:00
|
|
|
============================
|
|
|
|
|
2021-05-19 14:54:34 +02:00
|
|
|
This module extract url from item and returning only ones which are tor
|
|
|
|
related (.onion). All These urls are send to the crawler discovery queue.
|
2014-08-06 11:43:40 +02:00
|
|
|
|
|
|
|
Requirements
|
|
|
|
------------
|
|
|
|
|
|
|
|
*Need running Redis instances. (Redis)
|
|
|
|
|
|
|
|
"""
|
2014-08-31 22:42:12 +02:00
|
|
|
import os
|
2021-05-14 14:42:16 +02:00
|
|
|
import sys
|
2018-08-21 15:54:53 +02:00
|
|
|
import re
|
2014-08-06 11:43:40 +02:00
|
|
|
|
2021-06-02 14:42:23 +02:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from modules.abstract_module import AbstractModule
|
2021-05-14 14:42:16 +02:00
|
|
|
from lib.ConfigLoader import ConfigLoader
|
2022-10-25 16:25:19 +02:00
|
|
|
from lib.objects.Items import Item
|
2021-05-14 14:42:16 +02:00
|
|
|
from lib import crawlers
|
|
|
|
|
|
|
|
class Onion(AbstractModule):
|
|
|
|
"""docstring for Onion module."""
|
|
|
|
|
2023-04-13 14:25:02 +02:00
|
|
|
def __init__(self, queue=True):
|
|
|
|
super(Onion, self).__init__(queue=queue)
|
2021-05-14 14:42:16 +02:00
|
|
|
|
|
|
|
config_loader = ConfigLoader()
|
|
|
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
|
|
|
|
|
|
self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
|
|
|
|
# regex timeout
|
|
|
|
self.regex_timeout = 30
|
|
|
|
|
|
|
|
self.faup = crawlers.get_faup()
|
|
|
|
|
|
|
|
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
2023-07-25 15:57:11 +02:00
|
|
|
self.har = config_loader.get_config_boolean('Crawler', 'default_har')
|
|
|
|
self.screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
self.onion_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
|
|
# self.i2p_regex = r"((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
|
|
re.compile(self.onion_regex)
|
|
|
|
# re.compile(self.i2p_regex)
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.info(f"Module: {self.module_name} Launched")
|
2021-05-14 14:42:16 +02:00
|
|
|
|
|
|
|
# TEMP var: SAVE I2P Domain (future I2P crawler)
|
2022-10-25 16:25:19 +02:00
|
|
|
# self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2022-12-19 16:38:20 +01:00
|
|
|
def extract(self, obj_id, content, tag):
|
|
|
|
extracted = []
|
|
|
|
onions = self.regex_finditer(self.onion_regex, obj_id, content)
|
|
|
|
for onion in onions:
|
|
|
|
start, end, value = onion
|
|
|
|
url_unpack = crawlers.unpack_url(value)
|
|
|
|
domain = url_unpack['domain']
|
|
|
|
if crawlers.is_valid_onion_domain(domain):
|
2023-02-23 16:25:15 +01:00
|
|
|
extracted.append([start, end, value, f'tag:{tag}'])
|
2022-12-19 16:38:20 +01:00
|
|
|
return extracted
|
|
|
|
|
2021-05-14 14:42:16 +02:00
|
|
|
def compute(self, message):
|
2022-10-25 16:25:19 +02:00
|
|
|
onion_urls = []
|
|
|
|
domains = []
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
item_id, score = message.split()
|
|
|
|
item = Item(item_id)
|
2021-05-14 14:42:16 +02:00
|
|
|
item_content = item.get_content()
|
|
|
|
|
|
|
|
# max execution time on regex
|
2022-10-25 16:25:19 +02:00
|
|
|
res = self.regex_findall(self.onion_regex, item.get_id(), item_content)
|
2021-05-14 14:42:16 +02:00
|
|
|
for x in res:
|
|
|
|
# String to tuple
|
|
|
|
x = x[2:-2].replace(" '", "").split("',")
|
|
|
|
url = x[0]
|
2022-10-25 16:25:19 +02:00
|
|
|
print(url)
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# TODO Crawl subdomain
|
|
|
|
url_unpack = crawlers.unpack_url(url)
|
|
|
|
domain = url_unpack['domain']
|
2021-05-14 14:42:16 +02:00
|
|
|
if crawlers.is_valid_onion_domain(domain):
|
2022-10-25 16:25:19 +02:00
|
|
|
domains.append(domain)
|
|
|
|
onion_urls.append(url)
|
|
|
|
|
|
|
|
if onion_urls:
|
|
|
|
if crawlers.is_crawler_activated():
|
2023-07-25 15:57:11 +02:00
|
|
|
for domain in domains:
|
|
|
|
task_uuid = crawlers.create_task(domain, parent=item.get_id(), priority=0,
|
|
|
|
har=self.har, screenshot=self.screenshot)
|
2022-10-25 16:25:19 +02:00
|
|
|
if task_uuid:
|
|
|
|
print(f'{domain} added to crawler queue: {task_uuid}')
|
|
|
|
else:
|
|
|
|
to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
|
|
|
print(f'{to_print}Detected {len(domains)} .onion(s);{item.get_id()}')
|
|
|
|
self.redis_logger.warning(f'{to_print}Detected {len(domains)} .onion(s);{item.get_id()}')
|
|
|
|
|
|
|
|
# TAG Item
|
|
|
|
msg = f'infoleak:automatic-detection="onion";{item.get_id()}'
|
2023-04-13 14:25:02 +02:00
|
|
|
self.add_message_to_queue(msg, 'Tags')
|
2014-08-31 22:42:12 +02:00
|
|
|
|
2021-05-14 14:42:16 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
if __name__ == "__main__":
|
2021-05-14 14:42:16 +02:00
|
|
|
module = Onion()
|
2022-10-25 16:25:19 +02:00
|
|
|
# module.compute('submitted/2022/10/10/submitted_705d1d92-7e9a-4a44-8c21-ccd167bfb7db.gz 9')
|
2021-05-14 14:42:16 +02:00
|
|
|
module.run()
|