mirror of https://github.com/CIRCL/AIL-framework
171 lines
7.0 KiB
Python
Executable File
171 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# -*-coding:UTF-8 -*
|
|
"""
|
|
The Onion Module
|
|
============================
|
|
|
|
This module extract url from item and returning only ones which are tor
|
|
related (.onion). All These urls are send to the crawler discovery queue.
|
|
|
|
Requirements
|
|
------------
|
|
|
|
*Need running Redis instances. (Redis)
|
|
|
|
"""
|
|
import time
|
|
import datetime
|
|
import os
|
|
import sys
|
|
import re
|
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
##################################
|
|
# Import Project packages
|
|
##################################
|
|
from modules.abstract_module import AbstractModule
|
|
from lib.ConfigLoader import ConfigLoader
|
|
from lib import crawlers
|
|
from lib import regex_helper
|
|
from packages.Item import Item
|
|
|
|
## Manually fetch first page if crawler is disabled
|
|
# import base64
|
|
# import subprocess
|
|
#
|
|
# torclient_host = '127.0.0.1'
|
|
# torclient_port = 9050
|
|
#
|
|
# def fetch(p, r_cache, urls, domains):
|
|
# now = datetime.datetime.now()
|
|
# path = os.path.join('onions', str(now.year).zfill(4),
|
|
# str(now.month).zfill(2),
|
|
# str(now.day).zfill(2),
|
|
# str(int(time.mktime(now.utctimetuple()))))
|
|
# failed = []
|
|
# downloaded = []
|
|
# print('{} Urls to fetch'.format(len(urls)))
|
|
# for url, domain in zip(urls, domains):
|
|
# if r_cache.exists(url) or url in failed:
|
|
# continue
|
|
# to_fetch = base64.standard_b64encode(url.encode('utf8'))
|
|
# print('fetching url: {}'.format(to_fetch))
|
|
# process = subprocess.Popen(["python", './tor_fetcher.py', to_fetch],
|
|
# stdout=subprocess.PIPE)
|
|
# while process.poll() is None:
|
|
# time.sleep(1)
|
|
#
|
|
# if process.returncode == 0:
|
|
# r_cache.setbit(url, 0, 1)
|
|
# r_cache.expire(url, 360000)
|
|
# downloaded.append(url)
|
|
# print('downloaded : {}'.format(downloaded))
|
|
# '''tempfile = process.stdout.read().strip()
|
|
# tempfile = tempfile.decode('utf8')
|
|
# #with open(tempfile, 'r') as f:
|
|
# filename = path + domain + '.gz'
|
|
# fetched = f.read()
|
|
# content = base64.standard_b64decode(fetched)
|
|
# save_path = os.path.join(os.environ['AIL_HOME'],
|
|
# p.config.get("Directories", "pastes"),
|
|
# filename)
|
|
# dirname = os.path.dirname(save_path)
|
|
# if not os.path.exists(dirname):
|
|
# os.makedirs(dirname)
|
|
# with open(save_path, 'w') as ff:
|
|
# ff.write(content)
|
|
# p.populate_set_out(save_path, 'Global')
|
|
# p.populate_set_out(url, 'ValidOnion')
|
|
# p.populate_set_out(fetched, 'FetchedOnion')'''
|
|
# yield url
|
|
# #os.unlink(tempfile)
|
|
# else:
|
|
# r_cache.setbit(url, 0, 0)
|
|
# r_cache.expire(url, 3600)
|
|
# failed.append(url)
|
|
# print('Failed at downloading', url)
|
|
# print(process.stdout.read())
|
|
# print('Failed:', len(failed), 'Downloaded:', len(downloaded))
|
|
|
|
|
|
class Onion(AbstractModule):
|
|
"""docstring for Onion module."""
|
|
|
|
def __init__(self):
|
|
super(Onion, self).__init__()
|
|
|
|
config_loader = ConfigLoader()
|
|
self.r_cache = config_loader.get_redis_conn("Redis_Cache")
|
|
self.r_onion = config_loader.get_redis_conn("ARDB_Onion")
|
|
|
|
self.pending_seconds = config_loader.get_config_int("Onion", "max_execution_time")
|
|
# regex timeout
|
|
self.regex_timeout = 30
|
|
|
|
self.faup = crawlers.get_faup()
|
|
self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name)
|
|
|
|
# activate_crawler = p.config.get("Crawler", "activate_crawler")
|
|
|
|
|
|
self.url_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.onion)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
self.i2p_regex = "((http|https|ftp)?(?:\://)?([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.i2p)(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)"
|
|
re.compile(self.url_regex)
|
|
re.compile(self.i2p_regex)
|
|
|
|
self.redis_logger.info(f"Module: {self.module_name} Launched")
|
|
|
|
# TEMP var: SAVE I2P Domain (future I2P crawler)
|
|
self.save_i2p = config_loader.get_config_boolean("Onion", "save_i2p")
|
|
|
|
def compute(self, message):
|
|
# list of tuples: (url, subdomains, domain)
|
|
urls_to_crawl = []
|
|
|
|
id, score = message.split()
|
|
item = Item(id)
|
|
item_content = item.get_content()
|
|
|
|
# max execution time on regex
|
|
res = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content)
|
|
for x in res:
|
|
# String to tuple
|
|
x = x[2:-2].replace(" '", "").split("',")
|
|
url = x[0]
|
|
subdomain = x[4].lower()
|
|
self.faup.decode(url)
|
|
url_unpack = self.faup.get()
|
|
try: ## TODO: # FIXME: check faup version
|
|
domain = url_unpack['domain'].decode().lower()
|
|
except Exception as e:
|
|
domain = url_unpack['domain'].lower()
|
|
|
|
if crawlers.is_valid_onion_domain(domain):
|
|
urls_to_crawl.append((url, subdomain, domain))
|
|
|
|
to_print = f'Onion;{item.get_source()};{item.get_date()};{item.get_basename()};'
|
|
if not urls_to_crawl:
|
|
self.redis_logger.info(f'{to_print}Onion related;{item.get_id()}')
|
|
return
|
|
|
|
# TAG Item
|
|
msg = f'infoleak:automatic-detection="onion";{item.get_id()}'
|
|
self.send_message_to_queue(msg, 'Tags')
|
|
|
|
if crawlers.is_crawler_activated():
|
|
for to_crawl in urls_to_crawl:
|
|
print(f'{to_crawl[2]} added to crawler queue: {to_crawl[0]}')
|
|
crawlers.add_item_to_discovery_queue('onion', to_crawl[2], to_crawl[1], to_crawl[0], item.get_id())
|
|
else:
|
|
print(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
|
self.redis_logger.warning(f'{to_print}Detected {len(urls_to_crawl)} .onion(s);{item.get_id()}')
|
|
# keep manual fetcher ????
|
|
## Manually fetch first page if crawler is disabled
|
|
# for url in fetch(p, r_cache, urls, domains_list):
|
|
# publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_rel_path))
|
|
|
|
if __name__ == "__main__":
|
|
|
|
module = Onion()
|
|
module.run()
|