From 0cb7431e10388439877aa5c5c269f27b7eae8157 Mon Sep 17 00:00:00 2001 From: Terrtia Date: Mon, 21 Aug 2023 15:49:32 +0200 Subject: [PATCH] chg: [modules] crawl pasties domains --- bin/lib/ConfigLoader.py | 1 + bin/lib/regex_helper.py | 28 +++++++ bin/modules/Pasties.py | 144 +++++++++++++++++++++++++++++++++ bin/modules/Zerobins.py | 71 ---------------- bin/modules/abstract_module.py | 3 + configs/modules.cfg | 2 +- 6 files changed, 177 insertions(+), 72 deletions(-) create mode 100755 bin/modules/Pasties.py delete mode 100755 bin/modules/Zerobins.py diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py index 5be8f492..6ecd4b02 100755 --- a/bin/lib/ConfigLoader.py +++ b/bin/lib/ConfigLoader.py @@ -83,6 +83,7 @@ class ConfigLoader(object): else: return [] + # # # # Directory Config # # # # config_loader = ConfigLoader() diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py index 41ba4e98..6f877823 100755 --- a/bin/lib/regex_helper.py +++ b/bin/lib/regex_helper.py @@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30): proc.terminate() sys.exit(0) +def _regex_match(r_key, regex, content): + if re.match(regex, content): + r_serv_cache.set(r_key, 1) + r_serv_cache.expire(r_key, 360) + +def regex_match(r_key, regex, item_id, content, max_time=30): + proc = Proc(target=_regex_match, args=(r_key, regex, content)) + try: + proc.start() + proc.join(max_time) + if proc.is_alive(): + proc.terminate() + # Statistics.incr_module_timeout_statistic(r_key) + err_mess = f"{r_key}: processing timeout: {item_id}" + logger.info(err_mess) + return False + else: + if r_serv_cache.exists(r_key): + r_serv_cache.delete(r_key) + return True + else: + r_serv_cache.delete(r_key) + return False + except KeyboardInterrupt: + print("Caught KeyboardInterrupt, terminating regex worker") + proc.terminate() + sys.exit(0) + def _regex_search(r_key, regex, content): if re.search(regex, content): r_serv_cache.set(r_key, 1) diff --git a/bin/modules/Pasties.py b/bin/modules/Pasties.py new file mode 100755 index 00000000..ce2eff10 --- /dev/null +++ b/bin/modules/Pasties.py @@ -0,0 +1,144 @@ +#!/usr/bin/env python3 +# -*-coding:UTF-8 -* +""" +The Pasties Module +====================== +This module spots domain-pasties services for further processing +""" + +################################## +# Import External packages +################################## +import os +import sys +import time + +from pyfaup.faup import Faup + +sys.path.append(os.environ['AIL_BIN']) +################################## +# Import Project packages +################################## +from modules.abstract_module import AbstractModule +from lib.ConfigLoader import ConfigLoader +from lib import crawlers + +# TODO add url validator + +pasties_blocklist_urls = set() +pasties_domains = {} + +class Pasties(AbstractModule): + """ + Pasties module for AIL framework + """ + + def __init__(self): + super(Pasties, self).__init__() + self.faup = Faup() + + config_loader = ConfigLoader() + self.r_cache = config_loader.get_redis_conn("Redis_Cache") + + self.pasties = {} + self.urls_blocklist = set() + self.load_pasties_domains() + + # Send module state to logs + self.logger.info(f'Module {self.module_name} initialized') + + def load_pasties_domains(self): + self.pasties = {} + self.urls_blocklist = set() + + domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties') + if os.path.exists(domains_pasties): + with open(domains_pasties) as f: + for line in f: + url = line.strip() + if url: # TODO validate line + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + # print(url_decoded) + if path and path != '/': + if path[-1] != '/': + path = f'{path}/' + else: + path = None + + if host in self.pasties: + if path: + self.pasties[host].add(path) + else: + if path: + self.pasties[host] = {path} + else: + self.pasties[host] = set() + + url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist') + if os.path.exists(url_blocklist): + with open(url_blocklist) as f: + for line in f: + url = line.strip() + self.faup.decode(url) + url_decoded = self.faup.get() + host = url_decoded['host'] + # if url_decoded.get('port', ''): + # host = f'{host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + url = f'{host}{path}' + if url_decoded['query_string']: + url = url + url_decoded['query_string'] + self.urls_blocklist.add(url) + + def send_to_crawler(self, url, obj_id): + if not self.r_cache.exists(f'{self.module_name}:url:{url}'): + self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time())) + self.r_cache.expire(f'{self.module_name}:url:{url}', 86400) + crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id) + + def compute(self, message): + url, item_id = message.split() + + self.faup.decode(url) + url_decoded = self.faup.get() + # print(url_decoded) + url_host = url_decoded['host'] + # if url_decoded.get('port', ''): + # url_host = f'{url_host}:{url_decoded["port"]}' + path = url_decoded.get('resource_path', '') + if url_host in self.pasties: + if url.startswith('http://'): + if url[7:] in self.urls_blocklist: + return None + elif url.startswith('https://'): + if url[8:] in self.urls_blocklist: + return None + else: + if url in self.urls_blocklist: + return None + + if not self.pasties[url_host]: + if path and path != '/': + print('send to crawler', url_host, url) + self.send_to_crawler(url, item_id) + else: + if path.endswith('/'): + path_end = path[:-1] + else: + path_end = f'{path}/' + for url_path in self.pasties[url_host]: + if path.startswith(url_path): + if url_path != path and url_path != path_end: + print('send to crawler', url_path, url) + self.send_to_crawler(url, item_id) + break + + +if __name__ == '__main__': + module = Pasties() + module.run() diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py deleted file mode 100755 index f3fcea5a..00000000 --- a/bin/modules/Zerobins.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:UTF-8 -* -""" -The Zerobins Module -====================== -This module spots zerobins-like services for further processing -""" - -################################## -# Import External packages -################################## -import os -import re -import sys - -sys.path.append(os.environ['AIL_BIN']) -################################## -# Import Project packages -################################## -from modules.abstract_module import AbstractModule -from lib import crawlers - - -class Zerobins(AbstractModule): - """ - Zerobins module for AIL framework - """ - - def __init__(self): - super(Zerobins, self).__init__() - - binz = [ - r'^https:\/\/(zerobin||privatebin)\..*$', # historical ones - ] - - self.regex = re.compile('|'.join(binz)) - - # Pending time between two computation (computeNone) in seconds - self.pending_seconds = 10 - - # Send module state to logs - self.logger.info(f'Module {self.module_name} initialized') - - def computeNone(self): - """ - Compute when no message in queue - """ - self.logger.debug("No message in queue") - - def compute(self, message): - """ - Compute a message in queue - """ - url, item_id = message.split() - - # Extract zerobins addresses - matching_binz = self.regex_findall(self.regex, item_id, url) - - if len(matching_binz) > 0: - for bin_url in matching_binz: - print(f'send {bin_url} to crawler') - # TODO Change priority ??? - crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor', - parent='manual', priority=60) - - self.logger.debug("Compute message in queue") - - -if __name__ == '__main__': - module = Zerobins() - module.run() diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py index 0a1a12cd..164e77b3 100644 --- a/bin/modules/abstract_module.py +++ b/bin/modules/abstract_module.py @@ -92,6 +92,9 @@ class AbstractModule(ABC): def get_available_queues(self): return self.queue.get_out_queues() + def regex_match(self, regex, obj_id, content): + return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) + def regex_search(self, regex, obj_id, content): return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time) diff --git a/configs/modules.cfg b/configs/modules.cfg index b0b1f6df..3ce4f0ae 100644 --- a/configs/modules.cfg +++ b/configs/modules.cfg @@ -162,7 +162,7 @@ publish = Importers,Tags subscribe = Item publish = Tags -[Zerobins] +[Pasties] subscribe = Url # [My_Module_Name]