From 0cb7431e10388439877aa5c5c269f27b7eae8157 Mon Sep 17 00:00:00 2001
From: Terrtia <or1994@hotmail.fr>
Date: Mon, 21 Aug 2023 15:49:32 +0200
Subject: [PATCH] chg: [modules] crawl pasties domains

---
 bin/lib/ConfigLoader.py        |   1 +
 bin/lib/regex_helper.py        |  28 +++++++
 bin/modules/Pasties.py         | 144 +++++++++++++++++++++++++++++++++
 bin/modules/Zerobins.py        |  71 ----------------
 bin/modules/abstract_module.py |   3 +
 configs/modules.cfg            |   2 +-
 6 files changed, 177 insertions(+), 72 deletions(-)
 create mode 100755 bin/modules/Pasties.py
 delete mode 100755 bin/modules/Zerobins.py

diff --git a/bin/lib/ConfigLoader.py b/bin/lib/ConfigLoader.py
index 5be8f492..6ecd4b02 100755
--- a/bin/lib/ConfigLoader.py
+++ b/bin/lib/ConfigLoader.py
@@ -83,6 +83,7 @@ class ConfigLoader(object):
         else:
             return []
 
+
 # # # # Directory Config # # # #
 
 config_loader = ConfigLoader()
diff --git a/bin/lib/regex_helper.py b/bin/lib/regex_helper.py
index 41ba4e98..6f877823 100755
--- a/bin/lib/regex_helper.py
+++ b/bin/lib/regex_helper.py
@@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
         proc.terminate()
         sys.exit(0)
 
+def _regex_match(r_key, regex, content):
+    if re.match(regex, content):
+        r_serv_cache.set(r_key, 1)
+        r_serv_cache.expire(r_key, 360)
+
+def regex_match(r_key, regex, item_id, content, max_time=30):
+    proc = Proc(target=_regex_match, args=(r_key, regex, content))
+    try:
+        proc.start()
+        proc.join(max_time)
+        if proc.is_alive():
+            proc.terminate()
+            # Statistics.incr_module_timeout_statistic(r_key)
+            err_mess = f"{r_key}: processing timeout: {item_id}"
+            logger.info(err_mess)
+            return False
+        else:
+            if r_serv_cache.exists(r_key):
+                r_serv_cache.delete(r_key)
+                return True
+            else:
+                r_serv_cache.delete(r_key)
+                return False
+    except KeyboardInterrupt:
+        print("Caught KeyboardInterrupt, terminating regex worker")
+        proc.terminate()
+        sys.exit(0)
+
 def _regex_search(r_key, regex, content):
     if re.search(regex, content):
         r_serv_cache.set(r_key, 1)
diff --git a/bin/modules/Pasties.py b/bin/modules/Pasties.py
new file mode 100755
index 00000000..ce2eff10
--- /dev/null
+++ b/bin/modules/Pasties.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*-coding:UTF-8 -*
+"""
+The Pasties Module
+======================
+This module spots domain-pasties services for further processing
+"""
+
+##################################
+# Import External packages
+##################################
+import os
+import sys
+import time
+
+from pyfaup.faup import Faup
+
+sys.path.append(os.environ['AIL_BIN'])
+##################################
+# Import Project packages
+##################################
+from modules.abstract_module import AbstractModule
+from lib.ConfigLoader import ConfigLoader
+from lib import crawlers
+
+# TODO add url validator
+
+pasties_blocklist_urls = set()
+pasties_domains = {}
+
+class Pasties(AbstractModule):
+    """
+    Pasties module for AIL framework
+    """
+
+    def __init__(self):
+        super(Pasties, self).__init__()
+        self.faup = Faup()
+
+        config_loader = ConfigLoader()
+        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
+
+        self.pasties = {}
+        self.urls_blocklist = set()
+        self.load_pasties_domains()
+
+        # Send module state to logs
+        self.logger.info(f'Module {self.module_name} initialized')
+
+    def load_pasties_domains(self):
+        self.pasties = {}
+        self.urls_blocklist = set()
+
+        domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
+        if os.path.exists(domains_pasties):
+            with open(domains_pasties) as f:
+                for line in f:
+                    url = line.strip()
+                    if url: # TODO validate line
+                        self.faup.decode(url)
+                        url_decoded = self.faup.get()
+                        host = url_decoded['host']
+                        # if url_decoded.get('port', ''):
+                        #     host = f'{host}:{url_decoded["port"]}'
+                        path = url_decoded.get('resource_path', '')
+                        # print(url_decoded)
+                        if path and path != '/':
+                            if path[-1] != '/':
+                                path = f'{path}/'
+                        else:
+                            path = None
+
+                        if host in self.pasties:
+                            if path:
+                                self.pasties[host].add(path)
+                        else:
+                            if path:
+                                self.pasties[host] = {path}
+                            else:
+                                self.pasties[host] = set()
+
+        url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
+        if os.path.exists(url_blocklist):
+            with open(url_blocklist) as f:
+                for line in f:
+                    url = line.strip()
+                    self.faup.decode(url)
+                    url_decoded = self.faup.get()
+                    host = url_decoded['host']
+                    # if url_decoded.get('port', ''):
+                    #     host = f'{host}:{url_decoded["port"]}'
+                    path = url_decoded.get('resource_path', '')
+                    url = f'{host}{path}'
+                    if url_decoded['query_string']:
+                        url = url + url_decoded['query_string']
+                    self.urls_blocklist.add(url)
+
+    def send_to_crawler(self, url, obj_id):
+        if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
+            self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
+            self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
+            crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
+
+    def compute(self, message):
+        url, item_id = message.split()
+
+        self.faup.decode(url)
+        url_decoded = self.faup.get()
+        # print(url_decoded)
+        url_host = url_decoded['host']
+        # if url_decoded.get('port', ''):
+        #     url_host = f'{url_host}:{url_decoded["port"]}'
+        path = url_decoded.get('resource_path', '')
+        if url_host in self.pasties:
+            if url.startswith('http://'):
+                if url[7:] in self.urls_blocklist:
+                    return None
+            elif url.startswith('https://'):
+                if url[8:] in self.urls_blocklist:
+                    return None
+            else:
+                if url in self.urls_blocklist:
+                    return None
+
+            if not self.pasties[url_host]:
+                if path and path != '/':
+                    print('send to crawler', url_host, url)
+                    self.send_to_crawler(url, item_id)
+            else:
+                if path.endswith('/'):
+                    path_end = path[:-1]
+                else:
+                    path_end = f'{path}/'
+                for url_path in self.pasties[url_host]:
+                    if path.startswith(url_path):
+                        if url_path != path and url_path != path_end:
+                            print('send to crawler', url_path, url)
+                            self.send_to_crawler(url, item_id)
+                            break
+
+
+if __name__ == '__main__':
+    module = Pasties()
+    module.run()
diff --git a/bin/modules/Zerobins.py b/bin/modules/Zerobins.py
deleted file mode 100755
index f3fcea5a..00000000
--- a/bin/modules/Zerobins.py
+++ /dev/null
@@ -1,71 +0,0 @@
-#!/usr/bin/env python3
-# -*-coding:UTF-8 -*
-"""
-The Zerobins Module
-======================
-This module spots zerobins-like services for further processing
-"""
-
-##################################
-# Import External packages
-##################################
-import os
-import re
-import sys
-
-sys.path.append(os.environ['AIL_BIN'])
-##################################
-# Import Project packages
-##################################
-from modules.abstract_module import AbstractModule
-from lib import crawlers
-
-
-class Zerobins(AbstractModule):
-    """
-    Zerobins module for AIL framework
-    """
-
-    def __init__(self):
-        super(Zerobins, self).__init__()
-
-        binz = [
-            r'^https:\/\/(zerobin||privatebin)\..*$',  # historical ones
-            ]
-
-        self.regex = re.compile('|'.join(binz))
-
-        # Pending time between two computation (computeNone) in seconds
-        self.pending_seconds = 10
-
-        # Send module state to logs
-        self.logger.info(f'Module {self.module_name} initialized')
-
-    def computeNone(self):
-        """
-        Compute when no message in queue
-        """
-        self.logger.debug("No message in queue")
-
-    def compute(self, message):
-        """
-        Compute a message in queue
-        """
-        url, item_id = message.split()
-
-        # Extract zerobins addresses
-        matching_binz = self.regex_findall(self.regex, item_id, url)
-
-        if len(matching_binz) > 0:
-            for bin_url in matching_binz:
-                print(f'send {bin_url} to crawler')
-                # TODO Change priority ???
-                crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
-                                     parent='manual', priority=60)
-
-        self.logger.debug("Compute message in queue")
-
-
-if __name__ == '__main__':
-    module = Zerobins()
-    module.run()
diff --git a/bin/modules/abstract_module.py b/bin/modules/abstract_module.py
index 0a1a12cd..164e77b3 100644
--- a/bin/modules/abstract_module.py
+++ b/bin/modules/abstract_module.py
@@ -92,6 +92,9 @@ class AbstractModule(ABC):
     def get_available_queues(self):
         return self.queue.get_out_queues()
 
+    def regex_match(self, regex, obj_id, content):
+        return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
+
     def regex_search(self, regex, obj_id, content):
         return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
 
diff --git a/configs/modules.cfg b/configs/modules.cfg
index b0b1f6df..3ce4f0ae 100644
--- a/configs/modules.cfg
+++ b/configs/modules.cfg
@@ -162,7 +162,7 @@ publish = Importers,Tags
 subscribe = Item
 publish = Tags
 
-[Zerobins]
+[Pasties]
 subscribe = Url
 
 # [My_Module_Name]