mirror of https://github.com/CIRCL/AIL-framework
				
				
				
			chg: [modules] crawl pasties domains
							parent
							
								
									f05c7b6a93
								
							
						
					
					
						commit
						0cb7431e10
					
				| 
						 | 
				
			
			@ -83,6 +83,7 @@ class ConfigLoader(object):
 | 
			
		|||
        else:
 | 
			
		||||
            return []
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# # # # Directory Config # # # #
 | 
			
		||||
 | 
			
		||||
config_loader = ConfigLoader()
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -113,6 +113,34 @@ def regex_finditer(r_key, regex, item_id, content, max_time=30):
 | 
			
		|||
        proc.terminate()
 | 
			
		||||
        sys.exit(0)
 | 
			
		||||
 | 
			
		||||
def _regex_match(r_key, regex, content):
 | 
			
		||||
    if re.match(regex, content):
 | 
			
		||||
        r_serv_cache.set(r_key, 1)
 | 
			
		||||
        r_serv_cache.expire(r_key, 360)
 | 
			
		||||
 | 
			
		||||
def regex_match(r_key, regex, item_id, content, max_time=30):
 | 
			
		||||
    proc = Proc(target=_regex_match, args=(r_key, regex, content))
 | 
			
		||||
    try:
 | 
			
		||||
        proc.start()
 | 
			
		||||
        proc.join(max_time)
 | 
			
		||||
        if proc.is_alive():
 | 
			
		||||
            proc.terminate()
 | 
			
		||||
            # Statistics.incr_module_timeout_statistic(r_key)
 | 
			
		||||
            err_mess = f"{r_key}: processing timeout: {item_id}"
 | 
			
		||||
            logger.info(err_mess)
 | 
			
		||||
            return False
 | 
			
		||||
        else:
 | 
			
		||||
            if r_serv_cache.exists(r_key):
 | 
			
		||||
                r_serv_cache.delete(r_key)
 | 
			
		||||
                return True
 | 
			
		||||
            else:
 | 
			
		||||
                r_serv_cache.delete(r_key)
 | 
			
		||||
                return False
 | 
			
		||||
    except KeyboardInterrupt:
 | 
			
		||||
        print("Caught KeyboardInterrupt, terminating regex worker")
 | 
			
		||||
        proc.terminate()
 | 
			
		||||
        sys.exit(0)
 | 
			
		||||
 | 
			
		||||
def _regex_search(r_key, regex, content):
 | 
			
		||||
    if re.search(regex, content):
 | 
			
		||||
        r_serv_cache.set(r_key, 1)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,144 @@
 | 
			
		|||
#!/usr/bin/env python3
 | 
			
		||||
# -*-coding:UTF-8 -*
 | 
			
		||||
"""
 | 
			
		||||
The Pasties Module
 | 
			
		||||
======================
 | 
			
		||||
This module spots domain-pasties services for further processing
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
##################################
 | 
			
		||||
# Import External packages
 | 
			
		||||
##################################
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import time
 | 
			
		||||
 | 
			
		||||
from pyfaup.faup import Faup
 | 
			
		||||
 | 
			
		||||
sys.path.append(os.environ['AIL_BIN'])
 | 
			
		||||
##################################
 | 
			
		||||
# Import Project packages
 | 
			
		||||
##################################
 | 
			
		||||
from modules.abstract_module import AbstractModule
 | 
			
		||||
from lib.ConfigLoader import ConfigLoader
 | 
			
		||||
from lib import crawlers
 | 
			
		||||
 | 
			
		||||
# TODO add url validator
 | 
			
		||||
 | 
			
		||||
pasties_blocklist_urls = set()
 | 
			
		||||
pasties_domains = {}
 | 
			
		||||
 | 
			
		||||
class Pasties(AbstractModule):
 | 
			
		||||
    """
 | 
			
		||||
    Pasties module for AIL framework
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super(Pasties, self).__init__()
 | 
			
		||||
        self.faup = Faup()
 | 
			
		||||
 | 
			
		||||
        config_loader = ConfigLoader()
 | 
			
		||||
        self.r_cache = config_loader.get_redis_conn("Redis_Cache")
 | 
			
		||||
 | 
			
		||||
        self.pasties = {}
 | 
			
		||||
        self.urls_blocklist = set()
 | 
			
		||||
        self.load_pasties_domains()
 | 
			
		||||
 | 
			
		||||
        # Send module state to logs
 | 
			
		||||
        self.logger.info(f'Module {self.module_name} initialized')
 | 
			
		||||
 | 
			
		||||
    def load_pasties_domains(self):
 | 
			
		||||
        self.pasties = {}
 | 
			
		||||
        self.urls_blocklist = set()
 | 
			
		||||
 | 
			
		||||
        domains_pasties = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties')
 | 
			
		||||
        if os.path.exists(domains_pasties):
 | 
			
		||||
            with open(domains_pasties) as f:
 | 
			
		||||
                for line in f:
 | 
			
		||||
                    url = line.strip()
 | 
			
		||||
                    if url: # TODO validate line
 | 
			
		||||
                        self.faup.decode(url)
 | 
			
		||||
                        url_decoded = self.faup.get()
 | 
			
		||||
                        host = url_decoded['host']
 | 
			
		||||
                        # if url_decoded.get('port', ''):
 | 
			
		||||
                        #     host = f'{host}:{url_decoded["port"]}'
 | 
			
		||||
                        path = url_decoded.get('resource_path', '')
 | 
			
		||||
                        # print(url_decoded)
 | 
			
		||||
                        if path and path != '/':
 | 
			
		||||
                            if path[-1] != '/':
 | 
			
		||||
                                path = f'{path}/'
 | 
			
		||||
                        else:
 | 
			
		||||
                            path = None
 | 
			
		||||
 | 
			
		||||
                        if host in self.pasties:
 | 
			
		||||
                            if path:
 | 
			
		||||
                                self.pasties[host].add(path)
 | 
			
		||||
                        else:
 | 
			
		||||
                            if path:
 | 
			
		||||
                                self.pasties[host] = {path}
 | 
			
		||||
                            else:
 | 
			
		||||
                                self.pasties[host] = set()
 | 
			
		||||
 | 
			
		||||
        url_blocklist = os.path.join(os.environ['AIL_HOME'], 'files/domains_pasties_blacklist')
 | 
			
		||||
        if os.path.exists(url_blocklist):
 | 
			
		||||
            with open(url_blocklist) as f:
 | 
			
		||||
                for line in f:
 | 
			
		||||
                    url = line.strip()
 | 
			
		||||
                    self.faup.decode(url)
 | 
			
		||||
                    url_decoded = self.faup.get()
 | 
			
		||||
                    host = url_decoded['host']
 | 
			
		||||
                    # if url_decoded.get('port', ''):
 | 
			
		||||
                    #     host = f'{host}:{url_decoded["port"]}'
 | 
			
		||||
                    path = url_decoded.get('resource_path', '')
 | 
			
		||||
                    url = f'{host}{path}'
 | 
			
		||||
                    if url_decoded['query_string']:
 | 
			
		||||
                        url = url + url_decoded['query_string']
 | 
			
		||||
                    self.urls_blocklist.add(url)
 | 
			
		||||
 | 
			
		||||
    def send_to_crawler(self, url, obj_id):
 | 
			
		||||
        if not self.r_cache.exists(f'{self.module_name}:url:{url}'):
 | 
			
		||||
            self.r_cache.set(f'{self.module_name}:url:{url}', int(time.time()))
 | 
			
		||||
            self.r_cache.expire(f'{self.module_name}:url:{url}', 86400)
 | 
			
		||||
            crawlers.create_task(url, depth=0, har=False, screenshot=False, proxy='force_tor', priority=60, parent=obj_id)
 | 
			
		||||
 | 
			
		||||
    def compute(self, message):
 | 
			
		||||
        url, item_id = message.split()
 | 
			
		||||
 | 
			
		||||
        self.faup.decode(url)
 | 
			
		||||
        url_decoded = self.faup.get()
 | 
			
		||||
        # print(url_decoded)
 | 
			
		||||
        url_host = url_decoded['host']
 | 
			
		||||
        # if url_decoded.get('port', ''):
 | 
			
		||||
        #     url_host = f'{url_host}:{url_decoded["port"]}'
 | 
			
		||||
        path = url_decoded.get('resource_path', '')
 | 
			
		||||
        if url_host in self.pasties:
 | 
			
		||||
            if url.startswith('http://'):
 | 
			
		||||
                if url[7:] in self.urls_blocklist:
 | 
			
		||||
                    return None
 | 
			
		||||
            elif url.startswith('https://'):
 | 
			
		||||
                if url[8:] in self.urls_blocklist:
 | 
			
		||||
                    return None
 | 
			
		||||
            else:
 | 
			
		||||
                if url in self.urls_blocklist:
 | 
			
		||||
                    return None
 | 
			
		||||
 | 
			
		||||
            if not self.pasties[url_host]:
 | 
			
		||||
                if path and path != '/':
 | 
			
		||||
                    print('send to crawler', url_host, url)
 | 
			
		||||
                    self.send_to_crawler(url, item_id)
 | 
			
		||||
            else:
 | 
			
		||||
                if path.endswith('/'):
 | 
			
		||||
                    path_end = path[:-1]
 | 
			
		||||
                else:
 | 
			
		||||
                    path_end = f'{path}/'
 | 
			
		||||
                for url_path in self.pasties[url_host]:
 | 
			
		||||
                    if path.startswith(url_path):
 | 
			
		||||
                        if url_path != path and url_path != path_end:
 | 
			
		||||
                            print('send to crawler', url_path, url)
 | 
			
		||||
                            self.send_to_crawler(url, item_id)
 | 
			
		||||
                            break
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    module = Pasties()
 | 
			
		||||
    module.run()
 | 
			
		||||
| 
						 | 
				
			
			@ -1,71 +0,0 @@
 | 
			
		|||
#!/usr/bin/env python3
 | 
			
		||||
# -*-coding:UTF-8 -*
 | 
			
		||||
"""
 | 
			
		||||
The Zerobins Module
 | 
			
		||||
======================
 | 
			
		||||
This module spots zerobins-like services for further processing
 | 
			
		||||
"""
 | 
			
		||||
 | 
			
		||||
##################################
 | 
			
		||||
# Import External packages
 | 
			
		||||
##################################
 | 
			
		||||
import os
 | 
			
		||||
import re
 | 
			
		||||
import sys
 | 
			
		||||
 | 
			
		||||
sys.path.append(os.environ['AIL_BIN'])
 | 
			
		||||
##################################
 | 
			
		||||
# Import Project packages
 | 
			
		||||
##################################
 | 
			
		||||
from modules.abstract_module import AbstractModule
 | 
			
		||||
from lib import crawlers
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class Zerobins(AbstractModule):
 | 
			
		||||
    """
 | 
			
		||||
    Zerobins module for AIL framework
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
    def __init__(self):
 | 
			
		||||
        super(Zerobins, self).__init__()
 | 
			
		||||
 | 
			
		||||
        binz = [
 | 
			
		||||
            r'^https:\/\/(zerobin||privatebin)\..*$',  # historical ones
 | 
			
		||||
            ]
 | 
			
		||||
 | 
			
		||||
        self.regex = re.compile('|'.join(binz))
 | 
			
		||||
 | 
			
		||||
        # Pending time between two computation (computeNone) in seconds
 | 
			
		||||
        self.pending_seconds = 10
 | 
			
		||||
 | 
			
		||||
        # Send module state to logs
 | 
			
		||||
        self.logger.info(f'Module {self.module_name} initialized')
 | 
			
		||||
 | 
			
		||||
    def computeNone(self):
 | 
			
		||||
        """
 | 
			
		||||
        Compute when no message in queue
 | 
			
		||||
        """
 | 
			
		||||
        self.logger.debug("No message in queue")
 | 
			
		||||
 | 
			
		||||
    def compute(self, message):
 | 
			
		||||
        """
 | 
			
		||||
        Compute a message in queue
 | 
			
		||||
        """
 | 
			
		||||
        url, item_id = message.split()
 | 
			
		||||
 | 
			
		||||
        # Extract zerobins addresses
 | 
			
		||||
        matching_binz = self.regex_findall(self.regex, item_id, url)
 | 
			
		||||
 | 
			
		||||
        if len(matching_binz) > 0:
 | 
			
		||||
            for bin_url in matching_binz:
 | 
			
		||||
                print(f'send {bin_url} to crawler')
 | 
			
		||||
                # TODO Change priority ???
 | 
			
		||||
                crawlers.create_task(bin_url, depth=0, har=False, screenshot=False, proxy='force_tor',
 | 
			
		||||
                                     parent='manual', priority=60)
 | 
			
		||||
 | 
			
		||||
        self.logger.debug("Compute message in queue")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
    module = Zerobins()
 | 
			
		||||
    module.run()
 | 
			
		||||
| 
						 | 
				
			
			@ -92,6 +92,9 @@ class AbstractModule(ABC):
 | 
			
		|||
    def get_available_queues(self):
 | 
			
		||||
        return self.queue.get_out_queues()
 | 
			
		||||
 | 
			
		||||
    def regex_match(self, regex, obj_id, content):
 | 
			
		||||
        return regex_helper.regex_match(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
 | 
			
		||||
 | 
			
		||||
    def regex_search(self, regex, obj_id, content):
 | 
			
		||||
        return regex_helper.regex_search(self.r_cache_key, regex, obj_id, content, max_time=self.max_execution_time)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -162,7 +162,7 @@ publish = Importers,Tags
 | 
			
		|||
subscribe = Item
 | 
			
		||||
publish = Tags
 | 
			
		||||
 | 
			
		||||
[Zerobins]
 | 
			
		||||
[Pasties]
 | 
			
		||||
subscribe = Url
 | 
			
		||||
 | 
			
		||||
# [My_Module_Name]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue