#!/usr/bin/env python3 # -*-coding:UTF-8 -* import os import sys import time sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from modules.abstract_module import AbstractModule from lib import crawlers from lib.ConfigLoader import ConfigLoader from lib.objects.Domains import Domain from lib.objects import Screenshots class Crawler(AbstractModule): def __init__(self): super(Crawler, self, ).__init__(logger_channel='Crawler') # Waiting time in seconds between to message processed self.pending_seconds = 1 config_loader = ConfigLoader() self.r_log_submit = config_loader.get_redis_conn('Redis_Log_submit') self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit') # TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES # update hardcoded blacklist crawlers.load_blacklist() # update captures cache crawlers.reload_crawler_captures() # LACUS self.lacus = crawlers.get_lacus() # Capture self.har = None self.screenshot = None self.root_item = None self.har_dir = None self.items_dir = None self.domain = None # Send module state to logs self.redis_logger.info('Crawler initialized') def print_crawler_start_info(self, url, domain, domain_url): print() print() print('\033[92m------------------START CRAWLER------------------\033[0m') print(f'crawler type: {domain}') print('\033[92m-------------------------------------------------\033[0m') print(f'url: {url}') print(f'domain: {domain}') print(f'domain_url: {domain_url}') print() def get_message(self): # Check if a new Capture can be Launched if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): task_row = crawlers.get_crawler_task_from_queue() if task_row: print(task_row) task_uuid, priority = task_row self.enqueue_capture(task_uuid, priority) # Check if a Capture is Done capture = crawlers.get_crawler_capture() if capture: print(capture) capture_uuid = capture[0][0] capture_status = self.lacus.get_capture_status(capture_uuid) if capture_status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time crawlers.update_crawler_capture(capture_uuid) print(capture_uuid, capture_status, int(time.time())) else: self.compute(capture_uuid) crawlers.remove_crawler_capture(capture_uuid) print('capture', capture_uuid, 'completed') time.sleep(self.pending_seconds) def enqueue_capture(self, task_uuid, priority): task = crawlers.get_crawler_task(task_uuid) print(task) # task = { # 'uuid': task_uuid, # 'url': 'https://foo.be', # 'domain': 'foo.be', # 'depth': 1, # 'har': True, # 'screenshot': True, # 'user_agent': crawlers.get_default_user_agent(), # 'cookiejar': [], # 'header': '', # 'proxy': 'force_tor', # 'parent': 'manual', # } url = task['url'] force = priority != 0 # TODO unpack cookiejar # TODO HEADER capture_uuid = self.lacus.enqueue(url=url, depth=task['depth'], user_agent=task['user_agent'], proxy=task['proxy'], cookies=[], force=force, general_timeout_in_sec=90) crawlers.add_crawler_capture(task_uuid, capture_uuid) print(task_uuid, capture_uuid, 'launched') return capture_uuid # CRAWL DOMAIN # TODO: CATCH ERRORS def compute(self, capture_uuid): print('saving capture', capture_uuid) task_uuid = crawlers.get_crawler_capture_task_uuid(capture_uuid) task = crawlers.get_crawler_task(task_uuid) print(task['domain']) self.domain = Domain(task['domain']) # TODO CHANGE EPOCH epoch = int(time.time()) parent_id = task['parent'] print(task) entries = self.lacus.get_capture(capture_uuid) print(entries['status']) self.har = task['har'] self.screenshot = task['screenshot'] str_date = crawlers.get_current_date(separator=True) self.har_dir = crawlers.get_date_har_dir(str_date) self.items_dir = crawlers.get_date_crawled_items_source(str_date) self.root_item = None # Save Capture self.save_capture_response(parent_id, entries) self.domain.update_daterange(str_date.replace('/', '')) # Origin + History if self.root_item: # domain.add_ports(port) self.domain.set_last_origin(parent_id) self.domain.add_history(epoch, root_item=self.root_item) elif self.domain.was_up(): self.domain.add_history(epoch, root_item=epoch) crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch) crawlers.clear_crawler_task(task_uuid, self.domain.get_domain_type()) def save_capture_response(self, parent_id, entries): print(entries.keys()) if 'error' in entries: # TODO IMPROVE ERROR MESSAGE self.redis_logger.warning(str(entries['error'])) print(entries['error']) if entries.get('html'): print('retrieved content') # print(entries.get('html')) # TODO LOGS IF != domain if 'last_redirected_url' in entries and entries['last_redirected_url']: last_url = entries['last_redirected_url'] unpacked_last_url = crawlers.unpack_url(last_url) current_domain = unpacked_last_url['domain'] # REDIRECTION TODO CHECK IF WEB if current_domain != self.domain.id and not self.root_item: self.redis_logger.warning(f'External redirection {self.domain.id} -> {current_domain}') print(f'External redirection {self.domain.id} -> {current_domain}') if not self.root_item: self.domain = Domain(current_domain) # TODO LAST URL # FIXME else: last_url = f'http://{self.domain.id}' if 'html' in entries and entries['html']: item_id = crawlers.create_item_id(self.items_dir, self.domain.id) print(item_id) gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html']) # send item to Global relay_message = f'{item_id} {gzip64encoded}' self.send_message_to_queue(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_log_submit.hincrby('mixer_cache:list_feeder', 'crawler', 1) # Tag msg = f'infoleak:submission="crawler";{item_id}' self.send_message_to_queue(msg, 'Tags') crawlers.create_item_metadata(item_id, last_url, parent_id) if self.root_item is None: self.root_item = item_id parent_id = item_id # SCREENSHOT if self.screenshot: if 'png' in entries and entries['png']: screenshot = Screenshots.create_screenshot(entries['png'], b64=False) if screenshot: # Create Correlations screenshot.add_correlation('item', '', item_id) screenshot.add_correlation('domain', '', self.domain.id) # HAR if self.har: if 'har' in entries and entries['har']: crawlers.save_har(self.har_dir, item_id, entries['har']) # Next Children entries_children = entries.get('children') if entries_children: for children in entries_children: self.save_capture_response(parent_id, children) if __name__ == '__main__': module = Crawler() module.debug = True # module.compute(('ooooo', 0)) module.run() ################################## ################################## ################################## ################################## ################################## # from Helper import Process # from pubsublogger import publisher # ======== FUNCTIONS ======== # def update_auto_crawler(): # current_epoch = int(time.time()) # list_to_crawl = redis_crawler.zrangebyscore('crawler_auto_queue', '-inf', current_epoch) # for elem_to_crawl in list_to_crawl: # mess, type = elem_to_crawl.rsplit(';', 1) # redis_crawler.sadd('{}_crawler_priority_queue'.format(type), mess) # redis_crawler.zrem('crawler_auto_queue', elem_to_crawl) # Extract info form url (url, domain, domain url, ...) # def unpack_url(url): # to_crawl = {} # faup.decode(url) # url_unpack = faup.get() # to_crawl['domain'] = to_crawl['domain'].lower() # new_url_host = url_host.lower() # url_lower_case = url.replace(url_host, new_url_host, 1) # # if url_unpack['scheme'] is None: # to_crawl['scheme'] = 'http' # url= 'http://{}'.format(url_lower_case) # else: # try: # scheme = url_unpack['scheme'].decode() # except Exception as e: # scheme = url_unpack['scheme'] # if scheme in default_proto_map: # to_crawl['scheme'] = scheme # url = url_lower_case # else: # redis_crawler.sadd('new_proto', '{} {}'.format(scheme, url_lower_case)) # to_crawl['scheme'] = 'http' # url= 'http://{}'.format(url_lower_case.replace(scheme, '', 1)) # # if url_unpack['port'] is None: # to_crawl['port'] = default_proto_map[to_crawl['scheme']] # else: # try: # port = url_unpack['port'].decode() # except: # port = url_unpack['port'] # # Verify port number #################### make function to verify/correct port number # try: # int(port) # # Invalid port Number # except Exception as e: # port = default_proto_map[to_crawl['scheme']] # to_crawl['port'] = port # # #if url_unpack['query_string'] is None: # # if to_crawl['port'] == 80: # # to_crawl['url']= '{}://{}'.format(to_crawl['scheme'], url_unpack['host'].decode()) # # else: # # to_crawl['url']= '{}://{}:{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port']) # #else: # # to_crawl['url']= '{}://{}:{}{}'.format(to_crawl['scheme'], url_unpack['host'].decode(), to_crawl['port'], url_unpack['query_string'].decode()) # # to_crawl['url'] = url # if to_crawl['port'] == 80: # to_crawl['domain_url'] = '{}://{}'.format(to_crawl['scheme'], new_url_host) # else: # to_crawl['domain_url'] = '{}://{}:{}'.format(to_crawl['scheme'], new_url_host, to_crawl['port']) # # try: # to_crawl['tld'] = url_unpack['tld'].decode() # except: # to_crawl['tld'] = url_unpack['tld'] # # return to_crawl # ##################################################### add ftp ??? # update_auto_crawler() # # add next auto Crawling in queue: # if to_crawl['paste'] == 'auto': # redis_crawler.zadd('crawler_auto_queue', int(time.time()+crawler_config['crawler_options']['time']) , '{};{}'.format(to_crawl['original_message'], to_crawl['type_service'])) # # update list, last auto crawled domains # redis_crawler.lpush('last_auto_crawled', '{}:{};{}'.format(url_data['domain'], url_data['port'], date['epoch'])) # redis_crawler.ltrim('last_auto_crawled', 0, 9) #