#!/usr/bin/env python3 # -*-coding:UTF-8 -* import os import logging.config import sys import time from requests.exceptions import ConnectionError sys.path.append(os.environ['AIL_BIN']) ################################## # Import Project packages ################################## from modules.abstract_module import AbstractModule from lib import ail_logger from lib import crawlers from lib.ConfigLoader import ConfigLoader from lib.objects import CookiesNames from lib.objects import Etags from lib.objects.Domains import Domain from lib.objects.Items import Item from lib.objects import Screenshots from lib.objects import Titles logging.config.dictConfig(ail_logger.get_config(name='crawlers')) class Crawler(AbstractModule): def __init__(self): super(Crawler, self, ).__init__() self.logger = logging.getLogger(f'{self.__class__.__name__}') # Waiting time in seconds between to message processed self.pending_seconds = 1 config_loader = ConfigLoader() self.default_har = config_loader.get_config_boolean('Crawler', 'default_har') self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot') self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit') # TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES # update hardcoded blacklist crawlers.load_blacklist() # update captures cache crawlers.reload_crawler_captures() self.crawler_scheduler = crawlers.CrawlerScheduler() # LACUS self.lacus = crawlers.get_lacus() self.is_lacus_up = crawlers.is_lacus_connected(delta_check=0) # Capture self.har = None self.screenshot = None self.root_item = None self.date = None self.items_dir = None self.domain = None # TODO Replace with warning list ??? self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank '27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748', # not found '3e66bf4cc250a68c10f8a30643d73e50e68bf1d4a38d4adc5bfc4659ca2974c0'} # 404 # Send module state to logs self.logger.info('Crawler initialized') def refresh_lacus_status(self): try: lacus_up = self.is_lacus_up self.is_lacus_up = crawlers.get_lacus().is_up # refresh lacus if not lacus_up and self.is_lacus_up: self.lacus = crawlers.get_lacus() except: self.is_lacus_up = False if not self.is_lacus_up: print("Can't reach lacus server", int(time.time())) time.sleep(30) def print_crawler_start_info(self, url, domain_url): print() print() print('\033[92m------------------START CRAWLER------------------\033[0m') print(f'crawler type: {self.domain}') print('\033[92m-------------------------------------------------\033[0m') print(f'url: {url}') print(f'domain: {self.domain}') print(f'domain_url: {domain_url}') print() def get_message(self): # Crawler Scheduler self.crawler_scheduler.update_queue() self.crawler_scheduler.process_queue() self.refresh_lacus_status() # TODO LOG ERROR if not self.is_lacus_up: return None # Check if a new Capture can be Launched if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures(): task_row = crawlers.add_task_to_lacus_queue() if task_row: task_uuid, priority = task_row try: self.enqueue_capture(task_uuid, priority) except ConnectionError: print(task_row) task = crawlers.CrawlerTask(task_uuid) task.add_to_db_crawler_queue(priority) self.refresh_lacus_status() return None # Get CrawlerCapture Object capture = crawlers.get_crawler_capture() if capture: try: status = self.lacus.get_capture_status(capture.uuid) if status == crawlers.CaptureStatus.DONE: return capture elif status == crawlers.CaptureStatus.UNKNOWN: capture_start = capture.get_start_time(r_str=False) if int(time.time()) - capture_start > 600: # TODO ADD in new crawler config task = capture.get_task() task.reset() capture.delete() else: capture.update(status) else: capture.update(status) print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time())) except ConnectionError: print(capture.uuid) capture.update(self, -1) self.refresh_lacus_status() time.sleep(self.pending_seconds) def enqueue_capture(self, task_uuid, priority): task = crawlers.CrawlerTask(task_uuid) # print(task) # task = { # 'uuid': task_uuid, # 'url': 'https://foo.be', # 'domain': 'foo.be', # 'depth': 1, # 'har': True, # 'screenshot': True, # 'user_agent': crawlers.get_default_user_agent(), # 'cookiejar': [], # 'header': '', # 'proxy': 'force_tor', # 'parent': 'manual', # } url = task.get_url() force = priority != 0 # TODO timeout # TODO HEADER # capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000', # force=force, # general_timeout_in_sec=120) capture_uuid = self.lacus.enqueue(url=url, depth=task.get_depth(), user_agent=task.get_user_agent(), proxy=task.get_proxy(), cookies=task.get_cookies(), force=force, general_timeout_in_sec=90) # TODO increase timeout if onion ???? crawlers.create_capture(capture_uuid, task_uuid) print(task.uuid, capture_uuid, 'launched') return capture_uuid # CRAWL DOMAIN def compute(self, capture): print('saving capture', capture.uuid) task = capture.get_task() domain = task.get_domain() print(domain) self.domain = Domain(domain) epoch = int(time.time()) parent_id = task.get_parent() entries = self.lacus.get_capture(capture.uuid) print(entries.get('status')) self.har = task.get_har() self.screenshot = task.get_screenshot() # DEBUG # self.har = True # self.screenshot = True self.date = crawlers.get_current_date(separator=True) self.items_dir = crawlers.get_date_crawled_items_source(self.date) self.root_item = None # Save Capture self.save_capture_response(parent_id, entries) self.domain.update_daterange(self.date.replace('/', '')) # Origin + History + tags if self.root_item: self.domain.set_last_origin(parent_id) self.domain.add_history(epoch, root_item=self.root_item) # Tags for tag in task.get_tags(): self.domain.add_tag(tag) elif self.domain.was_up(): self.domain.add_history(epoch, root_item=epoch) crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch) print('capture:', capture.uuid, 'completed') print('task: ', task.uuid, 'completed') print() task.remove() def save_capture_response(self, parent_id, entries): print(entries.keys()) if 'error' in entries: # TODO IMPROVE ERROR MESSAGE self.logger.warning(str(entries['error'])) print(entries.get('error')) if entries.get('html'): print('retrieved content') # print(entries.get('html')) if 'last_redirected_url' in entries and entries.get('last_redirected_url'): last_url = entries['last_redirected_url'] unpacked_last_url = crawlers.unpack_url(last_url) current_domain = unpacked_last_url['domain'] # REDIRECTION TODO CHECK IF TYPE CHANGE if current_domain != self.domain.id and not self.root_item: self.logger.warning(f'External redirection {self.domain.id} -> {current_domain}') print(f'External redirection {self.domain.id} -> {current_domain}') if not self.root_item: self.domain = Domain(current_domain) # TODO LAST URL # FIXME else: last_url = f'http://{self.domain.id}' if 'html' in entries and entries.get('html'): item_id = crawlers.create_item_id(self.items_dir, self.domain.id) print(item_id) gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html']) # send item to Global relay_message = f'crawler {item_id} {gzip64encoded}' self.add_message_to_queue(relay_message, 'Importers') # Tag msg = f'infoleak:submission="crawler";{item_id}' self.add_message_to_queue(msg, 'Tags') crawlers.create_item_metadata(item_id, last_url, parent_id) if self.root_item is None: self.root_item = item_id parent_id = item_id item = Item(item_id) title_content = crawlers.extract_title_from_html(entries['html']) if title_content: title = Titles.create_title(title_content) title.add(item.get_date(), item_id) # SCREENSHOT if self.screenshot: if 'png' in entries and entries.get('png'): screenshot = Screenshots.create_screenshot(entries['png'], b64=False) if screenshot: if not screenshot.is_tags_safe(): unsafe_tag = 'dark-web:topic="pornography-child-exploitation"' self.domain.add_tag(unsafe_tag) item.add_tag(unsafe_tag) # Remove Placeholder pages # TODO Replace with warning list ??? if screenshot.id not in self.placeholder_screenshots: # Create Correlations screenshot.add_correlation('item', '', item_id) screenshot.add_correlation('domain', '', self.domain.id) # HAR if self.har: if 'har' in entries and entries.get('har'): har_id = crawlers.create_har_id(self.date, item_id) crawlers.save_har(har_id, entries['har']) for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']): print(cookie_name) cookie = CookiesNames.create(cookie_name) cookie.add(self.date.replace('/', ''), self.domain.id) for etag_content in crawlers.extract_etag_from_har(entries['har']): print(etag_content) etag = Etags.create(etag_content) etag.add(self.date.replace('/', ''), self.domain.id) crawlers.extract_hhhash(entries['har'], self.domain.id, self.date.replace('/', '')) # Next Children entries_children = entries.get('children') if entries_children: for children in entries_children: self.save_capture_response(parent_id, children) if __name__ == '__main__': module = Crawler() module.debug = True module.run()