2022-10-25 16:25:19 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*-coding:UTF-8 -*
|
|
|
|
|
|
|
|
import os
|
2023-05-12 15:29:53 +02:00
|
|
|
import logging.config
|
2022-10-25 16:25:19 +02:00
|
|
|
import sys
|
|
|
|
import time
|
|
|
|
|
2023-03-14 17:36:42 +01:00
|
|
|
from requests.exceptions import ConnectionError
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
sys.path.append(os.environ['AIL_BIN'])
|
|
|
|
##################################
|
|
|
|
# Import Project packages
|
|
|
|
##################################
|
|
|
|
from modules.abstract_module import AbstractModule
|
2023-05-12 15:29:53 +02:00
|
|
|
from lib import ail_logger
|
2022-10-25 16:25:19 +02:00
|
|
|
from lib import crawlers
|
|
|
|
from lib.ConfigLoader import ConfigLoader
|
2023-06-16 15:39:13 +02:00
|
|
|
from lib.objects import CookiesNames
|
2022-10-25 16:25:19 +02:00
|
|
|
from lib.objects.Domains import Domain
|
2023-05-10 16:28:19 +02:00
|
|
|
from lib.objects.Items import Item
|
2022-10-25 16:25:19 +02:00
|
|
|
from lib.objects import Screenshots
|
2023-05-25 14:33:12 +02:00
|
|
|
from lib.objects import Titles
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-05-12 15:29:53 +02:00
|
|
|
logging.config.dictConfig(ail_logger.get_config(name='crawlers'))
|
2023-03-14 17:36:42 +01:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
class Crawler(AbstractModule):
|
|
|
|
|
|
|
|
def __init__(self):
|
2023-05-12 15:29:53 +02:00
|
|
|
super(Crawler, self, ).__init__()
|
|
|
|
|
|
|
|
self.logger = logging.getLogger(f'{self.__class__.__name__}')
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
# Waiting time in seconds between to message processed
|
|
|
|
self.pending_seconds = 1
|
|
|
|
|
|
|
|
config_loader = ConfigLoader()
|
|
|
|
|
|
|
|
self.default_har = config_loader.get_config_boolean('Crawler', 'default_har')
|
|
|
|
self.default_screenshot = config_loader.get_config_boolean('Crawler', 'default_screenshot')
|
|
|
|
self.default_depth_limit = config_loader.get_config_int('Crawler', 'default_depth_limit')
|
|
|
|
|
|
|
|
# TODO: LIMIT MAX NUMBERS OF CRAWLED PAGES
|
|
|
|
|
|
|
|
# update hardcoded blacklist
|
|
|
|
crawlers.load_blacklist()
|
|
|
|
# update captures cache
|
|
|
|
crawlers.reload_crawler_captures()
|
|
|
|
|
2023-03-14 17:36:42 +01:00
|
|
|
self.crawler_scheduler = crawlers.CrawlerScheduler()
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# LACUS
|
|
|
|
self.lacus = crawlers.get_lacus()
|
2023-04-04 14:15:35 +02:00
|
|
|
self.is_lacus_up = crawlers.is_lacus_connected(delta_check=0)
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
# Capture
|
|
|
|
self.har = None
|
|
|
|
self.screenshot = None
|
|
|
|
self.root_item = None
|
2023-06-16 15:39:13 +02:00
|
|
|
self.date = None
|
2022-10-25 16:25:19 +02:00
|
|
|
self.items_dir = None
|
|
|
|
self.domain = None
|
|
|
|
|
2023-05-10 16:28:19 +02:00
|
|
|
# TODO Replace with warning list ???
|
2023-06-02 11:23:52 +02:00
|
|
|
self.placeholder_screenshots = {'07244254f73e822bd4a95d916d8b27f2246b02c428adc29082d09550c6ed6e1a' # blank
|
|
|
|
'27e14ace10b0f96acd2bd919aaa98a964597532c35b6409dff6cc8eec8214748', # not found
|
2023-05-26 10:47:58 +02:00
|
|
|
'3e66bf4cc250a68c10f8a30643d73e50e68bf1d4a38d4adc5bfc4659ca2974c0'} # 404
|
2023-05-10 16:28:19 +02:00
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# Send module state to logs
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.info('Crawler initialized')
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-03-14 17:36:42 +01:00
|
|
|
def refresh_lacus_status(self):
|
|
|
|
try:
|
2023-04-05 09:51:42 +02:00
|
|
|
lacus_up = self.is_lacus_up
|
2023-03-14 17:36:42 +01:00
|
|
|
self.is_lacus_up = crawlers.get_lacus().is_up
|
2023-04-05 09:51:42 +02:00
|
|
|
# refresh lacus
|
2023-04-11 09:44:49 +02:00
|
|
|
if not lacus_up and self.is_lacus_up:
|
2023-04-05 09:51:42 +02:00
|
|
|
self.lacus = crawlers.get_lacus()
|
2023-03-14 17:36:42 +01:00
|
|
|
except:
|
|
|
|
self.is_lacus_up = False
|
|
|
|
if not self.is_lacus_up:
|
|
|
|
print("Can't reach lacus server", int(time.time()))
|
|
|
|
time.sleep(30)
|
|
|
|
|
|
|
|
def print_crawler_start_info(self, url, domain_url):
|
2022-10-25 16:25:19 +02:00
|
|
|
print()
|
|
|
|
print()
|
|
|
|
print('\033[92m------------------START CRAWLER------------------\033[0m')
|
2023-03-14 17:36:42 +01:00
|
|
|
print(f'crawler type: {self.domain}')
|
2022-10-25 16:25:19 +02:00
|
|
|
print('\033[92m-------------------------------------------------\033[0m')
|
|
|
|
print(f'url: {url}')
|
2023-03-14 17:36:42 +01:00
|
|
|
print(f'domain: {self.domain}')
|
2022-10-25 16:25:19 +02:00
|
|
|
print(f'domain_url: {domain_url}')
|
|
|
|
print()
|
|
|
|
|
|
|
|
def get_message(self):
|
2023-03-14 17:36:42 +01:00
|
|
|
# Crawler Scheduler
|
|
|
|
self.crawler_scheduler.update_queue()
|
|
|
|
self.crawler_scheduler.process_queue()
|
|
|
|
|
|
|
|
self.refresh_lacus_status() # TODO LOG ERROR
|
|
|
|
if not self.is_lacus_up:
|
|
|
|
return None
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# Check if a new Capture can be Launched
|
|
|
|
if crawlers.get_nb_crawler_captures() < crawlers.get_crawler_max_captures():
|
2023-02-21 12:22:49 +01:00
|
|
|
task_row = crawlers.add_task_to_lacus_queue()
|
2022-10-25 16:25:19 +02:00
|
|
|
if task_row:
|
|
|
|
task_uuid, priority = task_row
|
2023-03-14 17:36:42 +01:00
|
|
|
try:
|
|
|
|
self.enqueue_capture(task_uuid, priority)
|
|
|
|
except ConnectionError:
|
|
|
|
print(task_row)
|
|
|
|
task = crawlers.CrawlerTask(task_uuid)
|
|
|
|
task.add_to_db_crawler_queue(priority)
|
|
|
|
self.refresh_lacus_status()
|
|
|
|
return None
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
# Get CrawlerCapture Object
|
2022-10-25 16:25:19 +02:00
|
|
|
capture = crawlers.get_crawler_capture()
|
|
|
|
if capture:
|
2023-03-14 17:36:42 +01:00
|
|
|
try:
|
|
|
|
status = self.lacus.get_capture_status(capture.uuid)
|
2023-04-06 15:13:27 +02:00
|
|
|
if status != crawlers.CaptureStatus.DONE: # TODO ADD GLOBAL TIMEOUT-> Save start time ### print start time
|
2023-03-14 17:36:42 +01:00
|
|
|
capture.update(status)
|
|
|
|
print(capture.uuid, crawlers.CaptureStatus(status).name, int(time.time()))
|
|
|
|
else:
|
|
|
|
return capture
|
|
|
|
|
|
|
|
except ConnectionError:
|
|
|
|
print(capture.uuid)
|
|
|
|
capture.update(self, -1)
|
|
|
|
self.refresh_lacus_status()
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
time.sleep(self.pending_seconds)
|
|
|
|
|
|
|
|
def enqueue_capture(self, task_uuid, priority):
|
2023-02-21 12:22:49 +01:00
|
|
|
task = crawlers.CrawlerTask(task_uuid)
|
2023-03-14 17:36:42 +01:00
|
|
|
# print(task)
|
2022-10-25 16:25:19 +02:00
|
|
|
# task = {
|
|
|
|
# 'uuid': task_uuid,
|
|
|
|
# 'url': 'https://foo.be',
|
|
|
|
# 'domain': 'foo.be',
|
|
|
|
# 'depth': 1,
|
|
|
|
# 'har': True,
|
|
|
|
# 'screenshot': True,
|
|
|
|
# 'user_agent': crawlers.get_default_user_agent(),
|
|
|
|
# 'cookiejar': [],
|
|
|
|
# 'header': '',
|
|
|
|
# 'proxy': 'force_tor',
|
|
|
|
# 'parent': 'manual',
|
|
|
|
# }
|
2023-03-14 17:36:42 +01:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
url = task.get_url()
|
2022-10-25 16:25:19 +02:00
|
|
|
force = priority != 0
|
2023-02-21 12:22:49 +01:00
|
|
|
# TODO timeout
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
# TODO HEADER
|
2023-03-14 17:36:42 +01:00
|
|
|
# capture_uuid = self.lacus.enqueue(url='https://cpg.circl.lu:7000',
|
|
|
|
# force=force,
|
|
|
|
# general_timeout_in_sec=120)
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
capture_uuid = self.lacus.enqueue(url=url,
|
2023-02-21 12:22:49 +01:00
|
|
|
depth=task.get_depth(),
|
|
|
|
user_agent=task.get_user_agent(),
|
|
|
|
proxy=task.get_proxy(),
|
|
|
|
cookies=task.get_cookies(),
|
2022-10-25 16:25:19 +02:00
|
|
|
force=force,
|
2023-03-14 17:36:42 +01:00
|
|
|
general_timeout_in_sec=90) # TODO increase timeout if onion ????
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
crawlers.create_capture(capture_uuid, task_uuid)
|
|
|
|
print(task.uuid, capture_uuid, 'launched')
|
2022-10-25 16:25:19 +02:00
|
|
|
return capture_uuid
|
|
|
|
|
|
|
|
# CRAWL DOMAIN
|
2023-02-21 12:22:49 +01:00
|
|
|
def compute(self, capture):
|
|
|
|
print('saving capture', capture.uuid)
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
task = capture.get_task()
|
|
|
|
domain = task.get_domain()
|
|
|
|
print(domain)
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
self.domain = Domain(domain)
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
epoch = int(time.time())
|
2023-02-21 12:22:49 +01:00
|
|
|
parent_id = task.get_parent()
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2023-02-21 12:22:49 +01:00
|
|
|
entries = self.lacus.get_capture(capture.uuid)
|
2023-06-18 15:09:09 +02:00
|
|
|
print(entries.get('status'))
|
2023-02-21 12:22:49 +01:00
|
|
|
self.har = task.get_har()
|
|
|
|
self.screenshot = task.get_screenshot()
|
2023-03-14 17:36:42 +01:00
|
|
|
# DEBUG
|
|
|
|
# self.har = True
|
|
|
|
# self.screenshot = True
|
2023-06-16 15:39:13 +02:00
|
|
|
self.date = crawlers.get_current_date(separator=True)
|
|
|
|
self.items_dir = crawlers.get_date_crawled_items_source(self.date)
|
2022-10-25 16:25:19 +02:00
|
|
|
self.root_item = None
|
|
|
|
|
|
|
|
# Save Capture
|
|
|
|
self.save_capture_response(parent_id, entries)
|
|
|
|
|
2023-06-16 15:39:13 +02:00
|
|
|
self.domain.update_daterange(self.date.replace('/', ''))
|
2023-06-20 08:11:44 +02:00
|
|
|
# Origin + History + tags
|
2022-10-25 16:25:19 +02:00
|
|
|
if self.root_item:
|
|
|
|
self.domain.set_last_origin(parent_id)
|
|
|
|
self.domain.add_history(epoch, root_item=self.root_item)
|
2023-06-20 08:11:44 +02:00
|
|
|
# Tags
|
|
|
|
for tag in task.get_tags():
|
|
|
|
self.domain.add_tag(tag)
|
2022-10-25 16:25:19 +02:00
|
|
|
elif self.domain.was_up():
|
|
|
|
self.domain.add_history(epoch, root_item=epoch)
|
|
|
|
|
|
|
|
crawlers.update_last_crawled_domain(self.domain.get_domain_type(), self.domain.id, epoch)
|
2023-03-14 17:36:42 +01:00
|
|
|
print('capture:', capture.uuid, 'completed')
|
|
|
|
print('task: ', task.uuid, 'completed')
|
|
|
|
print()
|
|
|
|
task.remove()
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
def save_capture_response(self, parent_id, entries):
|
|
|
|
print(entries.keys())
|
|
|
|
if 'error' in entries:
|
|
|
|
# TODO IMPROVE ERROR MESSAGE
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(str(entries['error']))
|
2023-06-18 15:09:09 +02:00
|
|
|
print(entries.get('error'))
|
2022-10-25 16:25:19 +02:00
|
|
|
if entries.get('html'):
|
|
|
|
print('retrieved content')
|
|
|
|
# print(entries.get('html'))
|
|
|
|
|
2023-06-18 15:09:09 +02:00
|
|
|
if 'last_redirected_url' in entries and entries.get('last_redirected_url'):
|
2022-10-25 16:25:19 +02:00
|
|
|
last_url = entries['last_redirected_url']
|
|
|
|
unpacked_last_url = crawlers.unpack_url(last_url)
|
|
|
|
current_domain = unpacked_last_url['domain']
|
2023-03-14 17:36:42 +01:00
|
|
|
# REDIRECTION TODO CHECK IF TYPE CHANGE
|
2022-10-25 16:25:19 +02:00
|
|
|
if current_domain != self.domain.id and not self.root_item:
|
2023-05-12 15:29:53 +02:00
|
|
|
self.logger.warning(f'External redirection {self.domain.id} -> {current_domain}')
|
2022-10-25 16:25:19 +02:00
|
|
|
print(f'External redirection {self.domain.id} -> {current_domain}')
|
|
|
|
if not self.root_item:
|
|
|
|
self.domain = Domain(current_domain)
|
|
|
|
# TODO LAST URL
|
|
|
|
# FIXME
|
|
|
|
else:
|
|
|
|
last_url = f'http://{self.domain.id}'
|
|
|
|
|
2023-06-18 15:09:09 +02:00
|
|
|
if 'html' in entries and entries.get('html'):
|
2022-10-25 16:25:19 +02:00
|
|
|
item_id = crawlers.create_item_id(self.items_dir, self.domain.id)
|
|
|
|
print(item_id)
|
|
|
|
gzip64encoded = crawlers.get_gzipped_b64_item(item_id, entries['html'])
|
|
|
|
# send item to Global
|
2023-03-31 14:53:20 +02:00
|
|
|
relay_message = f'crawler {item_id} {gzip64encoded}'
|
2023-04-13 14:25:02 +02:00
|
|
|
self.add_message_to_queue(relay_message, 'Importers')
|
2022-10-25 16:25:19 +02:00
|
|
|
|
|
|
|
# Tag
|
|
|
|
msg = f'infoleak:submission="crawler";{item_id}'
|
2023-04-13 14:25:02 +02:00
|
|
|
self.add_message_to_queue(msg, 'Tags')
|
2022-10-25 16:25:19 +02:00
|
|
|
|
2022-11-30 15:50:10 +01:00
|
|
|
crawlers.create_item_metadata(item_id, last_url, parent_id)
|
2022-10-25 16:25:19 +02:00
|
|
|
if self.root_item is None:
|
|
|
|
self.root_item = item_id
|
|
|
|
parent_id = item_id
|
|
|
|
|
2023-05-25 14:33:12 +02:00
|
|
|
item = Item(item_id)
|
|
|
|
|
|
|
|
title_content = crawlers.extract_title_from_html(entries['html'])
|
|
|
|
if title_content:
|
|
|
|
title = Titles.create_title(title_content)
|
|
|
|
title.add(item.get_date(), item_id)
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# SCREENSHOT
|
|
|
|
if self.screenshot:
|
2023-06-18 15:09:09 +02:00
|
|
|
if 'png' in entries and entries.get('png'):
|
2022-10-25 16:25:19 +02:00
|
|
|
screenshot = Screenshots.create_screenshot(entries['png'], b64=False)
|
|
|
|
if screenshot:
|
2023-05-10 16:28:19 +02:00
|
|
|
if not screenshot.is_tags_safe():
|
|
|
|
unsafe_tag = 'dark-web:topic="pornography-child-exploitation"'
|
|
|
|
self.domain.add_tag(unsafe_tag)
|
|
|
|
item.add_tag(unsafe_tag)
|
|
|
|
# Remove Placeholder pages # TODO Replace with warning list ???
|
|
|
|
if screenshot.id not in self.placeholder_screenshots:
|
2023-04-06 15:13:27 +02:00
|
|
|
# Create Correlations
|
|
|
|
screenshot.add_correlation('item', '', item_id)
|
|
|
|
screenshot.add_correlation('domain', '', self.domain.id)
|
2022-10-25 16:25:19 +02:00
|
|
|
# HAR
|
|
|
|
if self.har:
|
2023-06-18 15:09:09 +02:00
|
|
|
if 'har' in entries and entries.get('har'):
|
2023-06-16 15:39:13 +02:00
|
|
|
har_id = crawlers.create_har_id(self.date, item_id)
|
|
|
|
crawlers.save_har(har_id, entries['har'])
|
|
|
|
for cookie_name in crawlers.extract_cookies_names_from_har(entries['har']):
|
|
|
|
print(cookie_name)
|
|
|
|
cookie = CookiesNames.create(cookie_name)
|
|
|
|
cookie.add(self.date.replace('/', ''), self.domain.id)
|
|
|
|
|
2022-10-25 16:25:19 +02:00
|
|
|
# Next Children
|
|
|
|
entries_children = entries.get('children')
|
|
|
|
if entries_children:
|
|
|
|
for children in entries_children:
|
|
|
|
self.save_capture_response(parent_id, children)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
module = Crawler()
|
|
|
|
module.debug = True
|
|
|
|
module.run()
|