#!/usr/bin/env python3 # -*- coding: utf-8 -*- import hashlib import json import logging import re import socket import time from collections import defaultdict from datetime import date from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Union import requests import vt # type: ignore from har2tree import CrawledTree, Har2TreeError, HostNode, URLNode from pyeupi import PyEUPI from pymisp import MISPAttribute, MISPEvent, PyMISP from pysanejs import SaneJS from vt.error import APIError # type: ignore from .exceptions import ConfigError from .helpers import (get_config, get_homedir, get_public_suffix_list, get_useragent_for_requests) class MISP(): def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('apikey'): self.available = False self.logger.info('Module not enabled.') return self.available = True self.enable_lookup = False self.enable_push = False self.allow_auto_trigger = False try: self.client = PyMISP(url=config['url'], key=config['apikey'], ssl=config['verify_tls_cert'], timeout=config['timeout']) except Exception as e: self.available = False self.logger.warning(f'Unable to connect to MISP: {e}') return if config.get('enable_lookup'): self.enable_lookup = True if config.get('enable_push'): self.enable_push = True if config.get('allow_auto_trigger'): self.allow_auto_trigger = True self.default_tags: List[str] = config.get('default_tags') # type: ignore self.auto_publish = config.get('auto_publish') self.storage_dir_misp = get_homedir() / 'misp' self.storage_dir_misp.mkdir(parents=True, exist_ok=True) self.psl = get_public_suffix_list() def get_fav_tags(self): return self.client.tags(pythonify=True, favouritesOnly=1) def _prepare_push(self, to_push: Union[List[MISPEvent], MISPEvent], allow_duplicates: bool=False, auto_publish: Optional[bool]=False) -> Union[List[MISPEvent], Dict]: '''Adds the pre-configured information as required by the instance. If duplicates aren't allowed, they will be automatically skiped and the extends_uuid key in the next element in the list updated''' if isinstance(to_push, MISPEvent): events = [to_push] else: events = to_push events_to_push = [] existing_uuid_to_extend = None for event in events: if not allow_duplicates: existing_event = self.get_existing_event(event.attributes[0].value) if existing_event: existing_uuid_to_extend = existing_event.uuid continue if existing_uuid_to_extend: event.extends_uuid = existing_uuid_to_extend existing_uuid_to_extend = None for tag in self.default_tags: event.add_tag(tag) if auto_publish: event.publish() events_to_push.append(event) return events_to_push def push(self, to_push: Union[List[MISPEvent], MISPEvent], allow_duplicates: bool=False, auto_publish: Optional[bool]=None) -> Union[List[MISPEvent], Dict]: if auto_publish is None: auto_publish = self.auto_publish if self.available and self.enable_push: events = self._prepare_push(to_push, allow_duplicates, auto_publish) if not events: return {'error': 'All the events are already on the MISP instance.'} if isinstance(events, Dict): return {'error': events} to_return = [] for event in events: new_event = self.client.add_event(event, pythonify=True) if isinstance(new_event, MISPEvent): to_return.append(new_event) else: return {'error': new_event} return to_return else: return {'error': 'Module not available or push not enabled.'} def get_existing_event_url(self, permaurl: str) -> Optional[str]: attributes = self.client.search('attributes', value=permaurl, limit=1, page=1, pythonify=True) if not attributes or not isinstance(attributes[0], MISPAttribute): return None url = f'{self.client.root_url}/events/{attributes[0].event_id}' return url def get_existing_event(self, permaurl: str) -> Optional[MISPEvent]: attributes = self.client.search('attributes', value=permaurl, limit=1, page=1, pythonify=True) if not attributes or not isinstance(attributes[0], MISPAttribute): return None event = self.client.get_event(attributes[0].event_id, pythonify=True) if isinstance(event, MISPEvent): return event return None def lookup(self, node: URLNode, hostnode: HostNode) -> Union[Dict[str, Set[str]], Dict[str, Any]]: if self.available and self.enable_lookup: tld = self.psl.get_tld(hostnode.name) domain = re.sub(f'.{tld}$', '', hostnode.name).split('.')[-1] to_lookup = [node.name, hostnode.name, f'{domain}.{tld}'] + hostnode.resolved_ips if hasattr(hostnode, 'cnames'): to_lookup += hostnode.cnames if not node.empty_response: to_lookup.append(node.body_hash) if attributes := self.client.search(controller='attributes', value=to_lookup, enforce_warninglist=True, pythonify=True): if isinstance(attributes, list): to_return: Dict[str, Set[str]] = defaultdict(set) # NOTE: We have MISPAttribute in that list for a in attributes: to_return[a.event_id].add(a.value) # type: ignore return to_return else: # The request returned an error return attributes # type: ignore return {'info': 'No hits.'} else: return {'error': 'Module not available or lookup not enabled.'} class UniversalWhois(): def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('enabled'): self.available = False self.logger.info('Module not enabled.') return self.server = config.get('ipaddress') self.port = config.get('port') self.allow_auto_trigger = False if config.get('allow_auto_trigger'): self.allow_auto_trigger = True try: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((self.server, self.port)) except Exception as e: self.available = False self.logger.warning(f'Unable to connect to uwhois ({self.server}:{self.port}): {e}') return self.available = True def query_whois_hostnode(self, hostnode: HostNode) -> None: if hasattr(hostnode, 'resolved_ips'): for ip in hostnode.resolved_ips: self.whois(ip) if hasattr(hostnode, 'cnames'): for cname in hostnode.cnames: self.whois(cname) self.whois(hostnode.name) def capture_default_trigger(self, crawled_tree: CrawledTree, /, *, force: bool=False, auto_trigger: bool=False) -> None: '''Run the module on all the nodes up to the final redirect''' if not self.available: return None if auto_trigger and not self.allow_auto_trigger: return None try: hostnode = crawled_tree.root_hartree.get_host_node_by_uuid(crawled_tree.root_hartree.rendered_node.hostnode_uuid) except Har2TreeError as e: self.logger.warning(e) else: self.query_whois_hostnode(hostnode) for n in hostnode.get_ancestors(): self.query_whois_hostnode(n) def whois(self, query: str) -> str: if not self.available: return '' bytes_whois = b'' with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.connect((self.server, self.port)) sock.sendall('{}\n'.format(query).encode()) while True: data = sock.recv(2048) if not data: break bytes_whois += data to_return = bytes_whois.decode() return to_return class SaneJavaScript(): def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('enabled'): self.available = False self.logger.info('Module not enabled.') return self.client = SaneJS() if not self.client.is_up: self.available = False return self.available = True self.allow_auto_trigger = False if config.get('allow_auto_trigger'): self.allow_auto_trigger = True self.storage_dir = get_homedir() / 'sanejs' self.storage_dir.mkdir(parents=True, exist_ok=True) def hashes_lookup(self, sha512: Union[Iterable[str], str], force: bool=False) -> Dict[str, List[str]]: if isinstance(sha512, str): hashes: Iterable[str] = [sha512] else: hashes = sha512 today_dir = self.storage_dir / date.today().isoformat() today_dir.mkdir(parents=True, exist_ok=True) sanejs_unknowns = today_dir / 'unknown' unknown_hashes = set() if sanejs_unknowns.exists(): with sanejs_unknowns.open() as f: unknown_hashes = set(line.strip() for line in f.readlines()) to_return: Dict[str, List[str]] = {} if force: to_lookup = hashes else: to_lookup = [h for h in hashes if (h not in unknown_hashes and not (today_dir / h).exists())] has_new_unknown = False for h in to_lookup: try: response = self.client.sha512(h) except Exception as e: self.logger.warning(f'Something went wrong. Query: {h} - {e}') continue if 'error' in response: # Server not ready break if 'response' in response and response['response']: cached_path = today_dir / h with cached_path.open('w') as f: json.dump(response['response'], f) to_return[h] = response['response'] else: has_new_unknown = True unknown_hashes.add(h) for h in hashes: cached_path = today_dir / h if h in unknown_hashes or h in to_return: continue elif cached_path.exists(): with cached_path.open() as f: to_return[h] = json.load(f) if has_new_unknown: with sanejs_unknowns.open('w') as f: f.writelines(f'{h}\n' for h in unknown_hashes) return to_return class PhishingInitiative(): def __init__(self, config: Dict[str, Any]): if not config.get('apikey'): self.available = False return self.available = True self.autosubmit = False self.allow_auto_trigger = False self.client = PyEUPI(config['apikey']) if config.get('allow_auto_trigger'): self.allow_auto_trigger = True if config.get('autosubmit'): self.autosubmit = True self.storage_dir_eupi = get_homedir() / 'eupi' self.storage_dir_eupi.mkdir(parents=True, exist_ok=True) def __get_cache_directory(self, url: str) -> Path: m = hashlib.md5() m.update(url.encode()) return self.storage_dir_eupi / m.hexdigest() def get_url_lookup(self, url: str) -> Optional[Dict[str, Any]]: url_storage_dir = self.__get_cache_directory(url) if not url_storage_dir.exists(): return None cached_entries = sorted(url_storage_dir.glob('*'), reverse=True) if not cached_entries: return None with cached_entries[0].open() as f: return json.load(f) def capture_default_trigger(self, crawled_tree: CrawledTree, /, *, force: bool=False, auto_trigger: bool=False) -> Dict: '''Run the module on all the nodes up to the final redirect''' if not self.available: return {'error': 'Module not available'} if auto_trigger and not self.allow_auto_trigger: return {'error': 'Auto trigger not allowed on module'} if crawled_tree.redirects: for redirect in crawled_tree.redirects: self.url_lookup(redirect, force) else: self.url_lookup(crawled_tree.root_hartree.har.root_url, force) return {'success': 'Module triggered'} def url_lookup(self, url: str, force: bool=False) -> None: '''Lookup an URL on Phishing Initiative Note: force means 2 things: * (re)scan of the URL * re fetch the object from Phishing Initiative even if we already did it today Note: the URL will only be sent for scan if autosubmit is set to true in the config ''' if not self.available: raise ConfigError('PhishingInitiative not available, probably no API key') url_storage_dir = self.__get_cache_directory(url) url_storage_dir.mkdir(parents=True, exist_ok=True) pi_file = url_storage_dir / date.today().isoformat() scan_requested = False if self.autosubmit and force: self.client.post_submission(url, comment='Received on Lookyloo') scan_requested = True if not force and pi_file.exists(): return for _ in range(3): url_information = self.client.lookup(url) if not url_information['results']: # No results, that should not happen (?) break if url_information['results'][0]['tag'] == -1: # Not submitted if not self.autosubmit: break if not scan_requested: self.client.post_submission(url, comment='Received on Lookyloo') scan_requested = True time.sleep(1) else: with pi_file.open('w') as _f: json.dump(url_information, _f) break class VirusTotal(): def __init__(self, config: Dict[str, Any]): if not config.get('apikey'): self.available = False return self.available = True self.autosubmit = False self.allow_auto_trigger = False self.client = vt.Client(config['apikey']) if config.get('allow_auto_trigger'): self.allow_auto_trigger = True if config.get('autosubmit'): self.autosubmit = True self.storage_dir_vt = get_homedir() / 'vt_url' self.storage_dir_vt.mkdir(parents=True, exist_ok=True) def __get_cache_directory(self, url: str) -> Path: url_id = vt.url_id(url) m = hashlib.md5() m.update(url_id.encode()) return self.storage_dir_vt / m.hexdigest() def get_url_lookup(self, url: str) -> Optional[Dict[str, Any]]: url_storage_dir = self.__get_cache_directory(url) if not url_storage_dir.exists(): return None cached_entries = sorted(url_storage_dir.glob('*'), reverse=True) if not cached_entries: return None with cached_entries[0].open() as f: return json.load(f) def capture_default_trigger(self, crawled_tree: CrawledTree, /, *, force: bool=False, auto_trigger: bool=False) -> Dict: '''Run the module on all the nodes up to the final redirect''' if not self.available: return {'error': 'Module not available'} if auto_trigger and not self.allow_auto_trigger: return {'error': 'Auto trigger not allowed on module'} if crawled_tree.redirects: for redirect in crawled_tree.redirects: self.url_lookup(redirect, force) else: self.url_lookup(crawled_tree.root_hartree.har.root_url, force) return {'success': 'Module triggered'} def url_lookup(self, url: str, force: bool=False) -> None: '''Lookup an URL on VT Note: force means 2 things: * (re)scan of the URL * re fetch the object from VT even if we already did it today Note: the URL will only be sent for scan if autosubmit is set to true in the config ''' if not self.available: raise ConfigError('VirusTotal not available, probably no API key') url_storage_dir = self.__get_cache_directory(url) url_storage_dir.mkdir(parents=True, exist_ok=True) vt_file = url_storage_dir / date.today().isoformat() scan_requested = False if self.autosubmit and force: self.client.scan_url(url) scan_requested = True if not force and vt_file.exists(): return url_id = vt.url_id(url) for _ in range(3): try: url_information = self.client.get_object(f"/urls/{url_id}") with vt_file.open('w') as _f: json.dump(url_information.to_dict(), _f) break except APIError as e: if not self.autosubmit: break if not scan_requested and e.code == 'NotFoundError': self.client.scan_url(url) scan_requested = True time.sleep(5) class UrlScan(): def __init__(self, config: Dict[str, Any]): self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(get_config('generic', 'loglevel')) if not config.get('apikey'): self.available = False return self.available = True self.autosubmit = False self.allow_auto_trigger = False self.client = requests.session() self.client.headers['User-Agent'] = get_useragent_for_requests() self.client.headers['API-Key'] = config['apikey'] self.client.headers['Content-Type'] = 'application/json' if config.get('allow_auto_trigger'): self.allow_auto_trigger = True if config.get('autosubmit'): self.autosubmit = True if config.get('force_visibility'): # Cases: # 1. False: unlisted for hidden captures / public for others # 2. "key": default visibility defined on urlscan.io # 3. "public", "unlisted", "private": is set for all submissions self.force_visibility = config['force_visibility'] else: self.force_visibility = False if self.force_visibility not in [False, 'key', 'public', 'unlisted', 'private']: self.logger.warning("Invalid value for force_visibility, default to False (unlisted for hidden captures / public for others).") self.force_visibility = False self.storage_dir_urlscan = get_homedir() / 'urlscan' self.storage_dir_urlscan.mkdir(parents=True, exist_ok=True) def __get_cache_directory(self, url: str, useragent: str, referer: str) -> Path: m = hashlib.md5() to_hash = f'{url}{useragent}{referer}' m.update(to_hash.encode()) return self.storage_dir_urlscan / m.hexdigest() def get_url_submission(self, capture_info: Dict[str, Any]) -> Optional[Dict[str, Any]]: url_storage_dir = self.__get_cache_directory(capture_info['url'], capture_info['user_agent'], capture_info['referer']) / 'submit' if not url_storage_dir.exists(): return None cached_entries = sorted(url_storage_dir.glob('*'), reverse=True) if not cached_entries: return None with cached_entries[0].open() as f: return json.load(f) def capture_default_trigger(self, capture_info: Dict[str, Any], /, visibility: str, *, force: bool=False, auto_trigger: bool=False) -> Dict: '''Run the module on the initial URL''' if not self.available: return {'error': 'Module not available'} if auto_trigger and not self.allow_auto_trigger: # NOTE: if auto_trigger is true, it means the request comes from the # auto trigger feature (disabled by default) # Each module can disable auto-trigger to avoid depleating the # API limits. return {'error': 'Auto trigger not allowed on module'} self.url_submit(capture_info, visibility, force) return {'success': 'Module triggered'} def __submit_url(self, url: str, useragent: str, referer: str, visibility: str) -> Dict: data = {'customagent': useragent, 'referer': referer} if not url.startswith('http'): url = f'http://{url}' data['url'] = url if self.force_visibility is False: data["visibility"] = visibility elif self.force_visibility in ["public", "unlisted", "private"]: data["visibility"] = self.force_visibility else: # default to key config on urlscan.io website pass response = self.client.post('https://urlscan.io/api/v1/scan/', json=data) response.raise_for_status() return response.json() def __url_result(self, uuid: str) -> Dict: response = self.client.get(f'https://urlscan.io/api/v1/result/{uuid}') response.raise_for_status() return response.json() def url_submit(self, capture_info: Dict[str, Any], visibility: str, force: bool=False) -> Dict: '''Lookup an URL on urlscan.io Note: force means 2 things: * (re)scan of the URL * re-fetch the object from urlscan.io even if we already did it today Note: the URL will only be submitted if autosubmit is set to true in the config ''' if not self.available: raise ConfigError('UrlScan not available, probably no API key') url_storage_dir = self.__get_cache_directory(capture_info['url'], capture_info['user_agent'], capture_info['referer']) / 'submit' url_storage_dir.mkdir(parents=True, exist_ok=True) urlscan_file_submit = url_storage_dir / date.today().isoformat() if urlscan_file_submit.exists(): if not force: with urlscan_file_submit.open('r') as _f: return json.load(_f) elif self.autosubmit: # submit is allowed and we either force it, or it's just allowed try: response = self.__submit_url(capture_info['url'], capture_info['user_agent'], capture_info['referer'], visibility) except requests.exceptions.HTTPError as e: return {'error': e} with urlscan_file_submit.open('w') as _f: json.dump(response, _f) return response return {'error': 'Submitting is not allowed by the configuration'} def url_result(self, capture_info: Dict[str, Any]): '''Get the result from a submission.''' submission = self.get_url_submission(capture_info) if submission and 'uuid' in submission: uuid = submission['uuid'] if (self.storage_dir_urlscan / f'{uuid}.json').exists(): with (self.storage_dir_urlscan / f'{uuid}.json').open() as _f: return json.load(_f) try: result = self.__url_result(uuid) except requests.exceptions.HTTPError as e: return {'error': e} with (self.storage_dir_urlscan / f'{uuid}.json').open('w') as _f: json.dump(result, _f) return result return {'error': 'Submission incomplete or unavailable.'}