#!/usr/bin/env python3 # -*- coding: utf-8 -*- import logging from logging import Logger import json import asyncio from typing import Tuple, Dict, List, Optional, TypeVar, Any from datetime import datetime, date from pathlib import Path import aiohttp from bs4 import BeautifulSoup # type: ignore from dateutil.parser import parse from bgpranking.default import AbstractManager, get_homedir, safe_create_dir from bgpranking.helpers import get_data_dir, get_modules_dir logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s', level=logging.INFO) Dates = TypeVar('Dates', datetime, date, str) class ShadowServerFetcher(): def __init__(self, user, password, logger: Logger) -> None: self.logger = logger self.storage_directory = get_data_dir() self.config_path_modules = get_modules_dir() self.user = user self.password = password self.index_page = 'https://dl.shadowserver.org/reports/index.php' self.vendor = 'shadowserver' self.known_list_types = ('blacklist', 'botnet', 'cc', 'cisco', 'cwsandbox', 'drone', 'microsoft', 'scan', 'sinkhole6', 'sinkhole', 'outdated', 'compromised', 'hp', 'darknet', 'ddos') self.first_available_day: date self.last_available_day: date self.available_entries: Dict[str, List[Tuple[str, str]]] = {} async def __get_index(self): auth_details = {'user': self.user, 'password': self.password, 'login': 'Login'} async with aiohttp.ClientSession() as s: self.logger.debug('Fetching the index.') async with s.post(self.index_page, data=auth_details) as r: return await r.text() async def __build_daily_dict(self): html_index = await self.__get_index() soup = BeautifulSoup(html_index, 'html.parser') treeview = soup.find(id='treemenu1') for y in treeview.select(':scope > li'): year = y.contents[0] for m in y.contents[1].select(':scope > li'): month = m.contents[0] for d in m.contents[1].select(':scope > li'): day = d.contents[0] date = parse(f'{year} {month} {day}').date() self.available_entries[date.isoformat()] = [] for a in d.contents[1].find_all('a', href=True): if not self.first_available_day: self.first_available_day = date self.last_available_day = date self.available_entries[date.isoformat()].append((a['href'], a.string)) self.logger.debug('Dictionary created.') def __normalize_day(self, day: Optional[Dates]=None) -> str: if not day: if not self.last_available_day: raise Exception('Unable to figure out the last available day. You need to run build_daily_dict first') to_return = self.last_available_day else: if isinstance(day, str): to_return = parse(day).date() elif isinstance(day, datetime): to_return = day.date() return to_return.isoformat() def __split_name(self, name): type_content, country, list_type = name.split('-') if '_' in type_content: type_content, details_type = type_content.split('_', maxsplit=1) if '_' in details_type: details_type, sub = details_type.split('_', maxsplit=1) return list_type, country, (type_content, details_type, sub) return list_type, country, (type_content, details_type) return list_type, country, (type_content) def __check_config(self, filename: str) -> Optional[Path]: self.logger.debug(f'Working on config for {filename}.') config: Dict[str, Any] = {'vendor': 'shadowserver', 'parser': '.parsers.shadowserver'} type_content, _, type_details = self.__split_name(filename) prefix = type_content.split('.')[0] if isinstance(type_details, str): main_type = type_details config['name'] = '{}-{}'.format(prefix, type_details) else: main_type = type_details[0] config['name'] = '{}-{}'.format(prefix, '_'.join(type_details)) if main_type not in self.known_list_types: self.logger.warning(f'Unknown type: {main_type}. Please update the config creator script.') return None if main_type == 'blacklist': config['impact'] = 5 elif main_type == 'botnet': config['impact'] = 2 elif main_type == 'cc': config['impact'] = 5 elif main_type == 'cisco': config['impact'] = 3 elif main_type == 'cwsandbox': config['impact'] = 5 elif main_type == 'drone': config['impact'] = 2 elif main_type == 'microsoft': config['impact'] = 3 elif main_type == 'scan': config['impact'] = 1 elif main_type == 'sinkhole6': config['impact'] = 2 elif main_type == 'sinkhole': config['impact'] = 2 else: config['impact'] = 1 if not (self.config_path_modules / f"{config['vendor']}_{config['name']}.json").exists(): self.logger.debug(f'Creating config file for {filename}.') with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'w') as f: json.dump(config, f, indent=2) else: with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'r') as f: # Validate new config file with old config_current = json.load(f) if config_current != config: self.logger.warning('The config file created by this script is different from the one on disk: \n{}\n{}'.format(json.dumps(config), json.dumps(config_current))) # Init list directory directory = self.storage_directory / config['vendor'] / config['name'] safe_create_dir(directory) meta = directory / 'meta' safe_create_dir(meta) archive_dir = directory / 'archive' safe_create_dir(archive_dir) self.logger.debug(f'Done with config for {filename}.') return directory async def download_daily_entries(self, day: Optional[Dates]=None): await self.__build_daily_dict() for url, filename in self.available_entries[self.__normalize_day(day)]: storage_dir = self.__check_config(filename) if not storage_dir: continue # Check if the file we're trying to download has already been downloaded. Skip if True. uuid = url.split('/')[-1] if (storage_dir / 'meta' / 'last_download').exists(): with open(storage_dir / 'meta' / 'last_download') as _fr: last_download_uuid = _fr.read() if last_download_uuid == uuid: self.logger.debug(f'Already downloaded: {url}.') continue async with aiohttp.ClientSession() as s: async with s.get(url) as r: self.logger.info(f'Downloading {url}.') content = await r.content.read() with (storage_dir / f'{datetime.now().isoformat()}.txt').open('wb') as _fw: _fw.write(content) with (storage_dir / 'meta' / 'last_download').open('w') as _fwt: _fwt.write(uuid) class ShadowServerManager(AbstractManager): def __init__(self, loglevel: int=logging.INFO): super().__init__(loglevel) self.script_name = 'shadowserver_fetcher' shadow_server_config_file = get_homedir() / 'config' / 'shadowserver.json' self.config = True if not shadow_server_config_file.exists(): self.config = False self.logger.warning(f'No config file available {shadow_server_config_file}, the shadow server module will not be launched.') return with shadow_server_config_file.open() as f: ss_config = json.load(f) self.fetcher = ShadowServerFetcher(ss_config['user'], ss_config['password'], self.logger) async def _to_run_forever_async(self): await self.fetcher.download_daily_entries() def main(): modules_manager = ShadowServerManager() if modules_manager.config: asyncio.run(modules_manager.run_async(sleep_in_sec=3600)) if __name__ == '__main__': main()