BGP-Ranking/bin/ssfetcher.py

219 lines
9.1 KiB
Python
Raw Normal View History

2018-07-13 14:51:00 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import logging
2021-12-06 14:30:08 +01:00
from logging import Logger
import json
2018-07-13 14:51:00 +02:00
import asyncio
2021-12-06 14:30:08 +01:00
from typing import Tuple, Dict, List, Optional, TypeVar, Any
from datetime import datetime, date
2018-07-13 14:51:00 +02:00
from pathlib import Path
2021-12-06 14:30:08 +01:00
2018-07-13 14:51:00 +02:00
import aiohttp
2021-12-06 14:30:08 +01:00
from bs4 import BeautifulSoup # type: ignore
from dateutil.parser import parse
from bgpranking.default import AbstractManager, get_homedir, safe_create_dir
from bgpranking.helpers import get_data_dir, get_modules_dir
2018-07-13 14:51:00 +02:00
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
2021-12-17 10:54:05 +01:00
level=logging.INFO)
2018-07-13 14:51:00 +02:00
2021-12-06 14:30:08 +01:00
Dates = TypeVar('Dates', datetime, date, str)
class ShadowServerFetcher():
def __init__(self, user, password, logger: Logger) -> None:
self.logger = logger
self.storage_directory = get_data_dir()
self.config_path_modules = get_modules_dir()
self.user = user
self.password = password
self.index_page = 'https://dl.shadowserver.org/reports/index.php'
self.vendor = 'shadowserver'
self.known_list_types = ('blacklist', 'blocklist', 'botnet', 'cc', 'cisco', 'cwsandbox',
2022-01-04 14:15:59 +01:00
'device', 'drone', 'event4', 'malware', 'scan6', 'event6', 'netis',
2021-12-06 14:30:08 +01:00
'microsoft', 'scan', 'sinkhole6', 'sinkhole', 'outdated',
'compromised', 'hp', 'darknet', 'ddos')
self.first_available_day: Optional[date] = None
2021-12-06 14:30:08 +01:00
self.last_available_day: date
self.available_entries: Dict[str, List[Tuple[str, str]]] = {}
async def __get_index(self):
auth_details = {'user': self.user, 'password': self.password, 'login': 'Login'}
async with aiohttp.ClientSession() as s:
self.logger.debug('Fetching the index.')
async with s.post(self.index_page, data=auth_details) as r:
return await r.text()
async def __build_daily_dict(self):
html_index = await self.__get_index()
soup = BeautifulSoup(html_index, 'html.parser')
treeview = soup.find(id='treemenu1')
for y in treeview.select(':scope > li'):
year = y.contents[0]
for m in y.contents[1].select(':scope > li'):
month = m.contents[0]
for d in m.contents[1].select(':scope > li'):
day = d.contents[0]
date = parse(f'{year} {month} {day}').date()
self.available_entries[date.isoformat()] = []
for a in d.contents[1].find_all('a', href=True):
if not self.first_available_day:
self.first_available_day = date
self.last_available_day = date
self.available_entries[date.isoformat()].append((a['href'], a.string))
self.logger.debug('Dictionary created.')
def __normalize_day(self, day: Optional[Dates]=None) -> str:
if not day:
if not self.last_available_day:
raise Exception('Unable to figure out the last available day. You need to run build_daily_dict first')
to_return = self.last_available_day
else:
if isinstance(day, str):
to_return = parse(day).date()
elif isinstance(day, datetime):
to_return = day.date()
return to_return.isoformat()
def __split_name(self, name):
type_content, country, list_type = name.split('-')
if '_' in type_content:
type_content, details_type = type_content.split('_', maxsplit=1)
if '_' in details_type:
details_type, sub = details_type.split('_', maxsplit=1)
return list_type, country, (type_content, details_type, sub)
return list_type, country, (type_content, details_type)
return list_type, country, (type_content)
def __check_config(self, filename: str) -> Optional[Path]:
self.logger.debug(f'Working on config for {filename}.')
config: Dict[str, Any] = {'vendor': 'shadowserver', 'parser': '.parsers.shadowserver'}
type_content, _, type_details = self.__split_name(filename)
prefix = type_content.split('.')[0]
if isinstance(type_details, str):
main_type = type_details
config['name'] = '{}-{}'.format(prefix, type_details)
else:
main_type = type_details[0]
config['name'] = '{}-{}'.format(prefix, '_'.join(type_details))
if main_type not in self.known_list_types:
self.logger.warning(f'Unknown type: {main_type}. Please update the config creator script.')
return None
if main_type == 'blacklist':
config['impact'] = 5
elif main_type == 'blocklist':
config['impact'] = 5
2021-12-06 14:30:08 +01:00
elif main_type == 'botnet':
config['impact'] = 2
elif main_type == 'malware':
config['impact'] = 2
2021-12-06 14:30:08 +01:00
elif main_type == 'cc':
config['impact'] = 5
elif main_type == 'cisco':
config['impact'] = 3
elif main_type == 'cwsandbox':
config['impact'] = 5
elif main_type == 'drone':
config['impact'] = 2
elif main_type == 'microsoft':
config['impact'] = 3
elif main_type == 'scan':
config['impact'] = 1
elif main_type == 'scan6':
config['impact'] = 1
2021-12-06 14:30:08 +01:00
elif main_type == 'sinkhole6':
config['impact'] = 2
elif main_type == 'sinkhole':
config['impact'] = 2
elif main_type == 'device':
config['impact'] = 1
elif main_type == 'event4':
config['impact'] = 2
2022-01-04 14:15:59 +01:00
elif main_type == 'event6':
config['impact'] = 2
elif main_type == 'netis':
config['impact'] = 2
2021-12-06 14:30:08 +01:00
else:
config['impact'] = 1
if not (self.config_path_modules / f"{config['vendor']}_{config['name']}.json").exists():
self.logger.debug(f'Creating config file for {filename}.')
with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'w') as f:
json.dump(config, f, indent=2)
else:
with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'r') as f:
# Validate new config file with old
config_current = json.load(f)
if config_current != config:
self.logger.warning('The config file created by this script is different from the one on disk: \n{}\n{}'.format(json.dumps(config), json.dumps(config_current)))
# Init list directory
directory = self.storage_directory / config['vendor'] / config['name']
safe_create_dir(directory)
meta = directory / 'meta'
safe_create_dir(meta)
archive_dir = directory / 'archive'
safe_create_dir(archive_dir)
self.logger.debug(f'Done with config for {filename}.')
return directory
async def download_daily_entries(self, day: Optional[Dates]=None):
await self.__build_daily_dict()
for url, filename in self.available_entries[self.__normalize_day(day)]:
storage_dir = self.__check_config(filename)
if not storage_dir:
continue
# Check if the file we're trying to download has already been downloaded. Skip if True.
uuid = url.split('/')[-1]
if (storage_dir / 'meta' / 'last_download').exists():
with open(storage_dir / 'meta' / 'last_download') as _fr:
last_download_uuid = _fr.read()
if last_download_uuid == uuid:
self.logger.debug(f'Already downloaded: {url}.')
continue
async with aiohttp.ClientSession() as s:
async with s.get(url) as r:
self.logger.info(f'Downloading {url}.')
content = await r.content.read()
with (storage_dir / f'{datetime.now().isoformat()}.txt').open('wb') as _fw:
_fw.write(content)
with (storage_dir / 'meta' / 'last_download').open('w') as _fwt:
_fwt.write(uuid)
2018-07-13 14:51:00 +02:00
class ShadowServerManager(AbstractManager):
2021-12-06 14:30:08 +01:00
def __init__(self, loglevel: int=logging.INFO):
2018-07-13 14:51:00 +02:00
super().__init__(loglevel)
2021-12-06 14:30:08 +01:00
self.script_name = 'shadowserver_fetcher'
shadow_server_config_file = get_homedir() / 'config' / 'shadowserver.json'
2018-07-13 14:51:00 +02:00
self.config = True
2021-12-06 14:30:08 +01:00
if not shadow_server_config_file.exists():
2018-07-13 14:51:00 +02:00
self.config = False
2021-12-06 14:30:08 +01:00
self.logger.warning(f'No config file available {shadow_server_config_file}, the shadow server module will not be launched.')
2018-07-13 14:51:00 +02:00
return
2021-12-06 14:30:08 +01:00
with shadow_server_config_file.open() as f:
2018-07-13 14:51:00 +02:00
ss_config = json.load(f)
2021-12-06 14:30:08 +01:00
self.fetcher = ShadowServerFetcher(ss_config['user'], ss_config['password'], self.logger)
2018-07-13 14:51:00 +02:00
2021-12-06 14:30:08 +01:00
async def _to_run_forever_async(self):
await self.fetcher.download_daily_entries()
2018-07-13 14:51:00 +02:00
2021-12-06 14:30:08 +01:00
def main():
2018-07-13 14:51:00 +02:00
modules_manager = ShadowServerManager()
if modules_manager.config:
2021-12-06 14:30:08 +01:00
asyncio.run(modules_manager.run_async(sleep_in_sec=3600))
if __name__ == '__main__':
main()