BGP-Ranking/bgpranking/shadowserverfetcher.py

172 lines
7.5 KiB
Python

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import aiohttp
import logging
from bs4 import BeautifulSoup
from dateutil.parser import parse
from typing import Tuple
from datetime import datetime, date
from pathlib import Path
from .libs.helpers import safe_create_dir, set_running, unset_running
try:
import simplejson as json
except ImportError:
import json
class ShadowServerFetcher():
def __init__(self, user, password, config_path_modules: Path, storage_directory: Path,
loglevel: int=logging.DEBUG) -> None:
self.__init_logger(loglevel)
self.storage_directory = storage_directory
self.config_path_modules = config_path_modules
self.user = user
self.password = password
self.index_page = 'https://dl.shadowserver.org/reports/index.php'
self.vendor = 'shadowserver'
self.known_list_types = ('blacklist', 'botnet', 'cc', 'cisco', 'cwsandbox', 'drone',
'microsoft', 'scan', 'sinkhole6', 'sinkhole', 'outdated',
'compromised', 'hp', 'darknet', 'ddos')
self.first_available_day = None
self.last_available_day = None
self.available_entries = {}
def __init_logger(self, loglevel):
self.logger = logging.getLogger(f'{self.__class__.__name__}')
self.logger.setLevel(loglevel)
async def __get_index(self):
auth_details = {'user': self.user, 'password': self.password, 'login': 'Login'}
async with aiohttp.ClientSession() as s:
self.logger.debug('Fetching the index.')
async with s.post(self.index_page, data=auth_details) as r:
return await r.text()
async def __build_daily_dict(self):
html_index = await self.__get_index()
soup = BeautifulSoup(html_index, 'html.parser')
treeview = soup.find(id='treemenu1')
for y in treeview.select(':scope > li'):
year = y.contents[0]
for m in y.contents[1].select(':scope > li'):
month = m.contents[0]
for d in m.contents[1].select(':scope > li'):
day = d.contents[0]
date = parse(f'{year} {month} {day}').date()
self.available_entries[date.isoformat()] = []
for a in d.contents[1].find_all('a', href=True):
if not self.first_available_day:
self.first_available_day = date
self.last_available_day = date
self.available_entries[date.isoformat()].append((a['href'], a.string))
self.logger.debug('Dictionary created.')
def __normalize_day(self, day: Tuple[str, date, datetime]=None) -> str:
if not day:
if not self.last_available_day:
raise Exception('Unable to figure out the last available day. You need to run build_daily_dict first')
day = self.last_available_day
else:
if isinstance(day, str):
day = parse(day).date()
elif isinstance(day, datetime):
day = day.date()
return day.isoformat()
def __split_name(self, name):
type_content, country, list_type = name.split('-')
if '_' in type_content:
type_content, details_type = type_content.split('_', maxsplit=1)
if '_' in details_type:
details_type, sub = details_type.split('_', maxsplit=1)
return list_type, country, (type_content, details_type, sub)
return list_type, country, (type_content, details_type)
return list_type, country, (type_content)
def __check_config(self, filename: str) -> Path:
self.logger.debug(f'Working on config for {filename}.')
config = {'vendor': 'shadowserver', 'parser': '.parsers.shadowserver'}
type_content, _, type_details = self.__split_name(filename)
prefix = type_content.split('.')[0]
if isinstance(type_details, str):
main_type = type_details
config['name'] = '{}-{}'.format(prefix, type_details)
else:
main_type = type_details[0]
config['name'] = '{}-{}'.format(prefix, '_'.join(type_details))
if main_type not in self.known_list_types:
self.logger.warning(f'Unknown type: {main_type}. Please update the config creator script.')
return None
if main_type == 'blacklist':
config['impact'] = 5
elif main_type == 'botnet':
config['impact'] = 2
elif main_type == 'cc':
config['impact'] = 5
elif main_type == 'cisco':
config['impact'] = 3
elif main_type == 'cwsandbox':
config['impact'] = 5
elif main_type == 'drone':
config['impact'] = 2
elif main_type == 'microsoft':
config['impact'] = 3
elif main_type == 'scan':
config['impact'] = 1
elif main_type == 'sinkhole6':
config['impact'] = 2
elif main_type == 'sinkhole':
config['impact'] = 2
else:
config['impact'] = 1
if not (self.config_path_modules / f"{config['vendor']}_{config['name']}.json").exists():
self.logger.debug(f'Creating config file for {filename}.')
with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'w') as f:
json.dump(config, f, indent=2)
else:
with open(self.config_path_modules / f"{config['vendor']}_{config['name']}.json", 'r') as f:
# Validate new config file with old
config_current = json.load(f)
if config_current != config:
self.logger.warning('The config file created by this script is different from the one on disk: \n{}\n{}'.format(json.dumps(config), json.dumps(config_current)))
# Init list directory
directory = self.storage_directory / config['vendor'] / config['name']
safe_create_dir(directory)
meta = directory / 'meta'
safe_create_dir(meta)
archive_dir = directory / 'archive'
safe_create_dir(archive_dir)
self.logger.debug(f'Done with config for {filename}.')
return directory
async def download_daily_entries(self, day: Tuple[str, date, datetime]=None):
set_running(f'{self.__class__.__name__}')
await self.__build_daily_dict()
for url, filename in self.available_entries[self.__normalize_day(day)]:
storage_dir = self.__check_config(filename)
if not storage_dir:
continue
# Check if the file we're trying to download has already been downloaded. Skip if True.
uuid = url.split('/')[-1]
if (storage_dir / 'meta' / 'last_download').exists():
with open(storage_dir / 'meta' / 'last_download') as f:
last_download_uuid = f.read()
if last_download_uuid == uuid:
self.logger.debug(f'Already downloaded: {url}.')
continue
async with aiohttp.ClientSession() as s:
async with s.get(url) as r:
self.logger.info(f'Downloading {url}.')
content = await r.content.read()
with (storage_dir / '{}.txt'.format(datetime.now().isoformat())).open('wb') as f:
f.write(content)
with open(storage_dir / 'meta' / 'last_download', 'w') as f:
f.write(uuid)
unset_running(f'{self.__class__.__name__}')