BGP-Ranking/bgpranking/modulesfetcher.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import aiohttp
from dateutil import parser
from datetime import datetime, date
from hashlib import sha512  # Faster than sha256 on 64b machines.
from pathlib import Path
import logging
from pid import PidFile, PidFileError
import json

from .libs.helpers import safe_create_dir, set_running, unset_running


class Fetcher():

    def __init__(self, config_file: Path, storage_directory: Path,
                 loglevel: int=logging.DEBUG):
        '''Load `config_file`, and store the fetched data into `storage_directory`
        Note: if the `config_file` does not provide a URL (the file is
              gathered by some oter mean), the fetcher is automatically stoped.'''
        with open(config_file, 'r') as f:
            module_parameters = json.load(f)
        self.vendor = module_parameters['vendor']
        self.listname = module_parameters['name']
        self.__init_logger(loglevel)
        self.fetcher = True
        if 'url' not in module_parameters:
            self.logger.info('No URL to fetch, breaking.')
            self.fetcher = False
            return
        self.url = module_parameters['url']
        self.logger.debug(f'Starting fetcher on {self.url}')
        self.directory = storage_directory / self.vendor / self.listname
        safe_create_dir(self.directory)
        self.meta = self.directory / 'meta'
        safe_create_dir(self.meta)
        self.archive_dir = self.directory / 'archive'
        safe_create_dir(self.archive_dir)
        self.first_fetch = True

    def __init_logger(self, loglevel):
        self.logger = logging.getLogger(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
        self.logger.setLevel(loglevel)

    async def __get_last_modified(self):
        async with aiohttp.ClientSession() as session:
            async with session.head(self.url) as r:
                headers = r.headers
                if 'Last-Modified' in headers:
                    return parser.parse(headers['Last-Modified'])
                return None

    async def __newer(self):
        '''Check if the file available for download is newed than the one
        already downloaded by checking the `Last-Modified` header.
        Note: return False if the file containing the last header content
            is not existing, or the header doesn't have this key.
        '''
        last_modified_path = self.meta / 'lastmodified'
        if not last_modified_path.exists():
            # The file doesn't exists
            if not self.first_fetch:
                # The URL has no Last-Modified header, we cannot use it.
                self.logger.debug('No Last-Modified header available')
                return True
            self.first_fetch = False
            last_modified = await self.__get_last_modified()
            if last_modified:
                self.logger.debug('Last-Modified header available')
                with last_modified_path.open('w') as f:
                    f.write(last_modified.isoformat())
            else:
                self.logger.debug('No Last-Modified header available')
            return True
        with last_modified_path.open() as f:
            file_content = f.read()
            last_modified_file = parser.parse(file_content)
        last_modified = await self.__get_last_modified()
        if not last_modified:
            # No more Last-Modified header Oo
            self.logger.warning(f'{self.listname}: Last-Modified header was present, isn\'t anymore!')
            last_modified_path.unlink()
            return True
        if last_modified > last_modified_file:
            self.logger.info('Got a new file.')
            with last_modified_path.open('w') as f:
                f.write(last_modified.isoformat())
            return True
        return False

    def __same_as_last(self, downloaded):
        '''Figure out the last downloaded file, check if it is the same as the
        newly downloaded one. Returns true if both files have been downloaded the
        same day.
        Note: we check the new and the archive directory because we may have backlog
            and the newest file is always the first one we process
        '''
        to_check = []
        to_check_new = sorted([f for f in self.directory.iterdir() if f.is_file()])
        if to_check_new:
            # we have files waiting to be processed
            self.logger.debug('{} file(s) are waiting to be processed'.format(len(to_check_new)))
            to_check.append(to_check_new[-1])
        to_check_archive = sorted([f for f in self.archive_dir.iterdir() if f.is_file()])
        if to_check_archive:
            # we have files already processed, in the archive
            self.logger.debug('{} file(s) have been processed'.format(len(to_check_archive)))
            to_check.append(to_check_archive[-1])
        if not to_check:
            self.logger.debug('New list, no hisorical files')
            # nothing has been downloaded ever, moving on
            return False
        dl_hash = sha512(downloaded)
        for last_file in to_check:
            with last_file.open('rb') as f:
                last_hash = sha512(f.read())
            if (dl_hash.digest() == last_hash.digest() and
                    parser.parse(last_file.name.split('.')[0]).date() == date.today()):
                self.logger.debug('Same file already downloaded today.')
                return True
        return False

    async def fetch_list(self):
        '''Fetch & store the list'''
        if not self.fetcher:
            return
        set_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
        try:
            with PidFile(f'{self.listname}.pid', piddir=self.meta):
                if not await self.__newer():
                    unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
                    return
                async with aiohttp.ClientSession() as session:
                    async with session.get(self.url) as r:
                        content = await r.content.read()
                        if self.__same_as_last(content):
                            return
                        self.logger.info('Got a new file \o/')
                        with (self.directory / '{}.txt'.format(datetime.now().isoformat())).open('wb') as f:
                            f.write(content)
                        unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
        except PidFileError:
            self.logger.info('Fetcher already running')
        finally:
            unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
new: Add Shadowserver module 2018-07-13 14:51:00 +02:00			`#!/usr/bin/env python3`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`# -- coding: utf-8 --`

fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`import aiohttp`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`from dateutil import parser`
			`from datetime import datetime, date`
			`from hashlib import sha512 # Faster than sha256 on 64b machines.`
			`from pathlib import Path`
			`import logging`
			`from pid import PidFile, PidFileError`
			`import json`

new: major refactoring 2018-03-29 22:37:28 +02:00			`from .libs.helpers import safe_create_dir, set_running, unset_running`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00

			`class Fetcher():`

			`def __init__(self, config_file: Path, storage_directory: Path,`
			`loglevel: int=logging.DEBUG):`
			'''Load `config_file`, and store the fetched data into `storage_directory`
			Note: if the `config_file` does not provide a URL (the file is
			`gathered by some oter mean), the fetcher is automatically stoped.'''`
			`with open(config_file, 'r') as f:`
			`module_parameters = json.load(f)`
			`self.vendor = module_parameters['vendor']`
			`self.listname = module_parameters['name']`
			`self.__init_logger(loglevel)`
			`self.fetcher = True`
			`if 'url' not in module_parameters:`
			`self.logger.info('No URL to fetch, breaking.')`
			`self.fetcher = False`
			`return`
			`self.url = module_parameters['url']`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`self.logger.debug(f'Starting fetcher on {self.url}')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`self.directory = storage_directory / self.vendor / self.listname`
			`safe_create_dir(self.directory)`
			`self.meta = self.directory / 'meta'`
			`safe_create_dir(self.meta)`
			`self.archive_dir = self.directory / 'archive'`
			`safe_create_dir(self.archive_dir)`
			`self.first_fetch = True`

			`def __init_logger(self, loglevel):`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`self.logger = logging.getLogger(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`self.logger.setLevel(loglevel)`

fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`async def __get_last_modified(self):`
			`async with aiohttp.ClientSession() as session:`
			`async with session.head(self.url) as r:`
			`headers = r.headers`
			`if 'Last-Modified' in headers:`
			`return parser.parse(headers['Last-Modified'])`
			`return None`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00
fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`async def __newer(self):`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`'''Check if the file available for download is newed than the one`
			already downloaded by checking the `Last-Modified` header.
			`Note: return False if the file containing the last header content`
			`is not existing, or the header doesn't have this key.`
			`'''`
			`last_modified_path = self.meta / 'lastmodified'`
			`if not last_modified_path.exists():`
			`# The file doesn't exists`
			`if not self.first_fetch:`
			`# The URL has no Last-Modified header, we cannot use it.`
			`self.logger.debug('No Last-Modified header available')`
			`return True`
			`self.first_fetch = False`
fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`last_modified = await self.__get_last_modified()`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`if last_modified:`
			`self.logger.debug('Last-Modified header available')`
			`with last_modified_path.open('w') as f:`
			`f.write(last_modified.isoformat())`
			`else:`
			`self.logger.debug('No Last-Modified header available')`
			`return True`
			`with last_modified_path.open() as f:`
fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`file_content = f.read()`
			`last_modified_file = parser.parse(file_content)`
			`last_modified = await self.__get_last_modified()`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`if not last_modified:`
			`# No more Last-Modified header Oo`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`self.logger.warning(f'{self.listname}: Last-Modified header was present, isn\'t anymore!')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`last_modified_path.unlink()`
			`return True`
			`if last_modified > last_modified_file:`
			`self.logger.info('Got a new file.')`
			`with last_modified_path.open('w') as f:`
			`f.write(last_modified.isoformat())`
			`return True`
			`return False`

			`def __same_as_last(self, downloaded):`
			`'''Figure out the last downloaded file, check if it is the same as the`
			`newly downloaded one. Returns true if both files have been downloaded the`
			`same day.`
			`Note: we check the new and the archive directory because we may have backlog`
			`and the newest file is always the first one we process`
			`'''`
			`to_check = []`
			`to_check_new = sorted([f for f in self.directory.iterdir() if f.is_file()])`
			`if to_check_new:`
			`# we have files waiting to be processed`
			`self.logger.debug('{} file(s) are waiting to be processed'.format(len(to_check_new)))`
			`to_check.append(to_check_new[-1])`
			`to_check_archive = sorted([f for f in self.archive_dir.iterdir() if f.is_file()])`
			`if to_check_archive:`
			`# we have files already processed, in the archive`
			`self.logger.debug('{} file(s) have been processed'.format(len(to_check_archive)))`
			`to_check.append(to_check_archive[-1])`
			`if not to_check:`
			`self.logger.debug('New list, no hisorical files')`
			`# nothing has been downloaded ever, moving on`
			`return False`
new: Add Shadowserver module 2018-07-13 14:51:00 +02:00			`dl_hash = sha512(downloaded)`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`for last_file in to_check:`
			`with last_file.open('rb') as f:`
			`last_hash = sha512(f.read())`
			`if (dl_hash.digest() == last_hash.digest() and`
			`parser.parse(last_file.name.split('.')[0]).date() == date.today()):`
			`self.logger.debug('Same file already downloaded today.')`
			`return True`
			`return False`

			`async def fetch_list(self):`
			`'''Fetch & store the list'''`
			`if not self.fetcher:`
			`return`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`set_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`try:`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`with PidFile(f'{self.listname}.pid', piddir=self.meta):`
fix: Properly use asyncio all over the place \o/ 2018-03-20 21:44:46 +01:00			`if not await self.__newer():`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`return`
Revert "chg: Catch aiohttp.ClientError" This reverts commit 9904404003a649d61ccd2117cee81fff5be43bbf. 2018-04-04 23:41:22 +02:00			`async with aiohttp.ClientSession() as session:`
			`async with session.get(self.url) as r:`
			`content = await r.content.read()`
			`if self.__same_as_last(content):`
			`return`
			`self.logger.info('Got a new file \o/')`
			`with (self.directory / '{}.txt'.format(datetime.now().isoformat())).open('wb') as f:`
			`f.write(content)`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')`
Initial commit * Module fetchers, parsers and archivers * Initial import (intake) * Pre-insert sanitizer * Routing Information Service fetcher * Database insert 2018-03-12 15:29:18 +01:00			`except PidFileError:`
			`self.logger.info('Fetcher already running')`
new: major refactoring 2018-03-29 22:37:28 +02:00			`finally:`
Add Ranking. Make all possible strings f-string. 2018-04-10 00:20:59 +02:00			`unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')`