2018-07-13 14:51:00 +02:00
|
|
|
#!/usr/bin/env python3
|
2018-03-12 15:29:18 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2018-03-20 21:44:46 +01:00
|
|
|
import aiohttp
|
2018-03-12 15:29:18 +01:00
|
|
|
from dateutil import parser
|
|
|
|
from datetime import datetime, date
|
|
|
|
from hashlib import sha512 # Faster than sha256 on 64b machines.
|
|
|
|
from pathlib import Path
|
|
|
|
import logging
|
|
|
|
from pid import PidFile, PidFileError
|
|
|
|
import json
|
|
|
|
|
2018-03-29 22:37:28 +02:00
|
|
|
from .libs.helpers import safe_create_dir, set_running, unset_running
|
2018-03-12 15:29:18 +01:00
|
|
|
|
|
|
|
|
|
|
|
class Fetcher():
|
|
|
|
|
|
|
|
def __init__(self, config_file: Path, storage_directory: Path,
|
|
|
|
loglevel: int=logging.DEBUG):
|
|
|
|
'''Load `config_file`, and store the fetched data into `storage_directory`
|
|
|
|
Note: if the `config_file` does not provide a URL (the file is
|
|
|
|
gathered by some oter mean), the fetcher is automatically stoped.'''
|
|
|
|
with open(config_file, 'r') as f:
|
|
|
|
module_parameters = json.load(f)
|
|
|
|
self.vendor = module_parameters['vendor']
|
|
|
|
self.listname = module_parameters['name']
|
|
|
|
self.__init_logger(loglevel)
|
|
|
|
self.fetcher = True
|
|
|
|
if 'url' not in module_parameters:
|
|
|
|
self.logger.info('No URL to fetch, breaking.')
|
|
|
|
self.fetcher = False
|
|
|
|
return
|
|
|
|
self.url = module_parameters['url']
|
2018-04-10 00:20:59 +02:00
|
|
|
self.logger.debug(f'Starting fetcher on {self.url}')
|
2018-03-12 15:29:18 +01:00
|
|
|
self.directory = storage_directory / self.vendor / self.listname
|
|
|
|
safe_create_dir(self.directory)
|
|
|
|
self.meta = self.directory / 'meta'
|
|
|
|
safe_create_dir(self.meta)
|
|
|
|
self.archive_dir = self.directory / 'archive'
|
|
|
|
safe_create_dir(self.archive_dir)
|
|
|
|
self.first_fetch = True
|
|
|
|
|
|
|
|
def __init_logger(self, loglevel):
|
2018-04-10 00:20:59 +02:00
|
|
|
self.logger = logging.getLogger(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|
2018-03-12 15:29:18 +01:00
|
|
|
self.logger.setLevel(loglevel)
|
|
|
|
|
2018-03-20 21:44:46 +01:00
|
|
|
async def __get_last_modified(self):
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
async with session.head(self.url) as r:
|
|
|
|
headers = r.headers
|
|
|
|
if 'Last-Modified' in headers:
|
|
|
|
return parser.parse(headers['Last-Modified'])
|
|
|
|
return None
|
2018-03-12 15:29:18 +01:00
|
|
|
|
2018-03-20 21:44:46 +01:00
|
|
|
async def __newer(self):
|
2018-03-12 15:29:18 +01:00
|
|
|
'''Check if the file available for download is newed than the one
|
|
|
|
already downloaded by checking the `Last-Modified` header.
|
|
|
|
Note: return False if the file containing the last header content
|
|
|
|
is not existing, or the header doesn't have this key.
|
|
|
|
'''
|
|
|
|
last_modified_path = self.meta / 'lastmodified'
|
|
|
|
if not last_modified_path.exists():
|
|
|
|
# The file doesn't exists
|
|
|
|
if not self.first_fetch:
|
|
|
|
# The URL has no Last-Modified header, we cannot use it.
|
|
|
|
self.logger.debug('No Last-Modified header available')
|
|
|
|
return True
|
|
|
|
self.first_fetch = False
|
2018-03-20 21:44:46 +01:00
|
|
|
last_modified = await self.__get_last_modified()
|
2018-03-12 15:29:18 +01:00
|
|
|
if last_modified:
|
|
|
|
self.logger.debug('Last-Modified header available')
|
|
|
|
with last_modified_path.open('w') as f:
|
|
|
|
f.write(last_modified.isoformat())
|
|
|
|
else:
|
|
|
|
self.logger.debug('No Last-Modified header available')
|
|
|
|
return True
|
|
|
|
with last_modified_path.open() as f:
|
2018-03-20 21:44:46 +01:00
|
|
|
file_content = f.read()
|
|
|
|
last_modified_file = parser.parse(file_content)
|
|
|
|
last_modified = await self.__get_last_modified()
|
2018-03-12 15:29:18 +01:00
|
|
|
if not last_modified:
|
|
|
|
# No more Last-Modified header Oo
|
2018-04-10 00:20:59 +02:00
|
|
|
self.logger.warning(f'{self.listname}: Last-Modified header was present, isn\'t anymore!')
|
2018-03-12 15:29:18 +01:00
|
|
|
last_modified_path.unlink()
|
|
|
|
return True
|
|
|
|
if last_modified > last_modified_file:
|
|
|
|
self.logger.info('Got a new file.')
|
|
|
|
with last_modified_path.open('w') as f:
|
|
|
|
f.write(last_modified.isoformat())
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
def __same_as_last(self, downloaded):
|
|
|
|
'''Figure out the last downloaded file, check if it is the same as the
|
|
|
|
newly downloaded one. Returns true if both files have been downloaded the
|
|
|
|
same day.
|
|
|
|
Note: we check the new and the archive directory because we may have backlog
|
|
|
|
and the newest file is always the first one we process
|
|
|
|
'''
|
|
|
|
to_check = []
|
|
|
|
to_check_new = sorted([f for f in self.directory.iterdir() if f.is_file()])
|
|
|
|
if to_check_new:
|
|
|
|
# we have files waiting to be processed
|
|
|
|
self.logger.debug('{} file(s) are waiting to be processed'.format(len(to_check_new)))
|
|
|
|
to_check.append(to_check_new[-1])
|
|
|
|
to_check_archive = sorted([f for f in self.archive_dir.iterdir() if f.is_file()])
|
|
|
|
if to_check_archive:
|
|
|
|
# we have files already processed, in the archive
|
|
|
|
self.logger.debug('{} file(s) have been processed'.format(len(to_check_archive)))
|
|
|
|
to_check.append(to_check_archive[-1])
|
|
|
|
if not to_check:
|
|
|
|
self.logger.debug('New list, no hisorical files')
|
|
|
|
# nothing has been downloaded ever, moving on
|
|
|
|
return False
|
2018-07-13 14:51:00 +02:00
|
|
|
dl_hash = sha512(downloaded)
|
2018-03-12 15:29:18 +01:00
|
|
|
for last_file in to_check:
|
|
|
|
with last_file.open('rb') as f:
|
|
|
|
last_hash = sha512(f.read())
|
|
|
|
if (dl_hash.digest() == last_hash.digest() and
|
|
|
|
parser.parse(last_file.name.split('.')[0]).date() == date.today()):
|
|
|
|
self.logger.debug('Same file already downloaded today.')
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
async def fetch_list(self):
|
|
|
|
'''Fetch & store the list'''
|
|
|
|
if not self.fetcher:
|
|
|
|
return
|
2018-04-10 00:20:59 +02:00
|
|
|
set_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|
2018-03-12 15:29:18 +01:00
|
|
|
try:
|
2018-04-10 00:20:59 +02:00
|
|
|
with PidFile(f'{self.listname}.pid', piddir=self.meta):
|
2018-03-20 21:44:46 +01:00
|
|
|
if not await self.__newer():
|
2018-04-10 00:20:59 +02:00
|
|
|
unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|
2018-03-12 15:29:18 +01:00
|
|
|
return
|
2018-04-04 23:41:22 +02:00
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
async with session.get(self.url) as r:
|
|
|
|
content = await r.content.read()
|
|
|
|
if self.__same_as_last(content):
|
|
|
|
return
|
|
|
|
self.logger.info('Got a new file \o/')
|
|
|
|
with (self.directory / '{}.txt'.format(datetime.now().isoformat())).open('wb') as f:
|
|
|
|
f.write(content)
|
2018-04-10 00:20:59 +02:00
|
|
|
unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|
2018-03-12 15:29:18 +01:00
|
|
|
except PidFileError:
|
|
|
|
self.logger.info('Fetcher already running')
|
2018-03-29 22:37:28 +02:00
|
|
|
finally:
|
2018-04-10 00:20:59 +02:00
|
|
|
unset_running(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|