2018-03-12 15:29:18 +01:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
from datetime import datetime
|
|
|
|
from pathlib import Path
|
|
|
|
import logging
|
|
|
|
import json
|
|
|
|
import re
|
2018-04-05 14:36:01 +02:00
|
|
|
from redis import StrictRedis
|
2018-03-12 15:29:18 +01:00
|
|
|
from uuid import uuid4
|
|
|
|
from io import BytesIO
|
|
|
|
import importlib
|
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
from typing import List, Union, Tuple
|
2018-03-12 15:29:18 +01:00
|
|
|
import types
|
|
|
|
|
2018-04-05 14:36:01 +02:00
|
|
|
from .libs.helpers import safe_create_dir, set_running, unset_running, get_socket_path
|
2018-03-12 15:29:18 +01:00
|
|
|
|
|
|
|
|
|
|
|
class RawFilesParser():
|
|
|
|
|
|
|
|
def __init__(self, config_file: Path, storage_directory: Path,
|
2018-07-13 14:51:00 +02:00
|
|
|
loglevel: int=logging.DEBUG) -> None:
|
2018-03-12 15:29:18 +01:00
|
|
|
with open(config_file, 'r') as f:
|
|
|
|
module_parameters = json.load(f)
|
|
|
|
self.vendor = module_parameters['vendor']
|
|
|
|
self.listname = module_parameters['name']
|
|
|
|
if 'parser' in module_parameters:
|
2018-03-29 22:37:28 +02:00
|
|
|
self.parse_raw_file = types.MethodType(importlib.import_module(module_parameters['parser'], 'bgpranking').parse_raw_file, self)
|
2018-04-10 00:20:59 +02:00
|
|
|
self.source = f'{self.vendor}-{self.listname}'
|
2018-03-12 15:29:18 +01:00
|
|
|
self.directory = storage_directory / self.vendor / self.listname
|
|
|
|
safe_create_dir(self.directory)
|
2018-06-11 18:59:41 +02:00
|
|
|
self.unparsable_dir = self.directory / 'unparsable'
|
|
|
|
safe_create_dir(self.unparsable_dir)
|
2018-03-12 15:29:18 +01:00
|
|
|
self.__init_logger(loglevel)
|
2018-04-05 14:36:01 +02:00
|
|
|
self.redis_intake = StrictRedis(unix_socket_path=get_socket_path('intake'), db=0)
|
2018-04-10 00:20:59 +02:00
|
|
|
self.logger.debug(f'Starting intake on {self.source}')
|
2018-03-12 15:29:18 +01:00
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
def __init_logger(self, loglevel) -> None:
|
2018-04-10 00:20:59 +02:00
|
|
|
self.logger = logging.getLogger(f'{self.__class__.__name__}-{self.vendor}-{self.listname}')
|
2018-03-12 15:29:18 +01:00
|
|
|
self.logger.setLevel(loglevel)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def files_to_parse(self) -> List[Path]:
|
|
|
|
return sorted([f for f in self.directory.iterdir() if f.is_file()], reverse=True)
|
|
|
|
|
|
|
|
def extract_ipv4(self, bytestream: bytes) -> List[bytes]:
|
|
|
|
return re.findall(rb'[0-9]+(?:\.[0-9]+){3}', bytestream)
|
|
|
|
|
2018-03-21 17:58:36 +01:00
|
|
|
def strip_leading_zeros(self, ips: List[bytes]) -> List[bytes]:
|
|
|
|
'''Helper to get rid of leading 0s in an IP list.
|
|
|
|
Only run it when needed, it is nasty and slow'''
|
|
|
|
return ['.'.join(str(int(part)) for part in ip.split(b'.')).encode() for ip in ips]
|
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
def parse_raw_file(self, f: BytesIO) -> List[bytes]:
|
2018-04-10 14:01:07 +02:00
|
|
|
# If the list doesn't provide a time, fallback to current day, midnight
|
|
|
|
self.datetime = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)
|
2018-03-12 15:29:18 +01:00
|
|
|
return self.extract_ipv4(f.getvalue())
|
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
def parse_raw_files(self) -> None:
|
2018-04-10 00:20:59 +02:00
|
|
|
set_running(f'{self.__class__.__name__}-{self.source}')
|
2018-06-11 19:10:23 +02:00
|
|
|
nb_unparsable_files = len([f for f in self.unparsable_dir.iterdir() if f.is_file()])
|
|
|
|
if nb_unparsable_files:
|
|
|
|
self.logger.warning(f'Was unable to parse {nb_unparsable_files} files.')
|
2018-04-13 18:02:44 +02:00
|
|
|
try:
|
|
|
|
for filepath in self.files_to_parse:
|
|
|
|
self.logger.debug('Parsing {}, {} to go.'.format(filepath, len(self.files_to_parse) - 1))
|
|
|
|
with open(filepath, 'rb') as f:
|
|
|
|
to_parse = BytesIO(f.read())
|
|
|
|
p = self.redis_intake.pipeline()
|
|
|
|
for ip in self.parse_raw_file(to_parse):
|
2018-07-13 14:51:00 +02:00
|
|
|
if isinstance(ip, tuple):
|
|
|
|
ip, datetime = ip
|
|
|
|
else:
|
|
|
|
datetime = self.datetime
|
2018-04-13 18:02:44 +02:00
|
|
|
uuid = uuid4()
|
|
|
|
p.hmset(uuid, {'ip': ip, 'source': self.source,
|
2018-07-13 14:51:00 +02:00
|
|
|
'datetime': datetime.isoformat()})
|
2018-04-13 18:02:44 +02:00
|
|
|
p.sadd('intake', uuid)
|
|
|
|
p.execute()
|
|
|
|
self._archive(filepath)
|
|
|
|
except Exception as e:
|
|
|
|
self.logger.exception("That didn't go well")
|
2018-06-11 18:59:41 +02:00
|
|
|
self._unparsable(filepath)
|
2018-04-13 18:02:44 +02:00
|
|
|
finally:
|
|
|
|
unset_running(f'{self.__class__.__name__}-{self.source}')
|
2018-03-12 15:29:18 +01:00
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
def _archive(self, filepath: Path) -> None:
|
2018-03-12 15:29:18 +01:00
|
|
|
'''After processing, move file to the archive directory'''
|
|
|
|
filepath.rename(self.directory / 'archive' / filepath.name)
|
2018-06-11 18:59:41 +02:00
|
|
|
|
2018-07-13 14:51:00 +02:00
|
|
|
def _unparsable(self, filepath: Path) -> None:
|
2018-06-11 18:59:41 +02:00
|
|
|
'''After processing, move file to the archive directory'''
|
|
|
|
filepath.rename(self.unparsable_dir / filepath.name)
|