BGP-Ranking/bin/sanitizer.py

155 lines
5.6 KiB
Python
Raw Normal View History

2018-07-13 14:51:00 +02:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
2021-12-06 14:30:08 +01:00
import ipaddress
import logging
2021-12-06 14:30:08 +01:00
import time
2018-03-30 14:33:33 +02:00
2021-12-06 14:30:08 +01:00
from datetime import timezone
from typing import Optional, List, Dict
2021-12-06 14:30:08 +01:00
from dateutil import parser
from redis import Redis
import requests
from bgpranking.default import AbstractManager, get_socket_path
from bgpranking.helpers import get_ipasn, sanity_check_ipasn
logging.basicConfig(format='%(asctime)s %(name)s %(levelname)s:%(message)s',
2021-12-17 10:54:05 +01:00
level=logging.INFO)
2021-12-06 14:30:08 +01:00
class Sanitizer(AbstractManager):
2021-12-06 14:30:08 +01:00
def __init__(self, loglevel: int=logging.INFO):
2018-03-30 14:33:33 +02:00
super().__init__(loglevel)
2021-12-06 14:30:08 +01:00
self.script_name = 'sanitizer'
self.redis_intake = Redis(unix_socket_path=get_socket_path('intake'), db=0, decode_responses=True)
self.redis_sanitized = Redis(unix_socket_path=get_socket_path('prepare'), db=0, decode_responses=True)
self.ipasn = get_ipasn()
self.logger.debug('Starting import')
def _sanitize_ip(self, pipeline: Redis, uuid: str, data: Dict) -> Optional[Dict]:
try:
ip = ipaddress.ip_address(data['ip'])
if isinstance(ip, ipaddress.IPv6Address):
address_family = 'v6'
else:
address_family = 'v4'
except ValueError:
self.logger.info(f"Invalid IP address: {data['ip']}")
return None
except KeyError:
self.logger.info(f"Invalid entry {data}")
return None
if not ip.is_global:
self.logger.info(f"The IP address {data['ip']} is not global")
return None
datetime = parser.parse(data['datetime'])
if datetime.tzinfo:
# Make sure the datetime isn't TZ aware, and UTC.
datetime = datetime.astimezone(timezone.utc).replace(tzinfo=None)
# Add to temporay DB for further processing
pipeline.hmset(uuid, {'ip': str(ip), 'source': data['source'], 'address_family': address_family,
'date': datetime.date().isoformat(), 'datetime': datetime.isoformat()})
pipeline.sadd('to_insert', uuid)
return {'ip': str(ip), 'address_family': address_family,
'date': datetime.isoformat(), 'precision_delta': {'days': 3}}
def _sanitize_network(self, pipeline: Redis, uuid: str, data: Dict) -> List[Dict]:
try:
network = ipaddress.ip_network(data['ip'])
if isinstance(network, ipaddress.IPv6Network):
address_family = 'v6'
else:
address_family = 'v4'
except ValueError:
self.logger.info(f"Invalid IP network: {data['ip']}")
return []
except KeyError:
self.logger.info(f"Invalid entry {data}")
return []
datetime = parser.parse(data['datetime'])
if datetime.tzinfo:
# Make sure the datetime isn't TZ aware, and UTC.
datetime = datetime.astimezone(timezone.utc).replace(tzinfo=None)
for_cache = []
for ip in network.hosts():
if not ip.is_global:
self.logger.info(f"The IP address {ip} is not global")
continue
# Add to temporay DB for further processing
pipeline.hmset(uuid, {'ip': str(ip), 'source': data['source'], 'address_family': address_family,
'date': datetime.date().isoformat(), 'datetime': datetime.isoformat()})
pipeline.sadd('to_insert', uuid)
for_cache.append({'ip': str(ip), 'address_family': address_family,
'date': datetime.isoformat(), 'precision_delta': {'days': 3}})
return for_cache
2021-12-06 14:30:08 +01:00
def sanitize(self):
ready, message = sanity_check_ipasn(self.ipasn)
if not ready:
# Try again later.
self.logger.warning(message)
return
self.logger.debug(message)
while True:
try:
if self.shutdown_requested() or not self.ipasn.is_up:
break
except requests.exceptions.ConnectionError:
# Temporary issue with ipasnhistory
self.logger.info('Temporary issue with ipasnhistory, trying again later.')
time.sleep(10)
continue
uuids: Optional[List[str]] = self.redis_intake.spop('intake', 100) # type: ignore
if not uuids:
break
for_cache = []
pipeline = self.redis_sanitized.pipeline(transaction=False)
for uuid in uuids:
data = self.redis_intake.hgetall(uuid)
if not data:
continue
if '/' in data['ip']:
entries_for_cache = self._sanitize_network(pipeline, uuid, data)
if entries_for_cache:
for_cache += entries_for_cache
else:
entry_for_cache = self._sanitize_ip(pipeline, uuid, data)
if entry_for_cache:
for_cache.append(entry_for_cache)
2021-12-06 14:30:08 +01:00
pipeline.execute()
self.redis_intake.delete(*uuids)
try:
# Just cache everything so the lookup scripts can do their thing.
self.ipasn.mass_cache(for_cache)
except Exception:
self.logger.info('Mass cache in IPASN History failed, trying again later.')
# Rollback the spop
self.redis_intake.sadd('intake', *uuids)
break
2018-03-30 14:33:33 +02:00
def _to_run_forever(self):
2021-12-06 14:30:08 +01:00
self.sanitize()
2021-12-06 14:30:08 +01:00
def main():
sanitizer = Sanitizer()
2018-03-30 14:33:33 +02:00
sanitizer.run(sleep_in_sec=120)
2021-12-06 14:30:08 +01:00
if __name__ == '__main__':
main()