chg: Make DNS resolution async

pull/946/head
Raphaël Vinot 2024-09-27 16:42:25 +02:00
parent 7a37751447
commit 6c8c183485
1 changed files with 33 additions and 9 deletions

View File

@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import asyncio
import contextlib import contextlib
import gzip import gzip
import json import json
@ -22,7 +23,8 @@ from pathlib import Path
from typing import Any, MutableMapping, Iterator from typing import Any, MutableMapping, Iterator
import dns.rdatatype import dns.rdatatype
import dns.resolver from dns.resolver import Cache
from dns.asyncresolver import Resolver
from har2tree import CrawledTree, Har2TreeError, HarFile from har2tree import CrawledTree, Har2TreeError, HarFile
from pyipasnhistory import IPASNHistory # type: ignore[attr-defined] from pyipasnhistory import IPASNHistory # type: ignore[attr-defined]
from redis import Redis from redis import Redis
@ -123,6 +125,12 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
self.__cache: dict[str, CaptureCache] = OrderedDict() self.__cache: dict[str, CaptureCache] = OrderedDict()
self._quick_init() self._quick_init()
self.timeout = get_config('generic', 'max_tree_create_time') self.timeout = get_config('generic', 'max_tree_create_time')
self.dnsresolver: Resolver = Resolver()
self.dnsresolver.cache = Cache(900)
self.dnsresolver.timeout = 4
self.dnsresolver.lifetime = 6
try: try:
self.ipasnhistory: IPASNHistory | None = IPASNHistory() self.ipasnhistory: IPASNHistory | None = IPASNHistory()
if not self.ipasnhistory.is_up: if not self.ipasnhistory.is_up:
@ -163,7 +171,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
or (cc.capture_dir / 'tree.pickle').exists())): or (cc.capture_dir / 'tree.pickle').exists())):
self.__cache[uuid] = cc self.__cache[uuid] = cc
return self.__cache[uuid] return self.__cache[uuid]
self.__cache[uuid] = self._set_capture_cache(capture_dir) self.__cache[uuid] = asyncio.run(self._set_capture_cache(capture_dir))
return self.__cache[uuid] return self.__cache[uuid]
def __iter__(self) -> Iterator[dict[str, CaptureCache]]: def __iter__(self) -> Iterator[dict[str, CaptureCache]]:
@ -256,7 +264,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).') raise MissingCaptureDirectory(f'UUID ({uuid}) linked to a missing directory ({capture_dir}).')
raise MissingUUID(f'Unable to find UUID {uuid}.') raise MissingUUID(f'Unable to find UUID {uuid}.')
def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree: async def _create_pickle(self, capture_dir: Path, logger: LookylooCacheLogAdapter) -> CrawledTree:
with (capture_dir / 'uuid').open() as f: with (capture_dir / 'uuid').open() as f:
uuid = f.read().strip() uuid = f.read().strip()
@ -281,7 +289,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
default_recursion_limit = sys.getrecursionlimit() default_recursion_limit = sys.getrecursionlimit()
with self._timeout_context(): with self._timeout_context():
tree = CrawledTree(har_files, uuid) tree = CrawledTree(har_files, uuid)
self.__resolve_dns(tree, logger) await self.__resolve_dns(tree, logger)
if self.contextualizer: if self.contextualizer:
self.contextualizer.contextualize_tree(tree) self.contextualizer.contextualize_tree(tree)
except Har2TreeError as e: except Har2TreeError as e:
@ -338,7 +346,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
else: else:
yield yield
def _set_capture_cache(self, capture_dir_str: str) -> CaptureCache: async def _set_capture_cache(self, capture_dir_str: str) -> CaptureCache:
'''Populate the redis cache for a capture. Mostly used on the index page. '''Populate the redis cache for a capture. Mostly used on the index page.
NOTE: Doesn't require the pickle.''' NOTE: Doesn't require the pickle.'''
capture_dir = Path(capture_dir_str) capture_dir = Path(capture_dir_str)
@ -360,7 +368,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
except TreeNeedsRebuild: except TreeNeedsRebuild:
try: try:
logger.debug('The tree needs to be rebuilt.') logger.debug('The tree needs to be rebuilt.')
tree = self._create_pickle(capture_dir, logger) tree = await self._create_pickle(capture_dir, logger)
# Force the reindexing in the public and full index (if enabled) # Force the reindexing in the public and full index (if enabled)
get_indexing().force_reindex(uuid) get_indexing().force_reindex(uuid)
if get_config('generic', 'index_everything'): if get_config('generic', 'index_everything'):
@ -448,7 +456,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
p.execute() p.execute()
return CaptureCache(cache) return CaptureCache(cache)
def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter) -> CrawledTree: async def __resolve_dns(self, ct: CrawledTree, logger: LookylooCacheLogAdapter) -> None:
'''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries '''Resolves all domains of the tree, keeps A (IPv4), AAAA (IPv6), and CNAME entries
and store them in ips.json and cnames.json, in the capture directory. and store them in ips.json and cnames.json, in the capture directory.
Updates the nodes of the tree accordingly so the information is available. Updates the nodes of the tree accordingly so the information is available.
@ -468,6 +476,15 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
to_search = known_cnames[to_search] to_search = known_cnames[to_search]
return cnames return cnames
async def _dns_query(hostname: str) -> None:
query_types = [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]
# dns.rdatatype.RdataType.SOA, dns.rdatatype.RdataType.NS]
for qt in query_types:
try:
await self.dnsresolver.resolve(hostname, qt, search=True, raise_on_no_answer=False)
except Exception as e:
logger.warning(f'Unable to resolve DNS {hostname} - {qt}: {e}')
cnames_path = ct.root_hartree.har.path.parent / 'cnames.json' cnames_path = ct.root_hartree.har.path.parent / 'cnames.json'
ips_path = ct.root_hartree.har.path.parent / 'ips.json' ips_path = ct.root_hartree.har.path.parent / 'ips.json'
ipasn_path = ct.root_hartree.har.path.parent / 'ipasn.json' ipasn_path = ct.root_hartree.har.path.parent / 'ipasn.json'
@ -500,6 +517,14 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
ipasn = {} ipasn = {}
_all_ips = set() _all_ips = set()
_all_hostnames = {node.name for node in ct.root_hartree.hostname_tree.traverse()
if not getattr(node, 'hostname_is_ip', False)}
self.dnsresolver.cache.flush()
logger.info(f'Resolving DNS: {len(_all_hostnames)} hostnames.')
all_requests = [_dns_query(hostname) for hostname in _all_hostnames]
# tun all the requests, cache them and let the rest of the code deal.
await asyncio.gather(*all_requests)
logger.info('Done resolving DNS.')
for node in ct.root_hartree.hostname_tree.traverse(): for node in ct.root_hartree.hostname_tree.traverse():
if 'hostname_is_ip' in node.features and node.hostname_is_ip: if 'hostname_is_ip' in node.features and node.hostname_is_ip:
continue continue
@ -509,7 +534,7 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
# Resolve and cache # Resolve and cache
for query_type in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]: for query_type in [dns.rdatatype.RdataType.A, dns.rdatatype.RdataType.AAAA]:
try: try:
response = dns.resolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False) response = await self.dnsresolver.resolve(node.name, query_type, search=True, raise_on_no_answer=False)
except Exception as e: except Exception as e:
logger.warning(f'Unable to resolve DNS: {e}') logger.warning(f'Unable to resolve DNS: {e}')
continue continue
@ -601,4 +626,3 @@ class CapturesIndex(Mapping): # type: ignore[type-arg]
json.dump(host_ips, f, default=serialize_sets) json.dump(host_ips, f, default=serialize_sets)
with ipasn_path.open('w') as f: with ipasn_path.open('w') as f:
json.dump(ipasn, f) json.dump(ipasn, f)
return ct