315 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			315 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
| # -*- coding: utf-8 -*-
 | |
| # Copyright 2019 The Matrix.org Foundation C.I.C.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| 
 | |
| import json
 | |
| import logging
 | |
| import random
 | |
| import time
 | |
| 
 | |
| import attr
 | |
| 
 | |
| from twisted.internet import defer
 | |
| from twisted.web.client import RedirectAgent, readBody
 | |
| from twisted.web.http import stringToDatetime
 | |
| from twisted.web.http_headers import Headers
 | |
| 
 | |
| from synapse.logging.context import make_deferred_yieldable
 | |
| from synapse.util import Clock
 | |
| from synapse.util.caches.ttlcache import TTLCache
 | |
| from synapse.util.metrics import Measure
 | |
| 
 | |
| # period to cache .well-known results for by default
 | |
| WELL_KNOWN_DEFAULT_CACHE_PERIOD = 24 * 3600
 | |
| 
 | |
| # jitter factor to add to the .well-known default cache ttls
 | |
| WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER = 0.1
 | |
| 
 | |
| # period to cache failure to fetch .well-known for
 | |
| WELL_KNOWN_INVALID_CACHE_PERIOD = 1 * 3600
 | |
| 
 | |
| # period to cache failure to fetch .well-known if there has recently been a
 | |
| # valid well-known for that domain.
 | |
| WELL_KNOWN_DOWN_CACHE_PERIOD = 2 * 60
 | |
| 
 | |
| # period to remember there was a valid well-known after valid record expires
 | |
| WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID = 2 * 3600
 | |
| 
 | |
| # cap for .well-known cache period
 | |
| WELL_KNOWN_MAX_CACHE_PERIOD = 48 * 3600
 | |
| 
 | |
| # lower bound for .well-known cache period
 | |
| WELL_KNOWN_MIN_CACHE_PERIOD = 5 * 60
 | |
| 
 | |
| # Attempt to refetch a cached well-known N% of the TTL before it expires.
 | |
| # e.g. if set to 0.2 and we have a cached entry with a TTL of 5mins, then
 | |
| # we'll start trying to refetch 1 minute before it expires.
 | |
| WELL_KNOWN_GRACE_PERIOD_FACTOR = 0.2
 | |
| 
 | |
| # Number of times we retry fetching a well-known for a domain we know recently
 | |
| # had a valid entry.
 | |
| WELL_KNOWN_RETRY_ATTEMPTS = 3
 | |
| 
 | |
| 
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| 
 | |
| _well_known_cache = TTLCache("well-known")
 | |
| _had_valid_well_known_cache = TTLCache("had-valid-well-known")
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True)
 | |
| class WellKnownLookupResult(object):
 | |
|     delegated_server = attr.ib()
 | |
| 
 | |
| 
 | |
| class WellKnownResolver(object):
 | |
|     """Handles well-known lookups for matrix servers.
 | |
|     """
 | |
| 
 | |
|     def __init__(
 | |
|         self,
 | |
|         reactor,
 | |
|         agent,
 | |
|         user_agent,
 | |
|         well_known_cache=None,
 | |
|         had_well_known_cache=None,
 | |
|     ):
 | |
|         self._reactor = reactor
 | |
|         self._clock = Clock(reactor)
 | |
| 
 | |
|         if well_known_cache is None:
 | |
|             well_known_cache = _well_known_cache
 | |
| 
 | |
|         if had_well_known_cache is None:
 | |
|             had_well_known_cache = _had_valid_well_known_cache
 | |
| 
 | |
|         self._well_known_cache = well_known_cache
 | |
|         self._had_valid_well_known_cache = had_well_known_cache
 | |
|         self._well_known_agent = RedirectAgent(agent)
 | |
|         self.user_agent = user_agent
 | |
| 
 | |
|     @defer.inlineCallbacks
 | |
|     def get_well_known(self, server_name):
 | |
|         """Attempt to fetch and parse a .well-known file for the given server
 | |
| 
 | |
|         Args:
 | |
|             server_name (bytes): name of the server, from the requested url
 | |
| 
 | |
|         Returns:
 | |
|             Deferred[WellKnownLookupResult]: The result of the lookup
 | |
|         """
 | |
|         try:
 | |
|             prev_result, expiry, ttl = self._well_known_cache.get_with_expiry(
 | |
|                 server_name
 | |
|             )
 | |
| 
 | |
|             now = self._clock.time()
 | |
|             if now < expiry - WELL_KNOWN_GRACE_PERIOD_FACTOR * ttl:
 | |
|                 return WellKnownLookupResult(delegated_server=prev_result)
 | |
|         except KeyError:
 | |
|             prev_result = None
 | |
| 
 | |
|         # TODO: should we linearise so that we don't end up doing two .well-known
 | |
|         # requests for the same server in parallel?
 | |
|         try:
 | |
|             with Measure(self._clock, "get_well_known"):
 | |
|                 result, cache_period = yield self._fetch_well_known(server_name)
 | |
| 
 | |
|         except _FetchWellKnownFailure as e:
 | |
|             if prev_result and e.temporary:
 | |
|                 # This is a temporary failure and we have a still valid cached
 | |
|                 # result, so lets return that. Hopefully the next time we ask
 | |
|                 # the remote will be back up again.
 | |
|                 return WellKnownLookupResult(delegated_server=prev_result)
 | |
| 
 | |
|             result = None
 | |
| 
 | |
|             if self._had_valid_well_known_cache.get(server_name, False):
 | |
|                 # We have recently seen a valid well-known record for this
 | |
|                 # server, so we cache the lack of well-known for a shorter time.
 | |
|                 cache_period = WELL_KNOWN_DOWN_CACHE_PERIOD
 | |
|             else:
 | |
|                 cache_period = WELL_KNOWN_INVALID_CACHE_PERIOD
 | |
| 
 | |
|             # add some randomness to the TTL to avoid a stampeding herd
 | |
|             cache_period *= random.uniform(
 | |
|                 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
 | |
|                 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
 | |
|             )
 | |
| 
 | |
|         if cache_period > 0:
 | |
|             self._well_known_cache.set(server_name, result, cache_period)
 | |
| 
 | |
|         return WellKnownLookupResult(delegated_server=result)
 | |
| 
 | |
|     @defer.inlineCallbacks
 | |
|     def _fetch_well_known(self, server_name):
 | |
|         """Actually fetch and parse a .well-known, without checking the cache
 | |
| 
 | |
|         Args:
 | |
|             server_name (bytes): name of the server, from the requested url
 | |
| 
 | |
|         Raises:
 | |
|             _FetchWellKnownFailure if we fail to lookup a result
 | |
| 
 | |
|         Returns:
 | |
|             Deferred[Tuple[bytes,int]]: The lookup result and cache period.
 | |
|         """
 | |
| 
 | |
|         had_valid_well_known = self._had_valid_well_known_cache.get(server_name, False)
 | |
| 
 | |
|         # We do this in two steps to differentiate between possibly transient
 | |
|         # errors (e.g. can't connect to host, 503 response) and more permenant
 | |
|         # errors (such as getting a 404 response).
 | |
|         response, body = yield self._make_well_known_request(
 | |
|             server_name, retry=had_valid_well_known
 | |
|         )
 | |
| 
 | |
|         try:
 | |
|             if response.code != 200:
 | |
|                 raise Exception("Non-200 response %s" % (response.code,))
 | |
| 
 | |
|             parsed_body = json.loads(body.decode("utf-8"))
 | |
|             logger.info("Response from .well-known: %s", parsed_body)
 | |
| 
 | |
|             result = parsed_body["m.server"].encode("ascii")
 | |
|         except defer.CancelledError:
 | |
|             # Bail if we've been cancelled
 | |
|             raise
 | |
|         except Exception as e:
 | |
|             logger.info("Error parsing well-known for %s: %s", server_name, e)
 | |
|             raise _FetchWellKnownFailure(temporary=False)
 | |
| 
 | |
|         cache_period = _cache_period_from_headers(
 | |
|             response.headers, time_now=self._reactor.seconds
 | |
|         )
 | |
|         if cache_period is None:
 | |
|             cache_period = WELL_KNOWN_DEFAULT_CACHE_PERIOD
 | |
|             # add some randomness to the TTL to avoid a stampeding herd every 24 hours
 | |
|             # after startup
 | |
|             cache_period *= random.uniform(
 | |
|                 1 - WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
 | |
|                 1 + WELL_KNOWN_DEFAULT_CACHE_PERIOD_JITTER,
 | |
|             )
 | |
|         else:
 | |
|             cache_period = min(cache_period, WELL_KNOWN_MAX_CACHE_PERIOD)
 | |
|             cache_period = max(cache_period, WELL_KNOWN_MIN_CACHE_PERIOD)
 | |
| 
 | |
|         # We got a success, mark as such in the cache
 | |
|         self._had_valid_well_known_cache.set(
 | |
|             server_name,
 | |
|             bool(result),
 | |
|             cache_period + WELL_KNOWN_REMEMBER_DOMAIN_HAD_VALID,
 | |
|         )
 | |
| 
 | |
|         return result, cache_period
 | |
| 
 | |
|     @defer.inlineCallbacks
 | |
|     def _make_well_known_request(self, server_name, retry):
 | |
|         """Make the well known request.
 | |
| 
 | |
|         This will retry the request if requested and it fails (with unable
 | |
|         to connect or receives a 5xx error).
 | |
| 
 | |
|         Args:
 | |
|             server_name (bytes)
 | |
|             retry (bool): Whether to retry the request if it fails.
 | |
| 
 | |
|         Returns:
 | |
|             Deferred[tuple[IResponse, bytes]] Returns the response object and
 | |
|             body. Response may be a non-200 response.
 | |
|         """
 | |
|         uri = b"https://%s/.well-known/matrix/server" % (server_name,)
 | |
|         uri_str = uri.decode("ascii")
 | |
| 
 | |
|         headers = {
 | |
|             b"User-Agent": [self.user_agent],
 | |
|         }
 | |
| 
 | |
|         i = 0
 | |
|         while True:
 | |
|             i += 1
 | |
| 
 | |
|             logger.info("Fetching %s", uri_str)
 | |
|             try:
 | |
|                 response = yield make_deferred_yieldable(
 | |
|                     self._well_known_agent.request(
 | |
|                         b"GET", uri, headers=Headers(headers)
 | |
|                     )
 | |
|                 )
 | |
|                 body = yield make_deferred_yieldable(readBody(response))
 | |
| 
 | |
|                 if 500 <= response.code < 600:
 | |
|                     raise Exception("Non-200 response %s" % (response.code,))
 | |
| 
 | |
|                 return response, body
 | |
|             except defer.CancelledError:
 | |
|                 # Bail if we've been cancelled
 | |
|                 raise
 | |
|             except Exception as e:
 | |
|                 if not retry or i >= WELL_KNOWN_RETRY_ATTEMPTS:
 | |
|                     logger.info("Error fetching %s: %s", uri_str, e)
 | |
|                     raise _FetchWellKnownFailure(temporary=True)
 | |
| 
 | |
|                 logger.info("Error fetching %s: %s. Retrying", uri_str, e)
 | |
| 
 | |
|             # Sleep briefly in the hopes that they come back up
 | |
|             yield self._clock.sleep(0.5)
 | |
| 
 | |
| 
 | |
| def _cache_period_from_headers(headers, time_now=time.time):
 | |
|     cache_controls = _parse_cache_control(headers)
 | |
| 
 | |
|     if b"no-store" in cache_controls:
 | |
|         return 0
 | |
| 
 | |
|     if b"max-age" in cache_controls:
 | |
|         try:
 | |
|             max_age = int(cache_controls[b"max-age"])
 | |
|             return max_age
 | |
|         except ValueError:
 | |
|             pass
 | |
| 
 | |
|     expires = headers.getRawHeaders(b"expires")
 | |
|     if expires is not None:
 | |
|         try:
 | |
|             expires_date = stringToDatetime(expires[-1])
 | |
|             return expires_date - time_now()
 | |
|         except ValueError:
 | |
|             # RFC7234 says 'A cache recipient MUST interpret invalid date formats,
 | |
|             # especially the value "0", as representing a time in the past (i.e.,
 | |
|             # "already expired").
 | |
|             return 0
 | |
| 
 | |
|     return None
 | |
| 
 | |
| 
 | |
| def _parse_cache_control(headers):
 | |
|     cache_controls = {}
 | |
|     for hdr in headers.getRawHeaders(b"cache-control", []):
 | |
|         for directive in hdr.split(b","):
 | |
|             splits = [x.strip() for x in directive.split(b"=", 1)]
 | |
|             k = splits[0].lower()
 | |
|             v = splits[1] if len(splits) > 1 else None
 | |
|             cache_controls[k] = v
 | |
|     return cache_controls
 | |
| 
 | |
| 
 | |
| @attr.s()
 | |
| class _FetchWellKnownFailure(Exception):
 | |
|     # True if we didn't get a non-5xx HTTP response, i.e. this may or may not be
 | |
|     # a temporary failure.
 | |
|     temporary = attr.ib()
 |