Allow for the configuration of max request retries and min/max retry delays in the matrix federation client (#12504)

Co-authored-by: Mathieu Velten <mathieuv@matrix.org> Co-authored-by: Erik Johnston <erik@matrix.org>
2023-06-09 00:00:46 -07:00 · 2023-06-09 00:00:46 -07:00 · d84e66144d
parent f6321e386c
commit d84e66144d
5 changed files with 68 additions and 10 deletions
--- a/changelog.d/12504.misc
+++ b/changelog.d/12504.misc
@ -0,0 +1 @@
 Allow for the configuration of max request retries and min/max retry delays in the matrix federation client.
--- a/docs/usage/configuration/config_documentation.md
+++ b/docs/usage/configuration/config_documentation.md
@ -1196,6 +1196,32 @@ Example configuration:
 allow_device_name_lookup_over_federation: true
 ```
 ---
 ### `federation`
 The federation section defines some sub-options related to federation.
 The following options are related to configuring timeout and retry logic for one request,
 independently of the others.
 Short retry algorithm is used when something or someone will wait for the request to have an
 answer, while long retry is used for requests that happen in the background,
 like sending a federation transaction.
 * `client_timeout`: timeout for the federation requests in seconds. Default to 60s.
 * `max_short_retry_delay`: maximum delay to be used for the short retry algo in seconds. Default to 2s.
 * `max_long_retry_delay`: maximum delay to be used for the short retry algo in seconds. Default to 60s.
 * `max_short_retries`: maximum number of retries for the short retry algo. Default to 3 attempts.
 * `max_long_retries`: maximum number of retries for the long retry algo. Default to 10 attempts.
 Example configuration:
 ```yaml
 federation:
  client_timeout: 180
  max_short_retry_delay: 7
  max_long_retry_delay: 100
  max_short_retries: 5
  max_long_retries: 20
 ```
 ---
 ## Caching
 Options related to caching.
--- a/synapse/config/federation.py
+++ b/synapse/config/federation.py
@ -22,6 +22,8 @@ class FederationConfig(Config):
    section = "federation"
    def read_config(self, config: JsonDict, **kwargs: Any) -> None:
        federation_config = config.setdefault("federation", {})
        # FIXME: federation_domain_whitelist needs sytests
        self.federation_domain_whitelist: Optional[dict] = None
        federation_domain_whitelist = config.get("federation_domain_whitelist", None)
@ -49,5 +51,13 @@ class FederationConfig(Config):
            "allow_device_name_lookup_over_federation", False
        )
        # Allow for the configuration of timeout, max request retries
        # and min/max retry delays in the matrix federation client.
        self.client_timeout = federation_config.get("client_timeout", 60)
        self.max_long_retry_delay = federation_config.get("max_long_retry_delay", 60)
        self.max_short_retry_delay = federation_config.get("max_short_retry_delay", 2)
        self.max_long_retries = federation_config.get("max_long_retries", 10)
        self.max_short_retries = federation_config.get("max_short_retries", 3)
 _METRICS_FOR_DOMAINS_SCHEMA = {"type": "array", "items": {"type": "string"}}
--- a/synapse/http/matrixfederationclient.py
+++ b/synapse/http/matrixfederationclient.py
@ -95,8 +95,6 @@ incoming_responses_counter = Counter(
 )
 MAX_LONG_RETRIES = 10
 MAX_SHORT_RETRIES = 3
 MAXINT = sys.maxsize
@ -406,7 +404,12 @@ class MatrixFederationHttpClient:
        self.clock = hs.get_clock()
        self._store = hs.get_datastores().main
        self.version_string_bytes = hs.version_string.encode("ascii")
-        self.default_timeout = 60
+        self.default_timeout = hs.config.federation.client_timeout
        self.max_long_retry_delay = hs.config.federation.max_long_retry_delay
        self.max_short_retry_delay = hs.config.federation.max_short_retry_delay
        self.max_long_retries = hs.config.federation.max_long_retries
        self.max_short_retries = hs.config.federation.max_short_retries
        self._cooperator = Cooperator(scheduler=_make_scheduler(self.reactor))
@ -583,9 +586,9 @@ class MatrixFederationHttpClient:
            # XXX: Would be much nicer to retry only at the transaction-layer
            # (once we have reliable transactions in place)
            if long_retries:
-                retries_left = MAX_LONG_RETRIES
+                retries_left = self.max_long_retries
            else:
-                retries_left = MAX_SHORT_RETRIES
+                retries_left = self.max_short_retries
            url_bytes = request.uri
            url_str = url_bytes.decode("ascii")
@ -730,12 +733,12 @@ class MatrixFederationHttpClient:
                    if retries_left and not timeout:
                        if long_retries:
-                            delay = 4 ** (MAX_LONG_RETRIES + 1 - retries_left)
+                            delay = 4 ** (self.max_long_retries + 1 - retries_left)
-                            delay = min(delay, 60)
+                            delay = min(delay, self.max_long_retry_delay)
                            delay *= random.uniform(0.8, 1.4)
                        else:
-                            delay = 0.5 * 2 ** (MAX_SHORT_RETRIES - retries_left)
+                            delay = 0.5 * 2 ** (self.max_short_retries - retries_left)
-                            delay = min(delay, 2)
+                            delay = min(delay, self.max_short_retry_delay)
                            delay *= random.uniform(0.8, 1.4)
                        logger.debug(
--- a/tests/http/test_matrixfederationclient.py
+++ b/tests/http/test_matrixfederationclient.py
@ -40,7 +40,7 @@ from synapse.server import HomeServer
 from synapse.util import Clock
 from tests.server import FakeTransport
-from tests.unittest import HomeserverTestCase
+from tests.unittest import HomeserverTestCase, override_config
 def check_logcontext(context: LoggingContextOrSentinel) -> None:
@ -640,3 +640,21 @@ class FederationClientTests(HomeserverTestCase):
            self.cl.build_auth_headers(
                b"", b"GET", b"https://example.com", destination_is=b""
            )
    @override_config(
        {
            "federation": {
                "client_timeout": 180,
                "max_long_retry_delay": 100,
                "max_short_retry_delay": 7,
                "max_long_retries": 20,
                "max_short_retries": 5,
            }
        }
    )
    def test_configurable_retry_and_delay_values(self) -> None:
        self.assertEqual(self.cl.default_timeout, 180)
        self.assertEqual(self.cl.max_long_retry_delay, 100)
        self.assertEqual(self.cl.max_short_retry_delay, 7)
        self.assertEqual(self.cl.max_long_retries, 20)
        self.assertEqual(self.cl.max_short_retries, 5)
		`@ -0,0 +1 @@`
							`Allow for the configuration of max request retries and min/max retry delays in the matrix federation client.`