Don't reset retry timers on "valid" error codes (#16221)
parent
748c38921c
commit
f84baecb6f
|
@ -0,0 +1 @@
|
|||
Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.
|
|
@ -249,8 +249,10 @@ class TransportLayerClient:
|
|||
data=json_data,
|
||||
json_data_callback=json_data_callback,
|
||||
long_retries=True,
|
||||
backoff_on_404=True, # If we get a 404 the other side has gone
|
||||
try_trailing_slash_on_400=True,
|
||||
# Sending a transaction should always succeed, if it doesn't
|
||||
# then something is wrong and we should backoff.
|
||||
backoff_on_all_error_codes=True,
|
||||
)
|
||||
|
||||
async def make_query(
|
||||
|
|
|
@ -512,6 +512,7 @@ class MatrixFederationHttpClient:
|
|||
long_retries: bool = False,
|
||||
ignore_backoff: bool = False,
|
||||
backoff_on_404: bool = False,
|
||||
backoff_on_all_error_codes: bool = False,
|
||||
) -> IResponse:
|
||||
"""
|
||||
Sends a request to the given server.
|
||||
|
@ -552,6 +553,7 @@ class MatrixFederationHttpClient:
|
|||
and try the request anyway.
|
||||
|
||||
backoff_on_404: Back off if we get a 404
|
||||
backoff_on_all_error_codes: Back off if we get any error response
|
||||
|
||||
Returns:
|
||||
Resolves with the HTTP response object on success.
|
||||
|
@ -594,6 +596,7 @@ class MatrixFederationHttpClient:
|
|||
ignore_backoff=ignore_backoff,
|
||||
notifier=self.hs.get_notifier(),
|
||||
replication_client=self.hs.get_replication_command_handler(),
|
||||
backoff_on_all_error_codes=backoff_on_all_error_codes,
|
||||
)
|
||||
|
||||
method_bytes = request.method.encode("ascii")
|
||||
|
@ -889,6 +892,7 @@ class MatrixFederationHttpClient:
|
|||
backoff_on_404: bool = False,
|
||||
try_trailing_slash_on_400: bool = False,
|
||||
parser: Literal[None] = None,
|
||||
backoff_on_all_error_codes: bool = False,
|
||||
) -> JsonDict:
|
||||
...
|
||||
|
||||
|
@ -906,6 +910,7 @@ class MatrixFederationHttpClient:
|
|||
backoff_on_404: bool = False,
|
||||
try_trailing_slash_on_400: bool = False,
|
||||
parser: Optional[ByteParser[T]] = None,
|
||||
backoff_on_all_error_codes: bool = False,
|
||||
) -> T:
|
||||
...
|
||||
|
||||
|
@ -922,6 +927,7 @@ class MatrixFederationHttpClient:
|
|||
backoff_on_404: bool = False,
|
||||
try_trailing_slash_on_400: bool = False,
|
||||
parser: Optional[ByteParser[T]] = None,
|
||||
backoff_on_all_error_codes: bool = False,
|
||||
) -> Union[JsonDict, T]:
|
||||
"""Sends the specified json data using PUT
|
||||
|
||||
|
@ -957,6 +963,7 @@ class MatrixFederationHttpClient:
|
|||
enabled.
|
||||
parser: The parser to use to decode the response. Defaults to
|
||||
parsing as JSON.
|
||||
backoff_on_all_error_codes: Back off if we get any error response
|
||||
|
||||
Returns:
|
||||
Succeeds when we get a 2xx HTTP response. The
|
||||
|
@ -990,6 +997,7 @@ class MatrixFederationHttpClient:
|
|||
ignore_backoff=ignore_backoff,
|
||||
long_retries=long_retries,
|
||||
timeout=timeout,
|
||||
backoff_on_all_error_codes=backoff_on_all_error_codes,
|
||||
)
|
||||
|
||||
if timeout is not None:
|
||||
|
|
|
@ -128,6 +128,7 @@ class RetryDestinationLimiter:
|
|||
backoff_on_failure: bool = True,
|
||||
notifier: Optional["Notifier"] = None,
|
||||
replication_client: Optional["ReplicationCommandHandler"] = None,
|
||||
backoff_on_all_error_codes: bool = False,
|
||||
):
|
||||
"""Marks the destination as "down" if an exception is thrown in the
|
||||
context, except for CodeMessageException with code < 500.
|
||||
|
@ -147,6 +148,9 @@ class RetryDestinationLimiter:
|
|||
|
||||
backoff_on_failure: set to False if we should not increase the
|
||||
retry interval on a failure.
|
||||
|
||||
backoff_on_all_error_codes: Whether we should back off on any
|
||||
error code.
|
||||
"""
|
||||
self.clock = clock
|
||||
self.store = store
|
||||
|
@ -156,6 +160,7 @@ class RetryDestinationLimiter:
|
|||
self.retry_interval = retry_interval
|
||||
self.backoff_on_404 = backoff_on_404
|
||||
self.backoff_on_failure = backoff_on_failure
|
||||
self.backoff_on_all_error_codes = backoff_on_all_error_codes
|
||||
|
||||
self.notifier = notifier
|
||||
self.replication_client = replication_client
|
||||
|
@ -179,6 +184,7 @@ class RetryDestinationLimiter:
|
|||
exc_val: Optional[BaseException],
|
||||
exc_tb: Optional[TracebackType],
|
||||
) -> None:
|
||||
success = exc_type is None
|
||||
valid_err_code = False
|
||||
if exc_type is None:
|
||||
valid_err_code = True
|
||||
|
@ -195,7 +201,9 @@ class RetryDestinationLimiter:
|
|||
# won't accept our requests for at least a while.
|
||||
# 429 is us being aggressively rate limited, so lets rate limit
|
||||
# ourselves.
|
||||
if exc_val.code == 404 and self.backoff_on_404:
|
||||
if self.backoff_on_all_error_codes:
|
||||
valid_err_code = False
|
||||
elif exc_val.code == 404 and self.backoff_on_404:
|
||||
valid_err_code = False
|
||||
elif exc_val.code in (401, 429):
|
||||
valid_err_code = False
|
||||
|
@ -204,7 +212,7 @@ class RetryDestinationLimiter:
|
|||
else:
|
||||
valid_err_code = False
|
||||
|
||||
if valid_err_code:
|
||||
if success:
|
||||
# We connected successfully.
|
||||
if not self.retry_interval:
|
||||
return
|
||||
|
@ -215,6 +223,12 @@ class RetryDestinationLimiter:
|
|||
self.failure_ts = None
|
||||
retry_last_ts = 0
|
||||
self.retry_interval = 0
|
||||
elif valid_err_code:
|
||||
# We got a potentially valid error code back. We don't reset the
|
||||
# timers though, as the other side might actually be down anyway
|
||||
# (e.g. some deprovisioned servers will always return a 404 or 403,
|
||||
# and we don't want to keep resetting the retry timers for them).
|
||||
return
|
||||
elif not self.backoff_on_failure:
|
||||
return
|
||||
else:
|
||||
|
|
|
@ -251,8 +251,8 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
|||
),
|
||||
json_data_callback=ANY,
|
||||
long_retries=True,
|
||||
backoff_on_404=True,
|
||||
try_trailing_slash_on_400=True,
|
||||
backoff_on_all_error_codes=True,
|
||||
)
|
||||
|
||||
def test_started_typing_remote_recv(self) -> None:
|
||||
|
@ -366,7 +366,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
|||
),
|
||||
json_data_callback=ANY,
|
||||
long_retries=True,
|
||||
backoff_on_404=True,
|
||||
backoff_on_all_error_codes=True,
|
||||
try_trailing_slash_on_400=True,
|
||||
)
|
||||
|
||||
|
|
Loading…
Reference in New Issue