Don't reset retry timers on "valid" error codes (#16221)
parent
748c38921c
commit
f84baecb6f
|
@ -0,0 +1 @@
|
||||||
|
Fix long-standing bug where we did not correctly back off from servers that had "gone" if they returned 4xx series error codes.
|
|
@ -249,8 +249,10 @@ class TransportLayerClient:
|
||||||
data=json_data,
|
data=json_data,
|
||||||
json_data_callback=json_data_callback,
|
json_data_callback=json_data_callback,
|
||||||
long_retries=True,
|
long_retries=True,
|
||||||
backoff_on_404=True, # If we get a 404 the other side has gone
|
|
||||||
try_trailing_slash_on_400=True,
|
try_trailing_slash_on_400=True,
|
||||||
|
# Sending a transaction should always succeed, if it doesn't
|
||||||
|
# then something is wrong and we should backoff.
|
||||||
|
backoff_on_all_error_codes=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def make_query(
|
async def make_query(
|
||||||
|
|
|
@ -512,6 +512,7 @@ class MatrixFederationHttpClient:
|
||||||
long_retries: bool = False,
|
long_retries: bool = False,
|
||||||
ignore_backoff: bool = False,
|
ignore_backoff: bool = False,
|
||||||
backoff_on_404: bool = False,
|
backoff_on_404: bool = False,
|
||||||
|
backoff_on_all_error_codes: bool = False,
|
||||||
) -> IResponse:
|
) -> IResponse:
|
||||||
"""
|
"""
|
||||||
Sends a request to the given server.
|
Sends a request to the given server.
|
||||||
|
@ -552,6 +553,7 @@ class MatrixFederationHttpClient:
|
||||||
and try the request anyway.
|
and try the request anyway.
|
||||||
|
|
||||||
backoff_on_404: Back off if we get a 404
|
backoff_on_404: Back off if we get a 404
|
||||||
|
backoff_on_all_error_codes: Back off if we get any error response
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Resolves with the HTTP response object on success.
|
Resolves with the HTTP response object on success.
|
||||||
|
@ -594,6 +596,7 @@ class MatrixFederationHttpClient:
|
||||||
ignore_backoff=ignore_backoff,
|
ignore_backoff=ignore_backoff,
|
||||||
notifier=self.hs.get_notifier(),
|
notifier=self.hs.get_notifier(),
|
||||||
replication_client=self.hs.get_replication_command_handler(),
|
replication_client=self.hs.get_replication_command_handler(),
|
||||||
|
backoff_on_all_error_codes=backoff_on_all_error_codes,
|
||||||
)
|
)
|
||||||
|
|
||||||
method_bytes = request.method.encode("ascii")
|
method_bytes = request.method.encode("ascii")
|
||||||
|
@ -889,6 +892,7 @@ class MatrixFederationHttpClient:
|
||||||
backoff_on_404: bool = False,
|
backoff_on_404: bool = False,
|
||||||
try_trailing_slash_on_400: bool = False,
|
try_trailing_slash_on_400: bool = False,
|
||||||
parser: Literal[None] = None,
|
parser: Literal[None] = None,
|
||||||
|
backoff_on_all_error_codes: bool = False,
|
||||||
) -> JsonDict:
|
) -> JsonDict:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@ -906,6 +910,7 @@ class MatrixFederationHttpClient:
|
||||||
backoff_on_404: bool = False,
|
backoff_on_404: bool = False,
|
||||||
try_trailing_slash_on_400: bool = False,
|
try_trailing_slash_on_400: bool = False,
|
||||||
parser: Optional[ByteParser[T]] = None,
|
parser: Optional[ByteParser[T]] = None,
|
||||||
|
backoff_on_all_error_codes: bool = False,
|
||||||
) -> T:
|
) -> T:
|
||||||
...
|
...
|
||||||
|
|
||||||
|
@ -922,6 +927,7 @@ class MatrixFederationHttpClient:
|
||||||
backoff_on_404: bool = False,
|
backoff_on_404: bool = False,
|
||||||
try_trailing_slash_on_400: bool = False,
|
try_trailing_slash_on_400: bool = False,
|
||||||
parser: Optional[ByteParser[T]] = None,
|
parser: Optional[ByteParser[T]] = None,
|
||||||
|
backoff_on_all_error_codes: bool = False,
|
||||||
) -> Union[JsonDict, T]:
|
) -> Union[JsonDict, T]:
|
||||||
"""Sends the specified json data using PUT
|
"""Sends the specified json data using PUT
|
||||||
|
|
||||||
|
@ -957,6 +963,7 @@ class MatrixFederationHttpClient:
|
||||||
enabled.
|
enabled.
|
||||||
parser: The parser to use to decode the response. Defaults to
|
parser: The parser to use to decode the response. Defaults to
|
||||||
parsing as JSON.
|
parsing as JSON.
|
||||||
|
backoff_on_all_error_codes: Back off if we get any error response
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Succeeds when we get a 2xx HTTP response. The
|
Succeeds when we get a 2xx HTTP response. The
|
||||||
|
@ -990,6 +997,7 @@ class MatrixFederationHttpClient:
|
||||||
ignore_backoff=ignore_backoff,
|
ignore_backoff=ignore_backoff,
|
||||||
long_retries=long_retries,
|
long_retries=long_retries,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
|
backoff_on_all_error_codes=backoff_on_all_error_codes,
|
||||||
)
|
)
|
||||||
|
|
||||||
if timeout is not None:
|
if timeout is not None:
|
||||||
|
|
|
@ -128,6 +128,7 @@ class RetryDestinationLimiter:
|
||||||
backoff_on_failure: bool = True,
|
backoff_on_failure: bool = True,
|
||||||
notifier: Optional["Notifier"] = None,
|
notifier: Optional["Notifier"] = None,
|
||||||
replication_client: Optional["ReplicationCommandHandler"] = None,
|
replication_client: Optional["ReplicationCommandHandler"] = None,
|
||||||
|
backoff_on_all_error_codes: bool = False,
|
||||||
):
|
):
|
||||||
"""Marks the destination as "down" if an exception is thrown in the
|
"""Marks the destination as "down" if an exception is thrown in the
|
||||||
context, except for CodeMessageException with code < 500.
|
context, except for CodeMessageException with code < 500.
|
||||||
|
@ -147,6 +148,9 @@ class RetryDestinationLimiter:
|
||||||
|
|
||||||
backoff_on_failure: set to False if we should not increase the
|
backoff_on_failure: set to False if we should not increase the
|
||||||
retry interval on a failure.
|
retry interval on a failure.
|
||||||
|
|
||||||
|
backoff_on_all_error_codes: Whether we should back off on any
|
||||||
|
error code.
|
||||||
"""
|
"""
|
||||||
self.clock = clock
|
self.clock = clock
|
||||||
self.store = store
|
self.store = store
|
||||||
|
@ -156,6 +160,7 @@ class RetryDestinationLimiter:
|
||||||
self.retry_interval = retry_interval
|
self.retry_interval = retry_interval
|
||||||
self.backoff_on_404 = backoff_on_404
|
self.backoff_on_404 = backoff_on_404
|
||||||
self.backoff_on_failure = backoff_on_failure
|
self.backoff_on_failure = backoff_on_failure
|
||||||
|
self.backoff_on_all_error_codes = backoff_on_all_error_codes
|
||||||
|
|
||||||
self.notifier = notifier
|
self.notifier = notifier
|
||||||
self.replication_client = replication_client
|
self.replication_client = replication_client
|
||||||
|
@ -179,6 +184,7 @@ class RetryDestinationLimiter:
|
||||||
exc_val: Optional[BaseException],
|
exc_val: Optional[BaseException],
|
||||||
exc_tb: Optional[TracebackType],
|
exc_tb: Optional[TracebackType],
|
||||||
) -> None:
|
) -> None:
|
||||||
|
success = exc_type is None
|
||||||
valid_err_code = False
|
valid_err_code = False
|
||||||
if exc_type is None:
|
if exc_type is None:
|
||||||
valid_err_code = True
|
valid_err_code = True
|
||||||
|
@ -195,7 +201,9 @@ class RetryDestinationLimiter:
|
||||||
# won't accept our requests for at least a while.
|
# won't accept our requests for at least a while.
|
||||||
# 429 is us being aggressively rate limited, so lets rate limit
|
# 429 is us being aggressively rate limited, so lets rate limit
|
||||||
# ourselves.
|
# ourselves.
|
||||||
if exc_val.code == 404 and self.backoff_on_404:
|
if self.backoff_on_all_error_codes:
|
||||||
|
valid_err_code = False
|
||||||
|
elif exc_val.code == 404 and self.backoff_on_404:
|
||||||
valid_err_code = False
|
valid_err_code = False
|
||||||
elif exc_val.code in (401, 429):
|
elif exc_val.code in (401, 429):
|
||||||
valid_err_code = False
|
valid_err_code = False
|
||||||
|
@ -204,7 +212,7 @@ class RetryDestinationLimiter:
|
||||||
else:
|
else:
|
||||||
valid_err_code = False
|
valid_err_code = False
|
||||||
|
|
||||||
if valid_err_code:
|
if success:
|
||||||
# We connected successfully.
|
# We connected successfully.
|
||||||
if not self.retry_interval:
|
if not self.retry_interval:
|
||||||
return
|
return
|
||||||
|
@ -215,6 +223,12 @@ class RetryDestinationLimiter:
|
||||||
self.failure_ts = None
|
self.failure_ts = None
|
||||||
retry_last_ts = 0
|
retry_last_ts = 0
|
||||||
self.retry_interval = 0
|
self.retry_interval = 0
|
||||||
|
elif valid_err_code:
|
||||||
|
# We got a potentially valid error code back. We don't reset the
|
||||||
|
# timers though, as the other side might actually be down anyway
|
||||||
|
# (e.g. some deprovisioned servers will always return a 404 or 403,
|
||||||
|
# and we don't want to keep resetting the retry timers for them).
|
||||||
|
return
|
||||||
elif not self.backoff_on_failure:
|
elif not self.backoff_on_failure:
|
||||||
return
|
return
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -251,8 +251,8 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
||||||
),
|
),
|
||||||
json_data_callback=ANY,
|
json_data_callback=ANY,
|
||||||
long_retries=True,
|
long_retries=True,
|
||||||
backoff_on_404=True,
|
|
||||||
try_trailing_slash_on_400=True,
|
try_trailing_slash_on_400=True,
|
||||||
|
backoff_on_all_error_codes=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_started_typing_remote_recv(self) -> None:
|
def test_started_typing_remote_recv(self) -> None:
|
||||||
|
@ -366,7 +366,7 @@ class TypingNotificationsTestCase(unittest.HomeserverTestCase):
|
||||||
),
|
),
|
||||||
json_data_callback=ANY,
|
json_data_callback=ANY,
|
||||||
long_retries=True,
|
long_retries=True,
|
||||||
backoff_on_404=True,
|
backoff_on_all_error_codes=True,
|
||||||
try_trailing_slash_on_400=True,
|
try_trailing_slash_on_400=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue