Fix a bug where servers could be marked as up when they were failing (#16506)
After this change a server will only be reported as back online if they were previously having requests fail.pull/16510/head
parent
7291c68eea
commit
77dfc1f939
|
@ -0,0 +1 @@
|
|||
Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
|
|
@ -170,10 +170,10 @@ class RetryDestinationLimiter:
|
|||
database in milliseconds, or zero if the last request was
|
||||
successful.
|
||||
backoff_on_404: Back off if we get a 404
|
||||
|
||||
backoff_on_failure: set to False if we should not increase the
|
||||
retry interval on a failure.
|
||||
|
||||
notifier: A notifier used to mark servers as up.
|
||||
replication_client A replication client used to mark servers as up.
|
||||
backoff_on_all_error_codes: Whether we should back off on any
|
||||
error code.
|
||||
"""
|
||||
|
@ -237,6 +237,9 @@ class RetryDestinationLimiter:
|
|||
else:
|
||||
valid_err_code = False
|
||||
|
||||
# Whether previous requests to the destination had been failing.
|
||||
previously_failing = bool(self.failure_ts)
|
||||
|
||||
if success:
|
||||
# We connected successfully.
|
||||
if not self.retry_interval:
|
||||
|
@ -282,6 +285,9 @@ class RetryDestinationLimiter:
|
|||
if self.failure_ts is None:
|
||||
self.failure_ts = retry_last_ts
|
||||
|
||||
# Whether the current request to the destination had been failing.
|
||||
currently_failing = bool(self.failure_ts)
|
||||
|
||||
async def store_retry_timings() -> None:
|
||||
try:
|
||||
await self.store.set_destination_retry_timings(
|
||||
|
@ -291,16 +297,14 @@ class RetryDestinationLimiter:
|
|||
self.retry_interval,
|
||||
)
|
||||
|
||||
# If the server was previously failing, but is no longer.
|
||||
if previously_failing and not currently_failing:
|
||||
if self.notifier:
|
||||
# Inform the relevant places that the remote server is back up.
|
||||
self.notifier.notify_remote_server_up(self.destination)
|
||||
|
||||
if self.replication_client:
|
||||
# If we're on a worker we try and inform master about this. The
|
||||
# replication client doesn't hook into the notifier to avoid
|
||||
# infinite loops where we send a `REMOTE_SERVER_UP` command to
|
||||
# master, which then echoes it back to us which in turn pokes
|
||||
# the notifier.
|
||||
# Inform other workers that the remote server is up.
|
||||
self.replication_client.send_remote_server_up(self.destination)
|
||||
|
||||
except Exception:
|
||||
|
|
|
@ -11,6 +11,10 @@
|
|||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from unittest import mock
|
||||
|
||||
from synapse.notifier import Notifier
|
||||
from synapse.replication.tcp.handler import ReplicationCommandHandler
|
||||
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
|
||||
|
||||
from tests.unittest import HomeserverTestCase
|
||||
|
@ -109,6 +113,77 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
|||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||
self.assertIsNone(new_timings)
|
||||
|
||||
def test_notifier_replication(self) -> None:
|
||||
"""Ensure the notifier/replication client is called only when expected."""
|
||||
store = self.hs.get_datastores().main
|
||||
|
||||
notifier = mock.Mock(spec=Notifier)
|
||||
replication_client = mock.Mock(spec=ReplicationCommandHandler)
|
||||
|
||||
limiter = self.get_success(
|
||||
get_retry_limiter(
|
||||
"test_dest",
|
||||
self.clock,
|
||||
store,
|
||||
notifier=notifier,
|
||||
replication_client=replication_client,
|
||||
)
|
||||
)
|
||||
|
||||
# The server is already up, nothing should occur.
|
||||
self.pump(1)
|
||||
with limiter:
|
||||
pass
|
||||
self.pump()
|
||||
|
||||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||
self.assertIsNone(new_timings)
|
||||
notifier.notify_remote_server_up.assert_not_called()
|
||||
replication_client.send_remote_server_up.assert_not_called()
|
||||
|
||||
# Attempt again, but return an error. This will cause new retry timings, but
|
||||
# should not trigger server up notifications.
|
||||
self.pump(1)
|
||||
try:
|
||||
with limiter:
|
||||
raise AssertionError("argh")
|
||||
except AssertionError:
|
||||
pass
|
||||
self.pump()
|
||||
|
||||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||
# The exact retry timings are tested separately.
|
||||
self.assertIsNotNone(new_timings)
|
||||
notifier.notify_remote_server_up.assert_not_called()
|
||||
replication_client.send_remote_server_up.assert_not_called()
|
||||
|
||||
# A second failing request should be treated as the above.
|
||||
self.pump(1)
|
||||
try:
|
||||
with limiter:
|
||||
raise AssertionError("argh")
|
||||
except AssertionError:
|
||||
pass
|
||||
self.pump()
|
||||
|
||||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||
# The exact retry timings are tested separately.
|
||||
self.assertIsNotNone(new_timings)
|
||||
notifier.notify_remote_server_up.assert_not_called()
|
||||
replication_client.send_remote_server_up.assert_not_called()
|
||||
|
||||
# A final successful attempt should generate a server up notification.
|
||||
self.pump(1)
|
||||
with limiter:
|
||||
pass
|
||||
self.pump()
|
||||
|
||||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||
# The exact retry timings are tested separately.
|
||||
self.assertIsNone(new_timings)
|
||||
notifier.notify_remote_server_up.assert_called_once_with("test_dest")
|
||||
replication_client.send_remote_server_up.assert_called_once_with("test_dest")
|
||||
|
||||
def test_max_retry_interval(self) -> None:
|
||||
"""Test that `destination_max_retry_interval` setting works as expected"""
|
||||
store = self.hs.get_datastores().main
|
||||
|
|
Loading…
Reference in New Issue