Fix a bug where servers could be marked as up when they were failing (#16506)
After this change a server will only be reported as back online if they were previously having requests fail.pull/16510/head
parent
7291c68eea
commit
77dfc1f939
|
@ -0,0 +1 @@
|
||||||
|
Fix a bug introduced in Synapse 1.59.0 where servers would be incorrectly marked as available when a request resulted in an error.
|
|
@ -170,10 +170,10 @@ class RetryDestinationLimiter:
|
||||||
database in milliseconds, or zero if the last request was
|
database in milliseconds, or zero if the last request was
|
||||||
successful.
|
successful.
|
||||||
backoff_on_404: Back off if we get a 404
|
backoff_on_404: Back off if we get a 404
|
||||||
|
|
||||||
backoff_on_failure: set to False if we should not increase the
|
backoff_on_failure: set to False if we should not increase the
|
||||||
retry interval on a failure.
|
retry interval on a failure.
|
||||||
|
notifier: A notifier used to mark servers as up.
|
||||||
|
replication_client A replication client used to mark servers as up.
|
||||||
backoff_on_all_error_codes: Whether we should back off on any
|
backoff_on_all_error_codes: Whether we should back off on any
|
||||||
error code.
|
error code.
|
||||||
"""
|
"""
|
||||||
|
@ -237,6 +237,9 @@ class RetryDestinationLimiter:
|
||||||
else:
|
else:
|
||||||
valid_err_code = False
|
valid_err_code = False
|
||||||
|
|
||||||
|
# Whether previous requests to the destination had been failing.
|
||||||
|
previously_failing = bool(self.failure_ts)
|
||||||
|
|
||||||
if success:
|
if success:
|
||||||
# We connected successfully.
|
# We connected successfully.
|
||||||
if not self.retry_interval:
|
if not self.retry_interval:
|
||||||
|
@ -282,6 +285,9 @@ class RetryDestinationLimiter:
|
||||||
if self.failure_ts is None:
|
if self.failure_ts is None:
|
||||||
self.failure_ts = retry_last_ts
|
self.failure_ts = retry_last_ts
|
||||||
|
|
||||||
|
# Whether the current request to the destination had been failing.
|
||||||
|
currently_failing = bool(self.failure_ts)
|
||||||
|
|
||||||
async def store_retry_timings() -> None:
|
async def store_retry_timings() -> None:
|
||||||
try:
|
try:
|
||||||
await self.store.set_destination_retry_timings(
|
await self.store.set_destination_retry_timings(
|
||||||
|
@ -291,17 +297,15 @@ class RetryDestinationLimiter:
|
||||||
self.retry_interval,
|
self.retry_interval,
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.notifier:
|
# If the server was previously failing, but is no longer.
|
||||||
# Inform the relevant places that the remote server is back up.
|
if previously_failing and not currently_failing:
|
||||||
self.notifier.notify_remote_server_up(self.destination)
|
if self.notifier:
|
||||||
|
# Inform the relevant places that the remote server is back up.
|
||||||
|
self.notifier.notify_remote_server_up(self.destination)
|
||||||
|
|
||||||
if self.replication_client:
|
if self.replication_client:
|
||||||
# If we're on a worker we try and inform master about this. The
|
# Inform other workers that the remote server is up.
|
||||||
# replication client doesn't hook into the notifier to avoid
|
self.replication_client.send_remote_server_up(self.destination)
|
||||||
# infinite loops where we send a `REMOTE_SERVER_UP` command to
|
|
||||||
# master, which then echoes it back to us which in turn pokes
|
|
||||||
# the notifier.
|
|
||||||
self.replication_client.send_remote_server_up(self.destination)
|
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Failed to store destination_retry_timings")
|
logger.exception("Failed to store destination_retry_timings")
|
||||||
|
|
|
@ -11,6 +11,10 @@
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
|
from synapse.notifier import Notifier
|
||||||
|
from synapse.replication.tcp.handler import ReplicationCommandHandler
|
||||||
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
|
from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter
|
||||||
|
|
||||||
from tests.unittest import HomeserverTestCase
|
from tests.unittest import HomeserverTestCase
|
||||||
|
@ -109,6 +113,77 @@ class RetryLimiterTestCase(HomeserverTestCase):
|
||||||
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||||
self.assertIsNone(new_timings)
|
self.assertIsNone(new_timings)
|
||||||
|
|
||||||
|
def test_notifier_replication(self) -> None:
|
||||||
|
"""Ensure the notifier/replication client is called only when expected."""
|
||||||
|
store = self.hs.get_datastores().main
|
||||||
|
|
||||||
|
notifier = mock.Mock(spec=Notifier)
|
||||||
|
replication_client = mock.Mock(spec=ReplicationCommandHandler)
|
||||||
|
|
||||||
|
limiter = self.get_success(
|
||||||
|
get_retry_limiter(
|
||||||
|
"test_dest",
|
||||||
|
self.clock,
|
||||||
|
store,
|
||||||
|
notifier=notifier,
|
||||||
|
replication_client=replication_client,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# The server is already up, nothing should occur.
|
||||||
|
self.pump(1)
|
||||||
|
with limiter:
|
||||||
|
pass
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||||
|
self.assertIsNone(new_timings)
|
||||||
|
notifier.notify_remote_server_up.assert_not_called()
|
||||||
|
replication_client.send_remote_server_up.assert_not_called()
|
||||||
|
|
||||||
|
# Attempt again, but return an error. This will cause new retry timings, but
|
||||||
|
# should not trigger server up notifications.
|
||||||
|
self.pump(1)
|
||||||
|
try:
|
||||||
|
with limiter:
|
||||||
|
raise AssertionError("argh")
|
||||||
|
except AssertionError:
|
||||||
|
pass
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||||
|
# The exact retry timings are tested separately.
|
||||||
|
self.assertIsNotNone(new_timings)
|
||||||
|
notifier.notify_remote_server_up.assert_not_called()
|
||||||
|
replication_client.send_remote_server_up.assert_not_called()
|
||||||
|
|
||||||
|
# A second failing request should be treated as the above.
|
||||||
|
self.pump(1)
|
||||||
|
try:
|
||||||
|
with limiter:
|
||||||
|
raise AssertionError("argh")
|
||||||
|
except AssertionError:
|
||||||
|
pass
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||||
|
# The exact retry timings are tested separately.
|
||||||
|
self.assertIsNotNone(new_timings)
|
||||||
|
notifier.notify_remote_server_up.assert_not_called()
|
||||||
|
replication_client.send_remote_server_up.assert_not_called()
|
||||||
|
|
||||||
|
# A final successful attempt should generate a server up notification.
|
||||||
|
self.pump(1)
|
||||||
|
with limiter:
|
||||||
|
pass
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
new_timings = self.get_success(store.get_destination_retry_timings("test_dest"))
|
||||||
|
# The exact retry timings are tested separately.
|
||||||
|
self.assertIsNone(new_timings)
|
||||||
|
notifier.notify_remote_server_up.assert_called_once_with("test_dest")
|
||||||
|
replication_client.send_remote_server_up.assert_called_once_with("test_dest")
|
||||||
|
|
||||||
def test_max_retry_interval(self) -> None:
|
def test_max_retry_interval(self) -> None:
|
||||||
"""Test that `destination_max_retry_interval` setting works as expected"""
|
"""Test that `destination_max_retry_interval` setting works as expected"""
|
||||||
store = self.hs.get_datastores().main
|
store = self.hs.get_datastores().main
|
||||||
|
|
Loading…
Reference in New Issue