2016-01-07 05:26:29 +01:00
# Copyright 2015, 2016 OpenMarket Ltd
2015-02-17 18:20:56 +01:00
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
2015-11-02 17:49:05 +01:00
import random
2021-09-10 18:03:18 +02:00
from types import TracebackType
2022-05-10 11:39:54 +02:00
from typing import TYPE_CHECKING , Any , Optional , Type
2015-02-17 18:20:56 +01:00
2018-07-09 08:09:20 +02:00
from synapse . api . errors import CodeMessageException
2022-05-19 21:17:10 +02:00
from synapse . metrics . background_process_metrics import run_as_background_process
2021-09-10 18:03:18 +02:00
from synapse . storage import DataStore
from synapse . util import Clock
2015-02-17 18:20:56 +01:00
2022-05-10 11:39:54 +02:00
if TYPE_CHECKING :
from synapse . notifier import Notifier
from synapse . replication . tcp . handler import ReplicationCommandHandler
2015-02-17 18:20:56 +01:00
logger = logging . getLogger ( __name__ )
2020-07-09 15:52:58 +02:00
# the initial backoff, after the first transaction fails
2019-09-11 16:14:56 +02:00
MIN_RETRY_INTERVAL = 10 * 60 * 1000
# how much we multiply the backoff by after each subsequent fail
RETRY_MULTIPLIER = 5
2019-09-12 14:00:13 +02:00
# a cap on the backoff. (Essentially none)
2022-03-29 12:41:19 +02:00
MAX_RETRY_INTERVAL = 2 * * 62
2019-09-11 16:14:56 +02:00
2015-02-17 18:20:56 +01:00
class NotRetryingDestination ( Exception ) :
2021-09-10 18:03:18 +02:00
def __init__ ( self , retry_last_ts : int , retry_interval : int , destination : str ) :
2018-01-22 19:11:18 +01:00
""" Raised by the limiter (and federation client) to indicate that we are
are deliberately not attempting to contact a given server .
Args :
2021-09-10 18:03:18 +02:00
retry_last_ts : the unix ts in milliseconds of our last attempt
2018-01-22 19:11:18 +01:00
to contact the server . 0 indicates that the last attempt was
successful or that we ' ve never actually attempted to connect.
2021-09-10 18:03:18 +02:00
retry_interval : the time in milliseconds to wait until the next
2018-01-22 19:11:18 +01:00
attempt .
2021-09-10 18:03:18 +02:00
destination : the domain in question
2018-01-22 19:11:18 +01:00
"""
2022-10-26 23:10:55 +02:00
msg = f " Not retrying server { destination } because we tried it recently retry_last_ts= { retry_last_ts } and we won ' t check for another retry_interval= { retry_interval } ms. "
2020-09-18 15:56:44 +02:00
super ( ) . __init__ ( msg )
2015-02-17 18:20:56 +01:00
self . retry_last_ts = retry_last_ts
self . retry_interval = retry_interval
self . destination = destination
2021-09-10 18:03:18 +02:00
async def get_retry_limiter (
destination : str ,
clock : Clock ,
store : DataStore ,
ignore_backoff : bool = False ,
* * kwargs : Any ,
) - > " RetryDestinationLimiter " :
2015-02-18 11:09:54 +01:00
""" For a given destination check if we have previously failed to
send a request there and are waiting before retrying the destination .
If we are not ready to retry the destination , this will raise a
NotRetryingDestination exception . Otherwise , will return a Context Manager
that will mark the destination as down if an exception is thrown ( excluding
CodeMessageException with code < 500 )
2017-03-23 12:10:36 +01:00
Args :
2021-09-10 18:03:18 +02:00
destination : name of homeserver
clock : timing source
store : datastore
ignore_backoff : true to ignore the historical backoff data and
2019-06-04 00:50:43 +02:00
try the request anyway . We will still reset the retry_interval on success .
2017-03-23 12:10:36 +01:00
2015-02-18 11:09:54 +01:00
Example usage :
try :
2020-08-06 14:39:35 +02:00
limiter = await get_retry_limiter ( destination , clock , store )
2015-02-18 11:09:54 +01:00
with limiter :
2020-08-06 14:39:35 +02:00
response = await do_request ( )
2015-02-18 11:09:54 +01:00
except NotRetryingDestination :
# We aren't ready to retry that destination.
raise
"""
2019-09-17 12:41:54 +02:00
failure_ts = None
2015-02-17 18:20:56 +01:00
retry_last_ts , retry_interval = ( 0 , 0 )
2020-08-06 14:39:35 +02:00
retry_timings = await store . get_destination_retry_timings ( destination )
2015-02-17 18:20:56 +01:00
if retry_timings :
2021-05-21 18:57:08 +02:00
failure_ts = retry_timings . failure_ts
retry_last_ts = retry_timings . retry_last_ts
retry_interval = retry_timings . retry_interval
2015-02-17 18:20:56 +01:00
now = int ( clock . time_msec ( ) )
2017-03-23 12:10:36 +01:00
if not ignore_backoff and retry_last_ts + retry_interval > now :
2015-02-17 18:20:56 +01:00
raise NotRetryingDestination (
retry_last_ts = retry_last_ts ,
retry_interval = retry_interval ,
destination = destination ,
)
2019-06-04 00:50:43 +02:00
# if we are ignoring the backoff data, we should also not increment the backoff
# when we get another failure - otherwise a server can very quickly reach the
# maximum backoff even though it might only have been down briefly
backoff_on_failure = not ignore_backoff
2019-07-23 15:00:55 +02:00
return RetryDestinationLimiter (
destination ,
clock ,
store ,
2019-09-17 12:41:54 +02:00
failure_ts ,
2019-07-23 15:00:55 +02:00
retry_interval ,
backoff_on_failure = backoff_on_failure ,
2020-10-28 00:26:36 +01:00
* * kwargs ,
2015-02-17 18:20:56 +01:00
)
2020-09-04 12:54:56 +02:00
class RetryDestinationLimiter :
2019-06-04 00:50:43 +02:00
def __init__ (
self ,
2021-09-10 18:03:18 +02:00
destination : str ,
clock : Clock ,
store : DataStore ,
failure_ts : Optional [ int ] ,
retry_interval : int ,
backoff_on_404 : bool = False ,
backoff_on_failure : bool = True ,
2022-05-10 11:39:54 +02:00
notifier : Optional [ " Notifier " ] = None ,
replication_client : Optional [ " ReplicationCommandHandler " ] = None ,
2019-06-04 00:50:43 +02:00
) :
2015-02-18 11:11:24 +01:00
""" Marks the destination as " down " if an exception is thrown in the
context , except for CodeMessageException with code < 500.
If no exception is raised , marks the destination as " up " .
2015-02-18 11:09:54 +01:00
Args :
2021-09-10 18:03:18 +02:00
destination
clock
store
failure_ts : when this destination started failing ( in ms since
2019-09-17 12:41:54 +02:00
the epoch ) , or zero if the last request was successful
2021-09-10 18:03:18 +02:00
retry_interval : The next retry interval taken from the
2015-02-18 11:09:54 +01:00
database in milliseconds , or zero if the last request was
successful .
2021-09-10 18:03:18 +02:00
backoff_on_404 : Back off if we get a 404
2019-06-04 00:50:43 +02:00
2021-09-10 18:03:18 +02:00
backoff_on_failure : set to False if we should not increase the
2019-06-04 00:50:43 +02:00
retry interval on a failure .
2015-02-18 11:09:54 +01:00
"""
2015-02-17 18:20:56 +01:00
self . clock = clock
self . store = store
self . destination = destination
2019-09-17 12:41:54 +02:00
self . failure_ts = failure_ts
2015-02-17 18:20:56 +01:00
self . retry_interval = retry_interval
2017-01-31 14:40:09 +01:00
self . backoff_on_404 = backoff_on_404
2019-06-04 00:50:43 +02:00
self . backoff_on_failure = backoff_on_failure
2015-02-17 18:20:56 +01:00
2022-05-10 11:39:54 +02:00
self . notifier = notifier
self . replication_client = replication_client
2021-09-10 18:03:18 +02:00
def __enter__ ( self ) - > None :
2015-02-17 18:20:56 +01:00
pass
2021-09-10 18:03:18 +02:00
def __exit__ (
self ,
exc_type : Optional [ Type [ BaseException ] ] ,
exc_val : Optional [ BaseException ] ,
exc_tb : Optional [ TracebackType ] ,
) - > None :
2015-02-17 19:13:34 +01:00
valid_err_code = False
2017-03-23 01:12:21 +01:00
if exc_type is None :
valid_err_code = True
elif not issubclass ( exc_type , Exception ) :
# avoid treating exceptions which don't derive from Exception as
# failures; this is mostly so as not to catch defer._DefGen.
valid_err_code = True
2021-09-10 18:03:18 +02:00
elif isinstance ( exc_val , CodeMessageException ) :
2017-01-31 14:46:38 +01:00
# Some error codes are perfectly fine for some APIs, whereas other
# APIs may expect to never received e.g. a 404. It's important to
# handle 404 as some remote servers will return a 404 when the HS
# has been decommissioned.
2017-02-01 16:42:19 +01:00
# If we get a 401, then we should probably back off since they
# won't accept our requests for at least a while.
2020-07-09 15:52:58 +02:00
# 429 is us being aggressively rate limited, so lets rate limit
2017-02-01 16:42:19 +01:00
# ourselves.
2017-01-31 16:21:32 +01:00
if exc_val . code == 404 and self . backoff_on_404 :
2017-01-31 14:40:09 +01:00
valid_err_code = False
2017-02-01 16:42:19 +01:00
elif exc_val . code in ( 401 , 429 ) :
2017-01-31 14:40:09 +01:00
valid_err_code = False
elif exc_val . code < 500 :
valid_err_code = True
else :
valid_err_code = False
2015-02-17 19:13:34 +01:00
2017-03-23 01:12:21 +01:00
if valid_err_code :
2015-02-17 18:20:56 +01:00
# We connected successfully.
2015-02-17 18:38:38 +01:00
if not self . retry_interval :
return
2019-06-04 00:50:43 +02:00
logger . debug (
" Connection to %s was successful; clearing backoff " , self . destination
)
2019-09-17 12:41:54 +02:00
self . failure_ts = None
2015-02-17 18:20:56 +01:00
retry_last_ts = 0
self . retry_interval = 0
2019-06-04 00:50:43 +02:00
elif not self . backoff_on_failure :
return
2015-02-17 18:20:56 +01:00
else :
# We couldn't connect.
if self . retry_interval :
2019-09-12 13:59:43 +02:00
self . retry_interval = int (
self . retry_interval * RETRY_MULTIPLIER * random . uniform ( 0.8 , 1.4 )
)
2015-02-17 18:20:56 +01:00
2019-09-11 16:14:56 +02:00
if self . retry_interval > = MAX_RETRY_INTERVAL :
self . retry_interval = MAX_RETRY_INTERVAL
2015-02-17 18:20:56 +01:00
else :
2019-09-11 16:14:56 +02:00
self . retry_interval = MIN_RETRY_INTERVAL
2015-02-17 18:20:56 +01:00
2018-09-19 18:28:18 +02:00
logger . info (
2017-03-23 01:12:21 +01:00
" Connection to %s was unsuccessful ( %s ( %s )); backoff now %i " ,
2019-06-04 00:50:43 +02:00
self . destination ,
exc_type ,
exc_val ,
self . retry_interval ,
2017-03-23 01:12:21 +01:00
)
2015-02-18 12:25:20 +01:00
retry_last_ts = int ( self . clock . time_msec ( ) )
2015-02-17 18:20:56 +01:00
2019-09-17 12:41:54 +02:00
if self . failure_ts is None :
self . failure_ts = retry_last_ts
2021-09-10 18:03:18 +02:00
async def store_retry_timings ( ) - > None :
2016-11-22 18:45:44 +01:00
try :
2020-08-06 14:39:35 +02:00
await self . store . set_destination_retry_timings (
2019-09-17 12:41:54 +02:00
self . destination ,
self . failure_ts ,
retry_last_ts ,
self . retry_interval ,
2016-11-22 18:45:44 +01:00
)
2022-05-10 11:39:54 +02:00
if self . notifier :
# Inform the relevant places that the remote server is back up.
self . notifier . notify_remote_server_up ( self . destination )
if self . replication_client :
# If we're on a worker we try and inform master about this. The
# replication client doesn't hook into the notifier to avoid
# infinite loops where we send a `REMOTE_SERVER_UP` command to
# master, which then echoes it back to us which in turn pokes
# the notifier.
self . replication_client . send_remote_server_up ( self . destination )
2017-10-23 16:52:32 +02:00
except Exception :
2019-06-04 00:50:43 +02:00
logger . exception ( " Failed to store destination_retry_timings " )
2016-11-22 18:45:44 +01:00
2017-03-23 01:17:46 +01:00
# we deliberately do this in the background.
2022-05-19 21:17:10 +02:00
run_as_background_process ( " store_retry_timings " , store_retry_timings )