2017-03-27 17:10:55 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Copyright 2017 Vector Creations Ltd
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
"""A replication client for use by synapse workers.
|
|
|
|
"""
|
2018-07-09 08:09:20 +02:00
|
|
|
import logging
|
2020-05-22 15:21:54 +02:00
|
|
|
from typing import TYPE_CHECKING, Dict, List, Tuple
|
2018-07-09 08:09:20 +02:00
|
|
|
|
2020-05-22 15:21:54 +02:00
|
|
|
from twisted.internet.defer import Deferred
|
2017-03-27 17:10:55 +02:00
|
|
|
from twisted.internet.protocol import ReconnectingClientFactory
|
|
|
|
|
2020-05-14 15:01:39 +02:00
|
|
|
from synapse.api.constants import EventTypes
|
2020-05-22 15:21:54 +02:00
|
|
|
from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable
|
2020-04-06 10:58:42 +02:00
|
|
|
from synapse.replication.tcp.protocol import ClientReplicationStreamProtocol
|
2020-07-27 15:10:53 +02:00
|
|
|
from synapse.replication.tcp.streams import TypingStream
|
2020-05-14 15:01:39 +02:00
|
|
|
from synapse.replication.tcp.streams.events import (
|
|
|
|
EventsStream,
|
|
|
|
EventsStreamEventRow,
|
|
|
|
EventsStreamRow,
|
|
|
|
)
|
2020-09-29 22:48:33 +02:00
|
|
|
from synapse.types import PersistedEventPosition, UserID
|
2020-05-22 15:21:54 +02:00
|
|
|
from synapse.util.async_helpers import timeout_deferred
|
|
|
|
from synapse.util.metrics import Measure
|
2020-04-06 10:58:42 +02:00
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
from synapse.replication.tcp.handler import ReplicationCommandHandler
|
2020-07-05 17:32:02 +02:00
|
|
|
from synapse.server import HomeServer
|
2017-03-27 17:10:55 +02:00
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
2020-05-22 15:21:54 +02:00
|
|
|
# How long we allow callers to wait for replication updates before timing out.
|
|
|
|
_WAIT_FOR_REPLICATION_TIMEOUT_SECONDS = 30
|
|
|
|
|
|
|
|
|
2020-04-22 14:07:41 +02:00
|
|
|
class DirectTcpReplicationClientFactory(ReconnectingClientFactory):
|
2017-03-27 17:10:55 +02:00
|
|
|
"""Factory for building connections to the master. Will reconnect if the
|
|
|
|
connection is lost.
|
|
|
|
|
2020-04-06 10:58:42 +02:00
|
|
|
Accepts a handler that is passed to `ClientReplicationStreamProtocol`.
|
2017-03-27 17:10:55 +02:00
|
|
|
"""
|
2019-06-20 11:32:02 +02:00
|
|
|
|
2020-01-03 15:19:09 +01:00
|
|
|
initialDelay = 0.1
|
|
|
|
maxDelay = 1 # Try at least once every N seconds
|
2017-03-27 17:10:55 +02:00
|
|
|
|
2020-04-06 10:58:42 +02:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
hs: "HomeServer",
|
|
|
|
client_name: str,
|
|
|
|
command_handler: "ReplicationCommandHandler",
|
|
|
|
):
|
2017-03-27 17:10:55 +02:00
|
|
|
self.client_name = client_name
|
2020-04-06 10:58:42 +02:00
|
|
|
self.command_handler = command_handler
|
2017-03-27 17:10:55 +02:00
|
|
|
self.server_name = hs.config.server_name
|
2020-03-25 15:54:01 +01:00
|
|
|
self.hs = hs
|
2017-03-27 17:10:55 +02:00
|
|
|
self._clock = hs.get_clock() # As self.clock is defined in super class
|
|
|
|
|
2018-06-25 15:08:28 +02:00
|
|
|
hs.get_reactor().addSystemEventTrigger("before", "shutdown", self.stopTrying)
|
2017-03-27 17:10:55 +02:00
|
|
|
|
|
|
|
def startedConnecting(self, connector):
|
|
|
|
logger.info("Connecting to replication: %r", connector.getDestination())
|
|
|
|
|
|
|
|
def buildProtocol(self, addr):
|
|
|
|
logger.info("Connected to replication: %r", addr)
|
|
|
|
return ClientReplicationStreamProtocol(
|
2020-04-06 10:58:42 +02:00
|
|
|
self.hs,
|
|
|
|
self.client_name,
|
|
|
|
self.server_name,
|
|
|
|
self._clock,
|
|
|
|
self.command_handler,
|
2017-03-27 17:10:55 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
def clientConnectionLost(self, connector, reason):
|
|
|
|
logger.error("Lost replication conn: %r", reason)
|
|
|
|
ReconnectingClientFactory.clientConnectionLost(self, connector, reason)
|
|
|
|
|
|
|
|
def clientConnectionFailed(self, connector, reason):
|
|
|
|
logger.error("Failed to connect to replication: %r", reason)
|
2019-06-20 11:32:02 +02:00
|
|
|
ReconnectingClientFactory.clientConnectionFailed(self, connector, reason)
|
2017-03-27 17:10:55 +02:00
|
|
|
|
|
|
|
|
2020-04-06 10:58:42 +02:00
|
|
|
class ReplicationDataHandler:
|
|
|
|
"""Handles incoming stream updates from replication.
|
2017-03-27 17:10:55 +02:00
|
|
|
|
2020-04-06 10:58:42 +02:00
|
|
|
This instance notifies the slave data store about updates. Can be subclassed
|
|
|
|
to handle updates in additional ways.
|
2017-03-27 17:10:55 +02:00
|
|
|
"""
|
2019-06-20 11:32:02 +02:00
|
|
|
|
2020-05-14 15:01:39 +02:00
|
|
|
def __init__(self, hs: "HomeServer"):
|
|
|
|
self.store = hs.get_datastore()
|
|
|
|
self.notifier = hs.get_notifier()
|
2020-05-22 15:21:54 +02:00
|
|
|
self._reactor = hs.get_reactor()
|
|
|
|
self._clock = hs.get_clock()
|
|
|
|
self._streams = hs.get_replication_streams()
|
|
|
|
self._instance_name = hs.get_instance_name()
|
2020-07-27 15:10:53 +02:00
|
|
|
self._typing_handler = hs.get_typing_handler()
|
2020-05-22 15:21:54 +02:00
|
|
|
|
|
|
|
# Map from stream to list of deferreds waiting for the stream to
|
|
|
|
# arrive at a particular position. The lists are sorted by stream position.
|
2021-03-03 21:47:38 +01:00
|
|
|
self._streams_to_waiters = {} # type: Dict[str, List[Tuple[int, Deferred]]]
|
2017-03-27 17:10:55 +02:00
|
|
|
|
2020-05-01 18:19:56 +02:00
|
|
|
async def on_rdata(
|
|
|
|
self, stream_name: str, instance_name: str, token: int, rows: list
|
|
|
|
):
|
2019-03-27 22:12:36 +01:00
|
|
|
"""Called to handle a batch of replication data with a given stream token.
|
2017-03-27 17:10:55 +02:00
|
|
|
|
2019-03-27 08:40:32 +01:00
|
|
|
By default this just pokes the slave store. Can be overridden in subclasses to
|
2019-03-27 22:12:36 +01:00
|
|
|
handle more.
|
|
|
|
|
|
|
|
Args:
|
2020-05-01 18:19:56 +02:00
|
|
|
stream_name: name of the replication stream for this batch of rows
|
|
|
|
instance_name: the instance that wrote the rows.
|
|
|
|
token: stream token for this batch of rows
|
|
|
|
rows: a list of Stream.ROW_TYPE objects as returned by Stream.parse_row.
|
2017-03-27 17:10:55 +02:00
|
|
|
"""
|
2020-05-07 14:51:08 +02:00
|
|
|
self.store.process_replication_rows(stream_name, instance_name, token, rows)
|
2017-03-27 17:10:55 +02:00
|
|
|
|
2020-07-27 15:10:53 +02:00
|
|
|
if stream_name == TypingStream.NAME:
|
|
|
|
self._typing_handler.process_replication_rows(token, rows)
|
|
|
|
self.notifier.on_new_event(
|
|
|
|
"typing_key", token, rooms=[row.room_id for row in rows]
|
|
|
|
)
|
|
|
|
|
2020-05-14 15:01:39 +02:00
|
|
|
if stream_name == EventsStream.NAME:
|
|
|
|
# We shouldn't get multiple rows per token for events stream, so
|
|
|
|
# we don't need to optimise this for multiple rows.
|
|
|
|
for row in rows:
|
|
|
|
if row.type != EventsStreamEventRow.TypeId:
|
|
|
|
continue
|
|
|
|
assert isinstance(row, EventsStreamRow)
|
2020-10-28 13:11:45 +01:00
|
|
|
assert isinstance(row.data, EventsStreamEventRow)
|
2020-05-14 15:01:39 +02:00
|
|
|
|
2020-10-28 13:11:45 +01:00
|
|
|
if row.data.rejected:
|
2020-05-14 15:01:39 +02:00
|
|
|
continue
|
|
|
|
|
2020-09-10 14:24:43 +02:00
|
|
|
extra_users = () # type: Tuple[UserID, ...]
|
2020-10-28 13:11:45 +01:00
|
|
|
if row.data.type == EventTypes.Member and row.data.state_key:
|
|
|
|
extra_users = (UserID.from_string(row.data.state_key),)
|
2020-09-24 14:24:17 +02:00
|
|
|
|
2020-09-29 22:48:33 +02:00
|
|
|
max_token = self.store.get_room_max_token()
|
2020-09-24 14:24:17 +02:00
|
|
|
event_pos = PersistedEventPosition(instance_name, token)
|
2020-10-28 13:11:45 +01:00
|
|
|
self.notifier.on_new_room_event_args(
|
|
|
|
event_pos=event_pos,
|
|
|
|
max_room_stream_token=max_token,
|
|
|
|
extra_users=extra_users,
|
|
|
|
room_id=row.data.room_id,
|
|
|
|
event_type=row.data.type,
|
|
|
|
state_key=row.data.state_key,
|
|
|
|
membership=row.data.membership,
|
2020-09-24 14:24:17 +02:00
|
|
|
)
|
2020-05-14 15:01:39 +02:00
|
|
|
|
2020-05-22 15:21:54 +02:00
|
|
|
# Notify any waiting deferreds. The list is ordered by position so we
|
|
|
|
# just iterate through the list until we reach a position that is
|
|
|
|
# greater than the received row position.
|
|
|
|
waiting_list = self._streams_to_waiters.get(stream_name, [])
|
|
|
|
|
|
|
|
# Index of first item with a position after the current token, i.e we
|
|
|
|
# have called all deferreds before this index. If not overwritten by
|
|
|
|
# loop below means either a) no items in list so no-op or b) all items
|
|
|
|
# in list were called and so the list should be cleared. Setting it to
|
|
|
|
# `len(list)` works for both cases.
|
|
|
|
index_of_first_deferred_not_called = len(waiting_list)
|
|
|
|
|
|
|
|
for idx, (position, deferred) in enumerate(waiting_list):
|
|
|
|
if position <= token:
|
|
|
|
try:
|
|
|
|
with PreserveLoggingContext():
|
|
|
|
deferred.callback(None)
|
|
|
|
except Exception:
|
|
|
|
# The deferred has been cancelled or timed out.
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
# The list is sorted by position so we don't need to continue
|
2020-06-05 14:43:21 +02:00
|
|
|
# checking any further entries in the list.
|
2020-05-22 15:21:54 +02:00
|
|
|
index_of_first_deferred_not_called = idx
|
|
|
|
break
|
|
|
|
|
|
|
|
# Drop all entries in the waiting list that were called in the above
|
|
|
|
# loop. (This maintains the order so no need to resort)
|
|
|
|
waiting_list[:] = waiting_list[index_of_first_deferred_not_called:]
|
|
|
|
|
2020-05-07 14:51:08 +02:00
|
|
|
async def on_position(self, stream_name: str, instance_name: str, token: int):
|
|
|
|
self.store.process_replication_rows(stream_name, instance_name, token, [])
|
2019-02-26 16:04:34 +01:00
|
|
|
|
2020-10-12 16:51:41 +02:00
|
|
|
# We poke the generic "replication" notifier to wake anything up that
|
|
|
|
# may be streaming.
|
|
|
|
self.notifier.notify_replication()
|
|
|
|
|
2020-04-06 10:58:42 +02:00
|
|
|
def on_remote_server_up(self, server: str):
|
|
|
|
"""Called when get a new REMOTE_SERVER_UP command."""
|
2020-05-22 15:21:54 +02:00
|
|
|
|
|
|
|
async def wait_for_stream_position(
|
|
|
|
self, instance_name: str, stream_name: str, position: int
|
|
|
|
):
|
|
|
|
"""Wait until this instance has received updates up to and including
|
|
|
|
the given stream position.
|
|
|
|
"""
|
|
|
|
|
|
|
|
if instance_name == self._instance_name:
|
|
|
|
# We don't get told about updates written by this process, and
|
|
|
|
# anyway in that case we don't need to wait.
|
|
|
|
return
|
|
|
|
|
|
|
|
current_position = self._streams[stream_name].current_token(self._instance_name)
|
|
|
|
if position <= current_position:
|
|
|
|
# We're already past the position
|
|
|
|
return
|
|
|
|
|
|
|
|
# Create a new deferred that times out after N seconds, as we don't want
|
|
|
|
# to wedge here forever.
|
|
|
|
deferred = Deferred()
|
|
|
|
deferred = timeout_deferred(
|
|
|
|
deferred, _WAIT_FOR_REPLICATION_TIMEOUT_SECONDS, self._reactor
|
|
|
|
)
|
|
|
|
|
|
|
|
waiting_list = self._streams_to_waiters.setdefault(stream_name, [])
|
|
|
|
|
2020-08-28 18:12:45 +02:00
|
|
|
waiting_list.append((position, deferred))
|
|
|
|
waiting_list.sort(key=lambda t: t[0])
|
2020-05-22 15:21:54 +02:00
|
|
|
|
|
|
|
# We measure here to get in flight counts and average waiting time.
|
|
|
|
with Measure(self._clock, "repl.wait_for_stream_position"):
|
|
|
|
logger.info("Waiting for repl stream %r to reach %s", stream_name, position)
|
|
|
|
await make_deferred_yieldable(deferred)
|
|
|
|
logger.info(
|
|
|
|
"Finished waiting for repl stream %r to reach %s", stream_name, position
|
|
|
|
)
|