282 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			Python
		
	
	
			
		
		
	
	
			282 lines
		
	
	
		
			9.8 KiB
		
	
	
	
		
			Python
		
	
	
| # Copyright 2017 Vector Creations Ltd
 | |
| # Copyright 2019 New Vector Ltd
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License");
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| import heapq
 | |
| from collections import defaultdict
 | |
| from typing import TYPE_CHECKING, Iterable, Optional, Tuple, Type, TypeVar, cast
 | |
| 
 | |
| import attr
 | |
| 
 | |
| from synapse.replication.tcp.streams._base import (
 | |
|     StreamRow,
 | |
|     StreamUpdateResult,
 | |
|     Token,
 | |
|     _StreamFromIdGen,
 | |
| )
 | |
| 
 | |
| if TYPE_CHECKING:
 | |
|     from synapse.server import HomeServer
 | |
| 
 | |
| """Handling of the 'events' replication stream
 | |
| 
 | |
| This stream contains rows of various types. Each row therefore contains a 'type'
 | |
| identifier before the real data. For example::
 | |
| 
 | |
|     RDATA events batch ["state", ["!room:id", "m.type", "", "$event:id"]]
 | |
|     RDATA events 12345 ["ev", ["$event:id", "!room:id", "m.type", null, null]]
 | |
| 
 | |
| An "ev" row is sent for each new event. The fields in the data part are:
 | |
| 
 | |
|  * The new event id
 | |
|  * The room id for the event
 | |
|  * The type of the new event
 | |
|  * The state key of the event, for state events
 | |
|  * The event id of an event which is redacted by this event.
 | |
| 
 | |
| A "state" row is sent whenever the "current state" in a room changes. The fields in the
 | |
| data part are:
 | |
| 
 | |
|  * The room id for the state change
 | |
|  * The event type of the state which has changed
 | |
|  * The state_key of the state which has changed
 | |
|  * The event id of the new state
 | |
| 
 | |
| A "state-all" row is sent whenever the "current state" in a room changes, but there are
 | |
| too many state updates for a particular room in the same update. This replaces any
 | |
| "state" rows on a per-room basis. The fields in the data part are:
 | |
| 
 | |
| * The room id for the state changes
 | |
| 
 | |
| """
 | |
| 
 | |
| # Any room with more than _MAX_STATE_UPDATES_PER_ROOM will send a EventsStreamAllStateRow
 | |
| # instead of individual EventsStreamEventRow. This is predominantly useful when
 | |
| # purging large rooms.
 | |
| _MAX_STATE_UPDATES_PER_ROOM = 150
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True, auto_attribs=True)
 | |
| class EventsStreamRow:
 | |
|     """A parsed row from the events replication stream"""
 | |
| 
 | |
|     type: str  # the TypeId of one of the *EventsStreamRows
 | |
|     data: "BaseEventsStreamRow"
 | |
| 
 | |
| 
 | |
| T = TypeVar("T", bound="BaseEventsStreamRow")
 | |
| 
 | |
| 
 | |
| class BaseEventsStreamRow:
 | |
|     """Base class for rows to be sent in the events stream.
 | |
| 
 | |
|     Specifies how to identify, serialize and deserialize the different types.
 | |
|     """
 | |
| 
 | |
|     # Unique string that ids the type. Must be overridden in sub classes.
 | |
|     TypeId: str
 | |
| 
 | |
|     @classmethod
 | |
|     def from_data(cls: Type[T], data: Iterable[Optional[str]]) -> T:
 | |
|         """Parse the data from the replication stream into a row.
 | |
| 
 | |
|         By default we just call the constructor with the data list as arguments
 | |
| 
 | |
|         Args:
 | |
|             data: The value of the data object from the replication stream
 | |
|         """
 | |
|         return cls(*data)
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True, auto_attribs=True)
 | |
| class EventsStreamEventRow(BaseEventsStreamRow):
 | |
|     TypeId = "ev"
 | |
| 
 | |
|     event_id: str
 | |
|     room_id: str
 | |
|     type: str
 | |
|     state_key: Optional[str]
 | |
|     redacts: Optional[str]
 | |
|     relates_to: Optional[str]
 | |
|     membership: Optional[str]
 | |
|     rejected: bool
 | |
|     outlier: bool
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True, auto_attribs=True)
 | |
| class EventsStreamCurrentStateRow(BaseEventsStreamRow):
 | |
|     TypeId = "state"
 | |
| 
 | |
|     room_id: str
 | |
|     type: str
 | |
|     state_key: str
 | |
|     event_id: Optional[str]
 | |
| 
 | |
| 
 | |
| @attr.s(slots=True, frozen=True, auto_attribs=True)
 | |
| class EventsStreamAllStateRow(BaseEventsStreamRow):
 | |
|     TypeId = "state-all"
 | |
| 
 | |
|     room_id: str
 | |
| 
 | |
| 
 | |
| _EventRows: Tuple[Type[BaseEventsStreamRow], ...] = (
 | |
|     EventsStreamEventRow,
 | |
|     EventsStreamCurrentStateRow,
 | |
|     EventsStreamAllStateRow,
 | |
| )
 | |
| 
 | |
| TypeToRow = {Row.TypeId: Row for Row in _EventRows}
 | |
| 
 | |
| 
 | |
| class EventsStream(_StreamFromIdGen):
 | |
|     """We received a new event, or an event went from being an outlier to not"""
 | |
| 
 | |
|     NAME = "events"
 | |
| 
 | |
|     def __init__(self, hs: "HomeServer"):
 | |
|         self._store = hs.get_datastores().main
 | |
|         super().__init__(
 | |
|             hs.get_instance_name(), self._update_function, self._store._stream_id_gen
 | |
|         )
 | |
| 
 | |
|     async def _update_function(
 | |
|         self,
 | |
|         instance_name: str,
 | |
|         from_token: Token,
 | |
|         current_token: Token,
 | |
|         target_row_count: int,
 | |
|     ) -> StreamUpdateResult:
 | |
|         # The events stream cannot be "reset", so its safe to return early if
 | |
|         # the from token is larger than the current token (the DB query will
 | |
|         # trivially return 0 rows anyway).
 | |
|         if from_token >= current_token:
 | |
|             return [], current_token, False
 | |
| 
 | |
|         # the events stream merges together three separate sources:
 | |
|         #  * new events
 | |
|         #  * current_state changes
 | |
|         #  * events which were previously outliers, but have now been de-outliered.
 | |
|         #
 | |
|         # The merge operation is complicated by the fact that we only have a single
 | |
|         # "stream token" which is supposed to indicate how far we have got through
 | |
|         # all three streams. It's therefore no good to return rows 1-1000 from the
 | |
|         # "new events" table if the state_deltas are limited to rows 1-100 by the
 | |
|         # target_row_count.
 | |
|         #
 | |
|         # In other words: we must pick a new upper limit, and must return *all* rows
 | |
|         # up to that point for each of the three sources.
 | |
|         #
 | |
|         # Start by trying to split the target_row_count up. We expect to have a
 | |
|         # negligible number of ex-outliers, and a rough approximation based on recent
 | |
|         # traffic on sw1v.org shows that there are approximately the same number of
 | |
|         # event rows between a given pair of stream ids as there are state
 | |
|         # updates, so let's split our target_row_count among those two types. The target
 | |
|         # is only an approximation - it doesn't matter if we end up going a bit over it.
 | |
| 
 | |
|         target_row_count //= 2
 | |
| 
 | |
|         # now we fetch up to that many rows from the events table
 | |
| 
 | |
|         event_rows = await self._store.get_all_new_forward_event_rows(
 | |
|             instance_name, from_token, current_token, target_row_count
 | |
|         )
 | |
| 
 | |
|         # we rely on get_all_new_forward_event_rows strictly honouring the limit, so
 | |
|         # that we know it is safe to just take upper_limit = event_rows[-1][0].
 | |
|         assert (
 | |
|             len(event_rows) <= target_row_count
 | |
|         ), "get_all_new_forward_event_rows did not honour row limit"
 | |
| 
 | |
|         # if we hit the limit on event_updates, there's no point in going beyond the
 | |
|         # last stream_id in the batch for the other sources.
 | |
| 
 | |
|         if len(event_rows) == target_row_count:
 | |
|             limited = True
 | |
|             upper_limit: int = event_rows[-1][0]
 | |
|         else:
 | |
|             limited = False
 | |
|             upper_limit = current_token
 | |
| 
 | |
|         # next up is the state delta table.
 | |
|         (
 | |
|             state_rows,
 | |
|             upper_limit,
 | |
|             state_rows_limited,
 | |
|         ) = await self._store.get_all_updated_current_state_deltas(
 | |
|             instance_name, from_token, upper_limit, target_row_count
 | |
|         )
 | |
| 
 | |
|         limited = limited or state_rows_limited
 | |
| 
 | |
|         # finally, fetch the ex-outliers rows. We assume there are few enough of these
 | |
|         # not to bother with the limit.
 | |
| 
 | |
|         ex_outliers_rows = await self._store.get_ex_outlier_stream_rows(
 | |
|             instance_name, from_token, upper_limit
 | |
|         )
 | |
| 
 | |
|         # we now need to turn the raw database rows returned into tuples suitable
 | |
|         # for the replication protocol (basically, we add an identifier to
 | |
|         # distinguish the row type). At the same time, we can limit the event_rows
 | |
|         # to the max stream_id from state_rows.
 | |
| 
 | |
|         event_updates: Iterable[Tuple[int, Tuple]] = (
 | |
|             (stream_id, (EventsStreamEventRow.TypeId, rest))
 | |
|             for (stream_id, *rest) in event_rows
 | |
|             if stream_id <= upper_limit
 | |
|         )
 | |
| 
 | |
|         # Separate out rooms that have many state updates, listeners should clear
 | |
|         # all state for those rooms.
 | |
|         state_updates_by_room = defaultdict(list)
 | |
|         for stream_id, room_id, _type, _state_key, _event_id in state_rows:
 | |
|             state_updates_by_room[room_id].append(stream_id)
 | |
| 
 | |
|         state_all_rows = [
 | |
|             (stream_ids[-1], room_id)
 | |
|             for room_id, stream_ids in state_updates_by_room.items()
 | |
|             if len(stream_ids) >= _MAX_STATE_UPDATES_PER_ROOM
 | |
|         ]
 | |
|         state_all_updates: Iterable[Tuple[int, Tuple]] = (
 | |
|             (max_stream_id, (EventsStreamAllStateRow.TypeId, (room_id,)))
 | |
|             for (max_stream_id, room_id) in state_all_rows
 | |
|         )
 | |
| 
 | |
|         # Any remaining state updates are sent individually.
 | |
|         state_all_rooms = {room_id for _, room_id in state_all_rows}
 | |
|         state_updates: Iterable[Tuple[int, Tuple]] = (
 | |
|             (stream_id, (EventsStreamCurrentStateRow.TypeId, rest))
 | |
|             for (stream_id, *rest) in state_rows
 | |
|             if rest[0] not in state_all_rooms
 | |
|         )
 | |
| 
 | |
|         ex_outliers_updates: Iterable[Tuple[int, Tuple]] = (
 | |
|             (stream_id, (EventsStreamEventRow.TypeId, rest))
 | |
|             for (stream_id, *rest) in ex_outliers_rows
 | |
|         )
 | |
| 
 | |
|         # we need to return a sorted list, so merge them together.
 | |
|         updates = list(
 | |
|             heapq.merge(
 | |
|                 event_updates, state_all_updates, state_updates, ex_outliers_updates
 | |
|             )
 | |
|         )
 | |
|         return updates, upper_limit, limited
 | |
| 
 | |
|     @classmethod
 | |
|     def parse_row(cls, row: StreamRow) -> "EventsStreamRow":
 | |
|         (typ, data) = cast(Tuple[str, Iterable[Optional[str]]], row)
 | |
|         event_stream_row_data = TypeToRow[typ].from_data(data)
 | |
|         return EventsStreamRow(typ, event_stream_row_data)
 |