From c73774467edb04c372caecb9e843542654f7610b Mon Sep 17 00:00:00 2001 From: Sean Quah <8349537+squahtx@users.noreply.github.com> Date: Wed, 14 Sep 2022 10:42:57 +0100 Subject: [PATCH] Fix bug in device list caching when remote users leave rooms (#13749) When a remote user leaves the last room shared with the homeserver, we have to mark their device list as unsubscribed, otherwise we would hold on to a stale device list in our cache. Crucially, the device list would remain cached even after the remote user rejoined the room, which could lead to E2EE failures until the next change to the remote user's device list. Fixes #13651. Signed-off-by: Sean Quah --- changelog.d/13749.bugfix | 1 + synapse/handlers/device.py | 11 -------- synapse/handlers/e2e_keys.py | 26 +++++++++++++++++++ synapse/storage/controllers/persist_events.py | 20 +++++++++++--- tests/handlers/test_e2e_keys.py | 8 +++++- 5 files changed, 51 insertions(+), 15 deletions(-) create mode 100644 changelog.d/13749.bugfix diff --git a/changelog.d/13749.bugfix b/changelog.d/13749.bugfix new file mode 100644 index 0000000000..8ffafec07b --- /dev/null +++ b/changelog.d/13749.bugfix @@ -0,0 +1 @@ +Fix a long standing bug where device lists would remain cached when remote users left and rejoined the last room shared with the local homeserver. diff --git a/synapse/handlers/device.py b/synapse/handlers/device.py index c5ac169644..901e2310b7 100644 --- a/synapse/handlers/device.py +++ b/synapse/handlers/device.py @@ -45,7 +45,6 @@ from synapse.types import ( JsonDict, StreamKeyType, StreamToken, - UserID, get_domain_from_id, get_verify_key_from_cross_signing_key, ) @@ -324,8 +323,6 @@ class DeviceHandler(DeviceWorkerHandler): self.device_list_updater.incoming_device_list_update, ) - hs.get_distributor().observe("user_left_room", self.user_left_room) - # Whether `_handle_new_device_update_async` is currently processing. self._handle_new_device_update_is_processing = False @@ -569,14 +566,6 @@ class DeviceHandler(DeviceWorkerHandler): StreamKeyType.DEVICE_LIST, position, users=[from_user_id] ) - async def user_left_room(self, user: UserID, room_id: str) -> None: - user_id = user.to_string() - room_ids = await self.store.get_rooms_for_user(user_id) - if not room_ids: - # We no longer share rooms with this user, so we'll no longer - # receive device updates. Mark this in DB. - await self.store.mark_remote_user_device_list_as_unsubscribed(user_id) - async def store_dehydrated_device( self, user_id: str, diff --git a/synapse/handlers/e2e_keys.py b/synapse/handlers/e2e_keys.py index ec81639c78..8eed63ccf3 100644 --- a/synapse/handlers/e2e_keys.py +++ b/synapse/handlers/e2e_keys.py @@ -175,6 +175,32 @@ class E2eKeysHandler: user_ids_not_in_cache, remote_results, ) = await self.store.get_user_devices_from_cache(query_list) + + # Check that the homeserver still shares a room with all cached users. + # Note that this check may be slightly racy when a remote user leaves a + # room after we have fetched their cached device list. In the worst case + # we will do extra federation queries for devices that we had cached. + cached_users = set(remote_results.keys()) + valid_cached_users = ( + await self.store.get_users_server_still_shares_room_with( + remote_results.keys() + ) + ) + invalid_cached_users = cached_users - valid_cached_users + if invalid_cached_users: + # Fix up results. If we get here, there is either a bug in device + # list tracking, or we hit the race mentioned above. + user_ids_not_in_cache.update(invalid_cached_users) + for invalid_user_id in invalid_cached_users: + remote_results.pop(invalid_user_id) + # This log message may be removed if it turns out it's almost + # entirely triggered by races. + logger.error( + "Devices for %s were cached, but the server no longer shares " + "any rooms with them. The cached device lists are stale.", + invalid_cached_users, + ) + for user_id, devices in remote_results.items(): user_devices = results.setdefault(user_id, {}) for device_id, device in devices.items(): diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index dad3731b9b..501dbbc990 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -598,9 +598,9 @@ class EventsPersistenceStorageController: # room state_delta_for_room: Dict[str, DeltaState] = {} - # Set of remote users which were in rooms the server has left. We - # should check if we still share any rooms and if not we mark their - # device lists as stale. + # Set of remote users which were in rooms the server has left or who may + # have left rooms the server is in. We should check if we still share any + # rooms and if not we mark their device lists as stale. potentially_left_users: Set[str] = set() if not backfilled: @@ -725,6 +725,20 @@ class EventsPersistenceStorageController: current_state = {} delta.no_longer_in_room = True + # Add all remote users that might have left rooms. + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_delete + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + potentially_left_users.update( + user_id + for event_type, user_id in delta.to_insert.keys() + if event_type == EventTypes.Member + and not self.is_mine_id(user_id) + ) + state_delta_for_room[room_id] = delta await self.persist_events_store._persist_events_and_state_updates( diff --git a/tests/handlers/test_e2e_keys.py b/tests/handlers/test_e2e_keys.py index 1e6ad4b663..95698bc275 100644 --- a/tests/handlers/test_e2e_keys.py +++ b/tests/handlers/test_e2e_keys.py @@ -891,6 +891,12 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): new_callable=mock.MagicMock, return_value=make_awaitable(["some_room_id"]), ) + mock_get_users = mock.patch.object( + self.store, + "get_users_server_still_shares_room_with", + new_callable=mock.MagicMock, + return_value=make_awaitable({remote_user_id}), + ) mock_request = mock.patch.object( self.hs.get_federation_client(), "query_user_devices", @@ -898,7 +904,7 @@ class E2eKeysHandlerTestCase(unittest.HomeserverTestCase): return_value=make_awaitable(response_body), ) - with mock_get_rooms, mock_request as mocked_federation_request: + with mock_get_rooms, mock_get_users, mock_request as mocked_federation_request: # Make the first query and sanity check it succeeds. response_1 = self.get_success( e2e_handler.query_devices(