Merge remote-tracking branch 'origin/develop' into matrix-org-hotfixes

matrix-org-hotfixes
Erik Johnston 2023-08-23 13:46:10 +01:00
commit b7d0c7d3fd
12 changed files with 203 additions and 6 deletions

1
changelog.d/16133.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a rare race that could block new events from being sent for up to two minutes. Introduced in v1.90.0.

View File

@ -1 +1 @@
Clean-up calling `setup_background_tasks` in unit tests.
Improve presence tests.

1
changelog.d/16151.misc Normal file
View File

@ -0,0 +1 @@
Improve presence tests.

1
changelog.d/16160.misc Normal file
View File

@ -0,0 +1 @@
Reduce DB contention on worker locks.

1
changelog.d/16164.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a bug introduced in 1.87 where synapse would send an excessive amount of federation requests to servers which have been offline for a long time. Contributed by Nico.

1
changelog.d/16165.misc Normal file
View File

@ -0,0 +1 @@
Task scheduler: mark task as active if we are scheduling as soon as possible.

View File

@ -92,7 +92,7 @@ class TaskSchedulerWorkerStore(SQLBaseStore):
if clauses:
sql = sql + " WHERE " + " AND ".join(clauses)
sql = sql + "ORDER BY timestamp"
sql = sql + " ORDER BY timestamp"
txn.execute(sql, args)
return self.db_pool.cursor_to_dict(txn)

View File

@ -242,8 +242,6 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
) -> None:
# Upsert retry time interval if retry_interval is zero (i.e. we're
# resetting it) or greater than the existing retry interval.
# We also upsert when the new retry interval is the same as the existing one,
# since it will be the case when `destination_max_retry_interval` is reached.
#
# WARNING: This is executed in autocommit, so we shouldn't add any more
# SQL calls in here (without being very careful).
@ -258,8 +256,10 @@ class TransactionWorkerStore(CacheInvalidationWorkerStore):
retry_interval = EXCLUDED.retry_interval
WHERE
EXCLUDED.retry_interval = 0
OR EXCLUDED.retry_last_ts = 0
OR destinations.retry_interval IS NULL
OR destinations.retry_interval <= EXCLUDED.retry_interval
OR destinations.retry_interval < EXCLUDED.retry_interval
OR destinations.retry_last_ts < EXCLUDED.retry_last_ts
"""
txn.execute(sql, (destination, failure_ts, retry_last_ts, retry_interval))

View File

@ -0,0 +1,37 @@
/* Copyright 2023 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-- To avoid the possibility of a deadlock, lock the
-- `worker_read_write_locks_mode` table so that we serialize inserts/deletes
-- for a specific lock name/key.
CREATE OR REPLACE FUNCTION delete_read_write_lock_parent_before() RETURNS trigger AS $$
BEGIN
-- `PERFORM` is a `SELECT` which discards the rows.
PERFORM * FROM worker_read_write_locks_mode
WHERE
lock_name = OLD.lock_name
AND lock_key = OLD.lock_key
FOR UPDATE;
RETURN OLD;
END
$$
LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS delete_read_write_lock_parent_before_trigger ON worker_read_write_locks;
CREATE TRIGGER delete_read_write_lock_parent_before_trigger BEFORE DELETE ON worker_read_write_locks
FOR EACH ROW
EXECUTE PROCEDURE delete_read_write_lock_parent_before();

View File

@ -0,0 +1,37 @@
/* Copyright 2023 The Matrix.org Foundation C.I.C
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-- Fix up the triggers that were in `78/04_read_write_locks_triggers.sql`
-- Reduce the number of writes we do on this table.
--
-- Note: that we still want to lock the row here (i.e. still do a `DO UPDATE
-- SET`) so that we serialize updates.
CREATE OR REPLACE FUNCTION upsert_read_write_lock_parent() RETURNS trigger AS $$
BEGIN
INSERT INTO worker_read_write_locks_mode (lock_name, lock_key, write_lock, token)
VALUES (NEW.lock_name, NEW.lock_key, NEW.write_lock, NEW.token)
ON CONFLICT (lock_name, lock_key)
DO UPDATE SET write_lock = NEW.write_lock
WHERE OLD.write_lock != NEW.write_lock;
RETURN NEW;
END
$$
LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS upsert_read_write_lock_parent_trigger ON worker_read_write_locks;
CREATE TRIGGER upsert_read_write_lock_parent_trigger BEFORE INSERT ON worker_read_write_locks
FOR EACH ROW
EXECUTE PROCEDURE upsert_read_write_lock_parent();

View File

@ -154,13 +154,15 @@ class TaskScheduler:
f"No function associated with action {action} of the scheduled task"
)
status = TaskStatus.SCHEDULED
if timestamp is None or timestamp < self._clock.time_msec():
timestamp = self._clock.time_msec()
status = TaskStatus.ACTIVE
task = ScheduledTask(
random_string(16),
action,
TaskStatus.SCHEDULED,
status,
timestamp,
resource_id,
params,

View File

@ -38,6 +38,7 @@ from synapse.handlers.presence import (
from synapse.rest import admin
from synapse.rest.client import room
from synapse.server import HomeServer
from synapse.storage.database import LoggingDatabaseConnection
from synapse.types import JsonDict, UserID, get_domain_from_id
from synapse.util import Clock
@ -513,6 +514,121 @@ class PresenceTimeoutTestCase(unittest.TestCase):
self.assertEqual(state, new_state)
class PresenceHandlerInitTestCase(unittest.HomeserverTestCase):
def default_config(self) -> JsonDict:
config = super().default_config()
# Disable background tasks on this worker so that the PresenceHandler isn't
# loaded until we request it.
config["run_background_tasks_on"] = "other"
return config
def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None:
self.user_id = f"@test:{self.hs.config.server.server_name}"
# Move the reactor to the initial time.
self.reactor.advance(1000)
now = self.clock.time_msec()
main_store = hs.get_datastores().main
self.get_success(
main_store.update_presence(
[
UserPresenceState(
user_id=self.user_id,
state=PresenceState.ONLINE,
last_active_ts=now,
last_federation_update_ts=now,
last_user_sync_ts=now,
status_msg=None,
currently_active=True,
)
]
)
)
# Regenerate the preloaded presence information on PresenceStore.
def refill_presence(db_conn: LoggingDatabaseConnection) -> None:
main_store._presence_on_startup = main_store._get_active_presence(db_conn)
self.get_success(main_store.db_pool.runWithConnection(refill_presence))
def test_restored_presence_idles(self) -> None:
"""The presence state restored from the database should not persist forever."""
# Get the handler (which kicks off a bunch of timers).
presence_handler = self.hs.get_presence_handler()
# Assert the user is online.
state = self.get_success(
presence_handler.get_state(UserID.from_string(self.user_id))
)
self.assertEqual(state.state, PresenceState.ONLINE)
# Advance such that the user should timeout.
self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000)
self.reactor.pump([5])
# Check that the user is now offline.
state = self.get_success(
presence_handler.get_state(UserID.from_string(self.user_id))
)
self.assertEqual(state.state, PresenceState.OFFLINE)
@parameterized.expand(
[
(PresenceState.BUSY, PresenceState.BUSY),
(PresenceState.ONLINE, PresenceState.ONLINE),
(PresenceState.UNAVAILABLE, PresenceState.UNAVAILABLE),
# Offline syncs don't update the state.
(PresenceState.OFFLINE, PresenceState.ONLINE),
]
)
@unittest.override_config({"experimental_features": {"msc3026_enabled": True}})
def test_restored_presence_online_after_sync(
self, sync_state: str, expected_state: str
) -> None:
"""
The presence state restored from the database should be overridden with sync after a timeout.
Args:
sync_state: The presence state of the new sync.
expected_state: The expected presence right after the sync.
"""
# Get the handler (which kicks off a bunch of timers).
presence_handler = self.hs.get_presence_handler()
# Assert the user is online, as restored.
state = self.get_success(
presence_handler.get_state(UserID.from_string(self.user_id))
)
self.assertEqual(state.state, PresenceState.ONLINE)
# Advance slightly and sync.
self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000 / 2)
self.get_success(
presence_handler.user_syncing(
self.user_id, sync_state != PresenceState.OFFLINE, sync_state
)
)
# Assert the user is in the expected state.
state = self.get_success(
presence_handler.get_state(UserID.from_string(self.user_id))
)
self.assertEqual(state.state, expected_state)
# Advance such that the user's preloaded data times out, but not the new sync.
self.reactor.advance(SYNC_ONLINE_TIMEOUT / 1000 / 2)
self.reactor.pump([5])
# Check that the user is in the sync state (as the client is currently syncing still).
state = self.get_success(
presence_handler.get_state(UserID.from_string(self.user_id))
)
self.assertEqual(state.state, sync_state)
class PresenceHandlerTestCase(BaseMultiWorkerStreamTestCase):
user_id = "@test:server"
user_id_obj = UserID.from_string(user_id)