From a27fb7d5cac3cacc55a1c778f02d074d4115eea6 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 1 Oct 2019 11:05:48 +0100 Subject: [PATCH 1/4] Don't repeatedly attempt to censor events we don't have. Currently we don't set `have_censored` column if we don't have the target event of a redaction, which means we repeatedly attempt to censor the same non-existant event. When we persist non-redacted events we unset the `have_censored` column for any redactions that target said event. --- synapse/storage/events.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index ddf7ab6479..5d56ceeab3 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1389,6 +1389,22 @@ class EventsStore( ], ) + for event, _ in events_and_contexts: + if not event.internal_metadata.is_redacted(): + # If we're persisting an unredacted event we go and ensure + # that we mark any redactions that reference this event as + # requiring censoring. + self._simple_update_txn( + txn, + table="redactions", + keyvalues={ + "redacts": event.event_id, + }, + updatevalues={ + "have_censored": False, + } + ) + def _store_rejected_events_txn(self, txn, events_and_contexts): """Add rows to the 'rejections' table for received events which were rejected @@ -1589,7 +1605,7 @@ class EventsStore( sql = """ SELECT redact_event.event_id, redacts FROM redactions INNER JOIN events AS redact_event USING (event_id) - INNER JOIN events AS original_event ON ( + LEFT JOIN events AS original_event ON ( redact_event.room_id = original_event.room_id AND redacts = original_event.event_id ) From 898dde981b41a4dfb79b5830f17e6eb9871ef762 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 1 Oct 2019 13:23:34 +0100 Subject: [PATCH 2/4] Add received_ts column to redactions. This will allow us to efficiently search for uncensored redactions in the DB before a given time. --- synapse/storage/events.py | 20 +++--- synapse/storage/events_bg_updates.py | 61 +++++++++++++++++++ .../schema/delta/56/redaction_censor2.sql | 20 ++++++ 3 files changed, 92 insertions(+), 9 deletions(-) create mode 100644 synapse/storage/schema/delta/56/redaction_censor2.sql diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 5d56ceeab3..3104815f1a 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1397,12 +1397,8 @@ class EventsStore( self._simple_update_txn( txn, table="redactions", - keyvalues={ - "redacts": event.event_id, - }, - updatevalues={ - "have_censored": False, - } + keyvalues={"redacts": event.event_id}, + updatevalues={"have_censored": False}, ) def _store_rejected_events_txn(self, txn, events_and_contexts): @@ -1568,9 +1564,15 @@ class EventsStore( def _store_redaction(self, txn, event): # invalidate the cache for the redacted event txn.call_after(self._invalidate_get_event_cache, event.redacts) - txn.execute( - "INSERT INTO redactions (event_id, redacts) VALUES (?,?)", - (event.event_id, event.redacts), + + self._simple_insert_txn( + txn, + table="redactions", + values={ + "event_id": event.event_id, + "redacts": event.redacts, + "received_ts": self._clock.time_msec(), + }, ) @defer.inlineCallbacks diff --git a/synapse/storage/events_bg_updates.py b/synapse/storage/events_bg_updates.py index 6587f31e2b..5717baf48c 100644 --- a/synapse/storage/events_bg_updates.py +++ b/synapse/storage/events_bg_updates.py @@ -67,6 +67,10 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore): self.DELETE_SOFT_FAILED_EXTREMITIES, self._cleanup_extremities_bg_update ) + self.register_background_update_handler( + "redactions_received_ts", self._redactions_received_ts + ) + @defer.inlineCallbacks def _background_reindex_fields_sender(self, progress, batch_size): target_min_stream_id = progress["target_min_stream_id_inclusive"] @@ -397,3 +401,60 @@ class EventsBackgroundUpdatesStore(BackgroundUpdateStore): ) return num_handled + + @defer.inlineCallbacks + def _redactions_received_ts(self, progress, batch_size): + """Handles filling out the `received_ts` column in redactions. + """ + last_event_id = progress.get("last_event_id", "") + + def _redactions_received_ts_txn(txn): + # Fetch the set of event IDs that we want to update + sql = """ + SELECT event_id FROM redactions + WHERE event_id > ? + ORDER BY event_id ASC + LIMIT ? + """ + + txn.execute(sql, (last_event_id, batch_size)) + + rows = txn.fetchall() + if not rows: + return 0 + + upper_event_id, = rows[-1] + + # Update the redactions with the received_ts. + # + # Note: Not all events have an associated received_ts, so we + # fallback to using origin_server_ts. If we for some reason don't + # have an origin_server_ts, lets just use the current timestamp. + # + # We don't want to leave it null, as then we'll never try and + # censor those redactions. + sql = """ + UPDATE redactions + SET received_ts = ( + SELECT COALESCE(received_ts, origin_server_ts, ?) FROM events + WHERE events.event_id = redactions.event_id + ) + WHERE ? <= event_id AND event_id <= ? + """ + + txn.execute(sql, (self._clock.time_msec(), last_event_id, upper_event_id)) + + self._background_update_progress_txn( + txn, "redactions_received_ts", {"last_event_id": upper_event_id} + ) + + return len(rows) + + count = yield self.runInteraction( + "_redactions_received_ts", _redactions_received_ts_txn + ) + + if not count: + yield self._end_background_update("redactions_received_ts") + + return count diff --git a/synapse/storage/schema/delta/56/redaction_censor2.sql b/synapse/storage/schema/delta/56/redaction_censor2.sql new file mode 100644 index 0000000000..77a5eca499 --- /dev/null +++ b/synapse/storage/schema/delta/56/redaction_censor2.sql @@ -0,0 +1,20 @@ +/* Copyright 2019 The Matrix.org Foundation C.I.C. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +ALTER TABLE redactions ADD COLUMN received_ts BIGINT; +CREATE INDEX redactions_have_censored_ts ON redactions(received_ts) WHERE not have_censored; + +INSERT INTO background_updates (update_name, progress_json) VALUES + ('redactions_received_ts', '{}'); From 5e8387af9e771ae42c7c8c4dc186000d862d3787 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 1 Oct 2019 13:28:41 +0100 Subject: [PATCH 3/4] Use `received_ts` to find uncensored redacted events Joining against `events` and ordering by `stream_ordering` is inefficient as it forced scanning the entirety of the redactions table. This isn't the case if we use `redactions.received_ts` column as we can then use an index. --- synapse/storage/events.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 3104815f1a..2e485c8644 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -1589,36 +1589,29 @@ class EventsStore( if self.hs.config.redaction_retention_period is None: return - max_pos = yield self.find_first_stream_ordering_after_ts( - self._clock.time_msec() - self.hs.config.redaction_retention_period - ) + before_ts = self._clock.time_msec() - self.hs.config.redaction_retention_period # We fetch all redactions that: # 1. point to an event we have, - # 2. has a stream ordering from before the cut off, and + # 2. has a received_ts from before the cut off, and # 3. we haven't yet censored. # # This is limited to 100 events to ensure that we don't try and do too # much at once. We'll get called again so this should eventually catch # up. - # - # We use the range [-max_pos, max_pos] to handle backfilled events, - # which are given negative stream ordering. sql = """ - SELECT redact_event.event_id, redacts FROM redactions - INNER JOIN events AS redact_event USING (event_id) + SELECT redactions.event_id, redacts FROM redactions LEFT JOIN events AS original_event ON ( - redact_event.room_id = original_event.room_id - AND redacts = original_event.event_id + redacts = original_event.event_id ) WHERE NOT have_censored - AND ? <= redact_event.stream_ordering AND redact_event.stream_ordering <= ? - ORDER BY redact_event.stream_ordering ASC + AND redactions.received_ts <= ? + ORDER BY redactions.received_ts ASC LIMIT ? """ rows = yield self._execute( - "_censor_redactions_fetch", None, sql, -max_pos, max_pos, 100 + "_censor_redactions_fetch", None, sql, before_ts, 100 ) updates = [] From 2b8352e6387a71f8bea8b512f1a491f1fedf06fc Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Tue, 1 Oct 2019 13:36:29 +0100 Subject: [PATCH 4/4] Newsfile --- changelog.d/6141.bugfix | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/6141.bugfix diff --git a/changelog.d/6141.bugfix b/changelog.d/6141.bugfix new file mode 100644 index 0000000000..c93920b7b5 --- /dev/null +++ b/changelog.d/6141.bugfix @@ -0,0 +1 @@ +Fix bad performance of censoring redactions background task.