From 17d585753f1df2b2c2b13ddb8171e174cef97aac Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 4 Oct 2018 15:18:52 +0100 Subject: [PATCH 01/10] Delete unreferened state groups during purge --- synapse/storage/events.py | 33 +++++++++++++++++++++----- synapse/storage/state.py | 50 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 6 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index e7487311ce..0fb190530a 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2025,6 +2025,7 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore logger.info("[purge] finding state groups which depend on redundant" " state groups") remaining_state_groups = [] + unreferenced_state_groups = 0 for i in range(0, len(state_rows), 100): chunk = [sg for sg, in state_rows[i:i + 100]] # look for state groups whose prev_state_group is one we are about @@ -2037,13 +2038,33 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore retcols=["state_group"], keyvalues={}, ) - remaining_state_groups.extend( - row["state_group"] for row in rows - # exclude state groups we are about to delete: no point in - # updating them - if row["state_group"] not in state_groups_to_delete - ) + for row in rows: + sg = row["state_group"] + + if sg in state_groups_to_delete: + # exclude state groups we are about to delete: no point in + # updating them + continue + + if not self._is_state_group_referenced(txn, sg): + # Let's also delete unreferenced state groups while we're + # here, since otherwise we'd need to de-delta them + state_groups_to_delete.add(sg) + unreferenced_state_groups += 1 + continue + + remaining_state_groups.append(sg) + + logger.info( + "[purge] found %i extra unreferenced state groups to delete", + unreferenced_state_groups, + ) + + logger.info( + "[purge] de-delta-ing %i remaining state groups", + len(remaining_state_groups), + ) # Now we turn the state groups that reference to-be-deleted state # groups to non delta versions. diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 3f4cbd61c4..b88c7dc091 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1041,6 +1041,56 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): return count + def _is_state_group_referenced(self, txn, state_group): + """Checks if a given state group is referenced, or is safe to delete. + + A state groups is referenced if it or any of its descendants are + pointed at by an event. (A descendant is a group which has the given + state_group as a prev group) + """ + + # We check this by doing a depth first search to look for any + # descendant referenced by `event_to_state_groups`. + + # State groups we need to check, contains state groups that are + # descendants of `state_group` + state_groups_to_search = [state_group] + + # Set of state groups we've already checked + state_groups_searched = set() + + while state_groups_to_search: + state_group = state_groups_to_search.pop() # Next state group to check + + is_referenced = self._simple_select_one_onecol_txn( + txn, + table="event_to_state_groups", + keyvalues={"state_group": state_group}, + retcol="event_id", + allow_none=True, + ) + if is_referenced: + # A descendant is referenced by event_to_state_groups, so + # original state group is referenced. + return True + + state_groups_searched.add(state_group) + + # Find all children of current state group and add to search + references = self._simple_select_onecol_txn( + txn, + table="state_group_edges", + keyvalues={"prev_state_group": state_group}, + retcol="state_group", + ) + state_groups_to_search.extend(references) + + # Lets be paranoid and check for cycles + if state_groups_searched.intersection(references): + raise Exception("State group %s has cyclic dependency", state_group) + + return False + class StateStore(StateGroupWorkerStore, BackgroundUpdateStore): """ Keeps track of the state at a given event. From 4917ff55234718c0e650c6dc2a1117304465b9be Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 4 Oct 2018 15:24:01 +0100 Subject: [PATCH 02/10] Add state_group index to event_to_state_groups This is needed to efficiently check for unreferenced state groups during purge. --- synapse/storage/prepare_database.py | 2 +- .../52/add_event_to_state_group_index.sql | 19 +++++++++++++++++++ synapse/storage/state.py | 7 +++++++ 3 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 synapse/storage/schema/delta/52/add_event_to_state_group_index.sql diff --git a/synapse/storage/prepare_database.py b/synapse/storage/prepare_database.py index b364719312..bd740e1e45 100644 --- a/synapse/storage/prepare_database.py +++ b/synapse/storage/prepare_database.py @@ -25,7 +25,7 @@ logger = logging.getLogger(__name__) # Remember to update this number every time a change is made to database # schema files, so the users will be informed on server restarts. -SCHEMA_VERSION = 51 +SCHEMA_VERSION = 52 dir_path = os.path.abspath(os.path.dirname(__file__)) diff --git a/synapse/storage/schema/delta/52/add_event_to_state_group_index.sql b/synapse/storage/schema/delta/52/add_event_to_state_group_index.sql new file mode 100644 index 0000000000..91e03d13e1 --- /dev/null +++ b/synapse/storage/schema/delta/52/add_event_to_state_group_index.sql @@ -0,0 +1,19 @@ +/* Copyright 2018 New Vector Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +-- This is needed to efficiently check for unreferenced state groups during +-- purge. Added events_to_state_group(state_group) index +INSERT into background_updates (update_name, progress_json) + VALUES ('event_to_state_groups_sg_index', '{}'); diff --git a/synapse/storage/state.py b/synapse/storage/state.py index b88c7dc091..3f08f447a3 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1114,6 +1114,7 @@ class StateStore(StateGroupWorkerStore, BackgroundUpdateStore): STATE_GROUP_DEDUPLICATION_UPDATE_NAME = "state_group_state_deduplication" STATE_GROUP_INDEX_UPDATE_NAME = "state_group_state_type_index" CURRENT_STATE_INDEX_UPDATE_NAME = "current_state_members_idx" + EVENT_STATE_GROUP_INDEX_UPDATE_NAME = "event_to_state_groups_sg_index" def __init__(self, db_conn, hs): super(StateStore, self).__init__(db_conn, hs) @@ -1132,6 +1133,12 @@ class StateStore(StateGroupWorkerStore, BackgroundUpdateStore): columns=["state_key"], where_clause="type='m.room.member'", ) + self.register_background_index_update( + self.EVENT_STATE_GROUP_INDEX_UPDATE_NAME, + index_name="event_to_state_groups_sg_index", + table="event_to_state_groups", + columns=["state_group"], + ) def _store_event_state_mappings_txn(self, txn, events_and_contexts): state_groups = {} From d9f3db5081a4306520cb3c3208564d3c923fbf0a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Thu, 4 Oct 2018 15:29:30 +0100 Subject: [PATCH 03/10] Newsfile --- changelog.d/4006.misc | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/4006.misc diff --git a/changelog.d/4006.misc b/changelog.d/4006.misc new file mode 100644 index 0000000000..35ffa1c2d2 --- /dev/null +++ b/changelog.d/4006.misc @@ -0,0 +1 @@ +Delete unreferenced state groups during history purge From 67a1e315cc7f8b270ec87e0ab6e58aa2adaee32d Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 12 Oct 2018 13:49:48 +0100 Subject: [PATCH 04/10] Fix up comments --- synapse/storage/state.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 3f08f447a3..f7cf5c86c9 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1044,9 +1044,9 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): def _is_state_group_referenced(self, txn, state_group): """Checks if a given state group is referenced, or is safe to delete. - A state groups is referenced if it or any of its descendants are - pointed at by an event. (A descendant is a group which has the given - state_group as a prev group) + A state group is referenced if it or any of its descendants are + pointed at by an event. (A descendant is a state_group whose chain of + prev_groups includes the given state_group.) """ # We check this by doing a depth first search to look for any From 47a9da28caa6a4f27d2df31f043971a2c9c7b555 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 12 Oct 2018 20:43:18 +0100 Subject: [PATCH 05/10] Batch process handling state groups --- synapse/storage/events.py | 81 ++++++++---------------------- synapse/storage/state.py | 100 +++++++++++++++++++++++++------------- 2 files changed, 86 insertions(+), 95 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index 0fb190530a..e4d0f8b1a9 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -37,6 +37,7 @@ from synapse.metrics.background_process_metrics import run_as_background_process from synapse.storage.background_updates import BackgroundUpdateStore from synapse.storage.event_federation import EventFederationStore from synapse.storage.events_worker import EventsWorkerStore +from synapse.storage.state import StateGroupWorkerStore from synapse.types import RoomStreamToken, get_domain_from_id from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.descriptors import cached, cachedInlineCallbacks @@ -203,7 +204,8 @@ def _retry_on_integrity_error(func): # inherits from EventFederationStore so that we can call _update_backward_extremities # and _handle_mult_prev_events (though arguably those could both be moved in here) -class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore): +class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore, + BackgroundUpdateStore): EVENT_ORIGIN_SERVER_TS_NAME = "event_origin_server_ts" EVENT_FIELDS_SENDER_URL_UPDATE_NAME = "event_fields_sender_url" @@ -1995,70 +1997,29 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore logger.info("[purge] finding redundant state groups") - # Get all state groups that are only referenced by events that are - # to be deleted. - # This works by first getting state groups that we may want to delete, - # joining against event_to_state_groups to get events that use that - # state group, then left joining against events_to_purge again. Any - # state group where the left join produce *no nulls* are referenced - # only by events that are going to be purged. + # Get all state groups that are referenced by events that are to be + # deleted. We then go and check if they are referenced by other events + # or state groups, and if not we delete them. txn.execute(""" - SELECT state_group FROM - ( - SELECT DISTINCT state_group FROM events_to_purge - INNER JOIN event_to_state_groups USING (event_id) - ) AS sp - INNER JOIN event_to_state_groups USING (state_group) - LEFT JOIN events_to_purge AS ep USING (event_id) - GROUP BY state_group - HAVING SUM(CASE WHEN ep.event_id IS NULL THEN 1 ELSE 0 END) = 0 + SELECT DISTINCT state_group FROM events_to_purge + INNER JOIN event_to_state_groups USING (event_id) """) - state_rows = txn.fetchall() - logger.info("[purge] found %i redundant state groups", len(state_rows)) + referenced_state_groups = set(sg for sg, in txn) + logger.info( + "[purge] found %i referenced state groups", + len(referenced_state_groups), + ) - # make a set of the redundant state groups, so that we can look them up - # efficiently - state_groups_to_delete = set([sg for sg, in state_rows]) + logger.info("[purge] finding state groups that can be deleted") - # Now we get all the state groups that rely on these state groups - logger.info("[purge] finding state groups which depend on redundant" - " state groups") - remaining_state_groups = [] - unreferenced_state_groups = 0 - for i in range(0, len(state_rows), 100): - chunk = [sg for sg, in state_rows[i:i + 100]] - # look for state groups whose prev_state_group is one we are about - # to delete - rows = self._simple_select_many_txn( - txn, - table="state_group_edges", - column="prev_state_group", - iterable=chunk, - retcols=["state_group"], - keyvalues={}, - ) - - for row in rows: - sg = row["state_group"] - - if sg in state_groups_to_delete: - # exclude state groups we are about to delete: no point in - # updating them - continue - - if not self._is_state_group_referenced(txn, sg): - # Let's also delete unreferenced state groups while we're - # here, since otherwise we'd need to de-delta them - state_groups_to_delete.add(sg) - unreferenced_state_groups += 1 - continue - - remaining_state_groups.append(sg) + state_groups_to_delete, remaining_state_groups = self._find_unreferenced_groups( + txn, referenced_state_groups, + ) logger.info( - "[purge] found %i extra unreferenced state groups to delete", - unreferenced_state_groups, + "[purge] found %i state groups to delete", + len(state_groups_to_delete), ) logger.info( @@ -2109,11 +2070,11 @@ class EventsStore(EventFederationStore, EventsWorkerStore, BackgroundUpdateStore logger.info("[purge] removing redundant state groups") txn.executemany( "DELETE FROM state_groups_state WHERE state_group = ?", - state_rows + ((sg,) for sg in state_groups_to_delete), ) txn.executemany( "DELETE FROM state_groups WHERE id = ?", - state_rows + ((sg,) for sg in state_groups_to_delete), ) logger.info("[purge] removing events from event_to_state_groups") diff --git a/synapse/storage/state.py b/synapse/storage/state.py index f7cf5c86c9..0f86311ed4 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1041,55 +1041,85 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): return count - def _is_state_group_referenced(self, txn, state_group): - """Checks if a given state group is referenced, or is safe to delete. + def _find_unreferenced_groups(self, txn, state_groups): + """Used when purging history to figure out which state groups can be + deleted and which need to be de-delta'ed (due to one of its prev groups + being scheduled for deletion). - A state group is referenced if it or any of its descendants are - pointed at by an event. (A descendant is a state_group whose chain of - prev_groups includes the given state_group.) + Args: + txn + state_groups (set[int]): Set of state groups referenced by events + that are going to be deleted. + + Returns: + tuple[set[int], set[int]]: The set of state groups that can be + deleted and the set of state groups that need to be de-delta'ed """ + # Graph of state group -> previous group + graph = {} - # We check this by doing a depth first search to look for any - # descendant referenced by `event_to_state_groups`. + # Set of events that we have found to be referenced by events + referenced_groups = set() - # State groups we need to check, contains state groups that are - # descendants of `state_group` - state_groups_to_search = [state_group] + # Set of state groups we've already seen + state_groups_seen = set(state_groups) - # Set of state groups we've already checked - state_groups_searched = set() + # Set of state groups to handle next. + next_to_search = set(state_groups) + while next_to_search: + # We bound size of groups we're looking up at once, to stop the + # SQL query getting too big + if len(next_to_search) < 100: + current_search = next_to_search + next_to_search = set() + else: + lst = list(next_to_search) + current_search = set(lst[:100]) + next_to_search = set(lst[100:]) - while state_groups_to_search: - state_group = state_groups_to_search.pop() # Next state group to check + # Check if state groups are referenced + sql = """ + SELECT state_group, count(*) FROM event_to_state_groups + LEFT JOIN events_to_purge AS ep USING (event_id) + WHERE state_group IN (%s) AND ep.event_id IS NULL + GROUP BY state_group + """ % (",".join("?" for _ in current_search),) + txn.execute(sql, list(current_search)) - is_referenced = self._simple_select_one_onecol_txn( - txn, - table="event_to_state_groups", - keyvalues={"state_group": state_group}, - retcol="event_id", - allow_none=True, - ) - if is_referenced: - # A descendant is referenced by event_to_state_groups, so - # original state group is referenced. - return True + referenced = set(sg for sg, cnt in txn if cnt > 0) + referenced_groups |= referenced - state_groups_searched.add(state_group) + # We don't continue iterating up the state group graphs for state + # groups that are referenced. + current_search -= referenced - # Find all children of current state group and add to search - references = self._simple_select_onecol_txn( + rows = self._simple_select_many_txn( txn, table="state_group_edges", - keyvalues={"prev_state_group": state_group}, - retcol="state_group", + column="prev_state_group", + iterable=current_search, + keyvalues={}, + retcols=("prev_state_group", "state_group",), ) - state_groups_to_search.extend(references) - # Lets be paranoid and check for cycles - if state_groups_searched.intersection(references): - raise Exception("State group %s has cyclic dependency", state_group) + next_to_search.update(row["state_group"] for row in rows) + # We don't bother re-handling groups we've already seen + next_to_search -= state_groups_seen + state_groups_seen |= next_to_search - return False + for row in rows: + # Note: Each state group can have at most one prev group + graph[row["state_group"]] = row["prev_state_group"] + + to_delete = state_groups_seen - referenced_groups + + to_dedelta = set() + for sg in referenced_groups: + prev_sg = graph.get(sg) + if prev_sg and prev_sg in to_delete: + to_dedelta.add(sg) + + return to_delete, to_dedelta class StateStore(StateGroupWorkerStore, BackgroundUpdateStore): From 67f7b9cb50c246226b94027e3c1d4b958ff9f840 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Fri, 19 Oct 2018 16:06:59 +0100 Subject: [PATCH 06/10] pep8 --- synapse/storage/events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index af822fb69d..379b5c514f 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2041,7 +2041,7 @@ class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore INNER JOIN event_to_state_groups USING (event_id) """) - referenced_state_groups = set(sg for sg, in txn) + referenced_state_groups = set(sg for sg, in txn) logger.info( "[purge] found %i referenced state groups", len(referenced_state_groups), From b2399f6281d7cd11e7762b683bdd5a4f0c24927e Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 29 Oct 2018 14:01:11 +0000 Subject: [PATCH 07/10] Make SQL a bit cleaner --- synapse/storage/state.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 59a50a5df9..45afd42b3f 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1272,14 +1272,13 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): # Check if state groups are referenced sql = """ - SELECT state_group, count(*) FROM event_to_state_groups + SELECT DISTINCT state_group FROM event_to_state_groups LEFT JOIN events_to_purge AS ep USING (event_id) WHERE state_group IN (%s) AND ep.event_id IS NULL - GROUP BY state_group """ % (",".join("?" for _ in current_search),) txn.execute(sql, list(current_search)) - referenced = set(sg for sg, cnt in txn if cnt > 0) + referenced = set(sg for sg, in txn) referenced_groups |= referenced # We don't continue iterating up the state group graphs for state From f4f223aa4455bea3aa642c23ae957932b1168ba3 Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 29 Oct 2018 14:01:49 +0000 Subject: [PATCH 08/10] Don't make temporary list --- synapse/storage/state.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 45afd42b3f..947d3fc177 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1266,9 +1266,8 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): current_search = next_to_search next_to_search = set() else: - lst = list(next_to_search) - current_search = set(lst[:100]) - next_to_search = set(lst[100:]) + current_search = set(islice(next_to_search, 100)) + next_to_search -= current_search # Check if state groups are referenced sql = """ From 664b192a3b4aa57597d9832361b025bc078ea87a Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 29 Oct 2018 14:21:43 +0000 Subject: [PATCH 09/10] Fix set operations thinko --- synapse/storage/state.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/synapse/storage/state.py b/synapse/storage/state.py index 947d3fc177..dfec57c045 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1293,10 +1293,11 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): retcols=("prev_state_group", "state_group",), ) - next_to_search.update(row["state_group"] for row in rows) + prevs = set(row["state_group"] for row in rows) # We don't bother re-handling groups we've already seen - next_to_search -= state_groups_seen - state_groups_seen |= next_to_search + prevs -= state_groups_seen + next_to_search |= prevs + state_groups_seen |= prevs for row in rows: # Note: Each state group can have at most one prev group From ad88460e0d2e0a7c2cf39ec5539d5c4ff030bbbd Mon Sep 17 00:00:00 2001 From: Erik Johnston Date: Mon, 29 Oct 2018 14:23:34 +0000 Subject: [PATCH 10/10] Move _find_unreferenced_groups --- synapse/storage/events.py | 85 ++++++++++++++++++++++++++++++++++++++- synapse/storage/state.py | 79 ------------------------------------ 2 files changed, 83 insertions(+), 81 deletions(-) diff --git a/synapse/storage/events.py b/synapse/storage/events.py index e791922733..919e855f3b 100644 --- a/synapse/storage/events.py +++ b/synapse/storage/events.py @@ -2052,8 +2052,10 @@ class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore logger.info("[purge] finding state groups that can be deleted") - state_groups_to_delete, remaining_state_groups = self._find_unreferenced_groups( - txn, referenced_state_groups, + state_groups_to_delete, remaining_state_groups = ( + self._find_unreferenced_groups_during_purge( + txn, referenced_state_groups, + ) ) logger.info( @@ -2209,6 +2211,85 @@ class EventsStore(StateGroupWorkerStore, EventFederationStore, EventsWorkerStore logger.info("[purge] done") + def _find_unreferenced_groups_during_purge(self, txn, state_groups): + """Used when purging history to figure out which state groups can be + deleted and which need to be de-delta'ed (due to one of its prev groups + being scheduled for deletion). + + Args: + txn + state_groups (set[int]): Set of state groups referenced by events + that are going to be deleted. + + Returns: + tuple[set[int], set[int]]: The set of state groups that can be + deleted and the set of state groups that need to be de-delta'ed + """ + # Graph of state group -> previous group + graph = {} + + # Set of events that we have found to be referenced by events + referenced_groups = set() + + # Set of state groups we've already seen + state_groups_seen = set(state_groups) + + # Set of state groups to handle next. + next_to_search = set(state_groups) + while next_to_search: + # We bound size of groups we're looking up at once, to stop the + # SQL query getting too big + if len(next_to_search) < 100: + current_search = next_to_search + next_to_search = set() + else: + current_search = set(itertools.islice(next_to_search, 100)) + next_to_search -= current_search + + # Check if state groups are referenced + sql = """ + SELECT DISTINCT state_group FROM event_to_state_groups + LEFT JOIN events_to_purge AS ep USING (event_id) + WHERE state_group IN (%s) AND ep.event_id IS NULL + """ % (",".join("?" for _ in current_search),) + txn.execute(sql, list(current_search)) + + referenced = set(sg for sg, in txn) + referenced_groups |= referenced + + # We don't continue iterating up the state group graphs for state + # groups that are referenced. + current_search -= referenced + + rows = self._simple_select_many_txn( + txn, + table="state_group_edges", + column="prev_state_group", + iterable=current_search, + keyvalues={}, + retcols=("prev_state_group", "state_group",), + ) + + prevs = set(row["state_group"] for row in rows) + # We don't bother re-handling groups we've already seen + prevs -= state_groups_seen + next_to_search |= prevs + state_groups_seen |= prevs + + for row in rows: + # Note: Each state group can have at most one prev group + graph[row["state_group"]] = row["prev_state_group"] + + to_delete = state_groups_seen - referenced_groups + + to_dedelta = set() + for sg in referenced_groups: + prev_sg = graph.get(sg) + if prev_sg and prev_sg in to_delete: + to_dedelta.add(sg) + + return to_delete, to_dedelta + @defer.inlineCallbacks def is_event_after(self, event_id1, event_id2): """Returns True if event_id1 is after event_id2 in the stream diff --git a/synapse/storage/state.py b/synapse/storage/state.py index dfec57c045..d737bd6778 100644 --- a/synapse/storage/state.py +++ b/synapse/storage/state.py @@ -1234,85 +1234,6 @@ class StateGroupWorkerStore(EventsWorkerStore, SQLBaseStore): return count - def _find_unreferenced_groups(self, txn, state_groups): - """Used when purging history to figure out which state groups can be - deleted and which need to be de-delta'ed (due to one of its prev groups - being scheduled for deletion). - - Args: - txn - state_groups (set[int]): Set of state groups referenced by events - that are going to be deleted. - - Returns: - tuple[set[int], set[int]]: The set of state groups that can be - deleted and the set of state groups that need to be de-delta'ed - """ - # Graph of state group -> previous group - graph = {} - - # Set of events that we have found to be referenced by events - referenced_groups = set() - - # Set of state groups we've already seen - state_groups_seen = set(state_groups) - - # Set of state groups to handle next. - next_to_search = set(state_groups) - while next_to_search: - # We bound size of groups we're looking up at once, to stop the - # SQL query getting too big - if len(next_to_search) < 100: - current_search = next_to_search - next_to_search = set() - else: - current_search = set(islice(next_to_search, 100)) - next_to_search -= current_search - - # Check if state groups are referenced - sql = """ - SELECT DISTINCT state_group FROM event_to_state_groups - LEFT JOIN events_to_purge AS ep USING (event_id) - WHERE state_group IN (%s) AND ep.event_id IS NULL - """ % (",".join("?" for _ in current_search),) - txn.execute(sql, list(current_search)) - - referenced = set(sg for sg, in txn) - referenced_groups |= referenced - - # We don't continue iterating up the state group graphs for state - # groups that are referenced. - current_search -= referenced - - rows = self._simple_select_many_txn( - txn, - table="state_group_edges", - column="prev_state_group", - iterable=current_search, - keyvalues={}, - retcols=("prev_state_group", "state_group",), - ) - - prevs = set(row["state_group"] for row in rows) - # We don't bother re-handling groups we've already seen - prevs -= state_groups_seen - next_to_search |= prevs - state_groups_seen |= prevs - - for row in rows: - # Note: Each state group can have at most one prev group - graph[row["state_group"]] = row["prev_state_group"] - - to_delete = state_groups_seen - referenced_groups - - to_dedelta = set() - for sg in referenced_groups: - prev_sg = graph.get(sg) - if prev_sg and prev_sg in to_delete: - to_dedelta.add(sg) - - return to_delete, to_dedelta - class StateStore(StateGroupWorkerStore, BackgroundUpdateStore): """ Keeps track of the state at a given event.