Merge pull request #4699 from matrix-org/erikj/stop_fed_not_in_room

Stop backpaginating when events not visible
hhs-9/hs/log-mods
Erik Johnston 2019-03-05 09:32:33 +00:00 committed by GitHub
commit b050a10871
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 127 additions and 29 deletions

1
changelog.d/4699.bugfix Normal file
View File

@ -0,0 +1 @@
Fix attempting to paginate in rooms where server cannot see any events, to avoid unnecessarily pulling in lots of redacted events.

View File

@ -858,6 +858,52 @@ class FederationHandler(BaseHandler):
logger.debug("Not backfilling as no extremeties found.")
return
# We only want to paginate if we can actually see the events we'll get,
# as otherwise we'll just spend a lot of resources to get redacted
# events.
#
# We do this by filtering all the backwards extremities and seeing if
# any remain. Given we don't have the extremity events themselves, we
# need to actually check the events that reference them.
#
# *Note*: the spec wants us to keep backfilling until we reach the start
# of the room in case we are allowed to see some of the history. However
# in practice that causes more issues than its worth, as a) its
# relatively rare for there to be any visible history and b) even when
# there is its often sufficiently long ago that clients would stop
# attempting to paginate before backfill reached the visible history.
#
# TODO: If we do do a backfill then we should filter the backwards
# extremities to only include those that point to visible portions of
# history.
#
# TODO: Correctly handle the case where we are allowed to see the
# forward event but not the backward extremity, e.g. in the case of
# initial join of the server where we are allowed to see the join
# event but not anything before it. This would require looking at the
# state *before* the event, ignoring the special casing certain event
# types have.
forward_events = yield self.store.get_successor_events(
list(extremities),
)
extremities_events = yield self.store.get_events(
forward_events,
check_redacted=False,
get_prev_content=False,
)
# We set `check_history_visibility_only` as we might otherwise get false
# positives from users having been erased.
filtered_extremities = yield filter_events_for_server(
self.store, self.server_name, list(extremities_events.values()),
redact=False, check_history_visibility_only=True,
)
if not filtered_extremities:
defer.returnValue(False)
# Check if we reached a point where we should start backfilling.
sorted_extremeties_tuple = sorted(
extremities.items(),

View File

@ -442,6 +442,28 @@ class EventFederationWorkerStore(EventsWorkerStore, SignatureWorkerStore,
event_results.reverse()
return event_results
@defer.inlineCallbacks
def get_successor_events(self, event_ids):
"""Fetch all events that have the given events as a prev event
Args:
event_ids (iterable[str])
Returns:
Deferred[list[str]]
"""
rows = yield self._simple_select_many_batch(
table="event_edges",
column="prev_event_id",
iterable=event_ids,
retcols=("event_id",),
desc="get_successor_events"
)
defer.returnValue([
row["event_id"] for row in rows
])
class EventFederationStore(EventFederationWorkerStore):
""" Responsible for storing and serving up the various graphs associated

View File

@ -216,28 +216,36 @@ def filter_events_for_client(store, user_id, events, is_peeking=False,
@defer.inlineCallbacks
def filter_events_for_server(store, server_name, events):
# Whatever else we do, we need to check for senders which have requested
# erasure of their data.
erased_senders = yield store.are_users_erased(
(e.sender for e in events),
)
def filter_events_for_server(store, server_name, events, redact=True,
check_history_visibility_only=False):
"""Filter a list of events based on whether given server is allowed to
see them.
def redact_disallowed(event, state):
# if the sender has been gdpr17ed, always return a redacted
# copy of the event.
if erased_senders[event.sender]:
Args:
store (DataStore)
server_name (str)
events (iterable[FrozenEvent])
redact (bool): Whether to return a redacted version of the event, or
to filter them out entirely.
check_history_visibility_only (bool): Whether to only check the
history visibility, rather than things like if the sender has been
erased. This is used e.g. during pagination to decide whether to
backfill or not.
Returns
Deferred[list[FrozenEvent]]
"""
def is_sender_erased(event, erased_senders):
if erased_senders and erased_senders[event.sender]:
logger.info(
"Sender of %s has been erased, redacting",
event.event_id,
)
return prune_event(event)
# state will be None if we decided we didn't need to filter by
# room membership.
if not state:
return event
return True
return False
def check_event_is_visible(event, state):
history = state.get((EventTypes.RoomHistoryVisibility, ''), None)
if history:
visibility = history.content.get("history_visibility", "shared")
@ -259,17 +267,17 @@ def filter_events_for_server(store, server_name, events):
memtype = ev.membership
if memtype == Membership.JOIN:
return event
return True
elif memtype == Membership.INVITE:
if visibility == "invited":
return event
return True
else:
# server has no users in the room: redact
return prune_event(event)
return False
return event
return True
# Next lets check to see if all the events have a history visibility
# Lets check to see if all the events have a history visibility
# of "shared" or "world_readable". If thats the case then we don't
# need to check membership (as we know the server is in the room).
event_to_state_ids = yield store.get_state_ids_for_events(
@ -296,16 +304,31 @@ def filter_events_for_server(store, server_name, events):
for e in itervalues(event_map)
)
if not check_history_visibility_only:
erased_senders = yield store.are_users_erased(
(e.sender for e in events),
)
else:
# We don't want to check whether users are erased, which is equivalent
# to no users having been erased.
erased_senders = {}
if all_open:
# all the history_visibility state affecting these events is open, so
# we don't need to filter by membership state. We *do* need to check
# for user erasure, though.
if erased_senders:
events = [
redact_disallowed(e, None)
for e in events
]
to_return = []
for e in events:
if not is_sender_erased(e, erased_senders):
to_return.append(e)
elif redact:
to_return.append(prune_event(e))
defer.returnValue(to_return)
# If there are no erased users then we can just return the given list
# of events without having to copy it.
defer.returnValue(events)
# Ok, so we're dealing with events that have non-trivial visibility
@ -361,7 +384,13 @@ def filter_events_for_server(store, server_name, events):
for e_id, key_to_eid in iteritems(event_to_state_ids)
}
defer.returnValue([
redact_disallowed(e, event_to_state[e.event_id])
for e in events
])
to_return = []
for e in events:
erased = is_sender_erased(e, erased_senders)
visible = check_event_is_visible(e, event_to_state[e.event_id])
if visible and not erased:
to_return.append(e)
elif redact:
to_return.append(prune_event(e))
defer.returnValue(to_return)