diff --git a/changelog.d/12674.misc b/changelog.d/12674.misc new file mode 100644 index 0000000000..c8a8f32f0a --- /dev/null +++ b/changelog.d/12674.misc @@ -0,0 +1 @@ +Add tests for cancellation of `GET /rooms/$room_id/members` and `GET /rooms/$room_id/state` requests. diff --git a/tests/http/server/_base.py b/tests/http/server/_base.py index b9f1a381aa..57b92beb87 100644 --- a/tests/http/server/_base.py +++ b/tests/http/server/_base.py @@ -12,21 +12,48 @@ # See the License for the specific language governing permissions and # limitations under the License. +import inspect +import itertools +import logging from http import HTTPStatus -from typing import Any, Callable, Optional, Union +from typing import ( + Any, + Callable, + ContextManager, + Dict, + List, + Optional, + Set, + Tuple, + TypeVar, + Union, +) from unittest import mock +from unittest.mock import Mock +from twisted.internet.defer import Deferred from twisted.internet.error import ConnectionDone +from twisted.python.failure import Failure +from twisted.test.proto_helpers import MemoryReactorClock +from twisted.web.server import Site from synapse.http.server import ( HTTP_STATUS_REQUEST_CANCELLED, respond_with_html_bytes, respond_with_json, ) +from synapse.http.site import SynapseRequest +from synapse.logging.context import LoggingContext, make_deferred_yieldable from synapse.types import JsonDict from tests import unittest -from tests.server import FakeChannel, ThreadedMemoryReactorClock +from tests.server import FakeChannel, ThreadedMemoryReactorClock, make_request +from tests.unittest import logcontext_clean + +logger = logging.getLogger(__name__) + + +T = TypeVar("T") class EndpointCancellationTestHelperMixin(unittest.TestCase): @@ -98,3 +125,424 @@ class EndpointCancellationTestHelperMixin(unittest.TestCase): self.assertEqual(code, expected_code) self.assertEqual(request.code, expected_code) self.assertEqual(body, expected_body) + + +@logcontext_clean +def make_request_with_cancellation_test( + test_name: str, + reactor: MemoryReactorClock, + site: Site, + method: str, + path: str, + content: Union[bytes, str, JsonDict] = b"", +) -> FakeChannel: + """Performs a request repeatedly, disconnecting at successive `await`s, until + one completes. + + Fails if: + * A logging context is lost during cancellation. + * A logging context get restarted after it is marked as finished, eg. if + a request's logging context is used by some processing started by the + request, but the request neglects to cancel that processing or wait for it + to complete. + + Note that "Re-starting finished log context" errors get raised within the + request handling code and may or may not get caught. These errors will + likely manifest as a different logging context error at a later point. When + debugging logging context failures, setting a breakpoint in + `logcontext_error` can prove useful. + * A request gets stuck, possibly due to a previous cancellation. + * The request does not return a 499 when the client disconnects. + This implies that a `CancelledError` was swallowed somewhere. + + It is up to the caller to verify that the request returns the correct data when + it finally runs to completion. + + Note that this function can only cover a single code path and does not guarantee + that an endpoint is compatible with cancellation on every code path. + To allow inspection of the code path that is being tested, this function will + log the stack trace at every `await` that gets cancelled. To view these log + lines, `trial` can be run with the `SYNAPSE_TEST_LOG_LEVEL=INFO` environment + variable, which will include the log lines in `_trial_temp/test.log`. + Alternatively, `_log_for_request` can be modified to write to `sys.stdout`. + + Args: + test_name: The name of the test, which will be logged. + reactor: The twisted reactor running the request handler. + site: The twisted `Site` to use to render the request. + method: The HTTP request method ("verb"). + path: The HTTP path, suitably URL encoded (e.g. escaped UTF-8 & spaces and + such). + content: The body of the request. + + Returns: + The `FakeChannel` object which stores the result of the final request that + runs to completion. + """ + # To process a request, a coroutine run is created for the async method handling + # the request. That method may then start other coroutine runs, wrapped in + # `Deferred`s. + # + # We would like to trigger a cancellation at the first `await`, re-run the + # request and cancel at the second `await`, and so on. By patching + # `Deferred.__next__`, we can intercept `await`s, track which ones we have or + # have not seen, and force them to block when they wouldn't have. + + # The set of previously seen `await`s. + # Each element is a stringified stack trace. + seen_awaits: Set[Tuple[str, ...]] = set() + + _log_for_request( + 0, f"Running make_request_with_cancellation_test for {test_name}..." + ) + + for request_number in itertools.count(1): + deferred_patch = Deferred__next__Patch(seen_awaits, request_number) + + try: + with mock.patch( + "synapse.http.server.respond_with_json", wraps=respond_with_json + ) as respond_mock: + with deferred_patch.patch(): + # Start the request. + channel = make_request( + reactor, site, method, path, content, await_result=False + ) + request = channel.request + + # Run the request until we see a new `await` which we have not + # yet cancelled at, or it completes. + while not respond_mock.called and not deferred_patch.new_await_seen: + previous_awaits_seen = deferred_patch.awaits_seen + + reactor.advance(0.0) + + if deferred_patch.awaits_seen == previous_awaits_seen: + # We didn't see any progress. Try advancing the clock. + reactor.advance(1.0) + + if deferred_patch.awaits_seen == previous_awaits_seen: + # We still didn't see any progress. The request might be + # stuck. + raise AssertionError( + "Request appears to be stuck, possibly due to a " + "previous cancelled request" + ) + + if respond_mock.called: + # The request ran to completion and we are done with testing it. + + # `respond_with_json` writes the response asynchronously, so we + # might have to give the reactor a kick before the channel gets + # the response. + deferred_patch.unblock_awaits() + channel.await_result() + + return channel + + # Disconnect the client and wait for the response. + request.connectionLost(reason=ConnectionDone()) + + _log_for_request(request_number, "--- disconnected ---") + + # Advance the reactor just enough to get a response. + # We don't want to advance the reactor too far, because we can only + # detect re-starts of finished logging contexts after we set the + # finished flag below. + for _ in range(2): + # We may need to pump the reactor to allow `delay_cancellation`s to + # finish. + if not respond_mock.called: + reactor.advance(0.0) + + # Try advancing the clock if that didn't work. + if not respond_mock.called: + reactor.advance(1.0) + + # `delay_cancellation`s may be waiting for processing that we've + # forced to block. Try unblocking them, followed by another round of + # pumping the reactor. + if not respond_mock.called: + deferred_patch.unblock_awaits() + + # Mark the request's logging context as finished. If it gets + # activated again, an `AssertionError` will be raised and bubble up + # through request handling code. This `AssertionError` may or may not be + # caught. Eventually some other code will deactivate the logging + # context which will raise a different `AssertionError` because + # resource usage won't have been correctly tracked. + if isinstance(request, SynapseRequest) and request.logcontext: + request.logcontext.finished = True + + # Check that the request finished with a 499, + # ie. the `CancelledError` wasn't swallowed. + respond_mock.assert_called_once() + + if request.code != HTTP_STATUS_REQUEST_CANCELLED: + raise AssertionError( + f"{request.code} != {HTTP_STATUS_REQUEST_CANCELLED} : " + "Cancelled request did not finish with the correct status code." + ) + finally: + # Unblock any processing that might be shared between requests, if we + # haven't already done so. + deferred_patch.unblock_awaits() + + assert False, "unreachable" # noqa: B011 + + +class Deferred__next__Patch: + """A `Deferred.__next__` patch that will intercept `await`s and force them + to block once it sees a new `await`. + + When done with the patch, `unblock_awaits()` must be called to clean up after any + `await`s that were forced to block, otherwise processing shared between multiple + requests, such as database queries started by `@cached`, will become permanently + stuck. + + Usage: + seen_awaits = set() + deferred_patch = Deferred__next__Patch(seen_awaits, 1) + try: + with deferred_patch.patch(): + # do things + ... + finally: + deferred_patch.unblock_awaits() + """ + + def __init__(self, seen_awaits: Set[Tuple[str, ...]], request_number: int): + """ + Args: + seen_awaits: The set of stack traces of `await`s that have been previously + seen. When the `Deferred.__next__` patch sees a new `await`, it will add + it to the set. + request_number: The request number to log against. + """ + self._request_number = request_number + self._seen_awaits = seen_awaits + + self._original_Deferred___next__ = Deferred.__next__ + + # The number of `await`s on `Deferred`s we have seen so far. + self.awaits_seen = 0 + + # Whether we have seen a new `await` not in `seen_awaits`. + self.new_await_seen = False + + # To force `await`s on resolved `Deferred`s to block, we make up a new + # unresolved `Deferred` and return it out of `Deferred.__next__` / + # `coroutine.send()`. We have to resolve it later, in case the `await`ing + # coroutine is part of some shared processing, such as `@cached`. + self._to_unblock: Dict[Deferred, Union[object, Failure]] = {} + + # The last stack we logged. + self._previous_stack: List[inspect.FrameInfo] = [] + + def patch(self) -> ContextManager[Mock]: + """Returns a context manager which patches `Deferred.__next__`.""" + + def Deferred___next__( + deferred: "Deferred[T]", value: object = None + ) -> "Deferred[T]": + """Intercepts `await`s on `Deferred`s and rigs them to block once we have + seen enough of them. + + `Deferred.__next__` will normally: + * return `self` if the `Deferred` is unresolved, in which case + `coroutine.send()` will return the `Deferred`, and + `_defer.inlineCallbacks` will stop running the coroutine until the + `Deferred` is resolved. + * raise a `StopIteration(result)`, containing the result of the `await`. + * raise another exception, which will come out of the `await`. + """ + self.awaits_seen += 1 + + stack = _get_stack(skip_frames=1) + stack_hash = _hash_stack(stack) + + if stack_hash not in self._seen_awaits: + # Block at the current `await` onwards. + self._seen_awaits.add(stack_hash) + self.new_await_seen = True + + if not self.new_await_seen: + # This `await` isn't interesting. Let it proceed normally. + + # Don't log the stack. It's been seen before in a previous run. + self._previous_stack = stack + + return self._original_Deferred___next__(deferred, value) + + # We want to block at the current `await`. + if deferred.called and not deferred.paused: + # This `Deferred` already has a result. + # We return a new, unresolved, `Deferred` for `_inlineCallbacks` to wait + # on. This blocks the coroutine that did this `await`. + # We queue it up for unblocking later. + new_deferred: "Deferred[T]" = Deferred() + self._to_unblock[new_deferred] = deferred.result + + _log_await_stack( + stack, + self._previous_stack, + self._request_number, + "force-blocked await", + ) + self._previous_stack = stack + + return make_deferred_yieldable(new_deferred) + + # This `Deferred` does not have a result yet. + # The `await` will block normally, so we don't have to do anything. + _log_await_stack( + stack, + self._previous_stack, + self._request_number, + "blocking await", + ) + self._previous_stack = stack + + return self._original_Deferred___next__(deferred, value) + + return mock.patch.object(Deferred, "__next__", new=Deferred___next__) + + def unblock_awaits(self) -> None: + """Unblocks any shared processing that we forced to block. + + Must be called when done, otherwise processing shared between multiple requests, + such as database queries started by `@cached`, will become permanently stuck. + """ + to_unblock = self._to_unblock + self._to_unblock = {} + for deferred, result in to_unblock.items(): + deferred.callback(result) + + +def _log_for_request(request_number: int, message: str) -> None: + """Logs a message for an iteration of `make_request_with_cancellation_test`.""" + # We want consistent alignment when logging stack traces, so ensure the logging + # context has a fixed width name. + with LoggingContext(name=f"request-{request_number:<2}"): + logger.info(message) + + +def _log_await_stack( + stack: List[inspect.FrameInfo], + previous_stack: List[inspect.FrameInfo], + request_number: int, + note: str, +) -> None: + """Logs the stack for an `await` in `make_request_with_cancellation_test`. + + Only logs the part of the stack that has changed since the previous call. + + Example output looks like: + ``` + delay_cancellation:750 (synapse/util/async_helpers.py:750) + DatabasePool._runInteraction:768 (synapse/storage/database.py:768) + > *blocked on await* at DatabasePool.runWithConnection:891 (synapse/storage/database.py:891) + ``` + + Args: + stack: The stack to log, as returned by `_get_stack()`. + previous_stack: The previous stack logged, with callers appearing before + callees. + request_number: The request number to log against. + note: A note to attach to the last stack frame, eg. "blocked on await". + """ + for i, frame_info in enumerate(stack[:-1]): + # Skip any frames in common with the previous logging. + if i < len(previous_stack) and frame_info == previous_stack[i]: + continue + + frame = _format_stack_frame(frame_info) + message = f"{' ' * i}{frame}" + _log_for_request(request_number, message) + + # Always print the final frame with the `await`. + # If the frame with the `await` started another coroutine run, we may have already + # printed a deeper stack which includes our final frame. We want to log where all + # `await`s happen, so we reprint the frame in this case. + i = len(stack) - 1 + frame_info = stack[i] + frame = _format_stack_frame(frame_info) + message = f"{' ' * i}> *{note}* at {frame}" + _log_for_request(request_number, message) + + +def _format_stack_frame(frame_info: inspect.FrameInfo) -> str: + """Returns a string representation of a stack frame. + + Used for debug logging. + + Returns: + A string, formatted like + "JsonResource._async_render:559 (synapse/http/server.py:559)". + """ + method_name = _get_stack_frame_method_name(frame_info) + + return ( + f"{method_name}:{frame_info.lineno} ({frame_info.filename}:{frame_info.lineno})" + ) + + +def _get_stack(skip_frames: int) -> List[inspect.FrameInfo]: + """Captures the stack for a request. + + Skips any twisted frames and stops at `JsonResource.wrapped_async_request_handler`. + + Used for debug logging. + + Returns: + A list of `inspect.FrameInfo`s, with callers appearing before callees. + """ + stack = [] + + skip_frames += 1 # Also skip `get_stack` itself. + + for frame_info in inspect.stack()[skip_frames:]: + # Skip any twisted `inlineCallbacks` gunk. + if "/twisted/" in frame_info.filename: + continue + + # Exclude the reactor frame, upwards. + method_name = _get_stack_frame_method_name(frame_info) + if method_name == "ThreadedMemoryReactorClock.advance": + break + + stack.append(frame_info) + + # Stop at `JsonResource`'s `wrapped_async_request_handler`, which is the entry + # point for request handling. + if frame_info.function == "wrapped_async_request_handler": + break + + return stack[::-1] + + +def _get_stack_frame_method_name(frame_info: inspect.FrameInfo) -> str: + """Returns the name of a stack frame's method. + + eg. "JsonResource._async_render". + """ + method_name = frame_info.function + + # Prefix the class name for instance methods. + frame_self = frame_info.frame.f_locals.get("self") + if frame_self: + method = getattr(frame_self, method_name, None) + if method: + method_name = method.__qualname__ + else: + # We couldn't find the method on `self`. + # Make something up. It's useful to know which class "contains" a + # function anyway. + method_name = f"{type(frame_self).__name__} {method_name}" + + return method_name + + +def _hash_stack(stack: List[inspect.FrameInfo]): + """Turns a stack into a hashable value that can be put into a set.""" + return tuple(_format_stack_frame(frame) for frame in stack) diff --git a/tests/rest/client/test_rooms.py b/tests/rest/client/test_rooms.py index f523d89b8f..4be83dfd6d 100644 --- a/tests/rest/client/test_rooms.py +++ b/tests/rest/client/test_rooms.py @@ -42,6 +42,7 @@ from synapse.util import Clock from synapse.util.stringutils import random_string from tests import unittest +from tests.http.server._base import make_request_with_cancellation_test from tests.test_utils import make_awaitable PATH_PREFIX = b"/_matrix/client/api/v1" @@ -471,6 +472,49 @@ class RoomPermissionsTestCase(RoomBase): ) +class RoomStateTestCase(RoomBase): + """Tests /rooms/$room_id/state.""" + + user_id = "@sid1:red" + + def test_get_state_cancellation(self) -> None: + """Test cancellation of a `/rooms/$room_id/state` request.""" + room_id = self.helper.create_room_as(self.user_id) + channel = make_request_with_cancellation_test( + "test_state_cancellation", + self.reactor, + self.site, + "GET", + "/rooms/%s/state" % room_id, + ) + + self.assertEqual(200, channel.code, msg=channel.result["body"]) + self.assertCountEqual( + [state_event["type"] for state_event in channel.json_body], + { + "m.room.create", + "m.room.power_levels", + "m.room.join_rules", + "m.room.member", + "m.room.history_visibility", + }, + ) + + def test_get_state_event_cancellation(self) -> None: + """Test cancellation of a `/rooms/$room_id/state/$event_type` request.""" + room_id = self.helper.create_room_as(self.user_id) + channel = make_request_with_cancellation_test( + "test_state_cancellation", + self.reactor, + self.site, + "GET", + "/rooms/%s/state/m.room.member/%s" % (room_id, self.user_id), + ) + + self.assertEqual(200, channel.code, msg=channel.result["body"]) + self.assertEqual(channel.json_body, {"membership": "join"}) + + class RoomsMemberListTestCase(RoomBase): """Tests /rooms/$room_id/members/list REST events.""" @@ -591,6 +635,62 @@ class RoomsMemberListTestCase(RoomBase): channel = self.make_request("GET", room_path) self.assertEqual(200, channel.code, msg=channel.result["body"]) + def test_get_member_list_cancellation(self) -> None: + """Test cancellation of a `/rooms/$room_id/members` request.""" + room_id = self.helper.create_room_as(self.user_id) + channel = make_request_with_cancellation_test( + "test_get_member_list_cancellation", + self.reactor, + self.site, + "GET", + "/rooms/%s/members" % room_id, + ) + + self.assertEqual(200, channel.code, msg=channel.result["body"]) + self.assertEqual(len(channel.json_body["chunk"]), 1) + self.assertLessEqual( + { + "content": {"membership": "join"}, + "room_id": room_id, + "sender": self.user_id, + "state_key": self.user_id, + "type": "m.room.member", + "user_id": self.user_id, + }.items(), + channel.json_body["chunk"][0].items(), + ) + + def test_get_member_list_with_at_token_cancellation(self) -> None: + """Test cancellation of a `/rooms/$room_id/members?at=` request.""" + room_id = self.helper.create_room_as(self.user_id) + + # first sync to get an at token + channel = self.make_request("GET", "/sync") + self.assertEqual(200, channel.code) + sync_token = channel.json_body["next_batch"] + + channel = make_request_with_cancellation_test( + "test_get_member_list_with_at_token_cancellation", + self.reactor, + self.site, + "GET", + "/rooms/%s/members?at=%s" % (room_id, sync_token), + ) + + self.assertEqual(200, channel.code, msg=channel.result["body"]) + self.assertEqual(len(channel.json_body["chunk"]), 1) + self.assertLessEqual( + { + "content": {"membership": "join"}, + "room_id": room_id, + "sender": self.user_id, + "state_key": self.user_id, + "type": "m.room.member", + "user_id": self.user_id, + }.items(), + channel.json_body["chunk"][0].items(), + ) + class RoomsCreateTestCase(RoomBase): """Tests /rooms and /rooms/$room_id REST events."""