diff --git a/changelog.d/15607.bugfix b/changelog.d/15607.bugfix new file mode 100644 index 0000000000..a2767adbe2 --- /dev/null +++ b/changelog.d/15607.bugfix @@ -0,0 +1 @@ +Fix a long-standing bug where filters with multiple backslashes were rejected. diff --git a/synapse/api/filtering.py b/synapse/api/filtering.py index de7c56bc0f..82aeef8d19 100644 --- a/synapse/api/filtering.py +++ b/synapse/api/filtering.py @@ -128,20 +128,7 @@ USER_FILTER_SCHEMA = { "account_data": {"$ref": "#/definitions/filter"}, "room": {"$ref": "#/definitions/room_filter"}, "event_format": {"type": "string", "enum": ["client", "federation"]}, - "event_fields": { - "type": "array", - "items": { - "type": "string", - # Don't allow '\\' in event field filters. This makes matching - # events a lot easier as we can then use a negative lookbehind - # assertion to split '\.' If we allowed \\ then it would - # incorrectly split '\\.' See synapse.events.utils.serialize_event - # - # Note that because this is a regular expression, we have to escape - # each backslash in the pattern. - "pattern": r"^((?!\\\\).)*$", - }, - }, + "event_fields": {"type": "array", "items": {"type": "string"}}, }, "additionalProperties": True, # Allow new fields for forward compatibility } diff --git a/synapse/events/utils.py b/synapse/events/utils.py index e6d040176b..e7b7b78b84 100644 --- a/synapse/events/utils.py +++ b/synapse/events/utils.py @@ -22,6 +22,7 @@ from typing import ( Iterable, List, Mapping, + Match, MutableMapping, Optional, Union, @@ -46,12 +47,10 @@ if TYPE_CHECKING: from synapse.handlers.relations import BundledAggregations -# Split strings on "." but not "\." This uses a negative lookbehind assertion for '\' -# (? None: sub_out_dict[key_to_move] = sub_dict[key_to_move] +def _escape_slash(m: Match[str]) -> str: + """ + Replacement function; replace a backslash-backslash or backslash-dot with the + second character. Leaves any other string alone. + """ + if m.group(1) in ("\\", "."): + return m.group(1) + return m.group(0) + + +def _split_field(field: str) -> List[str]: + """ + Splits strings on unescaped dots and removes escaping. + + Args: + field: A string representing a path to a field. + + Returns: + A list of nested fields to traverse. + """ + + # Convert the field and remove escaping: + # + # 1. "content.body.thing\.with\.dots" + # 2. ["content", "body", "thing\.with\.dots"] + # 3. ["content", "body", "thing.with.dots"] + + # Find all dots (and their preceding backslashes). If the dot is unescaped + # then emit a new field part. + result = [] + prev_start = 0 + for match in SPLIT_FIELD_REGEX.finditer(field): + # If the match is an *even* number of characters than the dot was escaped. + if len(match.group()) % 2 == 0: + continue + + # Add a new part (up to the dot, exclusive) after escaping. + result.append( + ESCAPE_SEQUENCE_PATTERN.sub( + _escape_slash, field[prev_start : match.end() - 1] + ) + ) + prev_start = match.end() + + # Add any part of the field after the last unescaped dot. (Note that if the + # character is a dot this correctly adds a blank string.) + result.append(re.sub(r"\\(.)", _escape_slash, field[prev_start:])) + + return result + + def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: """Return a new dict with only the fields in 'dictionary' which are present in 'fields'. @@ -260,7 +310,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: If there are no event fields specified then all fields are included. The entries may include '.' characters to indicate sub-fields. So ['content.body'] will include the 'body' field of the 'content' object. - A literal '.' character in a field name may be escaped using a '\'. + A literal '.' or '\' character in a field name may be escaped using a '\'. Args: dictionary: The dictionary to read from. @@ -275,13 +325,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: # for each field, convert it: # ["content.body.thing\.with\.dots"] => [["content", "body", "thing\.with\.dots"]] - split_fields = [SPLIT_FIELD_REGEX.split(f) for f in fields] - - # for each element of the output array of arrays: - # remove escaping so we can use the right key names. - split_fields[:] = [ - [f.replace(r"\.", r".") for f in field_array] for field_array in split_fields - ] + split_fields = [_split_field(f) for f in fields] output: JsonDict = {} for field_array in split_fields: diff --git a/tests/api/test_filtering.py b/tests/api/test_filtering.py index 222449baac..aa6af5ad7b 100644 --- a/tests/api/test_filtering.py +++ b/tests/api/test_filtering.py @@ -48,8 +48,6 @@ class FilteringTestCase(unittest.HomeserverTestCase): invalid_filters: List[JsonDict] = [ # `account_data` must be a dictionary {"account_data": "Hello World"}, - # `event_fields` entries must not contain backslashes - {"event_fields": [r"\\foo"]}, # `event_format` must be "client" or "federation" {"event_format": "other"}, # `not_rooms` must contain valid room IDs @@ -114,10 +112,6 @@ class FilteringTestCase(unittest.HomeserverTestCase): "event_format": "client", "event_fields": ["type", "content", "sender"], }, - # a single backslash should be permitted (though it is debatable whether - # it should be permitted before anything other than `.`, and what that - # actually means) - # # (note that event_fields is implemented in # synapse.events.utils.serialize_event, and so whether this actually works # is tested elsewhere. We just want to check that it is allowed through the diff --git a/tests/events/test_utils.py b/tests/events/test_utils.py index e40eac2eb0..c9a610db9a 100644 --- a/tests/events/test_utils.py +++ b/tests/events/test_utils.py @@ -16,6 +16,7 @@ import unittest as stdlib_unittest from typing import Any, List, Mapping, Optional import attr +from parameterized import parameterized from synapse.api.constants import EventContentFields from synapse.api.room_versions import RoomVersions @@ -23,6 +24,7 @@ from synapse.events import EventBase, make_event_from_dict from synapse.events.utils import ( PowerLevelsContent, SerializeEventConfig, + _split_field, copy_and_fixup_power_levels_contents, maybe_upsert_event_field, prune_event, @@ -794,3 +796,40 @@ class CopyPowerLevelsContentTestCase(stdlib_unittest.TestCase): def test_invalid_nesting_raises_type_error(self) -> None: with self.assertRaises(TypeError): copy_and_fixup_power_levels_contents({"a": {"b": {"c": 1}}}) # type: ignore[dict-item] + + +class SplitFieldTestCase(stdlib_unittest.TestCase): + @parameterized.expand( + [ + # A field with no dots. + ["m", ["m"]], + # Simple dotted fields. + ["m.foo", ["m", "foo"]], + ["m.foo.bar", ["m", "foo", "bar"]], + # Backslash is used as an escape character. + [r"m\.foo", ["m.foo"]], + [r"m\\.foo", ["m\\", "foo"]], + [r"m\\\.foo", [r"m\.foo"]], + [r"m\\\\.foo", ["m\\\\", "foo"]], + [r"m\foo", [r"m\foo"]], + [r"m\\foo", [r"m\foo"]], + [r"m\\\foo", [r"m\\foo"]], + [r"m\\\\foo", [r"m\\foo"]], + # Ensure that escapes at the end don't cause issues. + ["m.foo\\", ["m", "foo\\"]], + ["m.foo\\", ["m", "foo\\"]], + [r"m.foo\.", ["m", "foo."]], + [r"m.foo\\.", ["m", "foo\\", ""]], + [r"m.foo\\\.", ["m", r"foo\."]], + # Empty parts (corresponding to properties which are an empty string) are allowed. + [".m", ["", "m"]], + ["..m", ["", "", "m"]], + ["m.", ["m", ""]], + ["m..", ["m", "", ""]], + ["m..foo", ["m", "", "foo"]], + # Invalid escape sequences. + [r"\m", [r"\m"]], + ] + ) + def test_split_field(self, input: str, expected: str) -> None: + self.assertEqual(_split_field(input), expected)