Properly parse event_fields in filters (#15607)

The event_fields property in filters should use the proper
escape rules, namely backslashes can be escaped with
an additional backslash.

This adds tests (adapted from matrix-js-sdk) and implements
the logic to properly split the event_fields strings.
pull/15648/head
Patrick Cloke 2023-05-22 11:31:22 -04:00 committed by GitHub
parent 201597fc86
commit c5d1e6d414
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 99 additions and 34 deletions

1
changelog.d/15607.bugfix Normal file
View File

@ -0,0 +1 @@
Fix a long-standing bug where filters with multiple backslashes were rejected.

View File

@ -128,20 +128,7 @@ USER_FILTER_SCHEMA = {
"account_data": {"$ref": "#/definitions/filter"}, "account_data": {"$ref": "#/definitions/filter"},
"room": {"$ref": "#/definitions/room_filter"}, "room": {"$ref": "#/definitions/room_filter"},
"event_format": {"type": "string", "enum": ["client", "federation"]}, "event_format": {"type": "string", "enum": ["client", "federation"]},
"event_fields": { "event_fields": {"type": "array", "items": {"type": "string"}},
"type": "array",
"items": {
"type": "string",
# Don't allow '\\' in event field filters. This makes matching
# events a lot easier as we can then use a negative lookbehind
# assertion to split '\.' If we allowed \\ then it would
# incorrectly split '\\.' See synapse.events.utils.serialize_event
#
# Note that because this is a regular expression, we have to escape
# each backslash in the pattern.
"pattern": r"^((?!\\\\).)*$",
},
},
}, },
"additionalProperties": True, # Allow new fields for forward compatibility "additionalProperties": True, # Allow new fields for forward compatibility
} }

View File

@ -22,6 +22,7 @@ from typing import (
Iterable, Iterable,
List, List,
Mapping, Mapping,
Match,
MutableMapping, MutableMapping,
Optional, Optional,
Union, Union,
@ -46,12 +47,10 @@ if TYPE_CHECKING:
from synapse.handlers.relations import BundledAggregations from synapse.handlers.relations import BundledAggregations
# Split strings on "." but not "\." This uses a negative lookbehind assertion for '\' # Split strings on "." but not "\." (or "\\\.").
# (?<!stuff) matches if the current position in the string is not preceded SPLIT_FIELD_REGEX = re.compile(r"\\*\.")
# by a match for 'stuff'. # Find escaped characters, e.g. those with a \ in front of them.
# TODO: This is fast, but fails to handle "foo\\.bar" which should be treated as ESCAPE_SEQUENCE_PATTERN = re.compile(r"\\(.)")
# the literal fields "foo\" and "bar" but will instead be treated as "foo\\.bar"
SPLIT_FIELD_REGEX = re.compile(r"(?<!\\)\.")
CANONICALJSON_MAX_INT = (2**53) - 1 CANONICALJSON_MAX_INT = (2**53) - 1
CANONICALJSON_MIN_INT = -CANONICALJSON_MAX_INT CANONICALJSON_MIN_INT = -CANONICALJSON_MAX_INT
@ -253,6 +252,57 @@ def _copy_field(src: JsonDict, dst: JsonDict, field: List[str]) -> None:
sub_out_dict[key_to_move] = sub_dict[key_to_move] sub_out_dict[key_to_move] = sub_dict[key_to_move]
def _escape_slash(m: Match[str]) -> str:
"""
Replacement function; replace a backslash-backslash or backslash-dot with the
second character. Leaves any other string alone.
"""
if m.group(1) in ("\\", "."):
return m.group(1)
return m.group(0)
def _split_field(field: str) -> List[str]:
"""
Splits strings on unescaped dots and removes escaping.
Args:
field: A string representing a path to a field.
Returns:
A list of nested fields to traverse.
"""
# Convert the field and remove escaping:
#
# 1. "content.body.thing\.with\.dots"
# 2. ["content", "body", "thing\.with\.dots"]
# 3. ["content", "body", "thing.with.dots"]
# Find all dots (and their preceding backslashes). If the dot is unescaped
# then emit a new field part.
result = []
prev_start = 0
for match in SPLIT_FIELD_REGEX.finditer(field):
# If the match is an *even* number of characters than the dot was escaped.
if len(match.group()) % 2 == 0:
continue
# Add a new part (up to the dot, exclusive) after escaping.
result.append(
ESCAPE_SEQUENCE_PATTERN.sub(
_escape_slash, field[prev_start : match.end() - 1]
)
)
prev_start = match.end()
# Add any part of the field after the last unescaped dot. (Note that if the
# character is a dot this correctly adds a blank string.)
result.append(re.sub(r"\\(.)", _escape_slash, field[prev_start:]))
return result
def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict: def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict:
"""Return a new dict with only the fields in 'dictionary' which are present """Return a new dict with only the fields in 'dictionary' which are present
in 'fields'. in 'fields'.
@ -260,7 +310,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict:
If there are no event fields specified then all fields are included. If there are no event fields specified then all fields are included.
The entries may include '.' characters to indicate sub-fields. The entries may include '.' characters to indicate sub-fields.
So ['content.body'] will include the 'body' field of the 'content' object. So ['content.body'] will include the 'body' field of the 'content' object.
A literal '.' character in a field name may be escaped using a '\'. A literal '.' or '\' character in a field name may be escaped using a '\'.
Args: Args:
dictionary: The dictionary to read from. dictionary: The dictionary to read from.
@ -275,13 +325,7 @@ def only_fields(dictionary: JsonDict, fields: List[str]) -> JsonDict:
# for each field, convert it: # for each field, convert it:
# ["content.body.thing\.with\.dots"] => [["content", "body", "thing\.with\.dots"]] # ["content.body.thing\.with\.dots"] => [["content", "body", "thing\.with\.dots"]]
split_fields = [SPLIT_FIELD_REGEX.split(f) for f in fields] split_fields = [_split_field(f) for f in fields]
# for each element of the output array of arrays:
# remove escaping so we can use the right key names.
split_fields[:] = [
[f.replace(r"\.", r".") for f in field_array] for field_array in split_fields
]
output: JsonDict = {} output: JsonDict = {}
for field_array in split_fields: for field_array in split_fields:

View File

@ -48,8 +48,6 @@ class FilteringTestCase(unittest.HomeserverTestCase):
invalid_filters: List[JsonDict] = [ invalid_filters: List[JsonDict] = [
# `account_data` must be a dictionary # `account_data` must be a dictionary
{"account_data": "Hello World"}, {"account_data": "Hello World"},
# `event_fields` entries must not contain backslashes
{"event_fields": [r"\\foo"]},
# `event_format` must be "client" or "federation" # `event_format` must be "client" or "federation"
{"event_format": "other"}, {"event_format": "other"},
# `not_rooms` must contain valid room IDs # `not_rooms` must contain valid room IDs
@ -114,10 +112,6 @@ class FilteringTestCase(unittest.HomeserverTestCase):
"event_format": "client", "event_format": "client",
"event_fields": ["type", "content", "sender"], "event_fields": ["type", "content", "sender"],
}, },
# a single backslash should be permitted (though it is debatable whether
# it should be permitted before anything other than `.`, and what that
# actually means)
#
# (note that event_fields is implemented in # (note that event_fields is implemented in
# synapse.events.utils.serialize_event, and so whether this actually works # synapse.events.utils.serialize_event, and so whether this actually works
# is tested elsewhere. We just want to check that it is allowed through the # is tested elsewhere. We just want to check that it is allowed through the

View File

@ -16,6 +16,7 @@ import unittest as stdlib_unittest
from typing import Any, List, Mapping, Optional from typing import Any, List, Mapping, Optional
import attr import attr
from parameterized import parameterized
from synapse.api.constants import EventContentFields from synapse.api.constants import EventContentFields
from synapse.api.room_versions import RoomVersions from synapse.api.room_versions import RoomVersions
@ -23,6 +24,7 @@ from synapse.events import EventBase, make_event_from_dict
from synapse.events.utils import ( from synapse.events.utils import (
PowerLevelsContent, PowerLevelsContent,
SerializeEventConfig, SerializeEventConfig,
_split_field,
copy_and_fixup_power_levels_contents, copy_and_fixup_power_levels_contents,
maybe_upsert_event_field, maybe_upsert_event_field,
prune_event, prune_event,
@ -794,3 +796,40 @@ class CopyPowerLevelsContentTestCase(stdlib_unittest.TestCase):
def test_invalid_nesting_raises_type_error(self) -> None: def test_invalid_nesting_raises_type_error(self) -> None:
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
copy_and_fixup_power_levels_contents({"a": {"b": {"c": 1}}}) # type: ignore[dict-item] copy_and_fixup_power_levels_contents({"a": {"b": {"c": 1}}}) # type: ignore[dict-item]
class SplitFieldTestCase(stdlib_unittest.TestCase):
@parameterized.expand(
[
# A field with no dots.
["m", ["m"]],
# Simple dotted fields.
["m.foo", ["m", "foo"]],
["m.foo.bar", ["m", "foo", "bar"]],
# Backslash is used as an escape character.
[r"m\.foo", ["m.foo"]],
[r"m\\.foo", ["m\\", "foo"]],
[r"m\\\.foo", [r"m\.foo"]],
[r"m\\\\.foo", ["m\\\\", "foo"]],
[r"m\foo", [r"m\foo"]],
[r"m\\foo", [r"m\foo"]],
[r"m\\\foo", [r"m\\foo"]],
[r"m\\\\foo", [r"m\\foo"]],
# Ensure that escapes at the end don't cause issues.
["m.foo\\", ["m", "foo\\"]],
["m.foo\\", ["m", "foo\\"]],
[r"m.foo\.", ["m", "foo."]],
[r"m.foo\\.", ["m", "foo\\", ""]],
[r"m.foo\\\.", ["m", r"foo\."]],
# Empty parts (corresponding to properties which are an empty string) are allowed.
[".m", ["", "m"]],
["..m", ["", "", "m"]],
["m.", ["m", ""]],
["m..", ["m", "", ""]],
["m..foo", ["m", "", "foo"]],
# Invalid escape sequences.
[r"\m", [r"\m"]],
]
)
def test_split_field(self, input: str, expected: str) -> None:
self.assertEqual(_split_field(input), expected)