From babeeb4e7a6f5b5c643b837bf724d674805546f6 Mon Sep 17 00:00:00 2001 From: Jeyachandran Rathnam Date: Mon, 9 Jan 2023 09:22:02 -0500 Subject: [PATCH] Unescape HTML entities in oEmbed titles. (#14781) It doesn't seem valid that HTML entities should appear in the title field of oEmbed responses, but a popular WordPress plug-in seems to do it. There should not be harm in unescaping these. --- changelog.d/14781.misc | 1 + synapse/rest/media/v1/oembed.py | 15 +++++++++------ tests/rest/media/v1/test_oembed.py | 10 ++++++++++ 3 files changed, 20 insertions(+), 6 deletions(-) create mode 100644 changelog.d/14781.misc diff --git a/changelog.d/14781.misc b/changelog.d/14781.misc new file mode 100644 index 0000000000..04f565b410 --- /dev/null +++ b/changelog.d/14781.misc @@ -0,0 +1 @@ +Unescape HTML entities in URL preview titles making use of oEmbed responses. diff --git a/synapse/rest/media/v1/oembed.py b/synapse/rest/media/v1/oembed.py index 827afd868d..a3738a6250 100644 --- a/synapse/rest/media/v1/oembed.py +++ b/synapse/rest/media/v1/oembed.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import html import logging import urllib.parse from typing import TYPE_CHECKING, List, Optional @@ -161,7 +162,9 @@ class OEmbedProvider: title = oembed.get("title") if title and isinstance(title, str): - open_graph_response["og:title"] = title + # A common WordPress plug-in seems to incorrectly escape entities + # in the oEmbed response. + open_graph_response["og:title"] = html.unescape(title) author_name = oembed.get("author_name") if not isinstance(author_name, str): @@ -180,9 +183,9 @@ class OEmbedProvider: # Process each type separately. oembed_type = oembed.get("type") if oembed_type == "rich": - html = oembed.get("html") - if isinstance(html, str): - calc_description_and_urls(open_graph_response, html) + html_str = oembed.get("html") + if isinstance(html_str, str): + calc_description_and_urls(open_graph_response, html_str) elif oembed_type == "photo": # If this is a photo, use the full image, not the thumbnail. @@ -192,8 +195,8 @@ class OEmbedProvider: elif oembed_type == "video": open_graph_response["og:type"] = "video.other" - html = oembed.get("html") - if html and isinstance(html, str): + html_str = oembed.get("html") + if html_str and isinstance(html_str, str): calc_description_and_urls(open_graph_response, oembed["html"]) for size in ("width", "height"): val = oembed.get(size) diff --git a/tests/rest/media/v1/test_oembed.py b/tests/rest/media/v1/test_oembed.py index 319ae8b1cc..3f7f1dbab9 100644 --- a/tests/rest/media/v1/test_oembed.py +++ b/tests/rest/media/v1/test_oembed.py @@ -150,3 +150,13 @@ class OEmbedTests(HomeserverTestCase): result = self.parse_response({"type": "link"}) self.assertIn("og:type", result.open_graph_result) self.assertEqual(result.open_graph_result["og:type"], "website") + + def test_title_html_entities(self) -> None: + """Test HTML entities in title""" + result = self.parse_response( + {"title": "Why JSON isn’t a Good Configuration Language"} + ) + self.assertEqual( + result.open_graph_result["og:title"], + "Why JSON isn’t a Good Configuration Language", + )