From 8e1febc6a1e909eeb4334d5572956f669ee2d290 Mon Sep 17 00:00:00 2001 From: sri-vidyut Date: Wed, 28 Jul 2021 02:29:42 +0900 Subject: [PATCH] Support underscores (in addition to hyphens) for charset detection. (#10410) --- changelog.d/10410.bugfix | 1 + synapse/rest/media/v1/preview_url_resource.py | 6 ++++-- tests/test_preview.py | 13 +++++++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 changelog.d/10410.bugfix diff --git a/changelog.d/10410.bugfix b/changelog.d/10410.bugfix new file mode 100644 index 0000000000..65b418fd35 --- /dev/null +++ b/changelog.d/10410.bugfix @@ -0,0 +1 @@ +Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut. diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index 172212ee3a..0f051d4041 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -58,9 +58,11 @@ if TYPE_CHECKING: logger = logging.getLogger(__name__) -_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I) +_charset_match = re.compile( + br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I +) _xml_encoding_match = re.compile( - br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I + br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I ) _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I) diff --git a/tests/test_preview.py b/tests/test_preview.py index cac3d81ac1..48e792b55b 100644 --- a/tests/test_preview.py +++ b/tests/test_preview.py @@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase): ) self.assertEqual(encoding, "ascii") + def test_meta_charset_underscores(self): + """A character encoding contains underscore.""" + encoding = get_html_media_encoding( + b""" + + + + + """, + "text/html", + ) + self.assertEqual(encoding, "Shift_JIS") + def test_xml_encoding(self): """A character encoding is found via the meta tag.""" encoding = get_html_media_encoding(