Use <meta> tags to discover the per-page encoding of html previews (#4183)

2018-11-15 11:05:08 -06:00 · 2018-11-15 11:05:08 -06:00 · df758e155d
parent a51288e5d6
commit df758e155d
3 changed files with 100 additions and 9 deletions
--- a/changelog.d/4183.bugfix
+++ b/changelog.d/4183.bugfix
@ -0,0 +1 @@
+URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
--- a/synapse/rest/media/v1/preview_url_resource.py
+++ b/synapse/rest/media/v1/preview_url_resource.py
@ -53,6 +53,9 @@ from ._base import FileInfo

 logger = logging.getLogger(__name__)

+_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
+_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
+

 class PreviewUrlResource(Resource):
    isLeaf = True
@ -223,13 +226,23 @@ class PreviewUrlResource(Resource):
            with open(media_info['filename'], 'rb') as file:
                body = file.read()

-            # clobber the encoding from the content-type, or default to utf-8
-            # XXX: this overrides any <meta/> or XML charset headers in the body
-            # which may pose problems, but so far seems to work okay.
-            match = re.match(
-                r'.*; *charset="?(.*?)"?(;|$)',
-                media_info['media_type'],
-                re.I
+            encoding = None
+
+            # Let's try and figure out if it has an encoding set in a meta tag.
+            # Limit it to the first 1kb, since it ought to be in the meta tags
+            # at the top.
+            match = _charset_match.search(body[:1000])
+
+            # If we find a match, it should take precedence over the
+            # Content-Type header, so set it here.
+            if match:
+                encoding = match.group(1).decode('ascii')
+
+            # If we don't find a match, we'll look at the HTTP Content-Type, and
+            # if that doesn't exist, we'll fall back to UTF-8.
+            if not encoding:
+                match = _content_type_match.match(
+                    media_info['media_type']
                )
                encoding = match.group(1) if match else "utf-8"

--- a/tests/rest/media/v1/test_url_preview.py
+++ b/tests/rest/media/v1/test_url_preview.py
@ -162,3 +162,80 @@ class URLPreviewTests(unittest.HomeserverTestCase):
        self.assertEqual(
            channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
        )
+
+    def test_non_ascii_preview_httpequiv(self):
+
+        request, channel = self.make_request(
+            "GET", "url_preview?url=matrix.org", shorthand=False
+        )
+        request.render(self.preview_url)
+        self.pump()
+
+        # We've made one fetch
+        self.assertEqual(len(self.fetches), 1)
+
+        end_content = (
+            b'<html><head>'
+            b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b'</head></html>'
+        )
+
+        self.fetches[0][0].callback(
+            (
+                end_content,
+                (
+                    len(end_content),
+                    {
+                        b"Content-Length": [b"%d" % (len(end_content))],
+                        # This charset=utf-8 should be ignored, because the
+                        # document has a meta tag overriding it.
+                        b"Content-Type": [b'text/html; charset="utf8"'],
+                    },
+                    "https://example.com",
+                    200,
+                ),
+            )
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
+
+    def test_non_ascii_preview_content_type(self):
+
+        request, channel = self.make_request(
+            "GET", "url_preview?url=matrix.org", shorthand=False
+        )
+        request.render(self.preview_url)
+        self.pump()
+
+        # We've made one fetch
+        self.assertEqual(len(self.fetches), 1)
+
+        end_content = (
+            b'<html><head>'
+            b'<meta property="og:title" content="\xe4\xea\xe0" />'
+            b'<meta property="og:description" content="hi" />'
+            b'</head></html>'
+        )
+
+        self.fetches[0][0].callback(
+            (
+                end_content,
+                (
+                    len(end_content),
+                    {
+                        b"Content-Length": [b"%d" % (len(end_content))],
+                        b"Content-Type": [b'text/html; charset="windows-1251"'],
+                    },
+                    "https://example.com",
+                    200,
+                ),
+            )
+        )
+
+        self.pump()
+        self.assertEqual(channel.code, 200)
+        self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
				`@ -0,0 +1 @@`
				URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.