Use <meta> tags to discover the per-page encoding of html previews (#4183)
parent
a51288e5d6
commit
df758e155d
|
@ -0,0 +1 @@
|
||||||
|
URL previews now correctly decode non-UTF-8 text if the header contains a `<meta http-equiv="Content-Type"` header.
|
|
@ -53,6 +53,9 @@ from ._base import FileInfo
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
|
||||||
|
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||||
|
|
||||||
|
|
||||||
class PreviewUrlResource(Resource):
|
class PreviewUrlResource(Resource):
|
||||||
isLeaf = True
|
isLeaf = True
|
||||||
|
@ -223,15 +226,25 @@ class PreviewUrlResource(Resource):
|
||||||
with open(media_info['filename'], 'rb') as file:
|
with open(media_info['filename'], 'rb') as file:
|
||||||
body = file.read()
|
body = file.read()
|
||||||
|
|
||||||
# clobber the encoding from the content-type, or default to utf-8
|
encoding = None
|
||||||
# XXX: this overrides any <meta/> or XML charset headers in the body
|
|
||||||
# which may pose problems, but so far seems to work okay.
|
# Let's try and figure out if it has an encoding set in a meta tag.
|
||||||
match = re.match(
|
# Limit it to the first 1kb, since it ought to be in the meta tags
|
||||||
r'.*; *charset="?(.*?)"?(;|$)',
|
# at the top.
|
||||||
media_info['media_type'],
|
match = _charset_match.search(body[:1000])
|
||||||
re.I
|
|
||||||
)
|
# If we find a match, it should take precedence over the
|
||||||
encoding = match.group(1) if match else "utf-8"
|
# Content-Type header, so set it here.
|
||||||
|
if match:
|
||||||
|
encoding = match.group(1).decode('ascii')
|
||||||
|
|
||||||
|
# If we don't find a match, we'll look at the HTTP Content-Type, and
|
||||||
|
# if that doesn't exist, we'll fall back to UTF-8.
|
||||||
|
if not encoding:
|
||||||
|
match = _content_type_match.match(
|
||||||
|
media_info['media_type']
|
||||||
|
)
|
||||||
|
encoding = match.group(1) if match else "utf-8"
|
||||||
|
|
||||||
og = decode_and_calc_og(body, media_info['uri'], encoding)
|
og = decode_and_calc_og(body, media_info['uri'], encoding)
|
||||||
|
|
||||||
|
|
|
@ -162,3 +162,80 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
|
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_non_ascii_preview_httpequiv(self):
|
||||||
|
|
||||||
|
request, channel = self.make_request(
|
||||||
|
"GET", "url_preview?url=matrix.org", shorthand=False
|
||||||
|
)
|
||||||
|
request.render(self.preview_url)
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
# We've made one fetch
|
||||||
|
self.assertEqual(len(self.fetches), 1)
|
||||||
|
|
||||||
|
end_content = (
|
||||||
|
b'<html><head>'
|
||||||
|
b'<meta http-equiv="Content-Type" content="text/html; charset=windows-1251"/>'
|
||||||
|
b'<meta property="og:title" content="\xe4\xea\xe0" />'
|
||||||
|
b'<meta property="og:description" content="hi" />'
|
||||||
|
b'</head></html>'
|
||||||
|
)
|
||||||
|
|
||||||
|
self.fetches[0][0].callback(
|
||||||
|
(
|
||||||
|
end_content,
|
||||||
|
(
|
||||||
|
len(end_content),
|
||||||
|
{
|
||||||
|
b"Content-Length": [b"%d" % (len(end_content))],
|
||||||
|
# This charset=utf-8 should be ignored, because the
|
||||||
|
# document has a meta tag overriding it.
|
||||||
|
b"Content-Type": [b'text/html; charset="utf8"'],
|
||||||
|
},
|
||||||
|
"https://example.com",
|
||||||
|
200,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pump()
|
||||||
|
self.assertEqual(channel.code, 200)
|
||||||
|
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
|
||||||
|
|
||||||
|
def test_non_ascii_preview_content_type(self):
|
||||||
|
|
||||||
|
request, channel = self.make_request(
|
||||||
|
"GET", "url_preview?url=matrix.org", shorthand=False
|
||||||
|
)
|
||||||
|
request.render(self.preview_url)
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
# We've made one fetch
|
||||||
|
self.assertEqual(len(self.fetches), 1)
|
||||||
|
|
||||||
|
end_content = (
|
||||||
|
b'<html><head>'
|
||||||
|
b'<meta property="og:title" content="\xe4\xea\xe0" />'
|
||||||
|
b'<meta property="og:description" content="hi" />'
|
||||||
|
b'</head></html>'
|
||||||
|
)
|
||||||
|
|
||||||
|
self.fetches[0][0].callback(
|
||||||
|
(
|
||||||
|
end_content,
|
||||||
|
(
|
||||||
|
len(end_content),
|
||||||
|
{
|
||||||
|
b"Content-Length": [b"%d" % (len(end_content))],
|
||||||
|
b"Content-Type": [b'text/html; charset="windows-1251"'],
|
||||||
|
},
|
||||||
|
"https://example.com",
|
||||||
|
200,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pump()
|
||||||
|
self.assertEqual(channel.code, 200)
|
||||||
|
self.assertEqual(channel.json_body["og:title"], u"\u0434\u043a\u0430")
|
||||||
|
|
Loading…
Reference in New Issue