Fix URL preview errors when previewing XML documents. (#11196)
parent
e0ef8fe58d
commit
b3e843be88
|
@ -0,0 +1 @@
|
||||||
|
Fix a bug introduced in v1.46.0rc1 where URL previews of some XML documents would fail.
|
|
@ -718,9 +718,12 @@ def decode_body(
|
||||||
if not body:
|
if not body:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# The idea here is that multiple encodings are tried until one works.
|
||||||
|
# Unfortunately the result is never used and then LXML will decode the string
|
||||||
|
# again with the found encoding.
|
||||||
for encoding in get_html_media_encodings(body, content_type):
|
for encoding in get_html_media_encodings(body, content_type):
|
||||||
try:
|
try:
|
||||||
body_str = body.decode(encoding)
|
body.decode(encoding)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
@ -732,11 +735,11 @@ def decode_body(
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
# Create an HTML parser.
|
# Create an HTML parser.
|
||||||
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
parser = etree.HTMLParser(recover=True, encoding=encoding)
|
||||||
|
|
||||||
# Attempt to parse the body. Returns None if the body was successfully
|
# Attempt to parse the body. Returns None if the body was successfully
|
||||||
# parsed, but no tree was found.
|
# parsed, but no tree was found.
|
||||||
return etree.fromstring(body_str, parser)
|
return etree.fromstring(body, parser)
|
||||||
|
|
||||||
|
|
||||||
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
||||||
|
|
|
@ -277,6 +277,21 @@ class CalcOgTestCase(unittest.TestCase):
|
||||||
tree = decode_body(html, "http://example.com/test.html")
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
self.assertIsNone(tree)
|
self.assertIsNone(tree)
|
||||||
|
|
||||||
|
def test_xml(self):
|
||||||
|
"""Test decoding XML and ensure it works properly."""
|
||||||
|
# Note that the strip() call is important to ensure the xml tag starts
|
||||||
|
# at the initial byte.
|
||||||
|
html = b"""
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||||
|
<head><title>Foo</title></head><body>Some text.</body></html>
|
||||||
|
""".strip()
|
||||||
|
tree = decode_body(html, "http://example.com/test.html")
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_invalid_encoding(self):
|
def test_invalid_encoding(self):
|
||||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||||
html = b"""
|
html = b"""
|
||||||
|
|
Loading…
Reference in New Issue