Attempt different character encodings when previewing a URL. (#11077)
This follows similar logic to BeautifulSoup where we attempt different character encodings until we find one which works.pull/11089/head
							parent
							
								
									1609ccf8fe
								
							
						
					
					
						commit
						e2f0b49b3f
					
				|  | @ -0,0 +1 @@ | |||
| Fix a long-standing bug when attempting to preview URLs which are in the `windows-1252` character encoding. | ||||
|  | @ -295,8 +295,7 @@ class PreviewUrlResource(DirectServeJsonResource): | |||
|             with open(media_info.filename, "rb") as file: | ||||
|                 body = file.read() | ||||
| 
 | ||||
|             encoding = get_html_media_encoding(body, media_info.media_type) | ||||
|             tree = decode_body(body, encoding) | ||||
|             tree = decode_body(body, media_info.uri, media_info.media_type) | ||||
|             if tree is not None: | ||||
|                 # Check if this HTML document points to oEmbed information and | ||||
|                 # defer to that. | ||||
|  | @ -632,16 +631,19 @@ class PreviewUrlResource(DirectServeJsonResource): | |||
|             logger.debug("No media removed from url cache") | ||||
| 
 | ||||
| 
 | ||||
| def get_html_media_encoding(body: bytes, content_type: str) -> str: | ||||
| def get_html_media_encodings(body: bytes, content_type: Optional[str]) -> Iterable[str]: | ||||
|     """ | ||||
|     Get the encoding of the body based on the (presumably) HTML body or media_type. | ||||
|     Get potential encoding of the body based on the (presumably) HTML body or the content-type header. | ||||
| 
 | ||||
|     The precedence used for finding a character encoding is: | ||||
| 
 | ||||
|     1. meta tag with a charset declared. | ||||
|     1. <meta> tag with a charset declared. | ||||
|     2. The XML document's character encoding attribute. | ||||
|     3. The Content-Type header. | ||||
|     4. Fallback to UTF-8. | ||||
|     4. Fallback to utf-8. | ||||
|     5. Fallback to windows-1252. | ||||
| 
 | ||||
|     This roughly follows the algorithm used by BeautifulSoup's bs4.dammit.EncodingDetector. | ||||
| 
 | ||||
|     Args: | ||||
|         body: The HTML document, as bytes. | ||||
|  | @ -653,36 +655,39 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str: | |||
|     # Limit searches to the first 1kb, since it ought to be at the top. | ||||
|     body_start = body[:1024] | ||||
| 
 | ||||
|     # Let's try and figure out if it has an encoding set in a meta tag. | ||||
|     # Check if it has an encoding set in a meta tag. | ||||
|     match = _charset_match.search(body_start) | ||||
|     if match: | ||||
|         return match.group(1).decode("ascii") | ||||
|         yield match.group(1).decode("ascii") | ||||
| 
 | ||||
|     # TODO Support <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> | ||||
| 
 | ||||
|     # If we didn't find a match, see if it an XML document with an encoding. | ||||
|     # Check if it has an XML document with an encoding. | ||||
|     match = _xml_encoding_match.match(body_start) | ||||
|     if match: | ||||
|         return match.group(1).decode("ascii") | ||||
|         yield match.group(1).decode("ascii") | ||||
| 
 | ||||
|     # If we don't find a match, we'll look at the HTTP Content-Type, and | ||||
|     # if that doesn't exist, we'll fall back to UTF-8. | ||||
|     content_match = _content_type_match.match(content_type) | ||||
|     if content_match: | ||||
|         return content_match.group(1) | ||||
|     # Check the HTTP Content-Type header for a character set. | ||||
|     if content_type: | ||||
|         content_match = _content_type_match.match(content_type) | ||||
|         if content_match: | ||||
|             yield content_match.group(1) | ||||
| 
 | ||||
|     return "utf-8" | ||||
|     # Finally, fallback to UTF-8, then windows-1252. | ||||
|     yield "utf-8" | ||||
|     yield "windows-1252" | ||||
| 
 | ||||
| 
 | ||||
| def decode_body( | ||||
|     body: bytes, request_encoding: Optional[str] = None | ||||
|     body: bytes, uri: str, content_type: Optional[str] = None | ||||
| ) -> Optional["etree.Element"]: | ||||
|     """ | ||||
|     This uses lxml to parse the HTML document. | ||||
| 
 | ||||
|     Args: | ||||
|         body: The HTML document, as bytes. | ||||
|         request_encoding: The character encoding of the body, as a string. | ||||
|         uri: The URI used to download the body. | ||||
|         content_type: The Content-Type header. | ||||
| 
 | ||||
|     Returns: | ||||
|         The parsed HTML body, or None if an error occurred during processed. | ||||
|  | @ -691,32 +696,25 @@ def decode_body( | |||
|     if not body: | ||||
|         return None | ||||
| 
 | ||||
|     from lxml import etree | ||||
| 
 | ||||
|     # Create an HTML parser. If this fails, log and return no metadata. | ||||
|     try: | ||||
|         parser = etree.HTMLParser(recover=True, encoding=request_encoding) | ||||
|     except LookupError: | ||||
|         # blindly consider the encoding as utf-8. | ||||
|         parser = etree.HTMLParser(recover=True, encoding="utf-8") | ||||
|     except Exception as e: | ||||
|         logger.warning("Unable to create HTML parser: %s" % (e,)) | ||||
|     for encoding in get_html_media_encodings(body, content_type): | ||||
|         try: | ||||
|             body_str = body.decode(encoding) | ||||
|         except Exception: | ||||
|             pass | ||||
|         else: | ||||
|             break | ||||
|     else: | ||||
|         logger.warning("Unable to decode HTML body for %s", uri) | ||||
|         return None | ||||
| 
 | ||||
|     def _attempt_decode_body( | ||||
|         body_attempt: Union[bytes, str] | ||||
|     ) -> Optional["etree.Element"]: | ||||
|         # Attempt to parse the body. Returns None if the body was successfully | ||||
|         # parsed, but no tree was found. | ||||
|         return etree.fromstring(body_attempt, parser) | ||||
|     from lxml import etree | ||||
| 
 | ||||
|     # Attempt to parse the body. If this fails, log and return no metadata. | ||||
|     try: | ||||
|         return _attempt_decode_body(body) | ||||
|     except UnicodeDecodeError: | ||||
|         # blindly try decoding the body as utf-8, which seems to fix | ||||
|         # the charset mismatches on https://google.com | ||||
|         return _attempt_decode_body(body.decode("utf-8", "ignore")) | ||||
|     # Create an HTML parser. | ||||
|     parser = etree.HTMLParser(recover=True, encoding="utf-8") | ||||
| 
 | ||||
|     # Attempt to parse the body. Returns None if the body was successfully | ||||
|     # parsed, but no tree was found. | ||||
|     return etree.fromstring(body_str, parser) | ||||
| 
 | ||||
| 
 | ||||
| def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]: | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ | |||
| from synapse.rest.media.v1.preview_url_resource import ( | ||||
|     _calc_og, | ||||
|     decode_body, | ||||
|     get_html_media_encoding, | ||||
|     get_html_media_encodings, | ||||
|     summarize_paragraphs, | ||||
| ) | ||||
| 
 | ||||
|  | @ -159,7 +159,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | ||||
|  | @ -175,7 +175,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | ||||
|  | @ -194,7 +194,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual( | ||||
|  | @ -216,7 +216,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | ||||
|  | @ -230,7 +230,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) | ||||
|  | @ -245,7 +245,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."}) | ||||
|  | @ -260,7 +260,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </html> | ||||
|         """ | ||||
| 
 | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
| 
 | ||||
|         self.assertEqual(og, {"og:title": None, "og:description": "Some text."}) | ||||
|  | @ -268,13 +268,13 @@ class CalcOgTestCase(unittest.TestCase): | |||
|     def test_empty(self): | ||||
|         """Test a body with no data in it.""" | ||||
|         html = b"" | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         self.assertIsNone(tree) | ||||
| 
 | ||||
|     def test_no_tree(self): | ||||
|         """A valid body with no tree in it.""" | ||||
|         html = b"\x00" | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         self.assertIsNone(tree) | ||||
| 
 | ||||
|     def test_invalid_encoding(self): | ||||
|  | @ -287,7 +287,7 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </body> | ||||
|         </html> | ||||
|         """ | ||||
|         tree = decode_body(html, "invalid-encoding") | ||||
|         tree = decode_body(html, "http://example.com/test.html", "invalid-encoding") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
|         self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."}) | ||||
| 
 | ||||
|  | @ -302,15 +302,29 @@ class CalcOgTestCase(unittest.TestCase): | |||
|         </body> | ||||
|         </html> | ||||
|         """ | ||||
|         tree = decode_body(html) | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
|         self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."}) | ||||
| 
 | ||||
|     def test_windows_1252(self): | ||||
|         """A body which uses windows-1252, but doesn't declare that.""" | ||||
|         html = b""" | ||||
|         <html> | ||||
|         <head><title>\xf3</title></head> | ||||
|         <body> | ||||
|         Some text. | ||||
|         </body> | ||||
|         </html> | ||||
|         """ | ||||
|         tree = decode_body(html, "http://example.com/test.html") | ||||
|         og = _calc_og(tree, "http://example.com/test.html") | ||||
|         self.assertEqual(og, {"og:title": "ó", "og:description": "Some text."}) | ||||
| 
 | ||||
| 
 | ||||
| class MediaEncodingTestCase(unittest.TestCase): | ||||
|     def test_meta_charset(self): | ||||
|         """A character encoding is found via the meta tag.""" | ||||
|         encoding = get_html_media_encoding( | ||||
|         encodings = get_html_media_encodings( | ||||
|             b""" | ||||
|         <html> | ||||
|         <head><meta charset="ascii"> | ||||
|  | @ -319,10 +333,10 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|         """, | ||||
|             "text/html", | ||||
|         ) | ||||
|         self.assertEqual(encoding, "ascii") | ||||
|         self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|         # A less well-formed version. | ||||
|         encoding = get_html_media_encoding( | ||||
|         encodings = get_html_media_encodings( | ||||
|             b""" | ||||
|         <html> | ||||
|         <head>< meta charset = ascii> | ||||
|  | @ -331,11 +345,11 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|         """, | ||||
|             "text/html", | ||||
|         ) | ||||
|         self.assertEqual(encoding, "ascii") | ||||
|         self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|     def test_meta_charset_underscores(self): | ||||
|         """A character encoding contains underscore.""" | ||||
|         encoding = get_html_media_encoding( | ||||
|         encodings = get_html_media_encodings( | ||||
|             b""" | ||||
|         <html> | ||||
|         <head><meta charset="Shift_JIS"> | ||||
|  | @ -344,11 +358,11 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|         """, | ||||
|             "text/html", | ||||
|         ) | ||||
|         self.assertEqual(encoding, "Shift_JIS") | ||||
|         self.assertEqual(list(encodings), ["Shift_JIS", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|     def test_xml_encoding(self): | ||||
|         """A character encoding is found via the meta tag.""" | ||||
|         encoding = get_html_media_encoding( | ||||
|         encodings = get_html_media_encodings( | ||||
|             b""" | ||||
|         <?xml version="1.0" encoding="ascii"?> | ||||
|         <html> | ||||
|  | @ -356,11 +370,11 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|         """, | ||||
|             "text/html", | ||||
|         ) | ||||
|         self.assertEqual(encoding, "ascii") | ||||
|         self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|     def test_meta_xml_encoding(self): | ||||
|         """Meta tags take precedence over XML encoding.""" | ||||
|         encoding = get_html_media_encoding( | ||||
|         encodings = get_html_media_encodings( | ||||
|             b""" | ||||
|         <?xml version="1.0" encoding="ascii"?> | ||||
|         <html> | ||||
|  | @ -370,7 +384,7 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|         """, | ||||
|             "text/html", | ||||
|         ) | ||||
|         self.assertEqual(encoding, "UTF-16") | ||||
|         self.assertEqual(list(encodings), ["UTF-16", "ascii", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|     def test_content_type(self): | ||||
|         """A character encoding is found via the Content-Type header.""" | ||||
|  | @ -384,10 +398,10 @@ class MediaEncodingTestCase(unittest.TestCase): | |||
|             'text/html; charset=ascii";', | ||||
|         ) | ||||
|         for header in headers: | ||||
|             encoding = get_html_media_encoding(b"", header) | ||||
|             self.assertEqual(encoding, "ascii") | ||||
|             encodings = get_html_media_encodings(b"", header) | ||||
|             self.assertEqual(list(encodings), ["ascii", "utf-8", "windows-1252"]) | ||||
| 
 | ||||
|     def test_fallback(self): | ||||
|         """A character encoding cannot be found in the body or header.""" | ||||
|         encoding = get_html_media_encoding(b"", "text/html") | ||||
|         self.assertEqual(encoding, "utf-8") | ||||
|         encodings = get_html_media_encodings(b"", "text/html") | ||||
|         self.assertEqual(list(encodings), ["utf-8", "windows-1252"]) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Patrick Cloke
						Patrick Cloke