Support underscores (in addition to hyphens) for charset detection. (#10410)
parent
5b22d5ee03
commit
8e1febc6a1
|
@ -0,0 +1 @@
|
|||
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
|
|
@ -58,9 +58,11 @@ if TYPE_CHECKING:
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
|
||||
_charset_match = re.compile(
|
||||
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
|
||||
)
|
||||
_xml_encoding_match = re.compile(
|
||||
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
|
||||
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
|
||||
)
|
||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||
|
||||
|
|
|
@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase):
|
|||
)
|
||||
self.assertEqual(encoding, "ascii")
|
||||
|
||||
def test_meta_charset_underscores(self):
|
||||
"""A character encoding contains underscore."""
|
||||
encoding = get_html_media_encoding(
|
||||
b"""
|
||||
<html>
|
||||
<head><meta charset="Shift_JIS">
|
||||
</head>
|
||||
</html>
|
||||
""",
|
||||
"text/html",
|
||||
)
|
||||
self.assertEqual(encoding, "Shift_JIS")
|
||||
|
||||
def test_xml_encoding(self):
|
||||
"""A character encoding is found via the meta tag."""
|
||||
encoding = get_html_media_encoding(
|
||||
|
|
Loading…
Reference in New Issue