Support underscores (in addition to hyphens) for charset detection. (#10410)
parent
5b22d5ee03
commit
8e1febc6a1
|
@ -0,0 +1 @@
|
||||||
|
Improve character set detection in URL previews by supporting underscores (in addition to hyphens). Contributed by @srividyut.
|
|
@ -58,9 +58,11 @@ if TYPE_CHECKING:
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_charset_match = re.compile(br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9-]+)"?', flags=re.I)
|
_charset_match = re.compile(
|
||||||
|
br'<\s*meta[^>]*charset\s*=\s*"?([a-z0-9_-]+)"?', flags=re.I
|
||||||
|
)
|
||||||
_xml_encoding_match = re.compile(
|
_xml_encoding_match = re.compile(
|
||||||
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9-]+)"', flags=re.I
|
br'\s*<\s*\?\s*xml[^>]*encoding="([a-z0-9_-]+)"', flags=re.I
|
||||||
)
|
)
|
||||||
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
|
||||||
|
|
||||||
|
|
|
@ -325,6 +325,19 @@ class MediaEncodingTestCase(unittest.TestCase):
|
||||||
)
|
)
|
||||||
self.assertEqual(encoding, "ascii")
|
self.assertEqual(encoding, "ascii")
|
||||||
|
|
||||||
|
def test_meta_charset_underscores(self):
|
||||||
|
"""A character encoding contains underscore."""
|
||||||
|
encoding = get_html_media_encoding(
|
||||||
|
b"""
|
||||||
|
<html>
|
||||||
|
<head><meta charset="Shift_JIS">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
""",
|
||||||
|
"text/html",
|
||||||
|
)
|
||||||
|
self.assertEqual(encoding, "Shift_JIS")
|
||||||
|
|
||||||
def test_xml_encoding(self):
|
def test_xml_encoding(self):
|
||||||
"""A character encoding is found via the meta tag."""
|
"""A character encoding is found via the meta tag."""
|
||||||
encoding = get_html_media_encoding(
|
encoding = get_html_media_encoding(
|
||||||
|
|
Loading…
Reference in New Issue