From 5e477c1debfd932ced56ec755204d6ead4ce8ec8 Mon Sep 17 00:00:00 2001 From: The Stranjer <791672+TheStranjer@users.noreply.github.com> Date: Tue, 17 Mar 2020 09:29:09 -0400 Subject: [PATCH] Set charset to utf-8 when adding headers for certain text content types (#7044) Fixes #7043 --- changelog.d/7044.bugfix | 1 + synapse/rest/media/v1/_base.py | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 changelog.d/7044.bugfix diff --git a/changelog.d/7044.bugfix b/changelog.d/7044.bugfix new file mode 100644 index 0000000000..790088ddb4 --- /dev/null +++ b/changelog.d/7044.bugfix @@ -0,0 +1 @@ +Fix a bug that renders UTF-8 text files incorrectly when loaded from media. Contributed by @TheStranjer. diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index ba28dd089d..503f2bed98 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -30,6 +30,22 @@ from synapse.util.stringutils import is_ascii logger = logging.getLogger(__name__) +# list all text content types that will have the charset default to UTF-8 when +# none is given +TEXT_CONTENT_TYPES = [ + "text/css", + "text/csv", + "text/html", + "text/calendar", + "text/plain", + "text/javascript", + "application/json", + "application/ld+json", + "application/rtf", + "image/svg+xml", + "text/xml", +] + def parse_media_id(request): try: @@ -96,7 +112,14 @@ def add_file_headers(request, media_type, file_size, upload_name): def _quote(x): return urllib.parse.quote(x.encode("utf-8")) - request.setHeader(b"Content-Type", media_type.encode("UTF-8")) + # Default to a UTF-8 charset for text content types. + # ex, uses UTF-8 for 'text/css' but not 'text/css; charset=UTF-16' + if media_type.lower() in TEXT_CONTENT_TYPES: + content_type = media_type + "; charset=UTF-8" + else: + content_type = media_type + + request.setHeader(b"Content-Type", content_type.encode("UTF-8")) if upload_name: # RFC6266 section 4.1 [1] defines both `filename` and `filename*`. #