From 68f47d6744ac2b4c6ac8b59b8c52a537a5072b4c Mon Sep 17 00:00:00 2001 From: Richard van der Hoff <1389908+richvdh@users.noreply.github.com> Date: Wed, 27 Feb 2019 22:29:10 +0000 Subject: [PATCH] Fix parsing of Content-Disposition headers (#4763) * Fix parsing of Content-Disposition headers TIL: filenames in content-dispostion headers can contain semicolons, and aren't %-encoded. * fix python2 incompatibility * Fix docstrings --- changelog.d/4763.bugfix | 1 + synapse/rest/media/v1/_base.py | 85 ++++++++++++++++++++++++-------- tests/rest/media/v1/test_base.py | 45 +++++++++++++++++ 3 files changed, 111 insertions(+), 20 deletions(-) create mode 100644 changelog.d/4763.bugfix create mode 100644 tests/rest/media/v1/test_base.py diff --git a/changelog.d/4763.bugfix b/changelog.d/4763.bugfix new file mode 100644 index 0000000000..213ea44b70 --- /dev/null +++ b/changelog.d/4763.bugfix @@ -0,0 +1 @@ +Fix parsing of Content-Disposition headers on remote media requests and URL previews. diff --git a/synapse/rest/media/v1/_base.py b/synapse/rest/media/v1/_base.py index d16a30acd8..fece1ef0b8 100644 --- a/synapse/rest/media/v1/_base.py +++ b/synapse/rest/media/v1/_base.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- # Copyright 2014-2016 OpenMarket Ltd +# Copyright 2019 New Vector Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -213,8 +214,7 @@ def get_filename_from_headers(headers): Content-Disposition HTTP header. Args: - headers (twisted.web.http_headers.Headers): The HTTP - request headers. + headers (dict[bytes, list[bytes]]): The HTTP request headers. Returns: A Unicode string of the filename, or None. @@ -225,23 +225,12 @@ def get_filename_from_headers(headers): if not content_disposition[0]: return - # dict of unicode: bytes, corresponding to the key value sections of the - # Content-Disposition header. - params = {} - parts = content_disposition[0].split(b";") - for i in parts: - # Split into key-value pairs, if able - # We don't care about things like `inline`, so throw it out - if b"=" not in i: - continue - - key, value = i.strip().split(b"=") - params[key.decode('ascii')] = value + _, params = _parse_header(content_disposition[0]) upload_name = None # First check if there is a valid UTF-8 filename - upload_name_utf8 = params.get("filename*", None) + upload_name_utf8 = params.get(b"filename*", None) if upload_name_utf8: if upload_name_utf8.lower().startswith(b"utf-8''"): upload_name_utf8 = upload_name_utf8[7:] @@ -267,12 +256,68 @@ def get_filename_from_headers(headers): # If there isn't check for an ascii name. if not upload_name: - upload_name_ascii = params.get("filename", None) + upload_name_ascii = params.get(b"filename", None) if upload_name_ascii and is_ascii(upload_name_ascii): - # Make sure there's no %-quoted bytes. If there is, reject it as - # non-valid ASCII. - if b"%" not in upload_name_ascii: - upload_name = upload_name_ascii.decode('ascii') + upload_name = upload_name_ascii.decode('ascii') # This may be None here, indicating we did not find a matching name. return upload_name + + +def _parse_header(line): + """Parse a Content-type like header. + + Cargo-culted from `cgi`, but works on bytes rather than strings. + + Args: + line (bytes): header to be parsed + + Returns: + Tuple[bytes, dict[bytes, bytes]]: + the main content-type, followed by the parameter dictionary + """ + parts = _parseparam(b';' + line) + key = next(parts) + pdict = {} + for p in parts: + i = p.find(b'=') + if i >= 0: + name = p[:i].strip().lower() + value = p[i + 1:].strip() + + # strip double-quotes + if len(value) >= 2 and value[0:1] == value[-1:] == b'"': + value = value[1:-1] + value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') + pdict[name] = value + + return key, pdict + + +def _parseparam(s): + """Generator which splits the input on ;, respecting double-quoted sequences + + Cargo-culted from `cgi`, but works on bytes rather than strings. + + Args: + s (bytes): header to be parsed + + Returns: + Iterable[bytes]: the split input + """ + while s[:1] == b';': + s = s[1:] + + # look for the next ; + end = s.find(b';') + + # if there is an odd number of " marks between here and the next ;, skip to the + # next ; instead + while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2: + end = s.find(b';', end + 1) + + if end < 0: + end = len(s) + f = s[:end] + yield f.strip() + s = s[end:] diff --git a/tests/rest/media/v1/test_base.py b/tests/rest/media/v1/test_base.py new file mode 100644 index 0000000000..af8f74eb42 --- /dev/null +++ b/tests/rest/media/v1/test_base.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Copyright 2019 New Vector Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from synapse.rest.media.v1._base import get_filename_from_headers + +from tests import unittest + + +class GetFileNameFromHeadersTests(unittest.TestCase): + # input -> expected result + TEST_CASES = { + b"inline; filename=abc.txt": u"abc.txt", + b'inline; filename="azerty"': u"azerty", + b'inline; filename="aze%20rty"': u"aze%20rty", + b'inline; filename="aze\"rty"': u'aze"rty', + b'inline; filename="azer;ty"': u"azer;ty", + + b"inline; filename*=utf-8''foo%C2%A3bar": u"foo£bar", + } + + def tests(self): + for hdr, expected in self.TEST_CASES.items(): + res = get_filename_from_headers( + { + b'Content-Disposition': [hdr], + }, + ) + self.assertEqual( + res, expected, + "expected output for %s to be %s but was %s" % ( + hdr, expected, res, + ) + )