Fix parsing of Content-Disposition headers (#4763)

* Fix parsing of Content-Disposition headers

TIL: filenames in content-dispostion headers can contain semicolons, and aren't
%-encoded.

* fix python2 incompatibility

* Fix docstrings
pull/4770/head
Richard van der Hoff 2019-02-27 22:29:10 +00:00 committed by Amber Brown
parent f2a753ea38
commit 68f47d6744
3 changed files with 111 additions and 20 deletions

1
changelog.d/4763.bugfix Normal file
View File

@ -0,0 +1 @@
Fix parsing of Content-Disposition headers on remote media requests and URL previews.

View File

@ -1,5 +1,6 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright 2014-2016 OpenMarket Ltd # Copyright 2014-2016 OpenMarket Ltd
# Copyright 2019 New Vector Ltd.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -213,8 +214,7 @@ def get_filename_from_headers(headers):
Content-Disposition HTTP header. Content-Disposition HTTP header.
Args: Args:
headers (twisted.web.http_headers.Headers): The HTTP headers (dict[bytes, list[bytes]]): The HTTP request headers.
request headers.
Returns: Returns:
A Unicode string of the filename, or None. A Unicode string of the filename, or None.
@ -225,23 +225,12 @@ def get_filename_from_headers(headers):
if not content_disposition[0]: if not content_disposition[0]:
return return
# dict of unicode: bytes, corresponding to the key value sections of the _, params = _parse_header(content_disposition[0])
# Content-Disposition header.
params = {}
parts = content_disposition[0].split(b";")
for i in parts:
# Split into key-value pairs, if able
# We don't care about things like `inline`, so throw it out
if b"=" not in i:
continue
key, value = i.strip().split(b"=")
params[key.decode('ascii')] = value
upload_name = None upload_name = None
# First check if there is a valid UTF-8 filename # First check if there is a valid UTF-8 filename
upload_name_utf8 = params.get("filename*", None) upload_name_utf8 = params.get(b"filename*", None)
if upload_name_utf8: if upload_name_utf8:
if upload_name_utf8.lower().startswith(b"utf-8''"): if upload_name_utf8.lower().startswith(b"utf-8''"):
upload_name_utf8 = upload_name_utf8[7:] upload_name_utf8 = upload_name_utf8[7:]
@ -267,12 +256,68 @@ def get_filename_from_headers(headers):
# If there isn't check for an ascii name. # If there isn't check for an ascii name.
if not upload_name: if not upload_name:
upload_name_ascii = params.get("filename", None) upload_name_ascii = params.get(b"filename", None)
if upload_name_ascii and is_ascii(upload_name_ascii): if upload_name_ascii and is_ascii(upload_name_ascii):
# Make sure there's no %-quoted bytes. If there is, reject it as upload_name = upload_name_ascii.decode('ascii')
# non-valid ASCII.
if b"%" not in upload_name_ascii:
upload_name = upload_name_ascii.decode('ascii')
# This may be None here, indicating we did not find a matching name. # This may be None here, indicating we did not find a matching name.
return upload_name return upload_name
def _parse_header(line):
"""Parse a Content-type like header.
Cargo-culted from `cgi`, but works on bytes rather than strings.
Args:
line (bytes): header to be parsed
Returns:
Tuple[bytes, dict[bytes, bytes]]:
the main content-type, followed by the parameter dictionary
"""
parts = _parseparam(b';' + line)
key = next(parts)
pdict = {}
for p in parts:
i = p.find(b'=')
if i >= 0:
name = p[:i].strip().lower()
value = p[i + 1:].strip()
# strip double-quotes
if len(value) >= 2 and value[0:1] == value[-1:] == b'"':
value = value[1:-1]
value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
pdict[name] = value
return key, pdict
def _parseparam(s):
"""Generator which splits the input on ;, respecting double-quoted sequences
Cargo-culted from `cgi`, but works on bytes rather than strings.
Args:
s (bytes): header to be parsed
Returns:
Iterable[bytes]: the split input
"""
while s[:1] == b';':
s = s[1:]
# look for the next ;
end = s.find(b';')
# if there is an odd number of " marks between here and the next ;, skip to the
# next ; instead
while end > 0 and (s.count(b'"', 0, end) - s.count(b'\\"', 0, end)) % 2:
end = s.find(b';', end + 1)
if end < 0:
end = len(s)
f = s[:end]
yield f.strip()
s = s[end:]

View File

@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
# Copyright 2019 New Vector Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from synapse.rest.media.v1._base import get_filename_from_headers
from tests import unittest
class GetFileNameFromHeadersTests(unittest.TestCase):
# input -> expected result
TEST_CASES = {
b"inline; filename=abc.txt": u"abc.txt",
b'inline; filename="azerty"': u"azerty",
b'inline; filename="aze%20rty"': u"aze%20rty",
b'inline; filename="aze\"rty"': u'aze"rty',
b'inline; filename="azer;ty"': u"azer;ty",
b"inline; filename*=utf-8''foo%C2%A3bar": u"foo£bar",
}
def tests(self):
for hdr, expected in self.TEST_CASES.items():
res = get_filename_from_headers(
{
b'Content-Disposition': [hdr],
},
)
self.assertEqual(
res, expected,
"expected output for %s to be %s but was %s" % (
hdr, expected, res,
)
)