From: Ben Darnell Date: Thu, 22 May 2025 14:59:48 +0000 (-0400) Subject: httputil: Fix support for non-latin1 filenames in multipart uploads X-Git-Tag: v6.5.1^2~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=170a58af2c1708c22529daf37536b6ba63403fd0;p=thirdparty%2Ftornado.git httputil: Fix support for non-latin1 filenames in multipart uploads The change to be stricter about characters allowed in HTTP headers inadvertently broke support for non-latin1 filenames in multipart uploads (this was missed in testing because our i18n test case only used characters in latin1). This commit adds a hacky workaround without changing any APIs to make it safe for a 6.5.1 patch release; a more robust solution will follow for future releases. Fixes #3502 --- diff --git a/tornado/httputil.py b/tornado/httputil.py index ef460985..7f22da02 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -70,6 +70,10 @@ else: # To be used with str.strip() and related methods. HTTP_WHITESPACE = " \t" +# Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits +# chars greater than \xFF (which may appear after decoding utf8). +_FORBIDDEN_HEADER_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]") + class _ABNF: """Class that holds a subset of ABNF rules from RFC 9110 and friends. @@ -196,14 +200,18 @@ class HTTPHeaders(StrMutableMapping): # new public methods - def add(self, name: str, value: str) -> None: + def add(self, name: str, value: str, *, _chars_are_bytes: bool = True) -> None: """Adds a new value for the given key.""" if not _ABNF.field_name.fullmatch(name): raise HTTPInputError("Invalid header name %r" % name) - if not _ABNF.field_value.fullmatch(to_unicode(value)): - # TODO: the fact we still support bytes here (contrary to type annotations) - # and still test for it should probably be changed. - raise HTTPInputError("Invalid header value %r" % value) + if _chars_are_bytes: + if not _ABNF.field_value.fullmatch(to_unicode(value)): + # TODO: the fact we still support bytes here (contrary to type annotations) + # and still test for it should probably be changed. + raise HTTPInputError("Invalid header value %r" % value) + else: + if _FORBIDDEN_HEADER_CHARS_RE.search(value): + raise HTTPInputError("Invalid header value %r" % value) norm_name = _normalize_header(name) self._last_key = norm_name if norm_name in self: @@ -229,7 +237,7 @@ class HTTPHeaders(StrMutableMapping): for value in values: yield (name, value) - def parse_line(self, line: str) -> None: + def parse_line(self, line: str, *, _chars_are_bytes: bool = True) -> None: r"""Updates the dictionary with a single header line. >>> h = HTTPHeaders() @@ -263,8 +271,12 @@ class HTTPHeaders(StrMutableMapping): if self._last_key is None: raise HTTPInputError("first header line cannot start with whitespace") new_part = " " + line.strip(HTTP_WHITESPACE) - if not _ABNF.field_value.fullmatch(new_part[1:]): - raise HTTPInputError("Invalid header continuation %r" % new_part) + if _chars_are_bytes: + if not _ABNF.field_value.fullmatch(new_part[1:]): + raise HTTPInputError("Invalid header continuation %r" % new_part) + else: + if _FORBIDDEN_HEADER_CHARS_RE.search(new_part): + raise HTTPInputError("Invalid header value %r" % new_part) self._as_list[self._last_key][-1] += new_part self._dict[self._last_key] += new_part else: @@ -272,10 +284,12 @@ class HTTPHeaders(StrMutableMapping): name, value = line.split(":", 1) except ValueError: raise HTTPInputError("no colon in header line") - self.add(name, value.strip(HTTP_WHITESPACE)) + self.add( + name, value.strip(HTTP_WHITESPACE), _chars_are_bytes=_chars_are_bytes + ) @classmethod - def parse(cls, headers: str) -> "HTTPHeaders": + def parse(cls, headers: str, *, _chars_are_bytes: bool = True) -> "HTTPHeaders": """Returns a dictionary from HTTP header text. >>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n") @@ -288,17 +302,31 @@ class HTTPHeaders(StrMutableMapping): mix of `KeyError`, and `ValueError`. """ + # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which + # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which + # they are to be interpreted as utf-8). For historical reasons, this method handled this by + # expecting both callers to decode the headers to strings before parsing them. This wasn't a + # problem until we started doing stricter validation of the characters allowed in HTTP + # headers (using ABNF rules defined in terms of byte values), which inadvertently started + # disallowing non-latin1 characters in multipart/form-data filenames. + # + # This method should have accepted bytes and a desired encoding, but this change is being + # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes + # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes + # smuggled through the latin1 encoding) or to accept any non-control unicode characters + # as required by multipart/form-data. This method will change to accept bytes in a future + # release. h = cls() start = 0 while True: lf = headers.find("\n", start) if lf == -1: - h.parse_line(headers[start:]) + h.parse_line(headers[start:], _chars_are_bytes=_chars_are_bytes) break line = headers[start : lf + 1] start = lf + 1 - h.parse_line(line) + h.parse_line(line, _chars_are_bytes=_chars_are_bytes) return h # MutableMapping abstract method implementations. @@ -946,7 +974,7 @@ def parse_multipart_form_data( eoh = part.find(b"\r\n\r\n") if eoh == -1: raise HTTPInputError("multipart/form-data missing headers") - headers = HTTPHeaders.parse(part[:eoh].decode("utf-8")) + headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"), _chars_are_bytes=False) disp_header = headers.get("Content-Disposition", "") disposition, disp_params = _parse_header(disp_header) if disposition != "form-data" or not part.endswith(b"\r\n"): diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index 7a09beaa..fdd5fcab 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -155,7 +155,7 @@ Foo self.assertEqual(file["filename"], filename) self.assertEqual(file["body"], b"Foo") - def test_non_ascii_filename(self): + def test_non_ascii_filename_rfc5987(self): data = b"""\ --1234 Content-Disposition: form-data; name="files"; filename="ab.txt"; filename*=UTF-8''%C3%A1b.txt @@ -170,6 +170,23 @@ Foo self.assertEqual(file["filename"], "áb.txt") self.assertEqual(file["body"], b"Foo") + def test_non_ascii_filename_raw(self): + data = """\ +--1234 +Content-Disposition: form-data; name="files"; filename="测试.txt" + +Foo +--1234--""".encode( + "utf-8" + ).replace( + b"\n", b"\r\n" + ) + args, files = form_data_args() + parse_multipart_form_data(b"1234", data, args, files) + file = files["files"][0] + self.assertEqual(file["filename"], "测试.txt") + self.assertEqual(file["body"], b"Foo") + def test_boundary_starts_and_ends_with_quotes(self): data = b"""\ --1234