httputil: Fix support for non-latin1 filenames in multipart uploads

author Ben Darnell <ben@bendarnell.com>

Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)

committer Ben Darnell <ben@bendarnell.com>

Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)
author Ben Darnell <ben@bendarnell.com>
Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)
committer Ben Darnell <ben@bendarnell.com>
Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)
diff --git a/tornado/httputil.py b/tornado/httputil.py

index ef460985eedc8983d9c9352a2de9011b490635fd..7f22da026a8da265d015309e41c02f6de7062cc6 100644 (file)
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -70,6 +70,10 @@ else:
  # To be used with str.strip() and related methods.
  HTTP_WHITESPACE = " \t"
  
+# Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
+# chars greater than \xFF (which may appear after decoding utf8).
+_FORBIDDEN_HEADER_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
+
  
  class _ABNF:
      """Class that holds a subset of ABNF rules from RFC 9110 and friends.
@@ -196,14 +200,18 @@ class HTTPHeaders(StrMutableMapping):
  
      # new public methods
  
-    def add(self, name: str, value: str) -> None:
+    def add(self, name: str, value: str, *, _chars_are_bytes: bool = True) -> None:
          """Adds a new value for the given key."""
          if not _ABNF.field_name.fullmatch(name):
              raise HTTPInputError("Invalid header name %r" % name)
-        if not _ABNF.field_value.fullmatch(to_unicode(value)):
-            # TODO: the fact we still support bytes here (contrary to type annotations)
-            # and still test for it should probably be changed.
-            raise HTTPInputError("Invalid header value %r" % value)
+        if _chars_are_bytes:
+            if not _ABNF.field_value.fullmatch(to_unicode(value)):
+                # TODO: the fact we still support bytes here (contrary to type annotations)
+                # and still test for it should probably be changed.
+                raise HTTPInputError("Invalid header value %r" % value)
+        else:
+            if _FORBIDDEN_HEADER_CHARS_RE.search(value):
+                raise HTTPInputError("Invalid header value %r" % value)
          norm_name = _normalize_header(name)
          self._last_key = norm_name
          if norm_name in self:
@@ -229,7 +237,7 @@ class HTTPHeaders(StrMutableMapping):
              for value in values:
                  yield (name, value)
  
-    def parse_line(self, line: str) -> None:
+    def parse_line(self, line: str, *, _chars_are_bytes: bool = True) -> None:
          r"""Updates the dictionary with a single header line.
  
          >>> h = HTTPHeaders()
@@ -263,8 +271,12 @@ class HTTPHeaders(StrMutableMapping):
              if self._last_key is None:
                  raise HTTPInputError("first header line cannot start with whitespace")
              new_part = " " + line.strip(HTTP_WHITESPACE)
-            if not _ABNF.field_value.fullmatch(new_part[1:]):
-                raise HTTPInputError("Invalid header continuation %r" % new_part)
+            if _chars_are_bytes:
+                if not _ABNF.field_value.fullmatch(new_part[1:]):
+                    raise HTTPInputError("Invalid header continuation %r" % new_part)
+            else:
+                if _FORBIDDEN_HEADER_CHARS_RE.search(new_part):
+                    raise HTTPInputError("Invalid header value %r" % new_part)
              self._as_list[self._last_key][-1] += new_part
              self._dict[self._last_key] += new_part
          else:
@@ -272,10 +284,12 @@ class HTTPHeaders(StrMutableMapping):
                  name, value = line.split(":", 1)
              except ValueError:
                  raise HTTPInputError("no colon in header line")
-            self.add(name, value.strip(HTTP_WHITESPACE))
+            self.add(
+                name, value.strip(HTTP_WHITESPACE), _chars_are_bytes=_chars_are_bytes
+            )
  
      @classmethod
-    def parse(cls, headers: str) -> "HTTPHeaders":
+    def parse(cls, headers: str, *, _chars_are_bytes: bool = True) -> "HTTPHeaders":
          """Returns a dictionary from HTTP header text.
  
          >>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
@@ -288,17 +302,31 @@ class HTTPHeaders(StrMutableMapping):
             mix of `KeyError`, and `ValueError`.
  
          """
+        # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
+        # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
+        # they are to be interpreted as utf-8). For historical reasons, this method handled this by
+        # expecting both callers to decode the headers to strings before parsing them. This wasn't a
+        # problem until we started doing stricter validation of the characters allowed in HTTP
+        # headers (using ABNF rules defined in terms of byte values), which inadvertently started
+        # disallowing non-latin1 characters in multipart/form-data filenames.
+        #
+        # This method should have accepted bytes and a desired encoding, but this change is being
+        # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
+        # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
+        # smuggled through the latin1 encoding) or to accept any non-control unicode characters
+        # as required by multipart/form-data. This method will change to accept bytes in a future
+        # release.
          h = cls()
  
          start = 0
          while True:
              lf = headers.find("\n", start)
              if lf == -1:
-                h.parse_line(headers[start:])
+                h.parse_line(headers[start:], _chars_are_bytes=_chars_are_bytes)
                  break
              line = headers[start : lf + 1]
              start = lf + 1
-            h.parse_line(line)
+            h.parse_line(line, _chars_are_bytes=_chars_are_bytes)
          return h
  
      # MutableMapping abstract method implementations.
@@ -946,7 +974,7 @@ def parse_multipart_form_data(
          eoh = part.find(b"\r\n\r\n")
          if eoh == -1:
              raise HTTPInputError("multipart/form-data missing headers")
-        headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
+        headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"), _chars_are_bytes=False)
          disp_header = headers.get("Content-Disposition", "")
          disposition, disp_params = _parse_header(disp_header)
          if disposition != "form-data" or not part.endswith(b"\r\n"):
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py

index 7a09beaa3f316fea64d4a9b24c3e5e8a9df0fee1..fdd5fcabf9efd9c99b558e965712d4a8d65ed15f 100644 (file)
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -155,7 +155,7 @@ Foo
              self.assertEqual(file["filename"], filename)
              self.assertEqual(file["body"], b"Foo")
  
-    def test_non_ascii_filename(self):
+    def test_non_ascii_filename_rfc5987(self):
          data = b"""\
  --1234
  Content-Disposition: form-data; name="files"; filename="ab.txt"; filename*=UTF-8''%C3%A1b.txt
@@ -170,6 +170,23 @@ Foo
          self.assertEqual(file["filename"], "áb.txt")
          self.assertEqual(file["body"], b"Foo")
  
+    def test_non_ascii_filename_raw(self):
+        data = """\
+--1234
+Content-Disposition: form-data; name="files"; filename="测试.txt"
+
+Foo
+--1234--""".encode(
+            "utf-8"
+        ).replace(
+            b"\n", b"\r\n"
+        )
+        args, files = form_data_args()
+        parse_multipart_form_data(b"1234", data, args, files)
+        file = files["files"][0]
+        self.assertEqual(file["filename"], "测试.txt")
+        self.assertEqual(file["body"], b"Foo")
+
      def test_boundary_starts_and_ends_with_quotes(self):
          data = b"""\
  --1234
author	Ben Darnell <ben@bendarnell.com>
	Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)
committer	Ben Darnell <ben@bendarnell.com>
	Thu, 22 May 2025 14:59:48 +0000 (10:59 -0400)
tornado/httputil.py		patch \| blob \| blame \| history
tornado/test/httputil_test.py		patch \| blob \| blame \| history