]> git.ipfire.org Git - thirdparty/tornado.git/commitdiff
httputil: Only strip tabs and spaces from header values 3387/head
authorBen Darnell <ben@bendarnell.com>
Wed, 5 Jun 2024 20:50:37 +0000 (16:50 -0400)
committerBen Darnell <ben@bendarnell.com>
Thu, 6 Jun 2024 17:34:27 +0000 (13:34 -0400)
The RFC specifies that only tabs and spaces should be stripped.
Removing additonal whitespace characters can lead to framing
errors with certain proxies.

tornado/httputil.py
tornado/test/httputil_test.py

index b21d8046c429d254b05fefb6dc1134fc4db40def..9ce992d82b3ea9a2239a99ab4c4c8edc2b001ac1 100644 (file)
@@ -62,6 +62,9 @@ if typing.TYPE_CHECKING:
     from asyncio import Future  # noqa: F401
     import unittest  # noqa: F401
 
+# To be used with str.strip() and related methods.
+HTTP_WHITESPACE = " \t"
+
 
 @lru_cache(1000)
 def _normalize_header(name: str) -> str:
@@ -171,7 +174,7 @@ class HTTPHeaders(collections.abc.MutableMapping):
             # continuation of a multi-line header
             if self._last_key is None:
                 raise HTTPInputError("first header line cannot start with whitespace")
-            new_part = " " + line.lstrip()
+            new_part = " " + line.lstrip(HTTP_WHITESPACE)
             self._as_list[self._last_key][-1] += new_part
             self._dict[self._last_key] += new_part
         else:
@@ -179,7 +182,7 @@ class HTTPHeaders(collections.abc.MutableMapping):
                 name, value = line.split(":", 1)
             except ValueError:
                 raise HTTPInputError("no colon in header line")
-            self.add(name, value.strip())
+            self.add(name, value.strip(HTTP_WHITESPACE))
 
     @classmethod
     def parse(cls, headers: str) -> "HTTPHeaders":
index aa9b6ee25380e14d0b4a822d23888fcdaa0ae552..6d618839e07f820d41b07aafdf330707cd0c35f2 100644 (file)
@@ -334,6 +334,25 @@ Foo: even
                     gen_log.warning("failed while trying %r in %s", newline, encoding)
                     raise
 
+    def test_unicode_whitespace(self):
+        # Only tabs and spaces are to be stripped according to the HTTP standard.
+        # Other unicode whitespace is to be left as-is. In the context of headers,
+        # this specifically means the whitespace characters falling within the
+        # latin1 charset.
+        whitespace = [
+            (" ", True),  # SPACE
+            ("\t", True),  # TAB
+            ("\u00a0", False),  # NON-BREAKING SPACE
+            ("\u0085", False),  # NEXT LINE
+        ]
+        for c, stripped in whitespace:
+            headers = HTTPHeaders.parse("Transfer-Encoding: %schunked" % c)
+            if stripped:
+                expected = [("Transfer-Encoding", "chunked")]
+            else:
+                expected = [("Transfer-Encoding", "%schunked" % c)]
+            self.assertEqual(expected, list(headers.get_all()))
+
     def test_optional_cr(self):
         # Both CRLF and LF should be accepted as separators. CR should not be
         # part of the data when followed by LF, but it is a normal char