httputil: Forbid control chars and CR in header values

author Ben Darnell <ben@bendarnell.com>

Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)

committer Ben Darnell <ben@bendarnell.com>

Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)
author Ben Darnell <ben@bendarnell.com>
Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)
committer Ben Darnell <ben@bendarnell.com>
Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)
diff --git a/tornado/httputil.py b/tornado/httputil.py

index 5d05cacbd81792cc09e46cee4e029fa8152f51ad..7044aca02b437e7c5fe5ef8b3cabff8eca9cf6ff 100644 (file)
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -33,7 +33,7 @@ import time
  import unicodedata
  from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
  
-from tornado.escape import native_str, parse_qs_bytes, utf8
+from tornado.escape import native_str, parse_qs_bytes, utf8, to_unicode
  from tornado.log import gen_log
  from tornado.util import ObjectDict, unicode_type
  
@@ -100,6 +100,12 @@ class _ABNF:
      # RFC 9110 (HTTP Semantics)
      obs_text = re.compile(r"[\x80-\xFF]")
      field_vchar = re.compile(rf"(?:{VCHAR.pattern}|{obs_text.pattern})")
+    # Not exactly from the RFC to simplify and combine field-content and field-value.
+    field_value = re.compile(
+        rf"|"
+        rf"{field_vchar.pattern}|"
+        rf"{field_vchar.pattern}(?:{field_vchar.pattern}| |\t)*{field_vchar.pattern}"
+    )
      tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]")
      token = re.compile(rf"{tchar.pattern}+")
      field_name = token
@@ -195,6 +201,10 @@ class HTTPHeaders(StrMutableMapping):
          """Adds a new value for the given key."""
          if not _ABNF.field_name.fullmatch(name):
              raise HTTPInputError("Invalid header name %r" % name)
+        if not _ABNF.field_value.fullmatch(to_unicode(value)):
+            # TODO: the fact we still support bytes here (contrary to type annotations)
+            # and still test for it should probably be changed.
+            raise HTTPInputError("Invalid header value %r" % value)
          norm_name = _normalize_header(name)
          self._last_key = norm_name
          if norm_name in self:
@@ -254,6 +264,8 @@ class HTTPHeaders(StrMutableMapping):
              if self._last_key is None:
                  raise HTTPInputError("first header line cannot start with whitespace")
              new_part = " " + line.strip(HTTP_WHITESPACE)
+            if not _ABNF.field_value.fullmatch(new_part[1:]):
+                raise HTTPInputError("Invalid header continuation %r" % new_part)
              self._as_list[self._last_key][-1] += new_part
              self._dict[self._last_key] += new_part
          else:
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py

index 221c1dcb46d4672cfd69b00e1809a955ba24f798..30fbec4d78e626c6bd56b87657baf76b05be84ee 100644 (file)
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -303,6 +303,17 @@ Foo: even
          data = "Foo: bar\r\n\fasdf"
          self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
  
+    def test_forbidden_ascii_characters(self):
+        # Control characters and ASCII whitespace other than space, tab, and CRLF are not allowed in
+        # headers.
+        for c in range(0xFF):
+            data = f"Foo: bar{chr(c)}baz\r\n"
+            if c == 0x09 or (c >= 0x20 and c != 0x7F):
+                headers = HTTPHeaders.parse(data)
+                self.assertEqual(headers["Foo"], f"bar{chr(c)}baz")
+            else:
+                self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
+
      def test_unicode_newlines(self):
          # Ensure that only \r\n is recognized as a header separator, and not
          # the other newline-like unicode characters.
@@ -311,10 +322,13 @@ Foo: even
          # and cpython's unicodeobject.c (which defines the implementation
          # of unicode_type.splitlines(), and uses a different list than TR13).
          newlines = [
-            "\u001b",  # VERTICAL TAB
-            "\u001c",  # FILE SEPARATOR
-            "\u001d",  # GROUP SEPARATOR
-            "\u001e",  # RECORD SEPARATOR
+            # The following ascii characters are sometimes treated as newline-like,
+            # but they're disallowed in HTTP headers. This test covers unicode
+            # characters that are permitted in headers (under the obs-text rule).
+            # "\u001b",  # VERTICAL TAB
+            # "\u001c",  # FILE SEPARATOR
+            # "\u001d",  # GROUP SEPARATOR
+            # "\u001e",  # RECORD SEPARATOR
              "\u0085",  # NEXT LINE
              "\u2028",  # LINE SEPARATOR
              "\u2029",  # PARAGRAPH SEPARATOR
@@ -363,13 +377,16 @@ Foo: even
              self.assertEqual(expected, list(headers.get_all()))
  
      def test_optional_cr(self):
+        # Bare CR is  not a valid line separator
+        with self.assertRaises(HTTPInputError):
+            HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+
          # Both CRLF and LF should be accepted as separators. CR should not be
-        # part of the data when followed by LF, but it is a normal char
-        # otherwise (or should bare CR be an error?)
-        headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+        # part of the data when followed by LF.
+        headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nMore: more\r\n")
          self.assertEqual(
              sorted(headers.get_all()),
-            [("Cr", "cr\rMore: more"), ("Crlf", "crlf"), ("Lf", "lf")],
+            [("Crlf", "crlf"), ("Lf", "lf"), ("More", "more")],
          )
  
      def test_copy(self):
author	Ben Darnell <ben@bendarnell.com>
	Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)
committer	Ben Darnell <ben@bendarnell.com>
	Sat, 26 Apr 2025 18:31:25 +0000 (14:31 -0400)
tornado/httputil.py		patch \| blob \| blame \| history
tornado/test/httputil_test.py		patch \| blob \| blame \| history