From: Ben Darnell Date: Sat, 26 Apr 2025 18:31:25 +0000 (-0400) Subject: httputil: Forbid control chars and CR in header values X-Git-Tag: v6.5.0b1~5^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F3489%2Fhead;p=thirdparty%2Ftornado.git httputil: Forbid control chars and CR in header values NUL, CR, and other control characters are not allowed in HTTP header values. Fixes #3481 --- diff --git a/tornado/httputil.py b/tornado/httputil.py index 5d05cacb..7044aca0 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -33,7 +33,7 @@ import time import unicodedata from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl -from tornado.escape import native_str, parse_qs_bytes, utf8 +from tornado.escape import native_str, parse_qs_bytes, utf8, to_unicode from tornado.log import gen_log from tornado.util import ObjectDict, unicode_type @@ -100,6 +100,12 @@ class _ABNF: # RFC 9110 (HTTP Semantics) obs_text = re.compile(r"[\x80-\xFF]") field_vchar = re.compile(rf"(?:{VCHAR.pattern}|{obs_text.pattern})") + # Not exactly from the RFC to simplify and combine field-content and field-value. + field_value = re.compile( + rf"|" + rf"{field_vchar.pattern}|" + rf"{field_vchar.pattern}(?:{field_vchar.pattern}| |\t)*{field_vchar.pattern}" + ) tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]") token = re.compile(rf"{tchar.pattern}+") field_name = token @@ -195,6 +201,10 @@ class HTTPHeaders(StrMutableMapping): """Adds a new value for the given key.""" if not _ABNF.field_name.fullmatch(name): raise HTTPInputError("Invalid header name %r" % name) + if not _ABNF.field_value.fullmatch(to_unicode(value)): + # TODO: the fact we still support bytes here (contrary to type annotations) + # and still test for it should probably be changed. + raise HTTPInputError("Invalid header value %r" % value) norm_name = _normalize_header(name) self._last_key = norm_name if norm_name in self: @@ -254,6 +264,8 @@ class HTTPHeaders(StrMutableMapping): if self._last_key is None: raise HTTPInputError("first header line cannot start with whitespace") new_part = " " + line.strip(HTTP_WHITESPACE) + if not _ABNF.field_value.fullmatch(new_part[1:]): + raise HTTPInputError("Invalid header continuation %r" % new_part) self._as_list[self._last_key][-1] += new_part self._dict[self._last_key] += new_part else: diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index 221c1dcb..30fbec4d 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -303,6 +303,17 @@ Foo: even data = "Foo: bar\r\n\fasdf" self.assertRaises(HTTPInputError, HTTPHeaders.parse, data) + def test_forbidden_ascii_characters(self): + # Control characters and ASCII whitespace other than space, tab, and CRLF are not allowed in + # headers. + for c in range(0xFF): + data = f"Foo: bar{chr(c)}baz\r\n" + if c == 0x09 or (c >= 0x20 and c != 0x7F): + headers = HTTPHeaders.parse(data) + self.assertEqual(headers["Foo"], f"bar{chr(c)}baz") + else: + self.assertRaises(HTTPInputError, HTTPHeaders.parse, data) + def test_unicode_newlines(self): # Ensure that only \r\n is recognized as a header separator, and not # the other newline-like unicode characters. @@ -311,10 +322,13 @@ Foo: even # and cpython's unicodeobject.c (which defines the implementation # of unicode_type.splitlines(), and uses a different list than TR13). newlines = [ - "\u001b", # VERTICAL TAB - "\u001c", # FILE SEPARATOR - "\u001d", # GROUP SEPARATOR - "\u001e", # RECORD SEPARATOR + # The following ascii characters are sometimes treated as newline-like, + # but they're disallowed in HTTP headers. This test covers unicode + # characters that are permitted in headers (under the obs-text rule). + # "\u001b", # VERTICAL TAB + # "\u001c", # FILE SEPARATOR + # "\u001d", # GROUP SEPARATOR + # "\u001e", # RECORD SEPARATOR "\u0085", # NEXT LINE "\u2028", # LINE SEPARATOR "\u2029", # PARAGRAPH SEPARATOR @@ -363,13 +377,16 @@ Foo: even self.assertEqual(expected, list(headers.get_all())) def test_optional_cr(self): + # Bare CR is not a valid line separator + with self.assertRaises(HTTPInputError): + HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n") + # Both CRLF and LF should be accepted as separators. CR should not be - # part of the data when followed by LF, but it is a normal char - # otherwise (or should bare CR be an error?) - headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n") + # part of the data when followed by LF. + headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nMore: more\r\n") self.assertEqual( sorted(headers.get_all()), - [("Cr", "cr\rMore: more"), ("Crlf", "crlf"), ("Lf", "lf")], + [("Crlf", "crlf"), ("Lf", "lf"), ("More", "more")], ) def test_copy(self):