import unicodedata
from urllib.parse import urlencode, urlparse, urlunparse, parse_qsl
-from tornado.escape import native_str, parse_qs_bytes, utf8
+from tornado.escape import native_str, parse_qs_bytes, utf8, to_unicode
from tornado.log import gen_log
from tornado.util import ObjectDict, unicode_type
# RFC 9110 (HTTP Semantics)
obs_text = re.compile(r"[\x80-\xFF]")
field_vchar = re.compile(rf"(?:{VCHAR.pattern}|{obs_text.pattern})")
+ # Not exactly from the RFC to simplify and combine field-content and field-value.
+ field_value = re.compile(
+ rf"|"
+ rf"{field_vchar.pattern}|"
+ rf"{field_vchar.pattern}(?:{field_vchar.pattern}| |\t)*{field_vchar.pattern}"
+ )
tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]")
token = re.compile(rf"{tchar.pattern}+")
field_name = token
"""Adds a new value for the given key."""
if not _ABNF.field_name.fullmatch(name):
raise HTTPInputError("Invalid header name %r" % name)
+ if not _ABNF.field_value.fullmatch(to_unicode(value)):
+ # TODO: the fact we still support bytes here (contrary to type annotations)
+ # and still test for it should probably be changed.
+ raise HTTPInputError("Invalid header value %r" % value)
norm_name = _normalize_header(name)
self._last_key = norm_name
if norm_name in self:
if self._last_key is None:
raise HTTPInputError("first header line cannot start with whitespace")
new_part = " " + line.strip(HTTP_WHITESPACE)
+ if not _ABNF.field_value.fullmatch(new_part[1:]):
+ raise HTTPInputError("Invalid header continuation %r" % new_part)
self._as_list[self._last_key][-1] += new_part
self._dict[self._last_key] += new_part
else:
data = "Foo: bar\r\n\fasdf"
self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
+ def test_forbidden_ascii_characters(self):
+ # Control characters and ASCII whitespace other than space, tab, and CRLF are not allowed in
+ # headers.
+ for c in range(0xFF):
+ data = f"Foo: bar{chr(c)}baz\r\n"
+ if c == 0x09 or (c >= 0x20 and c != 0x7F):
+ headers = HTTPHeaders.parse(data)
+ self.assertEqual(headers["Foo"], f"bar{chr(c)}baz")
+ else:
+ self.assertRaises(HTTPInputError, HTTPHeaders.parse, data)
+
def test_unicode_newlines(self):
# Ensure that only \r\n is recognized as a header separator, and not
# the other newline-like unicode characters.
# and cpython's unicodeobject.c (which defines the implementation
# of unicode_type.splitlines(), and uses a different list than TR13).
newlines = [
- "\u001b", # VERTICAL TAB
- "\u001c", # FILE SEPARATOR
- "\u001d", # GROUP SEPARATOR
- "\u001e", # RECORD SEPARATOR
+ # The following ascii characters are sometimes treated as newline-like,
+ # but they're disallowed in HTTP headers. This test covers unicode
+ # characters that are permitted in headers (under the obs-text rule).
+ # "\u001b", # VERTICAL TAB
+ # "\u001c", # FILE SEPARATOR
+ # "\u001d", # GROUP SEPARATOR
+ # "\u001e", # RECORD SEPARATOR
"\u0085", # NEXT LINE
"\u2028", # LINE SEPARATOR
"\u2029", # PARAGRAPH SEPARATOR
self.assertEqual(expected, list(headers.get_all()))
def test_optional_cr(self):
+ # Bare CR is not a valid line separator
+ with self.assertRaises(HTTPInputError):
+ HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+
# Both CRLF and LF should be accepted as separators. CR should not be
- # part of the data when followed by LF, but it is a normal char
- # otherwise (or should bare CR be an error?)
- headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n")
+ # part of the data when followed by LF.
+ headers = HTTPHeaders.parse("CRLF: crlf\r\nLF: lf\nMore: more\r\n")
self.assertEqual(
sorted(headers.get_all()),
- [("Cr", "cr\rMore: more"), ("Crlf", "crlf"), ("Lf", "lf")],
+ [("Crlf", "crlf"), ("Lf", "lf"), ("More", "more")],
)
def test_copy(self):