# To be used with str.strip() and related methods.
HTTP_WHITESPACE = " \t"
+# Roughly the inverse of RequestHandler._VALID_HEADER_CHARS, but permits
+# chars greater than \xFF (which may appear after decoding utf8).
+_FORBIDDEN_HEADER_CHARS_RE = re.compile(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]")
+
class _ABNF:
"""Class that holds a subset of ABNF rules from RFC 9110 and friends.
# new public methods
- def add(self, name: str, value: str) -> None:
+ def add(self, name: str, value: str, *, _chars_are_bytes: bool = True) -> None:
"""Adds a new value for the given key."""
if not _ABNF.field_name.fullmatch(name):
raise HTTPInputError("Invalid header name %r" % name)
- if not _ABNF.field_value.fullmatch(to_unicode(value)):
- # TODO: the fact we still support bytes here (contrary to type annotations)
- # and still test for it should probably be changed.
- raise HTTPInputError("Invalid header value %r" % value)
+ if _chars_are_bytes:
+ if not _ABNF.field_value.fullmatch(to_unicode(value)):
+ # TODO: the fact we still support bytes here (contrary to type annotations)
+ # and still test for it should probably be changed.
+ raise HTTPInputError("Invalid header value %r" % value)
+ else:
+ if _FORBIDDEN_HEADER_CHARS_RE.search(value):
+ raise HTTPInputError("Invalid header value %r" % value)
norm_name = _normalize_header(name)
self._last_key = norm_name
if norm_name in self:
for value in values:
yield (name, value)
- def parse_line(self, line: str) -> None:
+ def parse_line(self, line: str, *, _chars_are_bytes: bool = True) -> None:
r"""Updates the dictionary with a single header line.
>>> h = HTTPHeaders()
if self._last_key is None:
raise HTTPInputError("first header line cannot start with whitespace")
new_part = " " + line.strip(HTTP_WHITESPACE)
- if not _ABNF.field_value.fullmatch(new_part[1:]):
- raise HTTPInputError("Invalid header continuation %r" % new_part)
+ if _chars_are_bytes:
+ if not _ABNF.field_value.fullmatch(new_part[1:]):
+ raise HTTPInputError("Invalid header continuation %r" % new_part)
+ else:
+ if _FORBIDDEN_HEADER_CHARS_RE.search(new_part):
+ raise HTTPInputError("Invalid header value %r" % new_part)
self._as_list[self._last_key][-1] += new_part
self._dict[self._last_key] += new_part
else:
name, value = line.split(":", 1)
except ValueError:
raise HTTPInputError("no colon in header line")
- self.add(name, value.strip(HTTP_WHITESPACE))
+ self.add(
+ name, value.strip(HTTP_WHITESPACE), _chars_are_bytes=_chars_are_bytes
+ )
@classmethod
- def parse(cls, headers: str) -> "HTTPHeaders":
+ def parse(cls, headers: str, *, _chars_are_bytes: bool = True) -> "HTTPHeaders":
"""Returns a dictionary from HTTP header text.
>>> h = HTTPHeaders.parse("Content-Type: text/html\\r\\nContent-Length: 42\\r\\n")
mix of `KeyError`, and `ValueError`.
"""
+ # _chars_are_bytes is a hack. This method is used in two places, HTTP headers (in which
+ # non-ascii characters are to be interpreted as latin-1) and multipart/form-data (in which
+ # they are to be interpreted as utf-8). For historical reasons, this method handled this by
+ # expecting both callers to decode the headers to strings before parsing them. This wasn't a
+ # problem until we started doing stricter validation of the characters allowed in HTTP
+ # headers (using ABNF rules defined in terms of byte values), which inadvertently started
+ # disallowing non-latin1 characters in multipart/form-data filenames.
+ #
+ # This method should have accepted bytes and a desired encoding, but this change is being
+ # introduced in a patch release that shouldn't change the API. Instead, the _chars_are_bytes
+ # flag decides whether to use HTTP-style ABNF validation (treating the string as bytes
+ # smuggled through the latin1 encoding) or to accept any non-control unicode characters
+ # as required by multipart/form-data. This method will change to accept bytes in a future
+ # release.
h = cls()
start = 0
while True:
lf = headers.find("\n", start)
if lf == -1:
- h.parse_line(headers[start:])
+ h.parse_line(headers[start:], _chars_are_bytes=_chars_are_bytes)
break
line = headers[start : lf + 1]
start = lf + 1
- h.parse_line(line)
+ h.parse_line(line, _chars_are_bytes=_chars_are_bytes)
return h
# MutableMapping abstract method implementations.
eoh = part.find(b"\r\n\r\n")
if eoh == -1:
raise HTTPInputError("multipart/form-data missing headers")
- headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"))
+ headers = HTTPHeaders.parse(part[:eoh].decode("utf-8"), _chars_are_bytes=False)
disp_header = headers.get("Content-Disposition", "")
disposition, disp_params = _parse_header(disp_header)
if disposition != "form-data" or not part.endswith(b"\r\n"):
self.assertEqual(file["filename"], filename)
self.assertEqual(file["body"], b"Foo")
- def test_non_ascii_filename(self):
+ def test_non_ascii_filename_rfc5987(self):
data = b"""\
--1234
Content-Disposition: form-data; name="files"; filename="ab.txt"; filename*=UTF-8''%C3%A1b.txt
self.assertEqual(file["filename"], "áb.txt")
self.assertEqual(file["body"], b"Foo")
+ def test_non_ascii_filename_raw(self):
+ data = """\
+--1234
+Content-Disposition: form-data; name="files"; filename="测试.txt"
+
+Foo
+--1234--""".encode(
+ "utf-8"
+ ).replace(
+ b"\n", b"\r\n"
+ )
+ args, files = form_data_args()
+ parse_multipart_form_data(b"1234", data, args, files)
+ file = files["files"][0]
+ self.assertEqual(file["filename"], "测试.txt")
+ self.assertEqual(file["body"], b"Foo")
+
def test_boundary_starts_and_ends_with_quotes(self):
data = b"""\
--1234