From 68e81b4a3385161877408a7a49c7ed12b45a614d Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Tue, 9 Dec 2025 13:27:27 -0500 Subject: [PATCH] httputil: Fix quadratic performance of repeated header lines Previouisly, when many header lines with the same name were found in an HTTP request or response, repeated string concatenation would result in quadratic performance. This change does the concatenation lazily (with a cache) so that repeated headers can be processed efficiently. Security: The previous behavior allowed a denial of service attack via a maliciously crafted HTTP message, but only if the max_header_size was increased from its default of 64kB. --- tornado/httputil.py | 36 ++++++++++++++++++++++++----------- tornado/test/httputil_test.py | 15 +++++++++++++++ 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tornado/httputil.py b/tornado/httputil.py index a418b9fc..e13231e2 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -187,8 +187,14 @@ class HTTPHeaders(StrMutableMapping): pass def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811 - self._dict = {} # type: typing.Dict[str, str] - self._as_list = {} # type: typing.Dict[str, typing.List[str]] + # Formally, HTTP headers are a mapping from a field name to a "combined field value", + # which may be constructed from multiple field lines by joining them with commas. + # In practice, however, some headers (notably Set-Cookie) do not follow this convention, + # so we maintain a mapping from field name to a list of field lines in self._as_list. + # self._combined_cache is a cache of the combined field values derived from self._as_list + # on demand (and cleared whenever the list is modified). + self._as_list: dict[str, list[str]] = {} + self._combined_cache: dict[str, str] = {} self._last_key = None # type: Optional[str] if len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], HTTPHeaders): # Copy constructor @@ -215,9 +221,7 @@ class HTTPHeaders(StrMutableMapping): norm_name = _normalize_header(name) self._last_key = norm_name if norm_name in self: - self._dict[norm_name] = ( - native_str(self[norm_name]) + "," + native_str(value) - ) + self._combined_cache.pop(norm_name, None) self._as_list[norm_name].append(value) else: self[norm_name] = value @@ -278,7 +282,7 @@ class HTTPHeaders(StrMutableMapping): if _FORBIDDEN_HEADER_CHARS_RE.search(new_part): raise HTTPInputError("Invalid header value %r" % new_part) self._as_list[self._last_key][-1] += new_part - self._dict[self._last_key] += new_part + self._combined_cache.pop(self._last_key, None) else: try: name, value = line.split(":", 1) @@ -333,22 +337,32 @@ class HTTPHeaders(StrMutableMapping): def __setitem__(self, name: str, value: str) -> None: norm_name = _normalize_header(name) - self._dict[norm_name] = value + self._combined_cache[norm_name] = value self._as_list[norm_name] = [value] + def __contains__(self, name: object) -> bool: + # This is an important optimization to avoid the expensive concatenation + # in __getitem__ when it's not needed. + if not isinstance(name, str): + return False + return name in self._as_list + def __getitem__(self, name: str) -> str: - return self._dict[_normalize_header(name)] + header = _normalize_header(name) + if header not in self._combined_cache: + self._combined_cache[header] = ",".join(self._as_list[header]) + return self._combined_cache[header] def __delitem__(self, name: str) -> None: norm_name = _normalize_header(name) - del self._dict[norm_name] + del self._combined_cache[norm_name] del self._as_list[norm_name] def __len__(self) -> int: - return len(self._dict) + return len(self._as_list) def __iter__(self) -> Iterator[typing.Any]: - return iter(self._dict) + return iter(self._as_list) def copy(self) -> "HTTPHeaders": # defined in dict but not in MutableMapping. diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index 7a46b911..3da9024b 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -471,6 +471,21 @@ Foo: even with self.assertRaises(HTTPInputError): headers.add(name, "bar") + def test_linear_performance(self): + def f(n): + start = time.time() + headers = HTTPHeaders() + for i in range(n): + headers.add("X-Foo", "bar") + return time.time() - start + + # This runs under 50ms on my laptop as of 2025-12-09. + d1 = f(10_000) + d2 = f(100_000) + if d2 / d1 > 20: + # d2 should be about 10x d1 but allow a wide margin for variability. + self.fail(f"HTTPHeaders.add() does not scale linearly: {d1=} vs {d2=}") + class FormatTimestampTest(unittest.TestCase): # Make sure that all the input types are supported. -- 2.47.3