From: Ben Darnell Date: Tue, 9 Dec 2025 18:27:27 +0000 (-0500) Subject: httputil: Fix quadratic performance of repeated header lines X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=68e81b4a3385161877408a7a49c7ed12b45a614d;p=thirdparty%2Ftornado.git httputil: Fix quadratic performance of repeated header lines Previouisly, when many header lines with the same name were found in an HTTP request or response, repeated string concatenation would result in quadratic performance. This change does the concatenation lazily (with a cache) so that repeated headers can be processed efficiently. Security: The previous behavior allowed a denial of service attack via a maliciously crafted HTTP message, but only if the max_header_size was increased from its default of 64kB. --- diff --git a/tornado/httputil.py b/tornado/httputil.py index a418b9fc..e13231e2 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -187,8 +187,14 @@ class HTTPHeaders(StrMutableMapping): pass def __init__(self, *args: typing.Any, **kwargs: str) -> None: # noqa: F811 - self._dict = {} # type: typing.Dict[str, str] - self._as_list = {} # type: typing.Dict[str, typing.List[str]] + # Formally, HTTP headers are a mapping from a field name to a "combined field value", + # which may be constructed from multiple field lines by joining them with commas. + # In practice, however, some headers (notably Set-Cookie) do not follow this convention, + # so we maintain a mapping from field name to a list of field lines in self._as_list. + # self._combined_cache is a cache of the combined field values derived from self._as_list + # on demand (and cleared whenever the list is modified). + self._as_list: dict[str, list[str]] = {} + self._combined_cache: dict[str, str] = {} self._last_key = None # type: Optional[str] if len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], HTTPHeaders): # Copy constructor @@ -215,9 +221,7 @@ class HTTPHeaders(StrMutableMapping): norm_name = _normalize_header(name) self._last_key = norm_name if norm_name in self: - self._dict[norm_name] = ( - native_str(self[norm_name]) + "," + native_str(value) - ) + self._combined_cache.pop(norm_name, None) self._as_list[norm_name].append(value) else: self[norm_name] = value @@ -278,7 +282,7 @@ class HTTPHeaders(StrMutableMapping): if _FORBIDDEN_HEADER_CHARS_RE.search(new_part): raise HTTPInputError("Invalid header value %r" % new_part) self._as_list[self._last_key][-1] += new_part - self._dict[self._last_key] += new_part + self._combined_cache.pop(self._last_key, None) else: try: name, value = line.split(":", 1) @@ -333,22 +337,32 @@ class HTTPHeaders(StrMutableMapping): def __setitem__(self, name: str, value: str) -> None: norm_name = _normalize_header(name) - self._dict[norm_name] = value + self._combined_cache[norm_name] = value self._as_list[norm_name] = [value] + def __contains__(self, name: object) -> bool: + # This is an important optimization to avoid the expensive concatenation + # in __getitem__ when it's not needed. + if not isinstance(name, str): + return False + return name in self._as_list + def __getitem__(self, name: str) -> str: - return self._dict[_normalize_header(name)] + header = _normalize_header(name) + if header not in self._combined_cache: + self._combined_cache[header] = ",".join(self._as_list[header]) + return self._combined_cache[header] def __delitem__(self, name: str) -> None: norm_name = _normalize_header(name) - del self._dict[norm_name] + del self._combined_cache[norm_name] del self._as_list[norm_name] def __len__(self) -> int: - return len(self._dict) + return len(self._as_list) def __iter__(self) -> Iterator[typing.Any]: - return iter(self._dict) + return iter(self._as_list) def copy(self) -> "HTTPHeaders": # defined in dict but not in MutableMapping. diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index 7a46b911..3da9024b 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -471,6 +471,21 @@ Foo: even with self.assertRaises(HTTPInputError): headers.add(name, "bar") + def test_linear_performance(self): + def f(n): + start = time.time() + headers = HTTPHeaders() + for i in range(n): + headers.add("X-Foo", "bar") + return time.time() - start + + # This runs under 50ms on my laptop as of 2025-12-09. + d1 = f(10_000) + d2 = f(100_000) + if d2 / d1 > 20: + # d2 should be about 10x d1 but allow a wide margin for variability. + self.fail(f"HTTPHeaders.add() does not scale linearly: {d1=} vs {d2=}") + class FormatTimestampTest(unittest.TestCase): # Make sure that all the input types are supported.