]> git.ipfire.org Git - thirdparty/tornado.git/commitdiff
httputil: Fix quadratic performance of repeated header lines 3553/head
authorBen Darnell <ben@bendarnell.com>
Tue, 9 Dec 2025 18:27:27 +0000 (13:27 -0500)
committerBen Darnell <ben@bendarnell.com>
Tue, 9 Dec 2025 18:27:27 +0000 (13:27 -0500)
Previouisly, when many header lines with the same name were found
in an HTTP request or response, repeated string concatenation would
result in quadratic performance. This change does the concatenation
lazily (with a cache) so that repeated headers can be processed
efficiently.

Security: The previous behavior allowed a denial of service attack
via a maliciously crafted HTTP message, but only if the
max_header_size was increased from its default of 64kB.

tornado/httputil.py
tornado/test/httputil_test.py

index a418b9fce5bf2d11f8e3cc229bb4ee4ea420ba98..e13231e24644a1e45a43c48a777979e711bedfe1 100644 (file)
@@ -187,8 +187,14 @@ class HTTPHeaders(StrMutableMapping):
         pass
 
     def __init__(self, *args: typing.Any, **kwargs: str) -> None:  # noqa: F811
-        self._dict = {}  # type: typing.Dict[str, str]
-        self._as_list = {}  # type: typing.Dict[str, typing.List[str]]
+        # Formally, HTTP headers are a mapping from a field name to a "combined field value",
+        # which may be constructed from multiple field lines by joining them with commas.
+        # In practice, however, some headers (notably Set-Cookie) do not follow this convention,
+        # so we maintain a mapping from field name to a list of field lines in self._as_list.
+        # self._combined_cache is a cache of the combined field values derived from self._as_list
+        # on demand (and cleared whenever the list is modified).
+        self._as_list: dict[str, list[str]] = {}
+        self._combined_cache: dict[str, str] = {}
         self._last_key = None  # type: Optional[str]
         if len(args) == 1 and len(kwargs) == 0 and isinstance(args[0], HTTPHeaders):
             # Copy constructor
@@ -215,9 +221,7 @@ class HTTPHeaders(StrMutableMapping):
         norm_name = _normalize_header(name)
         self._last_key = norm_name
         if norm_name in self:
-            self._dict[norm_name] = (
-                native_str(self[norm_name]) + "," + native_str(value)
-            )
+            self._combined_cache.pop(norm_name, None)
             self._as_list[norm_name].append(value)
         else:
             self[norm_name] = value
@@ -278,7 +282,7 @@ class HTTPHeaders(StrMutableMapping):
                 if _FORBIDDEN_HEADER_CHARS_RE.search(new_part):
                     raise HTTPInputError("Invalid header value %r" % new_part)
             self._as_list[self._last_key][-1] += new_part
-            self._dict[self._last_key] += new_part
+            self._combined_cache.pop(self._last_key, None)
         else:
             try:
                 name, value = line.split(":", 1)
@@ -333,22 +337,32 @@ class HTTPHeaders(StrMutableMapping):
 
     def __setitem__(self, name: str, value: str) -> None:
         norm_name = _normalize_header(name)
-        self._dict[norm_name] = value
+        self._combined_cache[norm_name] = value
         self._as_list[norm_name] = [value]
 
+    def __contains__(self, name: object) -> bool:
+        # This is an important optimization to avoid the expensive concatenation
+        # in __getitem__ when it's not needed.
+        if not isinstance(name, str):
+            return False
+        return name in self._as_list
+
     def __getitem__(self, name: str) -> str:
-        return self._dict[_normalize_header(name)]
+        header = _normalize_header(name)
+        if header not in self._combined_cache:
+            self._combined_cache[header] = ",".join(self._as_list[header])
+        return self._combined_cache[header]
 
     def __delitem__(self, name: str) -> None:
         norm_name = _normalize_header(name)
-        del self._dict[norm_name]
+        del self._combined_cache[norm_name]
         del self._as_list[norm_name]
 
     def __len__(self) -> int:
-        return len(self._dict)
+        return len(self._as_list)
 
     def __iter__(self) -> Iterator[typing.Any]:
-        return iter(self._dict)
+        return iter(self._as_list)
 
     def copy(self) -> "HTTPHeaders":
         # defined in dict but not in MutableMapping.
index 7a46b911569853adc3fdf6917107e0a37858ee49..3da9024b50842ed17152e8c328585b13c20a78c8 100644 (file)
@@ -471,6 +471,21 @@ Foo: even
             with self.assertRaises(HTTPInputError):
                 headers.add(name, "bar")
 
+    def test_linear_performance(self):
+        def f(n):
+            start = time.time()
+            headers = HTTPHeaders()
+            for i in range(n):
+                headers.add("X-Foo", "bar")
+            return time.time() - start
+
+        # This runs under 50ms on my laptop as of 2025-12-09.
+        d1 = f(10_000)
+        d2 = f(100_000)
+        if d2 / d1 > 20:
+            # d2 should be about 10x d1 but allow a wide margin for variability.
+            self.fail(f"HTTPHeaders.add() does not scale linearly: {d1=} vs {d2=}")
+
 
 class FormatTimestampTest(unittest.TestCase):
     # Make sure that all the input types are supported.