From 03198b55f580c7c3e88bfac0a0c459357f9ac177 Mon Sep 17 00:00:00 2001
From: Ben Darnell <ben@bendarnell.com>
Date: Wed, 10 Dec 2025 10:55:02 -0500
Subject: [PATCH] httputil: Fix quadratic behavior in _parseparam

Prior to this change, _parseparam had O(n^2) behavior when parsing
certain inputs, which could be a DoS vector. This change adapts
logic from the equivalent function in the python standard library
in https://github.com/python/cpython/pull/136072/files
---
 tornado/httputil.py           | 29 ++++++++++++++++++++++-------
 tornado/test/httputil_test.py | 23 +++++++++++++++++++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/tornado/httputil.py b/tornado/httputil.py
index a418b9fc..8ecc6dea 100644
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -1082,19 +1082,34 @@ def parse_response_start_line(line: str) -> ResponseStartLine:
 # It has also been modified to support valueless parameters as seen in
 # websocket extension negotiations, and to support non-ascii values in
 # RFC 2231/5987 format.
+#
+# _parseparam has been further modified with the logic from
+# https://github.com/python/cpython/pull/136072/files
+# to avoid quadratic behavior when parsing semicolons in quoted strings.
+#
+# TODO: See if we can switch to email.message.Message for this functionality.
+# This is the suggested replacement for the cgi.py module now that cgi has
+# been removed from recent versions of Python.  We need to verify that
+# the email module is consistent with our existing behavior (and all relevant
+# RFCs for multipart/form-data) before making this change.
 
 
 def _parseparam(s: str) -> Generator[str, None, None]:
-    while s[:1] == ";":
-        s = s[1:]
-        end = s.find(";")
-        while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
-            end = s.find(";", end + 1)
+    start = 0
+    while s.find(";", start) == start:
+        start += 1
+        end = s.find(";", start)
+        ind, diff = start, 0
+        while end > 0:
+            diff += s.count('"', ind, end) - s.count('\\"', ind, end)
+            if diff % 2 == 0:
+                break
+            end, ind = ind, s.find(";", end + 1)
         if end < 0:
             end = len(s)
-        f = s[:end]
+        f = s[start:end]
         yield f.strip()
-        s = s[end:]
+        start = end
 
 
 def _parse_header(line: str) -> Tuple[str, Dict[str, str]]:
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
index 7a46b911..68c156b6 100644
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -279,6 +279,29 @@ Foo
         self.assertEqual(file["filename"], "ab.txt")
         self.assertEqual(file["body"], b"Foo")
 
+    def test_disposition_param_linear_performance(self):
+        # This is a regression test for performance of parsing parameters
+        # to the content-disposition header, specifically for semicolons within
+        # quoted strings.
+        def f(n):
+            start = time.time()
+            message = (
+                b"--1234\r\nContent-Disposition: form-data; "
+                + b'x="'
+                + b";" * n
+                + b'"; '
+                + b'name="files"; filename="a.txt"\r\n\r\nFoo\r\n--1234--\r\n'
+            )
+            args: dict[str, list[bytes]] = {}
+            files: dict[str, list[HTTPFile]] = {}
+            parse_multipart_form_data(b"1234", message, args, files)
+            return time.time() - start
+
+        d1 = f(1_000)
+        d2 = f(10_000)
+        if d2 / d1 > 20:
+            self.fail(f"Disposition param parsing is not linear: {d1=} vs {d2=}")
+
 
 class HTTPHeadersTest(unittest.TestCase):
     def test_multi_line(self):
-- 
2.47.3