httputil: Fix quadratic behavior in _parseparam

author Ben Darnell <ben@bendarnell.com>

Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)

committer Ben Darnell <ben@bendarnell.com>

Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)
author Ben Darnell <ben@bendarnell.com>
Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)
committer Ben Darnell <ben@bendarnell.com>
Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)
diff --git a/tornado/httputil.py b/tornado/httputil.py

index a418b9fce5bf2d11f8e3cc229bb4ee4ea420ba98..8ecc6dea0bf23211c1473e1392ceb5f3b164e1e5 100644 (file)
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -1082,19 +1082,34 @@ def parse_response_start_line(line: str) -> ResponseStartLine:
  # It has also been modified to support valueless parameters as seen in
  # websocket extension negotiations, and to support non-ascii values in
  # RFC 2231/5987 format.
+#
+# _parseparam has been further modified with the logic from
+# https://github.com/python/cpython/pull/136072/files
+# to avoid quadratic behavior when parsing semicolons in quoted strings.
+#
+# TODO: See if we can switch to email.message.Message for this functionality.
+# This is the suggested replacement for the cgi.py module now that cgi has
+# been removed from recent versions of Python.  We need to verify that
+# the email module is consistent with our existing behavior (and all relevant
+# RFCs for multipart/form-data) before making this change.
  
  
  def _parseparam(s: str) -> Generator[str, None, None]:
-    while s[:1] == ";":
-        s = s[1:]
-        end = s.find(";")
-        while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2:
-            end = s.find(";", end + 1)
+    start = 0
+    while s.find(";", start) == start:
+        start += 1
+        end = s.find(";", start)
+        ind, diff = start, 0
+        while end > 0:
+            diff += s.count('"', ind, end) - s.count('\\"', ind, end)
+            if diff % 2 == 0:
+                break
+            end, ind = ind, s.find(";", end + 1)
          if end < 0:
              end = len(s)
-        f = s[:end]
+        f = s[start:end]
          yield f.strip()
-        s = s[end:]
+        start = end
  
  
  def _parse_header(line: str) -> Tuple[str, Dict[str, str]]:
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py

index 7a46b911569853adc3fdf6917107e0a37858ee49..68c156b6072c539c1c3ea7b15f63f7c562cdd4b0 100644 (file)
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -279,6 +279,29 @@ Foo
          self.assertEqual(file["filename"], "ab.txt")
          self.assertEqual(file["body"], b"Foo")
  
+    def test_disposition_param_linear_performance(self):
+        # This is a regression test for performance of parsing parameters
+        # to the content-disposition header, specifically for semicolons within
+        # quoted strings.
+        def f(n):
+            start = time.time()
+            message = (
+                b"--1234\r\nContent-Disposition: form-data; "
+                + b'x="'
+                + b";" * n
+                + b'"; '
+                + b'name="files"; filename="a.txt"\r\n\r\nFoo\r\n--1234--\r\n'
+            )
+            args: dict[str, list[bytes]] = {}
+            files: dict[str, list[HTTPFile]] = {}
+            parse_multipart_form_data(b"1234", message, args, files)
+            return time.time() - start
+
+        d1 = f(1_000)
+        d2 = f(10_000)
+        if d2 / d1 > 20:
+            self.fail(f"Disposition param parsing is not linear: {d1=} vs {d2=}")
+
  
  class HTTPHeadersTest(unittest.TestCase):
      def test_multi_line(self):
author	Ben Darnell <ben@bendarnell.com>
	Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)
committer	Ben Darnell <ben@bendarnell.com>
	Wed, 10 Dec 2025 15:55:02 +0000 (10:55 -0500)
tornado/httputil.py		patch \| blob \| blame \| history
tornado/test/httputil_test.py		patch \| blob \| blame \| history