From 03198b55f580c7c3e88bfac0a0c459357f9ac177 Mon Sep 17 00:00:00 2001 From: Ben Darnell Date: Wed, 10 Dec 2025 10:55:02 -0500 Subject: [PATCH] httputil: Fix quadratic behavior in _parseparam Prior to this change, _parseparam had O(n^2) behavior when parsing certain inputs, which could be a DoS vector. This change adapts logic from the equivalent function in the python standard library in https://github.com/python/cpython/pull/136072/files --- tornado/httputil.py | 29 ++++++++++++++++++++++------- tornado/test/httputil_test.py | 23 +++++++++++++++++++++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/tornado/httputil.py b/tornado/httputil.py index a418b9fc..8ecc6dea 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -1082,19 +1082,34 @@ def parse_response_start_line(line: str) -> ResponseStartLine: # It has also been modified to support valueless parameters as seen in # websocket extension negotiations, and to support non-ascii values in # RFC 2231/5987 format. +# +# _parseparam has been further modified with the logic from +# https://github.com/python/cpython/pull/136072/files +# to avoid quadratic behavior when parsing semicolons in quoted strings. +# +# TODO: See if we can switch to email.message.Message for this functionality. +# This is the suggested replacement for the cgi.py module now that cgi has +# been removed from recent versions of Python. We need to verify that +# the email module is consistent with our existing behavior (and all relevant +# RFCs for multipart/form-data) before making this change. def _parseparam(s: str) -> Generator[str, None, None]: - while s[:1] == ";": - s = s[1:] - end = s.find(";") - while end > 0 and (s.count('"', 0, end) - s.count('\\"', 0, end)) % 2: - end = s.find(";", end + 1) + start = 0 + while s.find(";", start) == start: + start += 1 + end = s.find(";", start) + ind, diff = start, 0 + while end > 0: + diff += s.count('"', ind, end) - s.count('\\"', ind, end) + if diff % 2 == 0: + break + end, ind = ind, s.find(";", end + 1) if end < 0: end = len(s) - f = s[:end] + f = s[start:end] yield f.strip() - s = s[end:] + start = end def _parse_header(line: str) -> Tuple[str, Dict[str, str]]: diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index 7a46b911..68c156b6 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -279,6 +279,29 @@ Foo self.assertEqual(file["filename"], "ab.txt") self.assertEqual(file["body"], b"Foo") + def test_disposition_param_linear_performance(self): + # This is a regression test for performance of parsing parameters + # to the content-disposition header, specifically for semicolons within + # quoted strings. + def f(n): + start = time.time() + message = ( + b"--1234\r\nContent-Disposition: form-data; " + + b'x="' + + b";" * n + + b'"; ' + + b'name="files"; filename="a.txt"\r\n\r\nFoo\r\n--1234--\r\n' + ) + args: dict[str, list[bytes]] = {} + files: dict[str, list[HTTPFile]] = {} + parse_multipart_form_data(b"1234", message, args, files) + return time.time() - start + + d1 = f(1_000) + d2 = f(10_000) + if d2 / d1 > 20: + self.fail(f"Disposition param parsing is not linear: {d1=} vs {d2=}") + class HTTPHeadersTest(unittest.TestCase): def test_multi_line(self): -- 2.47.3