httputil: Fix quadratic performance of cookie parsing

author Ben Darnell <ben@bendarnell.com>

Thu, 21 Nov 2024 19:48:05 +0000 (14:48 -0500)

committer Ben Darnell <ben@bendarnell.com>

Fri, 22 Nov 2024 13:44:17 +0000 (08:44 -0500)
author Ben Darnell <ben@bendarnell.com>
Thu, 21 Nov 2024 19:48:05 +0000 (14:48 -0500)
committer Ben Darnell <ben@bendarnell.com>
Fri, 22 Nov 2024 13:44:17 +0000 (08:44 -0500)
diff --git a/tornado/httputil.py b/tornado/httputil.py

index 4ec7b68fedbf9469c8790432572cadfa9af9c330..899d76aeb5f9e1e85585caf2df9b2b80bdb48ae9 100644 (file)
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -1065,15 +1065,20 @@ def qs_to_qsl(qs: Dict[str, List[AnyStr]]) -> Iterable[Tuple[str, AnyStr]]:
              yield (k, v)
  
  
-_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]")
-_QuotePatt = re.compile(r"[\\].")
-_nulljoin = "".join
+_unquote_sub = re.compile(r"\\(?:([0-3][0-7][0-7])|(.))").sub
+
+
+def _unquote_replace(m: re.Match) -> str:
+    if m[1]:
+        return chr(int(m[1], 8))
+    else:
+        return m[2]
  
  
  def _unquote_cookie(s: str) -> str:
      """Handle double quotes and escaping in cookie values.
  
-    This method is copied verbatim from the Python 3.5 standard
+    This method is copied verbatim from the Python 3.13 standard
      library (http.cookies._unquote) so we don't have to depend on
      non-public interfaces.
      """
@@ -1094,30 +1099,7 @@ def _unquote_cookie(s: str) -> str:
      #    \012 --> \n
      #    \"   --> "
      #
-    i = 0
-    n = len(s)
-    res = []
-    while 0 <= i < n:
-        o_match = _OctalPatt.search(s, i)
-        q_match = _QuotePatt.search(s, i)
-        if not o_match and not q_match:  # Neither matched
-            res.append(s[i:])
-            break
-        # else:
-        j = k = -1
-        if o_match:
-            j = o_match.start(0)
-        if q_match:
-            k = q_match.start(0)
-        if q_match and (not o_match or k < j):  # QuotePatt matched
-            res.append(s[i:k])
-            res.append(s[k + 1])
-            i = k + 2
-        else:  # OctalPatt matched
-            res.append(s[i:j])
-            res.append(chr(int(s[j + 1 : j + 4], 8)))
-            i = j + 4
-    return _nulljoin(res)
+    return _unquote_sub(_unquote_replace, s)
  
  
  def parse_cookie(cookie: str) -> Dict[str, str]:
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py

index 7d6759e488bdf5b5ee7a1ecb42de83578a4a4e32..46a28bc69cd6953c564c67709e3f34a82d75a885 100644 (file)
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -560,3 +560,49 @@ class ParseCookieTest(unittest.TestCase):
          self.assertEqual(
              parse_cookie("  =  b  ;  ;  =  ;   c  =  ;  "), {"": "b", "c": ""}
          )
+
+    def test_unquote(self):
+        # Copied from
+        # https://github.com/python/cpython/blob/dc7a2b6522ec7af41282bc34f405bee9b306d611/Lib/test/test_http_cookies.py#L62
+        cases = [
+            (r'a="b=\""', 'b="'),
+            (r'a="b=\\"', "b=\\"),
+            (r'a="b=\="', "b=="),
+            (r'a="b=\n"', "b=n"),
+            (r'a="b=\042"', 'b="'),
+            (r'a="b=\134"', "b=\\"),
+            (r'a="b=\377"', "b=\xff"),
+            (r'a="b=\400"', "b=400"),
+            (r'a="b=\42"', "b=42"),
+            (r'a="b=\\042"', "b=\\042"),
+            (r'a="b=\\134"', "b=\\134"),
+            (r'a="b=\\\""', 'b=\\"'),
+            (r'a="b=\\\042"', 'b=\\"'),
+            (r'a="b=\134\""', 'b=\\"'),
+            (r'a="b=\134\042"', 'b=\\"'),
+        ]
+        for encoded, decoded in cases:
+            with self.subTest(encoded):
+                c = parse_cookie(encoded)
+                self.assertEqual(c["a"], decoded)
+
+    def test_unquote_large(self):
+        # Adapted from
+        # https://github.com/python/cpython/blob/dc7a2b6522ec7af41282bc34f405bee9b306d611/Lib/test/test_http_cookies.py#L87
+        # Modified from that test because we handle semicolons differently from the stdlib.
+        #
+        # This is a performance regression test: prior to improvements in Tornado 6.4.2, this test
+        # would take over a minute with n= 100k. Now it runs in tens of milliseconds.
+        n = 100000
+        for encoded in r"\\", r"\134":
+            with self.subTest(encoded):
+                start = time.time()
+                data = 'a="b=' + encoded * n + '"'
+                value = parse_cookie(data)["a"]
+                end = time.time()
+                self.assertEqual(value[:3], "b=\\")
+                self.assertEqual(value[-3:], "\\\\\\")
+                self.assertEqual(len(value), n + 2)
+
+                # Very loose performance check to avoid false positives
+                self.assertLess(end - start, 1, "Test took too long")
author	Ben Darnell <ben@bendarnell.com>
	Thu, 21 Nov 2024 19:48:05 +0000 (14:48 -0500)
committer	Ben Darnell <ben@bendarnell.com>
	Fri, 22 Nov 2024 13:44:17 +0000 (08:44 -0500)
tornado/httputil.py		patch \| blob \| blame \| history
tornado/test/httputil_test.py		patch \| blob \| blame \| history