gh-88500: Reduce memory use of `urllib.unquote` (#96763)

author Gregory P. Smith <greg@krypto.org>

Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)

committer GitHub <noreply@github.com>

Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)
author Gregory P. Smith <greg@krypto.org>
Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)
committer GitHub <noreply@github.com>
Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py

index f067560ca6caa1eadc4d2b9884d58c0b114f6a55..2df74f5e6f99b2c1dde1fd0ca654dfd26af49320 100644 (file)
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -1104,6 +1104,8 @@ class UnquotingTests(unittest.TestCase):
          self.assertEqual(result.count('%'), 1,
                           "using unquote(): not all characters escaped: "
                           "%s" % result)
+
+    def test_unquote_rejects_none_and_tuple(self):
          self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, None)
          self.assertRaises((TypeError, AttributeError), urllib.parse.unquote, ())
  
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py

index 4f6867accbc0eb14513fb4b2341ae143f350e223..5f95c5ff7f9c1c40ac62d4779a040c09ac985f0f 100644 (file)
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -600,6 +600,9 @@ _hextobyte = None
  
  def unquote_to_bytes(string):
      """unquote_to_bytes('abc%20def') -> b'abc def'."""
+    return bytes(_unquote_impl(string))
+
+def _unquote_impl(string: bytes | bytearray | str) -> bytes | bytearray:
      # Note: strings are encoded as UTF-8. This is only an issue if it contains
      # unescaped non-ASCII characters, which URIs should not.
      if not string:
@@ -611,8 +614,8 @@ def unquote_to_bytes(string):
      bits = string.split(b'%')
      if len(bits) == 1:
          return string
-    res = [bits[0]]
-    append = res.append
+    res = bytearray(bits[0])
+    append = res.extend
      # Delay the initialization of the table to not waste memory
      # if the function is never called
      global _hextobyte
@@ -626,10 +629,20 @@ def unquote_to_bytes(string):
          except KeyError:
              append(b'%')
              append(item)
-    return b''.join(res)
+    return res
  
  _asciire = re.compile('([\x00-\x7f]+)')
  
+def _generate_unquoted_parts(string, encoding, errors):
+    previous_match_end = 0
+    for ascii_match in _asciire.finditer(string):
+        start, end = ascii_match.span()
+        yield string[previous_match_end:start]  # Non-ASCII
+        # The ascii_match[1] group == string[start:end].
+        yield _unquote_impl(ascii_match[1]).decode(encoding, errors)
+        previous_match_end = end
+    yield string[previous_match_end:]  # Non-ASCII tail
+
  def unquote(string, encoding='utf-8', errors='replace'):
      """Replace %xx escapes by their single-character equivalent. The optional
      encoding and errors parameters specify how to decode percent-encoded
@@ -641,21 +654,16 @@ def unquote(string, encoding='utf-8', errors='replace'):
      unquote('abc%20def') -> 'abc def'.
      """
      if isinstance(string, bytes):
-        return unquote_to_bytes(string).decode(encoding, errors)
+        return _unquote_impl(string).decode(encoding, errors)
      if '%' not in string:
+        # Is it a string-like object?
          string.split
          return string
      if encoding is None:
          encoding = 'utf-8'
      if errors is None:
          errors = 'replace'
-    bits = _asciire.split(string)
-    res = [bits[0]]
-    append = res.append
-    for i in range(1, len(bits), 2):
-        append(unquote_to_bytes(bits[i]).decode(encoding, errors))
-        append(bits[i + 1])
-    return ''.join(res)
+    return ''.join(_generate_unquoted_parts(string, encoding, errors))
  
  
  def parse_qs(qs, keep_blank_values=False, strict_parsing=False,
diff --git a/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst

new file mode 100644 (file)

index 0000000..ad01f5e
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst
@@ -0,0 +1,2 @@
+Reduced the memory usage of :func:`urllib.parse.unquote` and
+:func:`urllib.parse.unquote_to_bytes` on large values.
author	Gregory P. Smith <greg@krypto.org>
	Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)
committer	GitHub <noreply@github.com>
	Sun, 11 Dec 2022 00:17:39 +0000 (16:17 -0800)
Lib/test/test_urllib.py		patch \| blob \| blame \| history
Lib/urllib/parse.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-09-16-08-21-46.gh-issue-88500.jQ0pCc.rst	[new file with mode: 0644]	patch \| blob