[3.12] gh-113028: Correctly memoize str in pickle when escapes added (GH-113436)...

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Sun, 24 Dec 2023 10:01:08 +0000 (11:01 +0100)

committer GitHub <noreply@github.com>

Sun, 24 Dec 2023 10:01:08 +0000 (10:01 +0000)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Sun, 24 Dec 2023 10:01:08 +0000 (11:01 +0100)
committer GitHub <noreply@github.com>
Sun, 24 Dec 2023 10:01:08 +0000 (10:01 +0000)
diff --git a/Lib/pickle.py b/Lib/pickle.py

index fe86f80f51d3b99b195f9971726250d4b472898f..f54a607cf298b4611fc84d2eaeeffabf73a0a19a 100644 (file)
--- a/Lib/pickle.py
+++ b/Lib/pickle.py
@@ -855,13 +855,13 @@ class _Pickler:
              else:
                  self.write(BINUNICODE + pack("<I", n) + encoded)
          else:
-            obj = obj.replace("\\", "\\u005c")
-            obj = obj.replace("\0", "\\u0000")
-            obj = obj.replace("\n", "\\u000a")
-            obj = obj.replace("\r", "\\u000d")
-            obj = obj.replace("\x1a", "\\u001a")  # EOF on DOS
-            self.write(UNICODE + obj.encode('raw-unicode-escape') +
-                       b'\n')
+            # Escape what raw-unicode-escape doesn't, but memoize the original.
+            tmp = obj.replace("\\", "\\u005c")
+            tmp = tmp.replace("\0", "\\u0000")
+            tmp = tmp.replace("\n", "\\u000a")
+            tmp = tmp.replace("\r", "\\u000d")
+            tmp = tmp.replace("\x1a", "\\u001a")  # EOF on DOS
+            self.write(UNICODE + tmp.encode('raw-unicode-escape') + b'\n')
          self.memoize(obj)
      dispatch[str] = save_str
  
diff --git a/Lib/test/pickletester.py b/Lib/test/pickletester.py

index 5d5df1658c4ee2ae2c3406dc1432ff0e76d561bf..5b9bceceba1ec8b685f747ff202d105b95c56d71 100644 (file)
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -1825,6 +1825,14 @@ class AbstractPickleTests:
              t2 = self.loads(p)
              self.assert_is_copy(t, t2)
  
+    def test_unicode_memoization(self):
+        # Repeated str is re-used (even when escapes added).
+        for proto in protocols:
+            for s in '', 'xyz', 'xyz\n', 'x\\yz', 'x\xa1yz\r':
+                p = self.dumps((s, s), proto)
+                s1, s2 = self.loads(p)
+                self.assertIs(s1, s2)
+
      def test_bytes(self):
          for proto in protocols:
              for s in b'', b'xyz', b'xyz'*100:
diff --git a/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst b/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst

new file mode 100644 (file)

index 0000000..5f66d6a
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst
@@ -0,0 +1,6 @@
+When a second reference to a string appears in the input to :mod:`pickle`,
+and the Python implementation is in use,
+we are guaranteed that a single copy gets pickled
+and a single object is shared when reloaded.
+Previously, in protocol 0, when a string contained certain characters
+(e.g. newline) it resulted in duplicate objects.
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Sun, 24 Dec 2023 10:01:08 +0000 (11:01 +0100)
committer	GitHub <noreply@github.com>
	Sun, 24 Dec 2023 10:01:08 +0000 (10:01 +0000)
Lib/pickle.py		patch \| blob \| blame \| history
Lib/test/pickletester.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2023-12-23-16-51-17.gh-issue-113028.3Jmdoj.rst	[new file with mode: 0644]	patch \| blob