]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.13] gh-120380: fix Python implementation of `pickle.Pickler` for `bytes` and ...
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Wed, 26 Jun 2024 10:11:28 +0000 (12:11 +0200)
committerGitHub <noreply@github.com>
Wed, 26 Jun 2024 10:11:28 +0000 (12:11 +0200)
gh-120380: fix Python implementation of `pickle.Pickler` for `bytes` and `bytearray` objects in protocol version 5. (GH-120422)
(cherry picked from commit 7595e6743ac78ac0dd19418176f66d251668fafc)

Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
Lib/pickle.py
Lib/test/pickletester.py
Misc/NEWS.d/next/Core and Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst [new file with mode: 0644]

index 33c97c8c5efb28e5a651bf233203c31463618191..d719ceb7a0b8e883250b1cd186ec8a01672a3522 100644 (file)
@@ -782,14 +782,10 @@ class _Pickler:
             self.write(FLOAT + repr(obj).encode("ascii") + b'\n')
     dispatch[float] = save_float
 
-    def save_bytes(self, obj):
-        if self.proto < 3:
-            if not obj: # bytes object is empty
-                self.save_reduce(bytes, (), obj=obj)
-            else:
-                self.save_reduce(codecs.encode,
-                                 (str(obj, 'latin1'), 'latin1'), obj=obj)
-            return
+    def _save_bytes_no_memo(self, obj):
+        # helper for writing bytes objects for protocol >= 3
+        # without memoizing them
+        assert self.proto >= 3
         n = len(obj)
         if n <= 0xff:
             self.write(SHORT_BINBYTES + pack("<B", n) + obj)
@@ -799,9 +795,29 @@ class _Pickler:
             self._write_large_bytes(BINBYTES + pack("<I", n), obj)
         else:
             self.write(BINBYTES + pack("<I", n) + obj)
+
+    def save_bytes(self, obj):
+        if self.proto < 3:
+            if not obj: # bytes object is empty
+                self.save_reduce(bytes, (), obj=obj)
+            else:
+                self.save_reduce(codecs.encode,
+                                 (str(obj, 'latin1'), 'latin1'), obj=obj)
+            return
+        self._save_bytes_no_memo(obj)
         self.memoize(obj)
     dispatch[bytes] = save_bytes
 
+    def _save_bytearray_no_memo(self, obj):
+        # helper for writing bytearray objects for protocol >= 5
+        # without memoizing them
+        assert self.proto >= 5
+        n = len(obj)
+        if n >= self.framer._FRAME_SIZE_TARGET:
+            self._write_large_bytes(BYTEARRAY8 + pack("<Q", n), obj)
+        else:
+            self.write(BYTEARRAY8 + pack("<Q", n) + obj)
+
     def save_bytearray(self, obj):
         if self.proto < 5:
             if not obj:  # bytearray is empty
@@ -809,11 +825,7 @@ class _Pickler:
             else:
                 self.save_reduce(bytearray, (bytes(obj),), obj=obj)
             return
-        n = len(obj)
-        if n >= self.framer._FRAME_SIZE_TARGET:
-            self._write_large_bytes(BYTEARRAY8 + pack("<Q", n), obj)
-        else:
-            self.write(BYTEARRAY8 + pack("<Q", n) + obj)
+        self._save_bytearray_no_memo(obj)
         self.memoize(obj)
     dispatch[bytearray] = save_bytearray
 
@@ -832,10 +844,18 @@ class _Pickler:
                 if in_band:
                     # Write data in-band
                     # XXX The C implementation avoids a copy here
+                    buf = m.tobytes()
+                    in_memo = id(buf) in self.memo
                     if m.readonly:
-                        self.save_bytes(m.tobytes())
+                        if in_memo:
+                            self._save_bytes_no_memo(buf)
+                        else:
+                            self.save_bytes(buf)
                     else:
-                        self.save_bytearray(m.tobytes())
+                        if in_memo:
+                            self._save_bytearray_no_memo(buf)
+                        else:
+                            self.save_bytearray(buf)
                 else:
                     # Write data out-of-band
                     self.write(NEXT_BUFFER)
index 93e7dbbd10393423717cc7cc2fe77133c6521972..9922591ce7114af53a995d06e350320b9ebb8fcb 100644 (file)
@@ -1845,6 +1845,25 @@ class AbstractPickleTests:
                 p = self.dumps(s, proto)
                 self.assert_is_copy(s, self.loads(p))
 
+    def test_bytes_memoization(self):
+        for proto in protocols:
+            for array_type in [bytes, ZeroCopyBytes]:
+                for s in b'', b'xyz', b'xyz'*100:
+                    with self.subTest(proto=proto, array_type=array_type, s=s, independent=False):
+                        b = array_type(s)
+                        p = self.dumps((b, b), proto)
+                        x, y = self.loads(p)
+                        self.assertIs(x, y)
+                        self.assert_is_copy((b, b), (x, y))
+
+                    with self.subTest(proto=proto, array_type=array_type, s=s, independent=True):
+                        b1, b2 = array_type(s), array_type(s)
+                        p = self.dumps((b1, b2), proto)
+                        # Note that (b1, b2) = self.loads(p) might have identical
+                        # components, i.e., b1 is b2, but this is not always the
+                        # case if the content is large (equality still holds).
+                        self.assert_is_copy((b1, b2), self.loads(p))
+
     def test_bytearray(self):
         for proto in protocols:
             for s in b'', b'xyz', b'xyz'*100:
@@ -1864,13 +1883,31 @@ class AbstractPickleTests:
                     self.assertNotIn(b'bytearray', p)
                     self.assertTrue(opcode_in_pickle(pickle.BYTEARRAY8, p))
 
-    def test_bytearray_memoization_bug(self):
+    def test_bytearray_memoization(self):
         for proto in protocols:
-            for s in b'', b'xyz', b'xyz'*100:
-                b = bytearray(s)
-                p = self.dumps((b, b), proto)
-                b1, b2 = self.loads(p)
-                self.assertIs(b1, b2)
+            for array_type in [bytearray, ZeroCopyBytearray]:
+                for s in b'', b'xyz', b'xyz'*100:
+                    with self.subTest(proto=proto, array_type=array_type, s=s, independent=False):
+                        b = array_type(s)
+                        p = self.dumps((b, b), proto)
+                        b1, b2 = self.loads(p)
+                        self.assertIs(b1, b2)
+
+                    with self.subTest(proto=proto, array_type=array_type, s=s, independent=True):
+                        b1a, b2a = array_type(s), array_type(s)
+                        # Unlike bytes, equal but independent bytearray objects are
+                        # never identical.
+                        self.assertIsNot(b1a, b2a)
+
+                        p = self.dumps((b1a, b2a), proto)
+                        b1b, b2b = self.loads(p)
+                        self.assertIsNot(b1b, b2b)
+
+                        self.assertIsNot(b1a, b1b)
+                        self.assert_is_copy(b1a, b1b)
+
+                        self.assertIsNot(b2a, b2b)
+                        self.assert_is_copy(b2a, b2b)
 
     def test_ints(self):
         for proto in protocols:
diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst b/Misc/NEWS.d/next/Core and Builtins/2024-06-12-18-23-15.gh-issue-120380.edtqjq.rst
new file mode 100644 (file)
index 0000000..c682a0b
--- /dev/null
@@ -0,0 +1,3 @@
+Fix Python implementation of :class:`pickle.Pickler` for :class:`bytes` and
+:class:`bytearray` objects when using protocol version 5. Patch by Bénédikt
+Tran.