]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.13] GH-128131: Completely support random read access of uncompressed unencrypted...
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Mon, 20 Jan 2025 18:28:52 +0000 (19:28 +0100)
committerGitHub <noreply@github.com>
Mon, 20 Jan 2025 18:28:52 +0000 (18:28 +0000)
GH-128131: Completely support random read access of uncompressed unencrypted files in ZipFile (GH-128143)
(cherry picked from commit dda02eb7be62bf0af850a7521c77c90ea997df6c)

Co-authored-by: 5ec1cff <56485584+5ec1cff@users.noreply.github.com>
Co-authored-by: blurb-it[bot] <43283697+blurb-it[bot]@users.noreply.github.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
Lib/test/test_zipfile/test_core.py
Lib/zipfile/__init__.py
Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst [new file with mode: 0644]

index 4ff9f9c34237c63b0f4abf20d05b3c83eb4418ce..4b56f6a380f2190b6a201b7fc1a4da4c2bb13941 100644 (file)
@@ -1,3 +1,4 @@
+import _pyio
 import array
 import contextlib
 import importlib.util
@@ -3454,5 +3455,87 @@ class StripExtraTests(unittest.TestCase):
             b"zzz", zipfile._Extra.strip(b"zzz", (self.ZIP64_EXTRA,)))
 
 
+class StatIO(_pyio.BytesIO):
+    """Buffer which remembers the number of bytes that were read."""
+
+    def __init__(self):
+        super().__init__()
+        self.bytes_read = 0
+
+    def read(self, size=-1):
+        bs = super().read(size)
+        self.bytes_read += len(bs)
+        return bs
+
+
+class StoredZipExtFileRandomReadTest(unittest.TestCase):
+    """Tests whether an uncompressed, unencrypted zip entry can be randomly
+    seek and read without reading redundant bytes."""
+    def test_stored_seek_and_read(self):
+
+        sio = StatIO()
+        # 20000 bytes
+        txt = b'0123456789' * 2000
+
+        # The seek length must be greater than ZipExtFile.MIN_READ_SIZE
+        # as `ZipExtFile._read2()` reads in blocks of this size and we
+        # need to seek out of the buffered data
+        read_buffer_size = zipfile.ZipExtFile.MIN_READ_SIZE
+        self.assertGreaterEqual(10002, read_buffer_size)  # for forward seek test
+        self.assertGreaterEqual(5003, read_buffer_size)  # for backward seek test
+        # The read length must be less than MIN_READ_SIZE, since we assume that
+        # only 1 block is read in the test.
+        read_length = 100
+        self.assertGreaterEqual(read_buffer_size, read_length)  # for read() calls
+
+        with zipfile.ZipFile(sio, "w", compression=zipfile.ZIP_STORED) as zipf:
+            zipf.writestr("foo.txt", txt)
+
+        # check random seek and read on a file
+        with zipfile.ZipFile(sio, "r") as zipf:
+            with zipf.open("foo.txt", "r") as fp:
+                # Test this optimized read hasn't rewound and read from the
+                # start of the file (as in the case of the unoptimized path)
+
+                # forward seek
+                old_count = sio.bytes_read
+                forward_seek_len = 10002
+                current_pos = 0
+                fp.seek(forward_seek_len, os.SEEK_CUR)
+                current_pos += forward_seek_len
+                self.assertEqual(fp.tell(), current_pos)
+                self.assertEqual(fp._left, fp._compress_left)
+                arr = fp.read(read_length)
+                current_pos += read_length
+                self.assertEqual(fp.tell(), current_pos)
+                self.assertEqual(arr, txt[current_pos - read_length:current_pos])
+                self.assertEqual(fp._left, fp._compress_left)
+                read_count = sio.bytes_read - old_count
+                self.assertLessEqual(read_count, read_buffer_size)
+
+                # backward seek
+                old_count = sio.bytes_read
+                backward_seek_len = 5003
+                fp.seek(-backward_seek_len, os.SEEK_CUR)
+                current_pos -= backward_seek_len
+                self.assertEqual(fp.tell(), current_pos)
+                self.assertEqual(fp._left, fp._compress_left)
+                arr = fp.read(read_length)
+                current_pos += read_length
+                self.assertEqual(fp.tell(), current_pos)
+                self.assertEqual(arr, txt[current_pos - read_length:current_pos])
+                self.assertEqual(fp._left, fp._compress_left)
+                read_count = sio.bytes_read - old_count
+                self.assertLessEqual(read_count, read_buffer_size)
+
+                # eof flags test
+                fp.seek(0, os.SEEK_END)
+                fp.seek(12345, os.SEEK_SET)
+                current_pos = 12345
+                arr = fp.read(read_length)
+                current_pos += read_length
+                self.assertEqual(arr, txt[current_pos - read_length:current_pos])
+
+
 if __name__ == "__main__":
     unittest.main()
index 8b636094fad3ca4fe9c5020aae88faa32f87b987..82e307f78e8e3d85d162abfbe4c37a2d6f9b10cd 100644 (file)
@@ -1163,13 +1163,15 @@ class ZipExtFile(io.BufferedIOBase):
             self._offset = buff_offset
             read_offset = 0
         # Fast seek uncompressed unencrypted file
-        elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset > 0:
+        elif self._compress_type == ZIP_STORED and self._decrypter is None and read_offset != 0:
             # disable CRC checking after first seeking - it would be invalid
             self._expected_crc = None
             # seek actual file taking already buffered data into account
             read_offset -= len(self._readbuffer) - self._offset
             self._fileobj.seek(read_offset, os.SEEK_CUR)
             self._left -= read_offset
+            self._compress_left -= read_offset
+            self._eof = self._left <= 0
             read_offset = 0
             # flush read buffer
             self._readbuffer = b''
diff --git a/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst b/Misc/NEWS.d/next/Library/2024-12-21-03-20-12.gh-issue-128131.QpPmNt.rst
new file mode 100644 (file)
index 0000000..f4c4ebc
--- /dev/null
@@ -0,0 +1,2 @@
+Completely support random access of uncompressed unencrypted read-only
+zip files obtained by :meth:`ZipFile.open <zipfile.ZipFile.open>`.