gh-89739: gh-77140: Support zip64 in zipimport (GH-94146)

author Tim Hatch <tim@timhatch.com>

Thu, 28 Mar 2024 06:54:51 +0000 (23:54 -0700)

committer GitHub <noreply@github.com>

Thu, 28 Mar 2024 06:54:51 +0000 (06:54 +0000)
author Tim Hatch <tim@timhatch.com>
Thu, 28 Mar 2024 06:54:51 +0000 (23:54 -0700)
committer GitHub <noreply@github.com>
Thu, 28 Mar 2024 06:54:51 +0000 (06:54 +0000)
diff --git a/Doc/library/zipimport.rst b/Doc/library/zipimport.rst

index 47c81f0e63603dcbc4bd03ef3c8becda7b28516d..7a8c837307e60af019f9eb94ac8a234972dfd644 100644 (file)
--- a/Doc/library/zipimport.rst
+++ b/Doc/library/zipimport.rst
@@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for
  corresponding :file:`.pyc` file, meaning that if a ZIP archive
  doesn't contain :file:`.pyc` files, importing may be rather slow.
  
+.. versionchanged:: 3.13
+   ZIP64 is supported
+
  .. versionchanged:: 3.8
     Previously, ZIP archives with an archive comment were not supported.
  
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst

index e6234bf974ea47328a108aeecde9ace704594a23..5a5c506d83d735bf9b98bc13189da13bce9d9780 100644 (file)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -700,6 +700,12 @@ xml.etree.ElementTree
    :func:`~xml.etree.ElementTree.iterparse` for explicit cleaning up.
    (Contributed by Serhiy Storchaka in :gh:`69893`.)
  
+zipimport
+---------
+
+* Gains support for ZIP64 format files.  Everybody loves huge code right?
+  (Contributed by Tim Hatch in :gh:`94146`.)
+
  
  Optimizations
  =============
diff --git a/Lib/importlib/_bootstrap_external.py b/Lib/importlib/_bootstrap_external.py

index 4749a627c50c421ccedd15b0587816962b05a2ad..0a11dc9efc252c60d5f4b71a03acb593962d9e35 100644 (file)
--- a/Lib/importlib/_bootstrap_external.py
+++ b/Lib/importlib/_bootstrap_external.py
@@ -81,6 +81,11 @@ def _pack_uint32(x):
      return (int(x) & 0xFFFFFFFF).to_bytes(4, 'little')
  
  
+def _unpack_uint64(data):
+    """Convert 8 bytes in little-endian to an integer."""
+    assert len(data) == 8
+    return int.from_bytes(data, 'little')
+
  def _unpack_uint32(data):
      """Convert 4 bytes in little-endian to an integer."""
      assert len(data) == 4
diff --git a/Lib/test/test_zipimport.py b/Lib/test/test_zipimport.py

index c12798d221e9b763bf9227740c6c08a5f232bdae..ae49700294330ce72f3fc2caa41b8a8ede087fbd 100644 (file)
--- a/Lib/test/test_zipimport.py
+++ b/Lib/test/test_zipimport.py
@@ -128,6 +128,10 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
                  f.write(stuff)
                  f.write(data)
  
+    def getZip64Files(self):
+        # This is the simplest way to make zipfile generate the zip64 EOCD block
+        return {f"f{n}.py": (NOW, test_src) for n in range(65537)}
+
      def doTest(self, expected_ext, files, *modules, **kw):
          self.makeZip(files, **kw)
  
@@ -798,6 +802,14 @@ class UncompressedZipImportTestCase(ImportHooksBaseTestCase):
          files = {TESTMOD + ".py": (NOW, test_src)}
          self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1))
  
+    def testZip64(self):
+        files = self.getZip64Files()
+        self.doTest(".py", files, "f6")
+
+    def testZip64CruftAndComment(self):
+        files = self.getZip64Files()
+        self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))
+
  
  @support.requires_zlib()
  class CompressedZipImportTestCase(UncompressedZipImportTestCase):
diff --git a/Lib/zipimport.py b/Lib/zipimport.py

index 823a82ee830465f48c3b2f15f1069a42bdc444ce..21d2dca46f569b1b06adc1eaa2950f2244a80e5c 100644 (file)
--- a/Lib/zipimport.py
+++ b/Lib/zipimport.py
@@ -15,7 +15,7 @@ to Zip archives.
  #from importlib import _bootstrap_external
  #from importlib import _bootstrap  # for _verbose_message
  import _frozen_importlib_external as _bootstrap_external
-from _frozen_importlib_external import _unpack_uint16, _unpack_uint32
+from _frozen_importlib_external import _unpack_uint16, _unpack_uint32, _unpack_uint64
  import _frozen_importlib as _bootstrap  # for _verbose_message
  import _imp  # for check_hash_based_pycs
  import _io  # for open
@@ -40,8 +40,14 @@ _zip_directory_cache = {}
  _module_type = type(sys)
  
  END_CENTRAL_DIR_SIZE = 22
-STRING_END_ARCHIVE = b'PK\x05\x06'
+END_CENTRAL_DIR_SIZE_64 = 56
+END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
+STRING_END_ARCHIVE = b'PK\x05\x06'  # standard EOCD signature
+STRING_END_LOCATOR_64 = b'PK\x06\x07'  # Zip64 EOCD Locator signature
+STRING_END_ZIP_64 = b'PK\x06\x06'  # Zip64 EOCD signature
  MAX_COMMENT_LEN = (1 << 16) - 1
+MAX_UINT32 = 0xffffffff
+ZIP64_EXTRA_TAG = 0x1
  
  class zipimporter(_bootstrap_external._LoaderBasics):
      """zipimporter(archivepath) -> zipimporter object
@@ -356,49 +362,72 @@ def _read_directory(archive):
          # to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
          start_offset = fp.tell()
          try:
+            # Check if there's a comment.
              try:
-                fp.seek(-END_CENTRAL_DIR_SIZE, 2)
-                header_position = fp.tell()
-                buffer = fp.read(END_CENTRAL_DIR_SIZE)
+                fp.seek(0, 2)
+                file_size = fp.tell()
              except OSError:
-                raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
-            if len(buffer) != END_CENTRAL_DIR_SIZE:
-                raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
-            if buffer[:4] != STRING_END_ARCHIVE:
-                # Bad: End of Central Dir signature
-                # Check if there's a comment.
-                try:
-                    fp.seek(0, 2)
-                    file_size = fp.tell()
-                except OSError:
-                    raise ZipImportError(f"can't read Zip file: {archive!r}",
-                                         path=archive)
-                max_comment_start = max(file_size - MAX_COMMENT_LEN -
-                                        END_CENTRAL_DIR_SIZE, 0)
-                try:
-                    fp.seek(max_comment_start)
-                    data = fp.read()
-                except OSError:
-                    raise ZipImportError(f"can't read Zip file: {archive!r}",
-                                         path=archive)
-                pos = data.rfind(STRING_END_ARCHIVE)
-                if pos < 0:
-                    raise ZipImportError(f'not a Zip file: {archive!r}',
-                                         path=archive)
+                raise ZipImportError(f"can't read Zip file: {archive!r}",
+                                     path=archive)
+            max_comment_plus_dirs_size = (
+                MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
+                END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64)
+            max_comment_start = max(file_size - max_comment_plus_dirs_size, 0)
+            try:
+                fp.seek(max_comment_start)
+                data = fp.read(max_comment_plus_dirs_size)
+            except OSError:
+                raise ZipImportError(f"can't read Zip file: {archive!r}",
+                                     path=archive)
+            pos = data.rfind(STRING_END_ARCHIVE)
+            pos64 = data.rfind(STRING_END_ZIP_64)
+
+            if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
+                # Zip64 at "correct" offset from standard EOCD
+                buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
+                if len(buffer) != END_CENTRAL_DIR_SIZE_64:
+                    raise ZipImportError(
+                        f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte "
+                        f"zip64 central directory, but read {len(buffer)} bytes.",
+                        path=archive)
+                header_position = file_size - len(data) + pos64
+
+                central_directory_size = _unpack_uint64(buffer[40:48])
+                central_directory_position = _unpack_uint64(buffer[48:56])
+                num_entries = _unpack_uint64(buffer[24:32])
+            elif pos >= 0:
                  buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
                  if len(buffer) != END_CENTRAL_DIR_SIZE:
                      raise ZipImportError(f"corrupt Zip file: {archive!r}",
                                           path=archive)
+
                  header_position = file_size - len(data) + pos
  
-            header_size = _unpack_uint32(buffer[12:16])
-            header_offset = _unpack_uint32(buffer[16:20])
-            if header_position < header_size:
+                # Buffer now contains a valid EOCD, and header_position gives the
+                # starting position of it.
+                central_directory_size = _unpack_uint32(buffer[12:16])
+                central_directory_position = _unpack_uint32(buffer[16:20])
+                num_entries = _unpack_uint16(buffer[8:10])
+
+                # N.b. if someday you want to prefer the standard (non-zip64) EOCD,
+                # you need to adjust position by 76 for arc to be 0.
+            else:
+                raise ZipImportError(f'not a Zip file: {archive!r}',
+                                     path=archive)
+
+            # Buffer now contains a valid EOCD, and header_position gives the
+            # starting position of it.
+            # XXX: These are cursory checks but are not as exact or strict as they
+            # could be.  Checking the arc-adjusted value is probably good too.
+            if header_position < central_directory_size:
                  raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
-            if header_position < header_offset:
+            if header_position < central_directory_position:
                  raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
-            header_position -= header_size
-            arc_offset = header_position - header_offset
+            header_position -= central_directory_size
+            # On just-a-zipfile these values are the same and arc_offset is zero; if
+            # the file has some bytes prepended, `arc_offset` is the number of such
+            # bytes.  This is used for pex as well as self-extracting .exe.
+            arc_offset = header_position - central_directory_position
              if arc_offset < 0:
                  raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)
  
@@ -415,6 +444,11 @@ def _read_directory(archive):
                      raise EOFError('EOF read where not expected')
                  # Start of file header
                  if buffer[:4] != b'PK\x01\x02':
+                    if count != num_entries:
+                        raise ZipImportError(
+                            f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
+                            path=archive,
+                        )
                      break                                # Bad: Central Dir File Header
                  if len(buffer) != 46:
                      raise EOFError('EOF read where not expected')
@@ -430,9 +464,6 @@ def _read_directory(archive):
                  comment_size = _unpack_uint16(buffer[32:34])
                  file_offset = _unpack_uint32(buffer[42:46])
                  header_size = name_size + extra_size + comment_size
-                if file_offset > header_offset:
-                    raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
-                file_offset += arc_offset
  
                  try:
                      name = fp.read(name_size)
@@ -444,7 +475,10 @@ def _read_directory(archive):
                  # slower than reading the data because fseek flushes stdio's
                  # internal buffers.    See issue #8745.
                  try:
-                    if len(fp.read(header_size - name_size)) != header_size - name_size:
+                    extra_data_len = header_size - name_size
+                    extra_data = memoryview(fp.read(extra_data_len))
+
+                    if len(extra_data) != extra_data_len:
                          raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
                  except OSError:
                      raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
@@ -461,6 +495,60 @@ def _read_directory(archive):
  
                  name = name.replace('/', path_sep)
                  path = _bootstrap_external._path_join(archive, name)
+
+                # Ordering matches unpacking below.
+                if (
+                    file_size == MAX_UINT32 or
+                    data_size == MAX_UINT32 or
+                    file_offset == MAX_UINT32
+                ):
+                    # need to decode extra_data looking for a zip64 extra (which might not
+                    # be present)
+                    while extra_data:
+                        if len(extra_data) < 4:
+                            raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                        tag = _unpack_uint16(extra_data[:2])
+                        size = _unpack_uint16(extra_data[2:4])
+                        if len(extra_data) < 4 + size:
+                            raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                        if tag == ZIP64_EXTRA_TAG:
+                            if (len(extra_data) - 4) % 8 != 0:
+                                raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                            num_extra_values = (len(extra_data) - 4) // 8
+                            if num_extra_values > 3:
+                                raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
+                            values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
+                                                        extra_data, offset=4)
+
+                            # N.b. Here be dragons: the ordering of these is different than
+                            # the header fields, and it's really easy to get it wrong since
+                            # naturally-occuring zips that use all 3 are >4GB
+                            if file_size == MAX_UINT32:
+                                file_size = values.pop(0)
+                            if data_size == MAX_UINT32:
+                                data_size = values.pop(0)
+                            if file_offset == MAX_UINT32:
+                                file_offset = values.pop(0)
+
+                            break
+
+                        # For a typical zip, this bytes-slicing only happens 2-3 times, on
+                        # small data like timestamps and filesizes.
+                        extra_data = extra_data[4+size:]
+                    else:
+                        _bootstrap._verbose_message(
+                            "zipimport: suspected zip64 but no zip64 extra for {!r}",
+                            path,
+                        )
+                # XXX These two statements seem swapped because `central_directory_position`
+                # is a position within the actual file, but `file_offset` (when compared) is
+                # as encoded in the entry, not adjusted for this file.
+                # N.b. this must be after we've potentially read the zip64 extra which can
+                # change `file_offset`.
+                if file_offset > central_directory_position:
+                    raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
+                file_offset += arc_offset
+
                  t = (path, compress, data_size, file_size, file_offset, time, date, crc)
                  files[name] = t
                  count += 1
diff --git a/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst b/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst

new file mode 100644 (file)

index 0000000..0358c01
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst
@@ -0,0 +1 @@
+The :mod:`zipimport` module can now read ZIP64 files.
author	Tim Hatch <tim@timhatch.com>
	Thu, 28 Mar 2024 06:54:51 +0000 (23:54 -0700)
committer	GitHub <noreply@github.com>
	Thu, 28 Mar 2024 06:54:51 +0000 (06:54 +0000)
Doc/library/zipimport.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.13.rst		patch \| blob \| blame \| history
Lib/importlib/_bootstrap_external.py		patch \| blob \| blame \| history
Lib/test/test_zipimport.py		patch \| blob \| blame \| history
Lib/zipimport.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-06-22-14-45-32.gh-issue-89739.CqZcRL.rst	[new file with mode: 0644]	patch \| blob