gh-86094: Add support for Unicode Path Extra Field in ZipFile (gh-102566)

author Yeojin Kim <yeojin.dev@gmail.com>

Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)

committer GitHub <noreply@github.com>

Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)
author Yeojin Kim <yeojin.dev@gmail.com>
Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)
committer GitHub <noreply@github.com>
Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)
diff --git a/Lib/test/test_zipfile/test_core.py b/Lib/test/test_zipfile/test_core.py

index e23f5c2a8556f2d5a0894e0fa1db357dca5f1e98..73c6b0185a1a0e4ebb4dce40921e37324138dba0 100644 (file)
--- a/Lib/test/test_zipfile/test_core.py
+++ b/Lib/test/test_zipfile/test_core.py
@@ -1616,6 +1616,33 @@ class OtherTests(unittest.TestCase):
              self.assertEqual(zf.filelist[0].filename, "foo.txt")
              self.assertEqual(zf.filelist[1].filename, "\xf6.txt")
  
+    @requires_zlib()
+    def test_read_zipfile_containing_unicode_path_extra_field(self):
+        with zipfile.ZipFile(TESTFN, mode='w') as zf:
+            # create a file with a non-ASCII name
+            filename = '이름.txt'
+            filename_encoded = filename.encode('utf-8')
+
+            # create a ZipInfo object with Unicode path extra field
+            zip_info = zipfile.ZipInfo(filename)
+
+            tag_for_unicode_path = b'\x75\x70'
+            version_of_unicode_path = b'\x01'
+
+            import zlib
+            filename_crc = struct.pack('<L', zlib.crc32(filename_encoded))
+
+            extra_data = version_of_unicode_path + filename_crc + filename_encoded
+            tsize = len(extra_data).to_bytes(2, 'little')
+
+            zip_info.extra = tag_for_unicode_path + tsize + extra_data
+
+            # add the file to the ZIP archive
+            zf.writestr(zip_info, b'Hello World!')
+
+        with zipfile.ZipFile(TESTFN, "r") as zf:
+            self.assertEqual(zf.filelist[0].filename, "이름.txt")
+
      def test_read_after_write_unicode_filenames(self):
          with zipfile.ZipFile(TESTFN2, 'w') as zipfp:
              zipfp.writestr('приклад', b'sample')
diff --git a/Lib/zipfile/__init__.py b/Lib/zipfile/__init__.py

index 6e6211de6b168428a5f334d9ac7f667c40eca0a6..95c047991f872b1d1635c5cb0908db24accbab59 100644 (file)
--- a/Lib/zipfile/__init__.py
+++ b/Lib/zipfile/__init__.py
@@ -338,6 +338,22 @@ def _EndRecData(fpin):
      # Unable to find a valid end of central directory structure
      return None
  
+def _sanitize_filename(filename):
+    """Terminate the file name at the first null byte and
+    ensure paths always use forward slashes as the directory separator."""
+
+    # Terminate the file name at the first null byte.  Null bytes in file
+    # names are used as tricks by viruses in archives.
+    null_byte = filename.find(chr(0))
+    if null_byte >= 0:
+        filename = filename[0:null_byte]
+    # This is used to ensure paths in generated ZIP files always use
+    # forward slashes as the directory separator, as required by the
+    # ZIP format specification.
+    if os.sep != "/" and os.sep in filename:
+        filename = filename.replace(os.sep, "/")
+    return filename
+
  
  class ZipInfo (object):
      """Class with attributes describing each file in the ZIP archive."""
@@ -368,16 +384,9 @@ class ZipInfo (object):
      def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
          self.orig_filename = filename   # Original file name in archive
  
-        # Terminate the file name at the first null byte.  Null bytes in file
-        # names are used as tricks by viruses in archives.
-        null_byte = filename.find(chr(0))
-        if null_byte >= 0:
-            filename = filename[0:null_byte]
-        # This is used to ensure paths in generated ZIP files always use
-        # forward slashes as the directory separator, as required by the
-        # ZIP format specification.
-        if os.sep != "/" and os.sep in filename:
-            filename = filename.replace(os.sep, "/")
+        # Terminate the file name at the first null byte and
+        # ensure paths always use forward slashes as the directory separator.
+        filename = _sanitize_filename(filename)
  
          self.filename = filename        # Normalized file name
          self.date_time = date_time      # year, month, day, hour, min, sec
@@ -482,7 +491,7 @@ class ZipInfo (object):
          except UnicodeEncodeError:
              return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
  
-    def _decodeExtra(self):
+    def _decodeExtra(self, filename_crc):
          # Try to decode the extra field.
          extra = self.extra
          unpack = struct.unpack
@@ -508,6 +517,21 @@ class ZipInfo (object):
                  except struct.error:
                      raise BadZipFile(f"Corrupt zip64 extra field. "
                                       f"{field} not found.") from None
+            elif tp == 0x7075:
+                data = extra[4:ln+4]
+                # Unicode Path Extra Field
+                try:
+                    up_version, up_name_crc = unpack('<BL', data[:5])
+                    if up_version == 1 and up_name_crc == filename_crc:
+                        up_unicode_name = data[5:].decode('utf-8')
+                        if up_unicode_name:
+                            self.filename = _sanitize_filename(up_unicode_name)
+                        else:
+                            warnings.warn("Empty unicode path extra field (0x7075)", stacklevel=2)
+                except struct.error as e:
+                    raise BadZipFile("Corrupt unicode path extra field (0x7075)") from e
+                except UnicodeDecodeError as e:
+                    raise BadZipFile('Corrupt unicode path extra field (0x7075): invalid utf-8 bytes') from e
  
              extra = extra[ln+4:]
  
@@ -1409,6 +1433,7 @@ class ZipFile:
              if self.debug > 2:
                  print(centdir)
              filename = fp.read(centdir[_CD_FILENAME_LENGTH])
+            orig_filename_crc = crc32(filename)
              flags = centdir[_CD_FLAG_BITS]
              if flags & _MASK_UTF_FILENAME:
                  # UTF-8 file names extension
@@ -1432,8 +1457,7 @@ class ZipFile:
              x._raw_time = t
              x.date_time = ( (d>>9)+1980, (d>>5)&0xF, d&0x1F,
                              t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
-
-            x._decodeExtra()
+            x._decodeExtra(orig_filename_crc)
              x.header_offset = x.header_offset + concat
              self.filelist.append(x)
              self.NameToInfo[x.filename] = x
diff --git a/Misc/ACKS b/Misc/ACKS

index 929e06a87cb794554d6be18009b8c31982ffde11..49f3692dfd6b8f3cdf737811c84628e35e2de4c2 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -627,6 +627,7 @@ Julian Gindi
  Yannick Gingras
  Neil Girdhar
  Matt Giuca
+Andrea Giudiceandrea
  Franz Glasner
  Wim Glenn
  Michael Goderbauer
diff --git a/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst b/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst

new file mode 100644 (file)

index 0000000..39461f3
--- /dev/null
+++ b/Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst
@@ -0,0 +1,2 @@
+Add support for Unicode Path Extra Field in ZipFile. Patch by Yeojin Kim
+and Andrea Giudiceandrea
author	Yeojin Kim <yeojin.dev@gmail.com>
	Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)
committer	GitHub <noreply@github.com>
	Wed, 5 Apr 2023 11:54:48 +0000 (20:54 +0900)
Lib/test/test_zipfile/test_core.py		patch \| blob \| blame \| history
Lib/zipfile/__init__.py		patch \| blob \| blame \| history
Misc/ACKS		patch \| blob \| blame \| history
Misc/NEWS.d/next/Documentation/2023-03-10-04-59-35.gh-issue-86094.zOYdy8.rst	[new file with mode: 0644]	patch \| blob