Pax parsing should consistently use the FIRST pathname/linkname (#2264)

author Tim Kientzle <kientzle@acm.org>

Tue, 9 Jul 2024 11:55:23 +0000 (04:55 -0700)

committer GitHub <noreply@github.com>

Tue, 9 Jul 2024 11:55:23 +0000 (13:55 +0200)
author Tim Kientzle <kientzle@acm.org>
Tue, 9 Jul 2024 11:55:23 +0000 (04:55 -0700)
committer GitHub <noreply@github.com>
Tue, 9 Jul 2024 11:55:23 +0000 (13:55 +0200)
diff --git a/libarchive/archive_read_support_format_tar.c b/libarchive/archive_read_support_format_tar.c

index 07475dba060c2e39b6da92094dcc495bde079446..f8005e1a238505e4ea893038c8c2b1e87bf9c0c0 100644 (file)
--- a/libarchive/archive_read_support_format_tar.c
+++ b/libarchive/archive_read_support_format_tar.c
@@ -1294,6 +1294,7 @@ header_common(struct archive_read *a, struct tar *tar,
  {
         const struct archive_entry_header_ustar *header;
         const char *existing_linkpath;
+       const wchar_t *existing_wcs_linkpath;
         int     err = ARCHIVE_OK;
  
         header = (const struct archive_entry_header_ustar *)h;
@@ -1346,8 +1347,10 @@ header_common(struct archive_read *a, struct tar *tar,
         switch (tar->filetype) {
         case '1': /* Hard link */
                 archive_entry_set_link_to_hardlink(entry);
+               existing_wcs_linkpath = archive_entry_hardlink_w(entry);
                 existing_linkpath = archive_entry_hardlink(entry);
-               if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
+               if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
+                   && (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
                         struct archive_string linkpath;
                         archive_string_init(&linkpath);
                         archive_strncpy(&linkpath,
@@ -1422,8 +1425,10 @@ header_common(struct archive_read *a, struct tar *tar,
                 break;
         case '2': /* Symlink */
                 archive_entry_set_link_to_symlink(entry);
+               existing_wcs_linkpath = archive_entry_symlink_w(entry);
                 existing_linkpath = archive_entry_symlink(entry);
-               if (existing_linkpath == NULL || existing_linkpath[0] == '\0') {
+               if ((existing_linkpath == NULL || existing_linkpath[0] == '\0')
+                   && (existing_wcs_linkpath == NULL || existing_wcs_linkpath[0] == '\0')) {
                         struct archive_string linkpath;
                         archive_string_init(&linkpath);
                         archive_strncpy(&linkpath,
@@ -1677,7 +1682,9 @@ header_ustar(struct archive_read *a, struct tar *tar,
  
         /* Copy name into an internal buffer to ensure null-termination. */
         const char *existing_pathname = archive_entry_pathname(entry);
-       if (existing_pathname == NULL || existing_pathname[0] == '\0') {
+       const wchar_t *existing_wcs_pathname = archive_entry_pathname_w(entry);
+       if ((existing_pathname == NULL || existing_pathname[0] == '\0')
+           && (existing_wcs_pathname == NULL || existing_wcs_pathname[0] == '\0')) {
                 archive_string_init(&as);
                 if (header->prefix[0]) {
                         archive_strncpy(&as, header->prefix, sizeof(header->prefix));
diff --git a/libarchive/test/test_pax_filename_encoding.c b/libarchive/test/test_pax_filename_encoding.c

index 98db4cda193ebc01d437a7a2168aad2e12ca5bb4..27e10eaffc6cb9e4a6308f3db6bd469d85841d5f 100644 (file)
--- a/libarchive/test/test_pax_filename_encoding.c
+++ b/libarchive/test/test_pax_filename_encoding.c
@@ -592,6 +592,7 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
         struct archive *a;
         struct archive_entry *entry;
         char buff[0x2000];
+       char *p;
         size_t used;
  
         /*
@@ -608,11 +609,11 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
                 archive_write_free(a);
                 return;
         }
-
-       /* Re-create a write archive object since filenames should be written
-        * in UTF-8 by default. */
         archive_write_free(a);
  
+       /*
+        * Create a new archive handle with default charset handling
+        */
         a = archive_write_new();
         assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
         assertEqualInt(ARCHIVE_OK,
@@ -650,11 +651,63 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
         archive_entry_free(entry);
         assertEqualInt(ARCHIVE_OK, archive_write_free(a));
  
-       /* Ensure that the names round trip properly */
+       /*
+        * Examine the bytes to ensure the filenames ended up UTF-8
+        * encoded as we expect.
+        */
+
+       /* Part 1: file */
+       p = buff + 0;
+       assertEqualString(p + 0, "PaxHeader/\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
+       assertEqualInt(p[156], 'x'); /* Pax extension header */
+       p += 512; /* Pax extension body */
+       assertEqualString(p + 0, "19 path=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n");
+       p += 512; /* Ustar header */
+       assertEqualString(p + 0, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
+       assertEqualInt(p[156], '0');
+
+       /* Part 2: directory */
+       p += 512; /* Pax extension header */
+       assertEqualString(p + 0, "PaxHeader/\xD0\xBF\xD1\x80\xD0\xB8"); /* File name */
+       assertEqualInt(p[156], 'x');
+       p += 512; /* Pax extension body */
+       assertEqualString(p + 0, "16 path=\xD0\xBF\xD1\x80\xD0\xB8/\n");
+       p += 512; /* Ustar header */
+       assertEqualString(p + 0, "\xD0\xBF\xD1\x80\xD0\xB8/"); /* File name */
+       assertEqualInt(p[156], '5'); /* directory */
+
+       /* Part 3: symlink */
+       p += 512; /* Pax Extension Header */
+       assertEqualString(p + 0, "PaxHeader/\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
+       p += 512; /* Pax extension body */
+       assertEqualString(p + 0,
+                         "19 path=\xE5\x86\x8D\xE8\xA7\x81.txt\n"
+                         "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
+                         "31 LIBARCHIVE.symlinktype=file\n");
+       p += 512; /* Ustar header */
+       assertEqualString(p + 0, "\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
+       assertEqualInt(p[156], '2'); /* symlink */
+       assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
+
+       /* Part 4: hardlink */
+       p += 512; /* Pax extension header */
+       assertEqualString(p + 0, "PaxHeader/\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
+       p += 512; /* Pax extension body */
+       assertEqualString(p + 0,
+                         "19 path=\xE6\x99\x9A\xE5\xAE\x89.txt\n"
+                         "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
+                         "31 LIBARCHIVE.symlinktype=file\n");
+       p += 512; /* Ustar header */
+       assertEqualString(p + 0, "\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
+       assertEqualInt(p[156], '1'); /* hard link */
+       assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
+
+       /*
+        * Read back the archive to see if we get the original names
+        */
         a = archive_read_new();
         archive_read_support_format_all(a);
         archive_read_support_filter_all(a);
-       assertEqualInt(ARCHIVE_OK, archive_read_set_options(a, "hdrcharset=UTF-8"));
         assertEqualInt(0, archive_read_open_memory(a, buff, used));
  
         /* Read part 1: file */
@@ -674,6 +727,8 @@ DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
         assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
         assertEqualWString(L"\u665a\u5b89.txt", archive_entry_pathname_w(entry));
         assertEqualWString(L"\u4f60\u597d.txt", archive_entry_hardlink_w(entry));
+
+       archive_free(a);
  #endif
  }
author	Tim Kientzle <kientzle@acm.org>
	Tue, 9 Jul 2024 11:55:23 +0000 (04:55 -0700)
committer	GitHub <noreply@github.com>
	Tue, 9 Jul 2024 11:55:23 +0000 (13:55 +0200)
libarchive/archive_read_support_format_tar.c		patch \| blob \| blame \| history
libarchive/test/test_pax_filename_encoding.c		patch \| blob \| blame \| history