From 2121da4bbbdb976af69f6d913e4a2cd19267ff06 Mon Sep 17 00:00:00 2001 From: Duncan Horn <40036384+dunhor@users.noreply.github.com> Date: Thu, 10 Oct 2024 23:30:25 -0700 Subject: [PATCH] [7zip] Read/write symlink paths as UTF-8 (#2252) I previously tried to find documentation on how symlinks are expected to be stored in 7zip files, however the best reference I could find was [here](https://py7zr.readthedocs.io/en/latest/archive_format.html). That site suggests that symlink paths are stored as UTF-8 encoded strings: --- Makefile.am | 1 + libarchive/archive_read_support_format_7zip.c | 15 ++- libarchive/archive_write_set_format_7zip.c | 16 ++- libarchive/test/CMakeLists.txt | 1 + libarchive/test/test_7zip_filename_encoding.c | 100 ++++++++++++++++++ 5 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 libarchive/test/test_7zip_filename_encoding.c diff --git a/Makefile.am b/Makefile.am index e3dbdb1d0..a36126c47 100644 --- a/Makefile.am +++ b/Makefile.am @@ -365,6 +365,7 @@ libarchive_test_SOURCES= \ $(test_utils_SOURCES) \ libarchive/test/read_open_memory.c \ libarchive/test/test.h \ + libarchive/test/test_7zip_filename_encoding.c \ libarchive/test/test_acl_nfs4.c \ libarchive/test/test_acl_pax.c \ libarchive/test/test_acl_platform_nfs4.c \ diff --git a/libarchive/archive_read_support_format_7zip.c b/libarchive/archive_read_support_format_7zip.c index fd5792d16..b4e34d68d 100644 --- a/libarchive/archive_read_support_format_7zip.c +++ b/libarchive/archive_read_support_format_7zip.c @@ -833,9 +833,20 @@ archive_read_format_7zip_read_header(struct archive_read *a, zip_entry->mode |= AE_IFREG; archive_entry_set_mode(entry, zip_entry->mode); } else { + struct archive_string_conv* utf8_conv; + symname[symsize] = '\0'; - archive_entry_copy_symlink(entry, - (const char *)symname); + + /* Symbolic links are embedded as UTF-8 strings */ + utf8_conv = archive_string_conversion_from_charset(&a->archive, + "UTF-8", 1); + if (utf8_conv == NULL) { + free(symname); + return ARCHIVE_FATAL; + } + + archive_entry_copy_symlink_l(entry, (const char*)symname, symsize, + utf8_conv); } free(symname); archive_entry_set_size(entry, 0); diff --git a/libarchive/archive_write_set_format_7zip.c b/libarchive/archive_write_set_format_7zip.c index c0ea9d6b1..b870338fc 100644 --- a/libarchive/archive_write_set_format_7zip.c +++ b/libarchive/archive_write_set_format_7zip.c @@ -521,7 +521,7 @@ _7z_write_header(struct archive_write *a, struct archive_entry *entry) */ if (archive_entry_filetype(entry) == AE_IFLNK) { ssize_t bytes; - const void *p = (const void *)archive_entry_symlink(entry); + const void *p = (const void *)archive_entry_symlink_utf8(entry); bytes = compress_out(a, p, (size_t)file->size, ARCHIVE_Z_RUN); if (bytes < 0) return ((int)bytes); @@ -1563,8 +1563,18 @@ file_new(struct archive_write *a, struct archive_entry *entry, archive_entry_set_size(entry, 0); if (archive_entry_filetype(entry) == AE_IFDIR) file->dir = 1; - else if (archive_entry_filetype(entry) == AE_IFLNK) - file->size = strlen(archive_entry_symlink(entry)); + else if (archive_entry_filetype(entry) == AE_IFLNK) { + const char* linkpath; + linkpath = archive_entry_symlink_utf8(entry); + if (linkpath == NULL) { + free(file); + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, + "symlink path could not be converted to UTF-8"); + return (ARCHIVE_FAILED); + } + else + file->size = strlen(linkpath); + } if (archive_entry_mtime_is_set(entry)) { file->flg |= MTIME_IS_SET; file->times[MTIME].time = archive_entry_mtime(entry); diff --git a/libarchive/test/CMakeLists.txt b/libarchive/test/CMakeLists.txt index 7d5bc3626..314c972d2 100644 --- a/libarchive/test/CMakeLists.txt +++ b/libarchive/test/CMakeLists.txt @@ -9,6 +9,7 @@ IF(ENABLE_TEST) ../../test_utils/test_main.c read_open_memory.c test.h + test_7zip_filename_encoding.c test_acl_nfs4.c test_acl_pax.c test_acl_platform_nfs4.c diff --git a/libarchive/test/test_7zip_filename_encoding.c b/libarchive/test/test_7zip_filename_encoding.c new file mode 100644 index 000000000..cf562d39f --- /dev/null +++ b/libarchive/test/test_7zip_filename_encoding.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2003-2018 + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "test.h" + +DEFINE_TEST(test_7zip_filename_encoding_UTF16_win) +{ +#if !defined(_WIN32) || defined(__CYGWIN__) + skipping("This test is meant to verify unicode string handling" + " on Windows with UTF-16 names"); + return; +#else + struct archive *a; + struct archive_entry *entry; + char buff[4096]; + size_t used; + + /* + * Don't call setlocale because we're verifying that the '_w' functions + * work as expected + */ + + a = archive_write_new(); + assertEqualInt(ARCHIVE_OK, archive_write_set_format_7zip(a)); + assertEqualInt(ARCHIVE_OK, + archive_write_open_memory(a, buff, sizeof(buff), &used)); + + /* Part 1: file */ + entry = archive_entry_new2(a); + archive_entry_copy_pathname_w(entry, L"\u8868.txt"); + archive_entry_set_filetype(entry, AE_IFREG); + archive_entry_set_size(entry, 0); + assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); + + /* Part 2: directory */ + archive_entry_clear(entry); + archive_entry_copy_pathname_w(entry, L"\u8868"); + archive_entry_set_filetype(entry, AE_IFDIR); + archive_entry_set_size(entry, 0); + assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); + + /* Part 3: symlink */ + archive_entry_clear(entry); + archive_entry_set_pathname(entry, "link.txt"); + archive_entry_copy_symlink_w(entry, L"\u8868.txt"); + archive_entry_set_filetype(entry, AE_IFLNK); + archive_entry_set_symlink_type(entry, AE_SYMLINK_TYPE_FILE); + archive_entry_set_size(entry, 0); + assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry)); + + /* NOTE: 7zip does not support hardlinks */ + + archive_entry_free(entry); + assertEqualInt(ARCHIVE_OK, archive_write_free(a)); + + /* Ensure that the archive contents can be read properly */ + /* NOTE: 7zip file contents are not in the order we wrote them! */ + a = archive_read_new(); + archive_read_support_format_all(a); + archive_read_support_filter_all(a); + assertEqualIntA(a, ARCHIVE_OK, read_open_memory_seek(a, buff, used, 7)); + + /* Read part 3: symlink */ + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry)); + assertEqualWString(L"\u8868.txt", archive_entry_symlink_w(entry)); + + /* Read part 1: file */ + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry)); + assertEqualWString(L"\u8868.txt", archive_entry_pathname_w(entry)); + + /* Read part 2: directory */ + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry)); + /* NOTE: Trailing slash added automatically for us */ + assertEqualWString(L"\u8868/", archive_entry_pathname_w(entry)); + + archive_read_free(a); +#endif +} \ No newline at end of file -- 2.47.2