]> git.ipfire.org Git - thirdparty/libarchive.git/commitdiff
Update 'archive_mstring_update_utf8' to attempt UTF8->WCS conversion on Windows if...
authorDuncan Horn <40036384+dunhor@users.noreply.github.com>
Mon, 18 Mar 2024 02:49:56 +0000 (19:49 -0700)
committerGitHub <noreply@github.com>
Mon, 18 Mar 2024 02:49:56 +0000 (19:49 -0700)
Currently, functions like `archive_entry_pathname_w` etc. fail on
Windows for .rar files that contain entries with Unicode filenames that
cannot be represented by the active codepage. This is because
`archive_mstring_update_utf8` first attempts to perform a UTF8->MBS
conversion before doing an MBS->WCS conversion. The first conversion (to
MBS) fails, which short-circuits to return failure. Later when we try
and read the string, `archive_mstring_get_wcs` will fail because it
_also_ tries to do a UTF8->MBS followed by an MBS->WCS conversion. The
conversion to MBS will of course fail again.

One possible workaround is to call `setlocale` with something like
`"en_US.utf8"`, however this is not feasible for some consumers.

This change fixes this issue by adding a "fallback" in
'archive_mstring_update_utf8' which will attempt to do a UTF8->WCS
conversion on Windows if the MBS conversion failed. This is not too
dissimilar from the implementation of `archive_mstring_copy_mbs_len_l`
which most - if not all - other archive formats seem to take, which will
by default call `archive_wstring_append_from_mbs_in_codepage` if the
passed in `archive_string_conv` object is non-null

Fixes #1971

libarchive/archive_string.c
libarchive/test/test_read_format_rar5.c
libarchive/test/test_read_format_rar5_unicode.rar.uu [new file with mode: 0644]

index d206db5f97e205d4942493580915195d8d40bc4f..17cfd384cd58a40411b7e57b0993371c8df01f21 100644 (file)
@@ -4226,6 +4226,17 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
        if (sc == NULL)
                return (-1);/* Couldn't allocate memory for sc. */
        r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+       /* On failure, make an effort to convert UTF8 to WCS as the active code page
+        * may not be able to represent all characters in the string */
+       if (r != 0) {
+               if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
+                       aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0)
+                       aes->aes_set = AES_SET_UTF8 | AES_SET_WCS;
+       }
+#endif
+
        if (a == NULL)
                free_sconv_object(sc);
        if (r != 0)
index 34f33ccb3f931cb5f8d0b11eea2513cc07e47239..705913b04cc23c174ec9892dd12494d3f50c70b4 100644 (file)
@@ -806,6 +806,36 @@ DEFINE_TEST(test_read_format_rar5_extract_win32)
        EPILOGUE();
 }
 
+DEFINE_TEST(test_read_format_rar5_unicode)
+{
+#if !defined(WIN32) || defined(__CYGWIN__)
+       skipping("Skipping test on non-Windows");
+       return;
+#else
+       /* Corresponds to the names:
+        * ๐Ÿ‘‹๐ŸŒŽ.txt
+        * ๐’ฎ๐“Ž๐“‚๐’ท๐‘œ๐“๐’พ๐’ธ ๐ฟ๐’พ๐“ƒ๐“€.txt
+        * โ’ฝโ“โ“กโ““ โ“โ“˜โ“โ“š.txt */
+       const wchar_t* emoji_name = L"\U0001f44b\U0001f30e.txt";
+       const wchar_t* italic_name = L"\U0001d4ae\U0001d4ce\U0001d4c2\U0001d4b7\U0001d45c\U0001d4c1\U0001d4be\U0001d4b8 \U0001d43f\U0001d4be\U0001d4c3\U0001d4c0.txt";
+       const wchar_t* circle_name = L"\u24bd\u24d0\u24e1\u24d3 \u24c1\u24d8\u24dd\u24da.txt";
+
+       PROLOGUE("test_read_format_rar5_unicode.rar");
+       assertA(0 == archive_read_next_header(a, &ae));
+       assertEqualWString(emoji_name, archive_entry_pathname_w(ae));
+       assertEqualInt(archive_entry_mode(ae), AE_IFREG | 0644);
+       assertA(0 == archive_read_next_header(a, &ae));
+       assertEqualWString(circle_name, archive_entry_pathname_w(ae));
+       assertEqualInt(archive_entry_mode(ae), AE_IFREG | 0644);
+       assertEqualWString(emoji_name, archive_entry_hardlink_w(ae));
+       assertA(0 == archive_read_next_header(a, &ae));
+       assertEqualWString(italic_name, archive_entry_pathname_w(ae));
+       assertEqualInt(archive_entry_mode(ae), AE_IFLNK | 0644);
+       assertEqualWString(emoji_name, archive_entry_symlink_w(ae));
+       EPILOGUE();
+#endif
+}
+
 DEFINE_TEST(test_read_format_rar5_block_by_block)
 {
        /* This test uses strange buffer sizes intentionally. */
diff --git a/libarchive/test/test_read_format_rar5_unicode.rar.uu b/libarchive/test/test_read_format_rar5_unicode.rar.uu
new file mode 100644 (file)
index 0000000..e011ab6
--- /dev/null
@@ -0,0 +1,9 @@
+begin 0744 test_read_format_rar5_unicode.rar\r
+M4F%R(1H'`0#SX8+K"P$%!P`&`0&`@(``//TR$"@"`PN-``2-`"#FQN;K@```\r
+M#/"?D8OPGXR.+G1X=`H#`D:)>%;RZ]D!2&5L;&\L('=O<FQD(2/9BPA(`@,<\r
+M``0-(`````"````=XI*]XI.0XI.AXI.3(.*3@>*3F.*3G>*3FBYT>'0*`P)&\r
+MB7A6\NO9`1`%!``,\)^1B_"?C(XN='ATD-'.1V$"`QP`!`"@"`````"````U\r
+M\)V2KO"=DX[PG9."\)V2M_"=D9SPG9.!\)V2OO"=DK@@\)V0O_"=DK[PG9.#\r
+L\)V3@"YT>'0*`P(>7'J>!.S9`1`%`@`,\)^1B_"?C(XN='AT'7=640,%!```\r
+`\r
+end\r