From aafb078b7cfaed3af4edbf9cb31d3ed930323aca Mon Sep 17 00:00:00 2001 From: Duncan Horn <40036384+dunhor@users.noreply.github.com> Date: Sun, 17 Mar 2024 19:49:56 -0700 Subject: [PATCH] Update 'archive_mstring_update_utf8' to attempt UTF8->WCS conversion on Windows if MBS conversion fails (#1978) Currently, functions like `archive_entry_pathname_w` etc. fail on Windows for .rar files that contain entries with Unicode filenames that cannot be represented by the active codepage. This is because `archive_mstring_update_utf8` first attempts to perform a UTF8->MBS conversion before doing an MBS->WCS conversion. The first conversion (to MBS) fails, which short-circuits to return failure. Later when we try and read the string, `archive_mstring_get_wcs` will fail because it _also_ tries to do a UTF8->MBS followed by an MBS->WCS conversion. The conversion to MBS will of course fail again. One possible workaround is to call `setlocale` with something like `"en_US.utf8"`, however this is not feasible for some consumers. This change fixes this issue by adding a "fallback" in 'archive_mstring_update_utf8' which will attempt to do a UTF8->WCS conversion on Windows if the MBS conversion failed. This is not too dissimilar from the implementation of `archive_mstring_copy_mbs_len_l` which most - if not all - other archive formats seem to take, which will by default call `archive_wstring_append_from_mbs_in_codepage` if the passed in `archive_string_conv` object is non-null Fixes #1971 --- libarchive/archive_string.c | 11 +++++++ libarchive/test/test_read_format_rar5.c | 30 +++++++++++++++++++ .../test/test_read_format_rar5_unicode.rar.uu | 9 ++++++ 3 files changed, 50 insertions(+) create mode 100644 libarchive/test/test_read_format_rar5_unicode.rar.uu diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c index d206db5f9..17cfd384c 100644 --- a/libarchive/archive_string.c +++ b/libarchive/archive_string.c @@ -4226,6 +4226,17 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, if (sc == NULL) return (-1);/* Couldn't allocate memory for sc. */ r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); + +#if defined(_WIN32) && !defined(__CYGWIN__) + /* On failure, make an effort to convert UTF8 to WCS as the active code page + * may not be able to represent all characters in the string */ + if (r != 0) { + if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), + aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0) + aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; + } +#endif + if (a == NULL) free_sconv_object(sc); if (r != 0) diff --git a/libarchive/test/test_read_format_rar5.c b/libarchive/test/test_read_format_rar5.c index 34f33ccb3..705913b04 100644 --- a/libarchive/test/test_read_format_rar5.c +++ b/libarchive/test/test_read_format_rar5.c @@ -806,6 +806,36 @@ DEFINE_TEST(test_read_format_rar5_extract_win32) EPILOGUE(); } +DEFINE_TEST(test_read_format_rar5_unicode) +{ +#if !defined(WIN32) || defined(__CYGWIN__) + skipping("Skipping test on non-Windows"); + return; +#else + /* Corresponds to the names: + * 👋🌎.txt + * 𝒮𝓎𝓂𝒷𝑜𝓁𝒾𝒸 𝐿𝒾𝓃𝓀.txt + * Ⓗⓐⓡⓓ Ⓛⓘⓝⓚ.txt */ + const wchar_t* emoji_name = L"\U0001f44b\U0001f30e.txt"; + const wchar_t* italic_name = L"\U0001d4ae\U0001d4ce\U0001d4c2\U0001d4b7\U0001d45c\U0001d4c1\U0001d4be\U0001d4b8 \U0001d43f\U0001d4be\U0001d4c3\U0001d4c0.txt"; + const wchar_t* circle_name = L"\u24bd\u24d0\u24e1\u24d3 \u24c1\u24d8\u24dd\u24da.txt"; + + PROLOGUE("test_read_format_rar5_unicode.rar"); + assertA(0 == archive_read_next_header(a, &ae)); + assertEqualWString(emoji_name, archive_entry_pathname_w(ae)); + assertEqualInt(archive_entry_mode(ae), AE_IFREG | 0644); + assertA(0 == archive_read_next_header(a, &ae)); + assertEqualWString(circle_name, archive_entry_pathname_w(ae)); + assertEqualInt(archive_entry_mode(ae), AE_IFREG | 0644); + assertEqualWString(emoji_name, archive_entry_hardlink_w(ae)); + assertA(0 == archive_read_next_header(a, &ae)); + assertEqualWString(italic_name, archive_entry_pathname_w(ae)); + assertEqualInt(archive_entry_mode(ae), AE_IFLNK | 0644); + assertEqualWString(emoji_name, archive_entry_symlink_w(ae)); + EPILOGUE(); +#endif +} + DEFINE_TEST(test_read_format_rar5_block_by_block) { /* This test uses strange buffer sizes intentionally. */ diff --git a/libarchive/test/test_read_format_rar5_unicode.rar.uu b/libarchive/test/test_read_format_rar5_unicode.rar.uu new file mode 100644 index 000000000..e011ab64e --- /dev/null +++ b/libarchive/test/test_read_format_rar5_unicode.rar.uu @@ -0,0 +1,9 @@ +begin 0744 test_read_format_rar5_unicode.rar +M4F%R(1H'`0#SX8+K"P$%!P`&`0&`@(``//TR$"@"`PN-``2-`"#FQN;K@``` +M#/"?D8OPGXR.+G1X=`H#`D:)>%;RZ]D!2&5L;&\L('=O*3F.*3G>*3FBYT>'0*`P)& +MB7A6\NO9`1`%!``,\)^1B_"?C(XN='ATD-'.1V$"`QP`!`"@"`````"````U +M\)V2KO"=DX[PG9."\)V2M_"=D9SPG9.!\)V2OO"=DK@@\)V0O_"=DK[PG9.# +L\)V3@"YT>'0*`P(>7'J>!.S9`1`%`@`,\)^1B_"?C(XN='AT'7=640,%!``` +` +end -- 2.47.2