From: Duncan Horn <40036384+dunhor@users.noreply.github.com> Date: Thu, 20 Jun 2024 21:01:47 +0000 (-0700) Subject: Fix & optimize string conversion functions for Windows (#2226) X-Git-Tag: v3.7.5~30 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=56e023631298f6b4988c5ca1c04a45857b886e91;p=thirdparty%2Flibarchive.git Fix & optimize string conversion functions for Windows (#2226) All three parts of this change effectively stem from the same assumption: most of the code in `archive_string.c` assumes that MBS <-> UTF-8 string conversion can be done directly and efficiently. This is not quite true on Windows, where conversion looks more like MBS <-> WCS <-> UTF-8. This results in a few inefficiencies currently present in the code. First, if the caller is asking for either the MBS or UTF-8 string, but it's not currently set on the `archive_mstring`, then on Windows, it's more efficient to first check if the WCS is set and do the conversion with that. Otherwise, we'll end up doing a wasteful intermediate step of converting either the MBS or UTF-8 string to WCS, which we already have. Second, in the `archive_mstring_update_utf8` function, it's more efficient on Windows to first convert to WCS and use that result to convert to MBS, as opposed to the fallback I introduced in a previous change, which converts UTF-8 to MBS first and disposes of the intermediate WCS, only to re-calculate it. --- diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c index be6c39600..41bfe7af1 100644 --- a/libarchive/archive_string.c +++ b/libarchive/archive_string.c @@ -3874,6 +3874,30 @@ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, } *p = NULL; +#if defined(_WIN32) && !defined(__CYGWIN__) + /* + * On Windows, first try converting from WCS because (1) there's no + * guarantee that the conversion to MBS will succeed, e.g. when using + * CP_ACP, and (2) that's more efficient than converting to MBS, just to + * convert back to WCS again before finally converting to UTF-8 + */ + if ((aes->aes_set & AES_SET_WCS) != 0) { + sc = archive_string_conversion_to_charset(a, "UTF-8", 1); + if (sc == NULL) + return (-1);/* Couldn't allocate memory for sc. */ + archive_string_empty(&(aes->aes_utf8)); + r = archive_string_append_from_wcs_in_codepage(&(aes->aes_utf8), + aes->aes_wcs.s, aes->aes_wcs.length, sc); + if (a == NULL) + free_sconv_object(sc); + if (r == 0) { + aes->aes_set |= AES_SET_UTF8; + *p = aes->aes_utf8.s; + return (0);/* success. */ + } else + return (-1);/* failure. */ + } +#endif /* Try converting WCS to MBS first if MBS does not exist yet. */ if ((aes->aes_set & AES_SET_MBS) == 0) { const char *pm; /* unused */ @@ -3958,6 +3982,32 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, } *wp = NULL; +#if defined(_WIN32) && !defined(__CYGWIN__) + /* + * On Windows, prefer converting from UTF-8 directly to WCS because: + * (1) there's no guarantee that the string can be represented in MBS (e.g. + * with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going + * to need to convert from UTF-8 to WCS anyway and its wasteful to throw + * away that intermediate result + */ + if (aes->aes_set & AES_SET_UTF8) { + struct archive_string_conv *sc; + + sc = archive_string_conversion_from_charset(a, "UTF-8", 1); + if (sc != NULL) { + archive_wstring_empty((&aes->aes_wcs)); + r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), + aes->aes_utf8.s, aes->aes_utf8.length, sc); + if (a == NULL) + free_sconv_object(sc); + if (r == 0) { + aes->aes_set |= AES_SET_WCS; + *wp = aes->aes_wcs.s; + return (0); + } + } + } +#endif /* Try converting UTF8 to MBS first if MBS does not exist yet. */ if ((aes->aes_set & AES_SET_MBS) == 0) { const char *p; /* unused */ @@ -4211,21 +4261,31 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */ - /* Try converting UTF-8 to MBS, return false on failure. */ sc = archive_string_conversion_from_charset(a, "UTF-8", 1); if (sc == NULL) return (-1);/* Couldn't allocate memory for sc. */ - r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); #if defined(_WIN32) && !defined(__CYGWIN__) - /* On failure, make an effort to convert UTF8 to WCS as the active code page - * may not be able to represent all characters in the string */ - if (r != 0) { - if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), - aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0) - aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; - } -#endif + /* On Windows, there's no good way to convert from UTF8 -> MBS directly, so + * prefer to first convert to WCS as (1) it's wasteful to throw away the + * intermediate result, and (2) WCS will still be set even if we fail to + * convert to MBS (e.g. with ACP that can't represent the characters) */ + r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs), + aes->aes_utf8.s, aes->aes_utf8.length, sc); + + if (a == NULL) + free_sconv_object(sc); + if (r != 0) + return (-1); /* This will guarantee we can't convert to MBS */ + aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; /* Both UTF8 and WCS set. */ + + /* Try converting WCS to MBS, return false on failure. */ + if (archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s, + aes->aes_wcs.length)) + return (-1); +#else + /* Try converting UTF-8 to MBS, return false on failure. */ + r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc); if (a == NULL) free_sconv_object(sc); @@ -4237,8 +4297,10 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes, if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s, aes->aes_mbs.length)) return (-1); - aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; +#endif /* All conversions succeeded. */ + aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS; + return (0); } diff --git a/libarchive/test/test_archive_string_conversion.c b/libarchive/test/test_archive_string_conversion.c index d8c75888a..67e9b762a 100644 --- a/libarchive/test/test_archive_string_conversion.c +++ b/libarchive/test/test_archive_string_conversion.c @@ -882,3 +882,138 @@ DEFINE_TEST(test_archive_string_conversion) test_archive_string_canonicalization(); test_archive_string_set_get(); } + +DEFINE_TEST(test_archive_string_conversion_utf16_utf8) +{ +#if !defined(_WIN32) || defined(__CYGWIN__) + skipping("This test is meant to verify unicode string handling on Windows"); +#else + struct archive_mstring mstr; + const char* utf8_string; + + memset(&mstr, 0, sizeof(mstr)); + + assertEqualInt(ARCHIVE_OK, + archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438")); + + /* Conversion from WCS to UTF-8 should always succeed */ + assertEqualInt(ARCHIVE_OK, + archive_mstring_get_utf8(NULL, &mstr, &utf8_string)); + assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string); + + archive_mstring_clean(&mstr); +#endif +} + +DEFINE_TEST(test_archive_string_conversion_utf8_utf16) +{ +#if !defined(_WIN32) || defined(__CYGWIN__) + skipping("This test is meant to verify unicode string handling on Windows"); +#else + struct archive_mstring mstr; + const wchar_t* wcs_string; + + memset(&mstr, 0, sizeof(mstr)); + + assertEqualInt(6, + archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8")); + + /* Conversion from UTF-8 to WCS should always succeed */ + assertEqualInt(ARCHIVE_OK, + archive_mstring_get_wcs(NULL, &mstr, &wcs_string)); + assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string); + + archive_mstring_clean(&mstr); +#endif +} + +DEFINE_TEST(test_archive_string_update_utf8_win) +{ +#if !defined(_WIN32) || defined(__CYGWIN__) + skipping("This test is meant to verify unicode string handling on Windows" + " with the C locale"); +#else + static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; + static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; + struct archive_mstring mstr; + int r; + + memset(&mstr, 0, sizeof(mstr)); + + r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); + + /* On Windows, this should reliably fail with the C locale */ + assertEqualInt(-1, r); + assertEqualInt(0, mstr.aes_set & AES_SET_MBS); + + /* NOTE: We access the internals to validate that they were set by the + * 'archive_mstring_update_utf8' function */ + /* UTF-8 should always be set */ + assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8); + assertEqualString(utf8_string, mstr.aes_utf8.s); + /* WCS should always be set as well */ + assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS); + assertEqualWString(wcs_string, mstr.aes_wcs.s); + + archive_mstring_clean(&mstr); +#endif +} + +DEFINE_TEST(test_archive_string_update_utf8_utf8) +{ + static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; + static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; + struct archive_mstring mstr; + int r; + + memset(&mstr, 0, sizeof(mstr)); + + if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) { + skipping("UTF-8 not supported on this system."); + return; + } + + r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); + + /* All conversions should have succeeded */ + assertEqualInt(0, r); + assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set); + assertEqualString(utf8_string, mstr.aes_utf8.s); + assertEqualString(utf8_string, mstr.aes_mbs.s); + assertEqualWString(wcs_string, mstr.aes_wcs.s); + + archive_mstring_clean(&mstr); +} + +DEFINE_TEST(test_archive_string_update_utf8_koi8) +{ + static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8"; + static const char koi8_string[] = "\xD0\xD2\xC9"; + static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438"; + struct archive_mstring mstr; + int r; + + memset(&mstr, 0, sizeof(mstr)); + + if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) { + skipping("KOI8-R locale not available on this system."); + return; + } + + r = archive_mstring_update_utf8(NULL, &mstr, utf8_string); + + /* All conversions should have succeeded */ + assertEqualInt(0, r); + assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set); + assertEqualString(utf8_string, mstr.aes_utf8.s); + assertEqualString(koi8_string, mstr.aes_mbs.s); +#if defined(_WIN32) && !defined(__CYGWIN__) + assertEqualWString(wcs_string, mstr.aes_wcs.s); +#else + /* No guarantee of how WCS strings behave, however this test test is + * primarily meant for Windows */ + (void)wcs_string; +#endif + + archive_mstring_clean(&mstr); +}