From: Duncan Horn <40036384+dunhor@users.noreply.github.com>
Date: Thu, 20 Jun 2024 21:01:47 +0000 (-0700)
Subject: Fix & optimize string conversion functions for Windows (#2226)
X-Git-Tag: v3.7.5~30
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=56e023631298f6b4988c5ca1c04a45857b886e91;p=thirdparty%2Flibarchive.git

Fix & optimize string conversion functions for Windows (#2226)

All three parts of this change effectively stem from the same
assumption: most of the code in `archive_string.c` assumes that MBS <->
UTF-8 string conversion can be done directly and efficiently. This is
not quite true on Windows, where conversion looks more like MBS <-> WCS
<-> UTF-8. This results in a few inefficiencies currently present in the
code.

First, if the caller is asking for either the MBS or UTF-8 string, but
it's not currently set on the `archive_mstring`, then on Windows, it's
more efficient to first check if the WCS is set and do the conversion
with that. Otherwise, we'll end up doing a wasteful intermediate step of
converting either the MBS or UTF-8 string to WCS, which we already have.

Second, in the `archive_mstring_update_utf8` function, it's more
efficient on Windows to first convert to WCS and use that result to
convert to MBS, as opposed to the fallback I introduced in a previous
change, which converts UTF-8 to MBS first and disposes of the
intermediate WCS, only to re-calculate it.
---

diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c
index be6c39600..41bfe7af1 100644
--- a/libarchive/archive_string.c
+++ b/libarchive/archive_string.c
@@ -3874,6 +3874,30 @@ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
 	}
 
 	*p = NULL;
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	/*
+	 * On Windows, first try converting from WCS because (1) there's no
+	 * guarantee that the conversion to MBS will succeed, e.g. when using
+	 * CP_ACP, and (2) that's more efficient than converting to MBS, just to
+	 * convert back to WCS again before finally converting to UTF-8
+	 */
+	if ((aes->aes_set & AES_SET_WCS) != 0) {
+		sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
+		if (sc == NULL)
+			return (-1);/* Couldn't allocate memory for sc. */
+		archive_string_empty(&(aes->aes_utf8));
+		r = archive_string_append_from_wcs_in_codepage(&(aes->aes_utf8),
+			aes->aes_wcs.s, aes->aes_wcs.length, sc);
+		if (a == NULL)
+			free_sconv_object(sc);
+		if (r == 0) {
+			aes->aes_set |= AES_SET_UTF8;
+			*p = aes->aes_utf8.s;
+			return (0);/* success. */
+		} else
+			return (-1);/* failure. */
+	}
+#endif
 	/* Try converting WCS to MBS first if MBS does not exist yet. */
 	if ((aes->aes_set & AES_SET_MBS) == 0) {
 		const char *pm; /* unused */
@@ -3958,6 +3982,32 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
 	}
 
 	*wp = NULL;
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	/*
+	 * On Windows, prefer converting from UTF-8 directly to WCS because:
+	 * (1) there's no guarantee that the string can be represented in MBS (e.g.
+	 * with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going
+	 * to need to convert from UTF-8 to WCS anyway and its wasteful to throw
+	 * away that intermediate result
+	 */
+	if (aes->aes_set & AES_SET_UTF8) {
+		struct archive_string_conv *sc;
+
+		sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
+		if (sc != NULL) {
+			archive_wstring_empty((&aes->aes_wcs));
+			r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
+			    aes->aes_utf8.s, aes->aes_utf8.length, sc);
+			if (a == NULL)
+				free_sconv_object(sc);
+			if (r == 0) {
+				aes->aes_set |= AES_SET_WCS;
+				*wp = aes->aes_wcs.s;
+				return (0);
+			}
+		}
+	}
+#endif
 	/* Try converting UTF8 to MBS first if MBS does not exist yet. */
 	if ((aes->aes_set & AES_SET_MBS) == 0) {
 		const char *p; /* unused */
@@ -4211,21 +4261,31 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
 
 	aes->aes_set = AES_SET_UTF8;	/* Only UTF8 is set now. */
 
-	/* Try converting UTF-8 to MBS, return false on failure. */
 	sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
 	if (sc == NULL)
 		return (-1);/* Couldn't allocate memory for sc. */
-	r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
 
 #if defined(_WIN32) && !defined(__CYGWIN__)
-	/* On failure, make an effort to convert UTF8 to WCS as the active code page
-	 * may not be able to represent all characters in the string */
-	if (r != 0) {
-		if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
-			aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0)
-			aes->aes_set = AES_SET_UTF8 | AES_SET_WCS;
-	}
-#endif
+	/* On Windows, there's no good way to convert from UTF8 -> MBS directly, so
+	 * prefer to first convert to WCS as (1) it's wasteful to throw away the
+	 * intermediate result, and (2) WCS will still be set even if we fail to
+	 * convert to MBS (e.g. with ACP that can't represent the characters) */
+	r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
+		aes->aes_utf8.s, aes->aes_utf8.length, sc);
+
+	if (a == NULL)
+		free_sconv_object(sc);
+	if (r != 0)
+		return (-1); /* This will guarantee we can't convert to MBS */
+	aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; /* Both UTF8 and WCS set. */
+
+	/* Try converting WCS to MBS, return false on failure. */
+	if (archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s,
+	    aes->aes_wcs.length))
+		return (-1);
+#else
+	/* Try converting UTF-8 to MBS, return false on failure. */
+	r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
 
 	if (a == NULL)
 		free_sconv_object(sc);
@@ -4237,8 +4297,10 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
 	if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
 	    aes->aes_mbs.length))
 		return (-1);
-	aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
+#endif
 
 	/* All conversions succeeded. */
+	aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
+
 	return (0);
 }
diff --git a/libarchive/test/test_archive_string_conversion.c b/libarchive/test/test_archive_string_conversion.c
index d8c75888a..67e9b762a 100644
--- a/libarchive/test/test_archive_string_conversion.c
+++ b/libarchive/test/test_archive_string_conversion.c
@@ -882,3 +882,138 @@ DEFINE_TEST(test_archive_string_conversion)
 	test_archive_string_canonicalization();
 	test_archive_string_set_get();
 }
+
+DEFINE_TEST(test_archive_string_conversion_utf16_utf8)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+	skipping("This test is meant to verify unicode string handling on Windows");
+#else
+	struct archive_mstring mstr;
+	const char* utf8_string;
+
+	memset(&mstr, 0, sizeof(mstr));
+
+	assertEqualInt(ARCHIVE_OK,
+	    archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438"));
+
+	/* Conversion from WCS to UTF-8 should always succeed */
+	assertEqualInt(ARCHIVE_OK,
+	    archive_mstring_get_utf8(NULL, &mstr, &utf8_string));
+	assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string);
+
+	archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_conversion_utf8_utf16)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+	skipping("This test is meant to verify unicode string handling on Windows");
+#else
+	struct archive_mstring mstr;
+	const wchar_t* wcs_string;
+
+	memset(&mstr, 0, sizeof(mstr));
+
+	assertEqualInt(6,
+	    archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8"));
+
+	/* Conversion from UTF-8 to WCS should always succeed */
+	assertEqualInt(ARCHIVE_OK,
+	    archive_mstring_get_wcs(NULL, &mstr, &wcs_string));
+	assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string);
+
+	archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_win)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+	skipping("This test is meant to verify unicode string handling on Windows"
+	    " with the C locale");
+#else
+	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+	struct archive_mstring mstr;
+	int r;
+
+	memset(&mstr, 0, sizeof(mstr));
+
+	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+	/* On Windows, this should reliably fail with the C locale */
+	assertEqualInt(-1, r);
+	assertEqualInt(0, mstr.aes_set & AES_SET_MBS);
+
+	/* NOTE: We access the internals to validate that they were set by the
+	 *       'archive_mstring_update_utf8' function */
+	/* UTF-8 should always be set */
+	assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8);
+	assertEqualString(utf8_string, mstr.aes_utf8.s);
+	/* WCS should always be set as well */
+	assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS);
+	assertEqualWString(wcs_string, mstr.aes_wcs.s);
+
+	archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_utf8)
+{
+	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+	struct archive_mstring mstr;
+	int r;
+
+	memset(&mstr, 0, sizeof(mstr));
+
+	if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) {
+		skipping("UTF-8 not supported on this system.");
+		return;
+	}
+
+	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+	/* All conversions should have succeeded */
+	assertEqualInt(0, r);
+	assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
+	assertEqualString(utf8_string, mstr.aes_utf8.s);
+	assertEqualString(utf8_string, mstr.aes_mbs.s);
+	assertEqualWString(wcs_string, mstr.aes_wcs.s);
+
+	archive_mstring_clean(&mstr);
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_koi8)
+{
+	static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+	static const char koi8_string[] = "\xD0\xD2\xC9";
+	static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+	struct archive_mstring mstr;
+	int r;
+
+	memset(&mstr, 0, sizeof(mstr));
+
+	if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) {
+		skipping("KOI8-R locale not available on this system.");
+		return;
+	}
+
+	r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+	/* All conversions should have succeeded */
+	assertEqualInt(0, r);
+	assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
+	assertEqualString(utf8_string, mstr.aes_utf8.s);
+	assertEqualString(koi8_string, mstr.aes_mbs.s);
+#if defined(_WIN32) && !defined(__CYGWIN__)
+	assertEqualWString(wcs_string, mstr.aes_wcs.s);
+#else
+	/* No guarantee of how WCS strings behave, however this test test is
+	 * primarily meant for Windows */
+	(void)wcs_string;
+#endif
+
+	archive_mstring_clean(&mstr);
+}