}
*p = NULL;
+#if defined(_WIN32) && !defined(__CYGWIN__)
+ /*
+ * On Windows, first try converting from WCS because (1) there's no
+ * guarantee that the conversion to MBS will succeed, e.g. when using
+ * CP_ACP, and (2) that's more efficient than converting to MBS, just to
+ * convert back to WCS again before finally converting to UTF-8
+ */
+ if ((aes->aes_set & AES_SET_WCS) != 0) {
+ sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
+ if (sc == NULL)
+ return (-1);/* Couldn't allocate memory for sc. */
+ archive_string_empty(&(aes->aes_utf8));
+ r = archive_string_append_from_wcs_in_codepage(&(aes->aes_utf8),
+ aes->aes_wcs.s, aes->aes_wcs.length, sc);
+ if (a == NULL)
+ free_sconv_object(sc);
+ if (r == 0) {
+ aes->aes_set |= AES_SET_UTF8;
+ *p = aes->aes_utf8.s;
+ return (0);/* success. */
+ } else
+ return (-1);/* failure. */
+ }
+#endif
/* Try converting WCS to MBS first if MBS does not exist yet. */
if ((aes->aes_set & AES_SET_MBS) == 0) {
const char *pm; /* unused */
}
*wp = NULL;
+#if defined(_WIN32) && !defined(__CYGWIN__)
+ /*
+ * On Windows, prefer converting from UTF-8 directly to WCS because:
+ * (1) there's no guarantee that the string can be represented in MBS (e.g.
+ * with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going
+ * to need to convert from UTF-8 to WCS anyway and its wasteful to throw
+ * away that intermediate result
+ */
+ if (aes->aes_set & AES_SET_UTF8) {
+ struct archive_string_conv *sc;
+
+ sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
+ if (sc != NULL) {
+ archive_wstring_empty((&aes->aes_wcs));
+ r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
+ aes->aes_utf8.s, aes->aes_utf8.length, sc);
+ if (a == NULL)
+ free_sconv_object(sc);
+ if (r == 0) {
+ aes->aes_set |= AES_SET_WCS;
+ *wp = aes->aes_wcs.s;
+ return (0);
+ }
+ }
+ }
+#endif
/* Try converting UTF8 to MBS first if MBS does not exist yet. */
if ((aes->aes_set & AES_SET_MBS) == 0) {
const char *p; /* unused */
aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */
- /* Try converting UTF-8 to MBS, return false on failure. */
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
if (sc == NULL)
return (-1);/* Couldn't allocate memory for sc. */
- r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
#if defined(_WIN32) && !defined(__CYGWIN__)
- /* On failure, make an effort to convert UTF8 to WCS as the active code page
- * may not be able to represent all characters in the string */
- if (r != 0) {
- if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
- aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0)
- aes->aes_set = AES_SET_UTF8 | AES_SET_WCS;
- }
-#endif
+ /* On Windows, there's no good way to convert from UTF8 -> MBS directly, so
+ * prefer to first convert to WCS as (1) it's wasteful to throw away the
+ * intermediate result, and (2) WCS will still be set even if we fail to
+ * convert to MBS (e.g. with ACP that can't represent the characters) */
+ r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
+ aes->aes_utf8.s, aes->aes_utf8.length, sc);
+
+ if (a == NULL)
+ free_sconv_object(sc);
+ if (r != 0)
+ return (-1); /* This will guarantee we can't convert to MBS */
+ aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; /* Both UTF8 and WCS set. */
+
+ /* Try converting WCS to MBS, return false on failure. */
+ if (archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s,
+ aes->aes_wcs.length))
+ return (-1);
+#else
+ /* Try converting UTF-8 to MBS, return false on failure. */
+ r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
if (a == NULL)
free_sconv_object(sc);
if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
aes->aes_mbs.length))
return (-1);
- aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
+#endif
/* All conversions succeeded. */
+ aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
+
return (0);
}
test_archive_string_canonicalization();
test_archive_string_set_get();
}
+
+DEFINE_TEST(test_archive_string_conversion_utf16_utf8)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+ skipping("This test is meant to verify unicode string handling on Windows");
+#else
+ struct archive_mstring mstr;
+ const char* utf8_string;
+
+ memset(&mstr, 0, sizeof(mstr));
+
+ assertEqualInt(ARCHIVE_OK,
+ archive_mstring_copy_wcs(&mstr, L"\U0000043f\U00000440\U00000438"));
+
+ /* Conversion from WCS to UTF-8 should always succeed */
+ assertEqualInt(ARCHIVE_OK,
+ archive_mstring_get_utf8(NULL, &mstr, &utf8_string));
+ assertEqualString("\xD0\xBF\xD1\x80\xD0\xB8", utf8_string);
+
+ archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_conversion_utf8_utf16)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+ skipping("This test is meant to verify unicode string handling on Windows");
+#else
+ struct archive_mstring mstr;
+ const wchar_t* wcs_string;
+
+ memset(&mstr, 0, sizeof(mstr));
+
+ assertEqualInt(6,
+ archive_mstring_copy_utf8(&mstr, "\xD0\xBF\xD1\x80\xD0\xB8"));
+
+ /* Conversion from UTF-8 to WCS should always succeed */
+ assertEqualInt(ARCHIVE_OK,
+ archive_mstring_get_wcs(NULL, &mstr, &wcs_string));
+ assertEqualWString(L"\U0000043f\U00000440\U00000438", wcs_string);
+
+ archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_win)
+{
+#if !defined(_WIN32) || defined(__CYGWIN__)
+ skipping("This test is meant to verify unicode string handling on Windows"
+ " with the C locale");
+#else
+ static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+ static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+ struct archive_mstring mstr;
+ int r;
+
+ memset(&mstr, 0, sizeof(mstr));
+
+ r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+ /* On Windows, this should reliably fail with the C locale */
+ assertEqualInt(-1, r);
+ assertEqualInt(0, mstr.aes_set & AES_SET_MBS);
+
+ /* NOTE: We access the internals to validate that they were set by the
+ * 'archive_mstring_update_utf8' function */
+ /* UTF-8 should always be set */
+ assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8);
+ assertEqualString(utf8_string, mstr.aes_utf8.s);
+ /* WCS should always be set as well */
+ assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS);
+ assertEqualWString(wcs_string, mstr.aes_wcs.s);
+
+ archive_mstring_clean(&mstr);
+#endif
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_utf8)
+{
+ static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+ static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+ struct archive_mstring mstr;
+ int r;
+
+ memset(&mstr, 0, sizeof(mstr));
+
+ if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) {
+ skipping("UTF-8 not supported on this system.");
+ return;
+ }
+
+ r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+ /* All conversions should have succeeded */
+ assertEqualInt(0, r);
+ assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
+ assertEqualString(utf8_string, mstr.aes_utf8.s);
+ assertEqualString(utf8_string, mstr.aes_mbs.s);
+ assertEqualWString(wcs_string, mstr.aes_wcs.s);
+
+ archive_mstring_clean(&mstr);
+}
+
+DEFINE_TEST(test_archive_string_update_utf8_koi8)
+{
+ static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
+ static const char koi8_string[] = "\xD0\xD2\xC9";
+ static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
+ struct archive_mstring mstr;
+ int r;
+
+ memset(&mstr, 0, sizeof(mstr));
+
+ if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) {
+ skipping("KOI8-R locale not available on this system.");
+ return;
+ }
+
+ r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
+
+ /* All conversions should have succeeded */
+ assertEqualInt(0, r);
+ assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
+ assertEqualString(utf8_string, mstr.aes_utf8.s);
+ assertEqualString(koi8_string, mstr.aes_mbs.s);
+#if defined(_WIN32) && !defined(__CYGWIN__)
+ assertEqualWString(wcs_string, mstr.aes_wcs.s);
+#else
+ /* No guarantee of how WCS strings behave, however this test test is
+ * primarily meant for Windows */
+ (void)wcs_string;
+#endif
+
+ archive_mstring_clean(&mstr);
+}