From: Yichao Yu Date: Sun, 31 May 2020 18:30:10 +0000 (-0400) Subject: Complete support for UTF8 encoding conversion X-Git-Tag: v3.5.0~32^2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c30f279475e2afd39f380622d2b53b157eb746d8;p=thirdparty%2Flibarchive.git Complete support for UTF8 encoding conversion All the conversions already always go through the MBS form so simply always convert to MBS before trying to convert to any other encoding in the `archive_mstring_get_*` functions. * Do this by calling `archive_mstring_get_mbs` to avoid duplicating code * Add `struct archive*` parameter to `archive_mstring_get_mbs_l` * Add test for set/get/update utf8 functions on entry * Add test for encoding conversion. Fix #931 --- diff --git a/libarchive/archive_acl.c b/libarchive/archive_acl.c index 952e20df4..eb307aea2 100644 --- a/libarchive/archive_acl.c +++ b/libarchive/archive_acl.c @@ -595,7 +595,7 @@ archive_acl_text_len(struct archive_acl *acl, int want_type, int flags, else length += sizeof(uid_t) * 3 + 1; } else { - r = archive_mstring_get_mbs_l(&ap->name, &name, + r = archive_mstring_get_mbs_l(a, &ap->name, &name, &len, sc); if (r != 0) return (0); @@ -968,7 +968,7 @@ archive_acl_to_text_l(struct archive_acl *acl, ssize_t *text_len, int flags, else prefix = NULL; r = archive_mstring_get_mbs_l( - &ap->name, &name, &len, sc); + NULL, &ap->name, &name, &len, sc); if (r != 0) { free(s); return (NULL); diff --git a/libarchive/archive_entry.c b/libarchive/archive_entry.c index 124600c98..5be85710d 100644 --- a/libarchive/archive_entry.c +++ b/libarchive/archive_entry.c @@ -450,7 +450,7 @@ int _archive_entry_gname_l(struct archive_entry *entry, const char **p, size_t *len, struct archive_string_conv *sc) { - return (archive_mstring_get_mbs_l(&entry->ae_gname, p, len, sc)); + return (archive_mstring_get_mbs_l(entry->archive, &entry->ae_gname, p, len, sc)); } const char * @@ -504,7 +504,7 @@ _archive_entry_hardlink_l(struct archive_entry *entry, *len = 0; return (0); } - return (archive_mstring_get_mbs_l(&entry->ae_hardlink, p, len, sc)); + return (archive_mstring_get_mbs_l(entry->archive, &entry->ae_hardlink, p, len, sc)); } la_int64_t @@ -595,7 +595,7 @@ int _archive_entry_pathname_l(struct archive_entry *entry, const char **p, size_t *len, struct archive_string_conv *sc) { - return (archive_mstring_get_mbs_l(&entry->ae_pathname, p, len, sc)); + return (archive_mstring_get_mbs_l(entry->archive, &entry->ae_pathname, p, len, sc)); } __LA_MODE_T @@ -723,7 +723,7 @@ _archive_entry_symlink_l(struct archive_entry *entry, *len = 0; return (0); } - return (archive_mstring_get_mbs_l( &entry->ae_symlink, p, len, sc)); + return (archive_mstring_get_mbs_l(entry->archive, &entry->ae_symlink, p, len, sc)); } la_int64_t @@ -769,7 +769,7 @@ int _archive_entry_uname_l(struct archive_entry *entry, const char **p, size_t *len, struct archive_string_conv *sc) { - return (archive_mstring_get_mbs_l(&entry->ae_uname, p, len, sc)); + return (archive_mstring_get_mbs_l(entry->archive, &entry->ae_uname, p, len, sc)); } int diff --git a/libarchive/archive_string.c b/libarchive/archive_string.c index c77dcf52c..7460ded00 100644 --- a/libarchive/archive_string.c +++ b/libarchive/archive_string.c @@ -3881,6 +3881,11 @@ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes, } *p = NULL; + /* Try converting WCS to MBS first if MBS does not exist yet. */ + if ((aes->aes_set & AES_SET_MBS) == 0) { + const char *pm; /* unused */ + archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ + } if (aes->aes_set & AES_SET_MBS) { sc = archive_string_conversion_to_charset(a, "UTF-8", 1); if (sc == NULL) @@ -3903,9 +3908,9 @@ int archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, const char **p) { + struct archive_string_conv *sc; int r, ret = 0; - (void)a; /* UNUSED */ /* If we already have an MBS form, return that immediately. */ if (aes->aes_set & AES_SET_MBS) { *p = aes->aes_mbs.s; @@ -3926,10 +3931,23 @@ archive_mstring_get_mbs(struct archive *a, struct archive_mstring *aes, ret = -1; } - /* - * Only a UTF-8 form cannot avail because its conversion already - * failed at archive_mstring_update_utf8(). - */ + /* If there's a UTF-8 form, try converting with the native locale. */ + if (aes->aes_set & AES_SET_UTF8) { + archive_string_empty(&(aes->aes_mbs)); + sc = archive_string_conversion_from_charset(a, "UTF-8", 1); + if (sc == NULL) + return (-1);/* Couldn't allocate memory for sc. */ + r = archive_strncpy_l(&(aes->aes_mbs), + aes->aes_utf8.s, aes->aes_utf8.length, sc); + if (a == NULL) + free_sconv_object(sc); + *p = aes->aes_mbs.s; + if (r == 0) { + aes->aes_set |= AES_SET_MBS; + ret = 0;/* success; overwrite previous error. */ + } else + ret = -1;/* failure. */ + } return (ret); } @@ -3947,6 +3965,11 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, } *wp = NULL; + /* Try converting UTF8 to MBS first if MBS does not exist yet. */ + if ((aes->aes_set & AES_SET_MBS) == 0) { + const char *p; /* unused */ + archive_mstring_get_mbs(a, aes, &p); /* ignore errors, we'll handle it later */ + } /* Try converting MBS to WCS using native locale. */ if (aes->aes_set & AES_SET_MBS) { archive_wstring_empty(&(aes->aes_wcs)); @@ -3962,11 +3985,12 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes, } int -archive_mstring_get_mbs_l(struct archive_mstring *aes, +archive_mstring_get_mbs_l(struct archive *a, struct archive_mstring *aes, const char **p, size_t *length, struct archive_string_conv *sc) { int r, ret = 0; + (void)r; /* UNUSED */ #if defined(_WIN32) && !defined(__CYGWIN__) /* * Internationalization programming on Windows must use Wide @@ -3989,20 +4013,12 @@ archive_mstring_get_mbs_l(struct archive_mstring *aes, } #endif - /* If there is not an MBS form but is a WCS form, try converting + /* If there is not an MBS form but there is a WCS or UTF8 form, try converting * with the native locale to be used for translating it to specified * character-set. */ - if ((aes->aes_set & AES_SET_MBS) == 0 && - (aes->aes_set & AES_SET_WCS) != 0) { - archive_string_empty(&(aes->aes_mbs)); - r = archive_string_append_from_wcs(&(aes->aes_mbs), - aes->aes_wcs.s, aes->aes_wcs.length); - if (r == 0) - aes->aes_set |= AES_SET_MBS; - else if (errno == ENOMEM) - return (-1); - else - ret = -1; + if ((aes->aes_set & AES_SET_MBS) == 0) { + const char *pm; /* unused */ + archive_mstring_get_mbs(a, aes, &pm); /* ignore errors, we'll handle it later */ } /* If we already have an MBS form, use it to be translated to * specified character-set. */ diff --git a/libarchive/archive_string.h b/libarchive/archive_string.h index 27e1ad69c..49d7d3064 100644 --- a/libarchive/archive_string.h +++ b/libarchive/archive_string.h @@ -226,7 +226,7 @@ void archive_mstring_copy(struct archive_mstring *dest, struct archive_mstring * int archive_mstring_get_mbs(struct archive *, struct archive_mstring *, const char **); int archive_mstring_get_utf8(struct archive *, struct archive_mstring *, const char **); int archive_mstring_get_wcs(struct archive *, struct archive_mstring *, const wchar_t **); -int archive_mstring_get_mbs_l(struct archive_mstring *, const char **, +int archive_mstring_get_mbs_l(struct archive *, struct archive_mstring *, const char **, size_t *, struct archive_string_conv *); int archive_mstring_copy_mbs(struct archive_mstring *, const char *mbs); int archive_mstring_copy_mbs_len(struct archive_mstring *, const char *mbs, diff --git a/libarchive/test/test_archive_string_conversion.c b/libarchive/test/test_archive_string_conversion.c index e86f97c8a..fb5359b6f 100644 --- a/libarchive/test/test_archive_string_conversion.c +++ b/libarchive/test/test_archive_string_conversion.c @@ -445,7 +445,7 @@ test_archive_string_normalization_nfc(const char *testdata) assertEqualInt(0, archive_mstring_copy_wcs(&mstr, wc_nfc)); assertEqualInt(0, archive_mstring_get_mbs_l( - &mstr, &mp, &mplen, t_sconv8)); + a, &mstr, &mp, &mplen, t_sconv8)); failure("WCS NFC(%s) should be UTF-8 NFC:%d" ,nfc, line); assertEqualUTF8String(utf8_nfc, mp); @@ -695,7 +695,7 @@ test_archive_string_normalization_mac_nfd(const char *testdata) assertEqualInt(0, archive_mstring_copy_wcs( &mstr, wc_nfd)); assertEqualInt(0, archive_mstring_get_mbs_l( - &mstr, &mp, &mplen, t_sconv8)); + a, &mstr, &mp, &mplen, t_sconv8)); failure("WCS NFD(%s) should be UTF-8 NFD:%d" ,nfd, line); assertEqualUTF8String(utf8_nfd, mp); @@ -777,6 +777,80 @@ test_archive_string_canonicalization(void) } +static void +check_string(struct archive *a, struct archive_mstring *mstr, struct archive_string_conv *sc, + const char *exp, const wchar_t *wexp) +{ + /* Do all the tests on a copy so that we can have a clear initial state every time */ + struct archive_mstring mstr2; + const char *p = NULL; + const wchar_t *wp = NULL; + size_t len = 0; + + memset(&mstr2, 0, sizeof(mstr2)); + + archive_mstring_copy(&mstr2, mstr); + assertEqualInt(0, archive_mstring_get_mbs(a, &mstr2, &p)); + assertEqualString(exp, p); + p = NULL; + + archive_mstring_copy(&mstr2, mstr); + assertEqualInt(0, archive_mstring_get_utf8(a, &mstr2, &p)); + assertEqualString(exp, p); + p = NULL; + + archive_mstring_copy(&mstr2, mstr); + assertEqualInt(0, archive_mstring_get_wcs(a, &mstr2, &wp)); + assertEqualWString(wexp, wp); + wp = NULL; + + archive_mstring_copy(&mstr2, mstr); + assertEqualInt(0, archive_mstring_get_mbs_l(a, &mstr2, &p, &len, sc)); + assertEqualString(exp, p); + assertEqualInt(len, strlen(exp)); + p = NULL; + len = 0; + + archive_mstring_clean(&mstr2); +} + +/* + * Make sure no matter what the input encoding is, the string can be + * converted too all the output encodings. + */ +static void +test_archive_string_set_get(void) +{ + struct archive *a; + struct archive_mstring mstr; + struct archive_string_conv *sc; + + setlocale(LC_ALL, "en_US.UTF-8"); + + assert((a = archive_read_new()) != NULL); + memset(&mstr, 0, sizeof(mstr)); + + assertA(NULL != (sc = + archive_string_conversion_to_charset(a, "UTF-8", 1))); + failure("Charset name should be UTF-8"); + assertEqualString("UTF-8", + archive_string_conversion_charset_name(sc)); + + assertEqualInt(0, archive_mstring_copy_mbs(&mstr, "AAA")); + check_string(a, &mstr, sc, "AAA", L"AAA"); + assertEqualInt(4, archive_mstring_copy_utf8(&mstr, "BBBB")); + check_string(a, &mstr, sc, "BBBB", L"BBBB"); + assertEqualInt(0, archive_mstring_copy_wcs(&mstr, L"CCC12")); + check_string(a, &mstr, sc, "CCC12", L"CCC12"); + assertEqualInt(0, archive_mstring_copy_mbs_len_l(&mstr, "DDDD-l", 6, sc)); + check_string(a, &mstr, sc, "DDDD-l", L"DDDD-l"); + assertEqualInt(0, archive_mstring_update_utf8(a, &mstr, "EEEEE---H")); + check_string(a, &mstr, sc, "EEEEE---H", L"EEEEE---H"); + + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + +} + DEFINE_TEST(test_archive_string_conversion) { static const char reffile[] = "test_archive_string_conversion.txt.Z"; @@ -807,4 +881,5 @@ DEFINE_TEST(test_archive_string_conversion) test_archive_string_normalization_nfc(testdata); test_archive_string_normalization_mac_nfd(testdata); test_archive_string_canonicalization(); + test_archive_string_set_get(); } diff --git a/libarchive/test/test_entry.c b/libarchive/test/test_entry.c index 0cf13e825..f20576490 100644 --- a/libarchive/test/test_entry.c +++ b/libarchive/test/test_entry.c @@ -177,31 +177,60 @@ DEFINE_TEST(test_entry) /* gname */ archive_entry_set_gname(e, "group"); assertEqualString(archive_entry_gname(e), "group"); + assertEqualString(archive_entry_gname_utf8(e), "group"); + assertEqualWString(archive_entry_gname_w(e), L"group"); wcscpy(wbuff, L"wgroup"); archive_entry_copy_gname_w(e, wbuff); assertEqualWString(archive_entry_gname_w(e), L"wgroup"); memset(wbuff, 0, sizeof(wbuff)); assertEqualWString(archive_entry_gname_w(e), L"wgroup"); + assertEqualString(archive_entry_gname_utf8(e), "wgroup"); + assertEqualString(archive_entry_gname(e), "wgroup"); + archive_entry_set_gname_utf8(e, "group"); + assertEqualString(archive_entry_gname_utf8(e), "group"); + assertEqualWString(archive_entry_gname_w(e), L"group"); + assertEqualString(archive_entry_gname(e), "group"); + archive_entry_update_gname_utf8(e, "group2"); + assertEqualString(archive_entry_gname_utf8(e), "group2"); + assertEqualWString(archive_entry_gname_w(e), L"group2"); + assertEqualString(archive_entry_gname(e), "group2"); /* hardlink */ archive_entry_set_hardlink(e, "hardlinkname"); assertEqualString(archive_entry_hardlink(e), "hardlinkname"); + assertEqualString(archive_entry_hardlink_utf8(e), "hardlinkname"); + assertEqualWString(archive_entry_hardlink_w(e), L"hardlinkname"); strcpy(buff, "hardlinkname2"); archive_entry_copy_hardlink(e, buff); assertEqualString(archive_entry_hardlink(e), "hardlinkname2"); + assertEqualWString(archive_entry_hardlink_w(e), L"hardlinkname2"); + assertEqualString(archive_entry_hardlink_utf8(e), "hardlinkname2"); memset(buff, 0, sizeof(buff)); assertEqualString(archive_entry_hardlink(e), "hardlinkname2"); + assertEqualString(archive_entry_hardlink_utf8(e), "hardlinkname2"); + assertEqualWString(archive_entry_hardlink_w(e), L"hardlinkname2"); archive_entry_copy_hardlink(e, NULL); assertEqualString(archive_entry_hardlink(e), NULL); assertEqualWString(archive_entry_hardlink_w(e), NULL); + assertEqualString(archive_entry_hardlink_utf8(e), NULL); wcscpy(wbuff, L"whardlink"); archive_entry_copy_hardlink_w(e, wbuff); assertEqualWString(archive_entry_hardlink_w(e), L"whardlink"); + assertEqualString(archive_entry_hardlink_utf8(e), "whardlink"); + assertEqualString(archive_entry_hardlink(e), "whardlink"); memset(wbuff, 0, sizeof(wbuff)); assertEqualWString(archive_entry_hardlink_w(e), L"whardlink"); archive_entry_copy_hardlink_w(e, NULL); assertEqualString(archive_entry_hardlink(e), NULL); assertEqualWString(archive_entry_hardlink_w(e), NULL); + archive_entry_set_hardlink_utf8(e, "hardlinkname"); + assertEqualString(archive_entry_hardlink_utf8(e), "hardlinkname"); + assertEqualWString(archive_entry_hardlink_w(e), L"hardlinkname"); + assertEqualString(archive_entry_hardlink(e), "hardlinkname"); + archive_entry_update_hardlink_utf8(e, "hardlinkname2"); + assertEqualString(archive_entry_hardlink_utf8(e), "hardlinkname2"); + assertEqualWString(archive_entry_hardlink_w(e), L"hardlinkname2"); + assertEqualString(archive_entry_hardlink(e), "hardlinkname2"); /* ino */ assert(!archive_entry_ino_is_set(e)); @@ -270,18 +299,38 @@ DEFINE_TEST(test_entry) /* pathname */ archive_entry_set_pathname(e, "path"); assertEqualString(archive_entry_pathname(e), "path"); + assertEqualString(archive_entry_pathname_utf8(e), "path"); + assertEqualWString(archive_entry_pathname_w(e), L"path"); archive_entry_set_pathname(e, "path"); assertEqualString(archive_entry_pathname(e), "path"); + assertEqualWString(archive_entry_pathname_w(e), L"path"); + assertEqualString(archive_entry_pathname_utf8(e), "path"); strcpy(buff, "path2"); archive_entry_copy_pathname(e, buff); assertEqualString(archive_entry_pathname(e), "path2"); + assertEqualWString(archive_entry_pathname_w(e), L"path2"); + assertEqualString(archive_entry_pathname_utf8(e), "path2"); memset(buff, 0, sizeof(buff)); assertEqualString(archive_entry_pathname(e), "path2"); + assertEqualString(archive_entry_pathname_utf8(e), "path2"); + assertEqualWString(archive_entry_pathname_w(e), L"path2"); wcscpy(wbuff, L"wpath"); archive_entry_copy_pathname_w(e, wbuff); assertEqualWString(archive_entry_pathname_w(e), L"wpath"); + assertEqualString(archive_entry_pathname_utf8(e), "wpath"); + assertEqualString(archive_entry_pathname(e), "wpath"); memset(wbuff, 0, sizeof(wbuff)); assertEqualWString(archive_entry_pathname_w(e), L"wpath"); + assertEqualString(archive_entry_pathname(e), "wpath"); + assertEqualString(archive_entry_pathname_utf8(e), "wpath"); + archive_entry_set_pathname_utf8(e, "path"); + assertEqualWString(archive_entry_pathname_w(e), L"path"); + assertEqualString(archive_entry_pathname(e), "path"); + assertEqualString(archive_entry_pathname_utf8(e), "path"); + archive_entry_update_pathname_utf8(e, "path2"); + assertEqualWString(archive_entry_pathname_w(e), L"path2"); + assertEqualString(archive_entry_pathname(e), "path2"); + assertEqualString(archive_entry_pathname_utf8(e), "path2"); /* rdev */ archive_entry_set_rdev(e, 532); @@ -302,19 +351,37 @@ DEFINE_TEST(test_entry) /* symlink */ archive_entry_set_symlink(e, "symlinkname"); assertEqualString(archive_entry_symlink(e), "symlinkname"); + assertEqualString(archive_entry_symlink_utf8(e), "symlinkname"); + assertEqualWString(archive_entry_symlink_w(e), L"symlinkname"); strcpy(buff, "symlinkname2"); archive_entry_copy_symlink(e, buff); assertEqualString(archive_entry_symlink(e), "symlinkname2"); + assertEqualWString(archive_entry_symlink_w(e), L"symlinkname2"); + assertEqualString(archive_entry_symlink_utf8(e), "symlinkname2"); memset(buff, 0, sizeof(buff)); assertEqualString(archive_entry_symlink(e), "symlinkname2"); + assertEqualString(archive_entry_symlink_utf8(e), "symlinkname2"); + assertEqualWString(archive_entry_symlink_w(e), L"symlinkname2"); archive_entry_copy_symlink_w(e, NULL); assertEqualWString(archive_entry_symlink_w(e), NULL); assertEqualString(archive_entry_symlink(e), NULL); + assertEqualString(archive_entry_symlink_utf8(e), NULL); archive_entry_copy_symlink_w(e, L"wsymlink"); assertEqualWString(archive_entry_symlink_w(e), L"wsymlink"); + assertEqualString(archive_entry_symlink_utf8(e), "wsymlink"); + assertEqualString(archive_entry_symlink(e), "wsymlink"); archive_entry_copy_symlink(e, NULL); assertEqualWString(archive_entry_symlink_w(e), NULL); assertEqualString(archive_entry_symlink(e), NULL); + assertEqualString(archive_entry_symlink_utf8(e), NULL); + archive_entry_set_symlink_utf8(e, "symlinkname"); + assertEqualWString(archive_entry_symlink_w(e), L"symlinkname"); + assertEqualString(archive_entry_symlink(e), "symlinkname"); + assertEqualString(archive_entry_symlink_utf8(e), "symlinkname"); + archive_entry_update_symlink_utf8(e, "symlinkname2"); + assertEqualWString(archive_entry_symlink_w(e), L"symlinkname2"); + assertEqualString(archive_entry_symlink(e), "symlinkname2"); + assertEqualString(archive_entry_symlink_utf8(e), "symlinkname2"); /* uid */ archive_entry_set_uid(e, 83); @@ -323,11 +390,27 @@ DEFINE_TEST(test_entry) /* uname */ archive_entry_set_uname(e, "user"); assertEqualString(archive_entry_uname(e), "user"); + assertEqualString(archive_entry_uname_utf8(e), "user"); + assertEqualWString(archive_entry_uname_w(e), L"user"); wcscpy(wbuff, L"wuser"); - archive_entry_copy_gname_w(e, wbuff); - assertEqualWString(archive_entry_gname_w(e), L"wuser"); + archive_entry_copy_uname_w(e, wbuff); + assertEqualWString(archive_entry_uname_w(e), L"wuser"); memset(wbuff, 0, sizeof(wbuff)); - assertEqualWString(archive_entry_gname_w(e), L"wuser"); + assertEqualWString(archive_entry_uname_w(e), L"wuser"); + assertEqualString(archive_entry_uname_utf8(e), "wuser"); + assertEqualString(archive_entry_uname(e), "wuser"); + archive_entry_set_uname_utf8(e, "user"); + assertEqualString(archive_entry_uname_utf8(e), "user"); + assertEqualWString(archive_entry_uname_w(e), L"user"); + assertEqualString(archive_entry_uname(e), "user"); + archive_entry_set_uname_utf8(e, "user"); + assertEqualWString(archive_entry_uname_w(e), L"user"); + assertEqualString(archive_entry_uname(e), "user"); + assertEqualString(archive_entry_uname_utf8(e), "user"); + archive_entry_update_uname_utf8(e, "user2"); + assertEqualWString(archive_entry_uname_w(e), L"user2"); + assertEqualString(archive_entry_uname(e), "user2"); + assertEqualString(archive_entry_uname_utf8(e), "user2"); /* Test fflags interface. */ archive_entry_set_fflags(e, 0x55, 0xAA);