From: Yasuhiro Matsumoto Date: Fri, 12 Jun 2026 10:00:37 +0000 (+0000) Subject: patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding X-Git-Tag: v9.2.0622^0 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=26dc90a21079a5f5ae472d98c05770ba2eb7868e;p=thirdparty%2Fvim.git patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding Problem: str2blob() does not work with wide UTF-16 encoding Solution: Use iconv() and convert the UTF-16 and similar encodings directly (Yasuhiro Matsumoto) str2blob() routed every target encoding through convert_string(), which treats all Unicode encodings as utf-8 and therefore left the bytes unconverted. As a result str2blob(['Hello'], {'encoding': 'utf-16le'}) returned 0z48656C6C6F instead of 0z480065006C006C006F00, breaking the round-trip with blob2str(). Add the same wide-encoding handling blob2str() uses: join the list items with a newline, convert the whole string at once with the endianness-preserving encoding name, and append the raw bytes. closes: #20466 Signed-off-by: Yasuhiro Matsumoto Signed-off-by: Christian Brabandt --- diff --git a/src/strings.c b/src/strings.c index ff63a3ef40..71f9d83e21 100644 --- a/src/strings.c +++ b/src/strings.c @@ -1584,6 +1584,7 @@ f_str2blob(typval_T *argvars, typval_T *rettv) return; char_u *to_encoding = NULL; + char_u *to_encoding_raw = NULL; // Encoding name with endianness preserved for iconv if (argvars[1].v_type != VAR_UNKNOWN) { dict_T *d = argvars[1].vval.v_dict; @@ -1591,50 +1592,144 @@ f_str2blob(typval_T *argvars, typval_T *rettv) { char_u *enc = dict_get_string(d, "encoding", FALSE); if (enc != NULL) - to_encoding = enc_canonize(enc_skip(enc)); + { + char_u *enc_skipped = enc_skip(enc); + to_encoding = enc_canonize(enc_skipped); + + // For iconv, preserve the endianness suffix by creating a + // normalized version with hyphens: "utf16le" -> "utf-16le" + to_encoding_raw = normalize_encoding_name(enc_skipped); + if (to_encoding_raw == NULL) + { + emsg(_(e_out_of_memory)); + VIM_CLEAR(to_encoding); + return; + } + } } } - FOR_ALL_LIST_ITEMS(list, li) + // Special handling for UTF-16/UCS-2/UTF-32/UCS-4 target encodings: join the + // list items with a newline and convert the whole string at once, so that + // the wide-encoded newline separators and embedded NUL bytes are preserved + // (mirrors blob2str()). convert_string() cannot be used here because it + // treats every Unicode encoding as utf-8, leaving the bytes unconverted. + int to_prop = 0; + if (to_encoding != NULL) + to_prop = enc_canon_props(to_encoding); + if (to_encoding != NULL && (to_prop & (ENC_2BYTE | ENC_4BYTE | ENC_2WORD))) { - if (li->li_tv.v_type != VAR_STRING) - continue; + garray_T str_ga; - string_T str = {li->li_tv.vval.v_string, 0}; + ga_init2(&str_ga, 1, 256); + FOR_ALL_LIST_ITEMS(list, li) + { + char_u *s; - if (str.string == NULL) - STR_LITERAL_SET(str, ""); - else - str.length = STRLEN(str.string); + if (li->li_tv.v_type != VAR_STRING) + continue; + + s = li->li_tv.vval.v_string; - if (to_encoding != NULL) + // Each list string item is separated by a newline in the blob + if (li != list->lv_first) + ga_append(&str_ga, NL); + if (s != NULL && *s != NUL) + { + int slen = (int)STRLEN(s); + + if (ga_grow(&str_ga, slen) == FAIL) + { + ga_clear(&str_ga); + goto done; + } + mch_memmove((char_u *)str_ga.ga_data + str_ga.ga_len, s, + (size_t)slen); + str_ga.ga_len += slen; + } + } + + if (str_ga.ga_len > 0) { - int res; - string_T converted; + vimconv_T vimconv; - res = convert_string(&str, p_enc, to_encoding, &converted); - if (res != OK) + vimconv.vc_type = CONV_NONE; + if (convert_setup_ext(&vimconv, p_enc, FALSE, to_encoding_raw, FALSE) + == FAIL) { + ga_clear(&str_ga); semsg(_(e_str_encoding_to_failed), to_encoding); goto done; } - str.string = converted.string; - str.length = converted.length; + vimconv.vc_fail = TRUE; + + int len = str_ga.ga_len; + char_u *converted = string_convert_ext(&vimconv, + (char_u *)str_ga.ga_data, &len, NULL); + convert_setup(&vimconv, NULL, NULL); + ga_clear(&str_ga); + + if (converted == NULL) + { + semsg(_(e_str_encoding_to_failed), to_encoding); + goto done; + } + if (len > 0 && ga_grow(&blob->bv_ga, len) == OK) + { + mch_memmove((char_u *)blob->bv_ga.ga_data + blob->bv_ga.ga_len, + converted, (size_t)len); + blob->bv_ga.ga_len += len; + } + vim_free(converted); } + else + ga_clear(&str_ga); + } + else + { + FOR_ALL_LIST_ITEMS(list, li) + { + if (li->li_tv.v_type != VAR_STRING) + continue; - if (li != list->lv_first) - // Each list string item is separated by a newline in the blob - ga_append(&blob->bv_ga, NL); + string_T str = {li->li_tv.vval.v_string, 0}; - blob_from_string(str.string, blob); + if (str.string == NULL) + STR_LITERAL_SET(str, ""); + else + str.length = STRLEN(str.string); - if (to_encoding != NULL) - vim_free(str.string); + if (to_encoding != NULL) + { + int res; + string_T converted; + + res = convert_string(&str, p_enc, to_encoding, &converted); + if (res != OK) + { + semsg(_(e_str_encoding_to_failed), to_encoding); + goto done; + } + str.string = converted.string; + str.length = converted.length; + } + + if (li != list->lv_first) + // Each list string item is separated by a newline in the blob + ga_append(&blob->bv_ga, NL); + + blob_from_string(str.string, blob); + + if (to_encoding != NULL) + vim_free(str.string); + } } done: if (to_encoding != NULL) vim_free(to_encoding); + if (to_encoding_raw != NULL) + vim_free(to_encoding_raw); } /* diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim index 375359527d..8ca73a62ae 100644 --- a/src/testdir/test_functions.vim +++ b/src/testdir/test_functions.vim @@ -4513,6 +4513,20 @@ func Test_str2blob() call assert_equal(0zABBB0AABBB, str2blob(['«»', '«»'], {'encoding': 'latin1'})) call assert_equal(0zC2ABC2BB, str2blob(['«»'], {'encoding': 'utf8'})) + if has('iconv') + call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf-16le'})) + call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf16le'})) + call assert_equal(0z00480065006C006C006F, str2blob(['Hello'], {'encoding': 'utf-16be'})) + call assert_equal(0z48006900.0A004200.79006500, str2blob(['Hi', 'Bye'], {'encoding': 'utf-16le'})) + call assert_equal(0z61000A006200, str2blob(["a\nb"], {'encoding': 'utf-16le'})) + call assert_equal(0z, str2blob([''], {'encoding': 'utf-16le'})) + call assert_equal(0z0A00, str2blob(['', ''], {'encoding': 'utf-16le'})) + for enc in ['utf-16le', 'utf-16be', 'ucs-2le', 'utf-32le', 'utf-32be'] + call assert_equal(['Hello', 'World'], + \ blob2str(str2blob(['Hello', 'World'], {'encoding': enc}), {'encoding': enc}), enc) + endfor + endif + call assert_equal(0z62, str2blob(["b"], test_null_dict())) call assert_equal(0z63, str2blob(["c"], {'encoding': test_null_string()})) @@ -4581,12 +4595,14 @@ func Test_blob2str() call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using a List as a String') call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: Unable to convert from ''ab12xy'' encoding') - #" UTF-16LE encoding - call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'})) - call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'})) - #" UCS-2LE encoding - call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'})) - call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'})) + if has("iconv") + #" UTF-16LE encoding + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'})) + #" UCS-2LE encoding + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'})) + endif END call v9.CheckLegacyAndVim9Success(lines) endfunc diff --git a/src/version.c b/src/version.c index fac15c00f3..57bd82493c 100644 --- a/src/version.c +++ b/src/version.c @@ -754,6 +754,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 622, /**/ 621, /**/