From: Yasuhiro Matsumoto <mattn.jp@gmail.com>
Date: Fri, 12 Jun 2026 10:00:37 +0000 (+0000)
Subject: patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding
X-Git-Tag: v9.2.0622^0
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=26dc90a21079a5f5ae472d98c05770ba2eb7868e;p=thirdparty%2Fvim.git

patch 9.2.0622: str2blob() does not work with wide UTF-16 encoding

Problem:  str2blob() does not work with wide UTF-16 encoding
Solution: Use iconv() and convert the UTF-16 and similar encodings
          directly (Yasuhiro Matsumoto)

str2blob() routed every target encoding through convert_string(), which
treats all Unicode encodings as utf-8 and therefore left the bytes
unconverted. As a result str2blob(['Hello'], {'encoding': 'utf-16le'})
returned 0z48656C6C6F instead of 0z480065006C006C006F00, breaking the
round-trip with blob2str(). Add the same wide-encoding handling blob2str()
uses: join the list items with a newline, convert the whole string at once
with the endianness-preserving encoding name, and append the raw bytes.

closes: #20466

Signed-off-by: Yasuhiro Matsumoto <mattn.jp@gmail.com>
Signed-off-by: Christian Brabandt <cb@256bit.org>
---

diff --git a/src/strings.c b/src/strings.c
index ff63a3ef40..71f9d83e21 100644
--- a/src/strings.c
+++ b/src/strings.c
@@ -1584,6 +1584,7 @@ f_str2blob(typval_T *argvars, typval_T *rettv)
 	return;
 
     char_u	*to_encoding = NULL;
+    char_u	*to_encoding_raw = NULL;  // Encoding name with endianness preserved for iconv
     if (argvars[1].v_type != VAR_UNKNOWN)
     {
 	dict_T *d = argvars[1].vval.v_dict;
@@ -1591,50 +1592,144 @@ f_str2blob(typval_T *argvars, typval_T *rettv)
 	{
 	    char_u *enc = dict_get_string(d, "encoding", FALSE);
 	    if (enc != NULL)
-		to_encoding = enc_canonize(enc_skip(enc));
+	    {
+		char_u *enc_skipped = enc_skip(enc);
+		to_encoding = enc_canonize(enc_skipped);
+
+		// For iconv, preserve the endianness suffix by creating a
+		// normalized version with hyphens: "utf16le" -> "utf-16le"
+		to_encoding_raw = normalize_encoding_name(enc_skipped);
+		if (to_encoding_raw == NULL)
+		{
+		    emsg(_(e_out_of_memory));
+		    VIM_CLEAR(to_encoding);
+		    return;
+		}
+	    }
 	}
     }
 
-    FOR_ALL_LIST_ITEMS(list, li)
+    // Special handling for UTF-16/UCS-2/UTF-32/UCS-4 target encodings: join the
+    // list items with a newline and convert the whole string at once, so that
+    // the wide-encoded newline separators and embedded NUL bytes are preserved
+    // (mirrors blob2str()).  convert_string() cannot be used here because it
+    // treats every Unicode encoding as utf-8, leaving the bytes unconverted.
+    int to_prop = 0;
+    if (to_encoding != NULL)
+	to_prop = enc_canon_props(to_encoding);
+    if (to_encoding != NULL && (to_prop & (ENC_2BYTE | ENC_4BYTE | ENC_2WORD)))
     {
-	if (li->li_tv.v_type != VAR_STRING)
-	    continue;
+	garray_T	str_ga;
 
-	string_T    str = {li->li_tv.vval.v_string, 0};
+	ga_init2(&str_ga, 1, 256);
+	FOR_ALL_LIST_ITEMS(list, li)
+	{
+	    char_u *s;
 
-	if (str.string == NULL)
-	    STR_LITERAL_SET(str, "");
-	else
-	    str.length = STRLEN(str.string);
+	    if (li->li_tv.v_type != VAR_STRING)
+		continue;
+
+	    s = li->li_tv.vval.v_string;
 
-	if (to_encoding != NULL)
+	    // Each list string item is separated by a newline in the blob
+	    if (li != list->lv_first)
+		ga_append(&str_ga, NL);
+	    if (s != NULL && *s != NUL)
+	    {
+		int slen = (int)STRLEN(s);
+
+		if (ga_grow(&str_ga, slen) == FAIL)
+		{
+		    ga_clear(&str_ga);
+		    goto done;
+		}
+		mch_memmove((char_u *)str_ga.ga_data + str_ga.ga_len, s,
+								(size_t)slen);
+		str_ga.ga_len += slen;
+	    }
+	}
+
+	if (str_ga.ga_len > 0)
 	{
-	    int		res;
-	    string_T	converted;
+	    vimconv_T	vimconv;
 
-	    res = convert_string(&str, p_enc, to_encoding, &converted);
-	    if (res != OK)
+	    vimconv.vc_type = CONV_NONE;
+	    if (convert_setup_ext(&vimconv, p_enc, FALSE, to_encoding_raw, FALSE)
+								    == FAIL)
 	    {
+		ga_clear(&str_ga);
 		semsg(_(e_str_encoding_to_failed), to_encoding);
 		goto done;
 	    }
-	    str.string = converted.string;
-	    str.length = converted.length;
+	    vimconv.vc_fail = TRUE;
+
+	    int		len = str_ga.ga_len;
+	    char_u	*converted = string_convert_ext(&vimconv,
+				    (char_u *)str_ga.ga_data, &len, NULL);
+	    convert_setup(&vimconv, NULL, NULL);
+	    ga_clear(&str_ga);
+
+	    if (converted == NULL)
+	    {
+		semsg(_(e_str_encoding_to_failed), to_encoding);
+		goto done;
+	    }
+	    if (len > 0 && ga_grow(&blob->bv_ga, len) == OK)
+	    {
+		mch_memmove((char_u *)blob->bv_ga.ga_data + blob->bv_ga.ga_len,
+						    converted, (size_t)len);
+		blob->bv_ga.ga_len += len;
+	    }
+	    vim_free(converted);
 	}
+	else
+	    ga_clear(&str_ga);
+    }
+    else
+    {
+	FOR_ALL_LIST_ITEMS(list, li)
+	{
+	    if (li->li_tv.v_type != VAR_STRING)
+		continue;
 
-	if (li != list->lv_first)
-	    // Each list string item is separated by a newline in the blob
-	    ga_append(&blob->bv_ga, NL);
+	    string_T	str = {li->li_tv.vval.v_string, 0};
 
-	blob_from_string(str.string, blob);
+	    if (str.string == NULL)
+		STR_LITERAL_SET(str, "");
+	    else
+		str.length = STRLEN(str.string);
 
-	if (to_encoding != NULL)
-	    vim_free(str.string);
+	    if (to_encoding != NULL)
+	    {
+		int	    res;
+		string_T    converted;
+
+		res = convert_string(&str, p_enc, to_encoding, &converted);
+		if (res != OK)
+		{
+		    semsg(_(e_str_encoding_to_failed), to_encoding);
+		    goto done;
+		}
+		str.string = converted.string;
+		str.length = converted.length;
+	    }
+
+	    if (li != list->lv_first)
+		// Each list string item is separated by a newline in the blob
+		ga_append(&blob->bv_ga, NL);
+
+	    blob_from_string(str.string, blob);
+
+	    if (to_encoding != NULL)
+		vim_free(str.string);
+	}
     }
 
 done:
     if (to_encoding != NULL)
 	vim_free(to_encoding);
+    if (to_encoding_raw != NULL)
+	vim_free(to_encoding_raw);
 }
 
 /*
diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim
index 375359527d..8ca73a62ae 100644
--- a/src/testdir/test_functions.vim
+++ b/src/testdir/test_functions.vim
@@ -4513,6 +4513,20 @@ func Test_str2blob()
     call assert_equal(0zABBB0AABBB, str2blob(['Â«Â»', 'Â«Â»'], {'encoding': 'latin1'}))
     call assert_equal(0zC2ABC2BB, str2blob(['Â«Â»'], {'encoding': 'utf8'}))
 
+    if has('iconv')
+      call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf-16le'}))
+      call assert_equal(0z480065006C006C006F00, str2blob(['Hello'], {'encoding': 'utf16le'}))
+      call assert_equal(0z00480065006C006C006F, str2blob(['Hello'], {'encoding': 'utf-16be'}))
+      call assert_equal(0z48006900.0A004200.79006500, str2blob(['Hi', 'Bye'], {'encoding': 'utf-16le'}))
+      call assert_equal(0z61000A006200, str2blob(["a\nb"], {'encoding': 'utf-16le'}))
+      call assert_equal(0z, str2blob([''], {'encoding': 'utf-16le'}))
+      call assert_equal(0z0A00, str2blob(['', ''], {'encoding': 'utf-16le'}))
+      for enc in ['utf-16le', 'utf-16be', 'ucs-2le', 'utf-32le', 'utf-32be']
+        call assert_equal(['Hello', 'World'],
+              \ blob2str(str2blob(['Hello', 'World'], {'encoding': enc}), {'encoding': enc}), enc)
+      endfor
+    endif
+
     call assert_equal(0z62, str2blob(["b"], test_null_dict()))
     call assert_equal(0z63, str2blob(["c"], {'encoding': test_null_string()}))
 
@@ -4581,12 +4595,14 @@ func Test_blob2str()
     call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using a List as a String')
     call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: Unable to convert from ''ab12xy'' encoding')
 
-    #" UTF-16LE encoding
-    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
-    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
-    #" UCS-2LE encoding
-    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
-    call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
+    if has("iconv")
+      #" UTF-16LE encoding
+      call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'}))
+      call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'}))
+      #" UCS-2LE encoding
+      call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'}))
+      call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'}))
+    endif
   END
   call v9.CheckLegacyAndVim9Success(lines)
 endfunc
diff --git a/src/version.c b/src/version.c
index fac15c00f3..57bd82493c 100644
--- a/src/version.c
+++ b/src/version.c
@@ -754,6 +754,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    622,
 /**/
     621,
 /**/