From: Yasuhiro Matsumoto Date: Sat, 31 Jan 2026 15:53:26 +0000 (+0000) Subject: patch 9.1.2124: blob2str() does not handle UTF-16 encoding X-Git-Tag: v9.1.2124^0 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2b184d4b97c370179a33aeb832cd77e737bbc480;p=thirdparty%2Fvim.git patch 9.1.2124: blob2str() does not handle UTF-16 encoding Problem: blob2str() does not handle UTF-16 encoding (Hirohito Higashi) Solution: Refactor the code and fix remaining issues, see below (Yasuhiro Matsumoto). blob2str() function did not properly handle UTF-16/UCS-2/UTF-32/UCS-4 encodings with endianness suffixes (e.g., utf-16le, utf-16be, ucs-2le). The encoding name was canonicalized too aggressively, losing the endianness information needed by iconv. This change include few fixes: - Preserve the raw encoding name with endianness suffix for iconv calls - Normalize encoding names properly: "ucs2be" → "ucs-2be", "utf16le" → "utf-16le" - For multi-byte encodings (UTF-16/32, UCS-2/4), convert the entire blob first, then split by newlines convert_string() cannot handle UTF-16 because it uses string_convert() which expects NUL-terminated strings. UTF-16 contains 0x00 bytes within characters (e.g., "H" = 0x48 0x00), causing premature termination. Therefore, for UTF-16/32 encodings, the fix uses string_convert_ext() with an explicit input length to convert the entire blob at once. The code appends two NUL bytes (ga_append(&blob_ga, NUL) twice) because UTF-16 requires a 2-byte NUL terminator (0x00 0x00), not a single-byte NUL. - src/strings.c: Add from_encoding_raw to preserve endianness, special handling for UTF-16/32 and UCS-2/4 - src/mbyte.c: Fix convert_setup_ext() to use == ENC_UNICODE instead of & ENC_UNICODE. The bitwise AND was incorrectly treating UTF-16/UCS-2 (which have ENC_UNICODE + ENC_2BYTE etc.) as UTF-8, causing iconv setup to be skipped. fixes: #19198 closes: #19246 Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Yasuhiro Matsumoto Signed-off-by: Christian Brabandt --- diff --git a/runtime/doc/builtin.txt b/runtime/doc/builtin.txt index c940ee50c8..18712b4037 100644 --- a/runtime/doc/builtin.txt +++ b/runtime/doc/builtin.txt @@ -1,4 +1,4 @@ -*builtin.txt* For Vim version 9.1. Last change: 2026 Jan 17 +*builtin.txt* For Vim version 9.1. Last change: 2026 Jan 31 VIM REFERENCE MANUAL by Bram Moolenaar @@ -1389,6 +1389,9 @@ blob2str({blob} [, {options}]) *blob2str()* Can also be used as a |method|: > GetBlob()->blob2str() < + If `iconv` is not available and the encoding cannot be converted + using built-in conversion rules, an error will be reported. + Return type: list diff --git a/src/strings.c b/src/strings.c index 4d878cb7db..f4b335676f 100644 --- a/src/strings.c +++ b/src/strings.c @@ -1275,10 +1275,123 @@ string_from_blob(blob_T *blob, long *start_idx) return ret_str; } +/* + * Normalize encoding name for iconv by adding hyphens. + * For example: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le" + * Returns allocated string or NULL on allocation failure. + */ + static char_u * +normalize_encoding_name(char_u *enc_skipped) +{ + char_u *from_encoding_raw = alloc(STRLEN(enc_skipped) + 3); + if (from_encoding_raw == NULL) + return NULL; + + char_u *s = enc_skipped; + char_u *pe = from_encoding_raw; + + // Convert to lowercase and replace '_' with '-' + while (*s != NUL) + { + if (*s == '_') + *pe++ = '-'; + else + *pe++ = TOLOWER_ASC(*s); + ++s; + } + *pe = NUL; + + // Add hyphen before digit: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le" + char_u *p = from_encoding_raw; + if ((STRNCMP(p, "ucs", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] != '-') || + (STRNCMP(p, "utf", 3) == 0 && VIM_ISDIGIT(p[3]) && p[3] != NUL && p[4] != '-')) + { + // Insert hyphen after "ucs" or "utf": "ucs2" -> "ucs-2" + mch_memmove(p + 4, p + 3, STRLEN(p + 3) + 1); + p[3] = '-'; + } + + return from_encoding_raw; +} + /* * "blob2str()" function * Converts a blob to a string, ensuring valid UTF-8 encoding. */ + static void +append_converted_string_to_list( + char_u *converted, + int validate_utf8, + list_T *list, + char_u *from_encoding) +{ + if (converted != NULL) + { + // After conversion, the output is a valid UTF-8 string (NUL-terminated) + int converted_len = (int)STRLEN(converted); + + // Split by newlines and add to list + char_u *p = converted; + char_u *end = converted + converted_len; + while (p < end) + { + char_u *line_start = p; + while (p < end && *p != NL) + p++; + + // Add this line to the result list + char_u *line = vim_strnsave(line_start, p - line_start); + if (line != NULL) + { + if (validate_utf8 && !utf_valid_string(line, NULL)) + { + vim_free(line); + semsg(_(e_str_encoding_from_failed), p_enc); + vim_free(converted); + return; // Stop processing + } + if (list_append_string(list, line, -1) == FAIL) + { + vim_free(line); + vim_free(converted); + return; // Stop processing on append failure + } + vim_free(line); + } + else + { + // Allocation failure: report error and stop processing + emsg(_(e_out_of_memory)); + vim_free(converted); + return; + } + + if (*p == NL) + p++; + } + vim_free(converted); + } + else + { + semsg(_(e_str_encoding_from_failed), from_encoding); + } +} + + static int +append_validated_line_to_list(char_u *line, int validate_utf8, list_T *list) +{ + if (validate_utf8 && !utf_valid_string(line, NULL)) + { + semsg(_(e_str_encoding_from_failed), p_enc); + vim_free(line); + return FAIL; + } + + int ret = list_append_string(list, line, -1); + vim_free(line); + return ret; +} + void f_blob2str(typval_T *argvars, typval_T *rettv) { @@ -1300,6 +1413,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv) blen = blob_len(blob); char_u *from_encoding = NULL; + char_u *from_encoding_raw = NULL; // Encoding name with endianness preserved for iconv if (argvars[1].v_type != VAR_UNKNOWN) { dict_T *d = argvars[1].vval.v_dict; @@ -1307,7 +1421,20 @@ f_blob2str(typval_T *argvars, typval_T *rettv) { char_u *enc = dict_get_string(d, "encoding", FALSE); if (enc != NULL) - from_encoding = enc_canonize(enc_skip(enc)); + { + char_u *enc_skipped = enc_skip(enc); + from_encoding = enc_canonize(enc_skipped); + + // For iconv, preserve the endianness suffix by creating a normalized + // version with hyphens: "ucs2be" -> "ucs-2be", "utf16le" -> "utf-16le" + from_encoding_raw = normalize_encoding_name(enc_skipped); + if (from_encoding_raw == NULL) + { + emsg(_(e_out_of_memory)); + VIM_CLEAR(from_encoding); + return; + } + } } } @@ -1317,46 +1444,74 @@ f_blob2str(typval_T *argvars, typval_T *rettv) if (from_encoding != NULL && STRCMP(from_encoding, "none") == 0) { validate_utf8 = FALSE; - vim_free(from_encoding); - from_encoding = NULL; + VIM_CLEAR(from_encoding); + VIM_CLEAR(from_encoding_raw); } - idx = 0; - while (idx < blen) + // Special handling for UTF-16/UCS-2/UTF-32/UCS-4 encodings: convert entire blob before splitting by newlines + int from_prop = 0; + if (from_encoding != NULL) + from_prop = enc_canon_props(from_encoding); + if (from_encoding != NULL && (from_prop & (ENC_2BYTE | ENC_4BYTE | ENC_2WORD))) { - char_u *str; - char_u *converted_str; + // Build a temporary buffer from the blob as a whole + // Don't use string_from_blob() because it treats NUL as line separator + garray_T blob_ga; + int nul_size = (from_prop & ENC_4BYTE) ? 4 : 2; + ga_init2(&blob_ga, 1, blen + nul_size); + for (long i = 0; i < blen; i++) + ga_append(&blob_ga, (int)(unsigned char)blob_get(blob, i)); + // Add NUL terminator (2 bytes for UTF-16/UCS-2, 4 bytes for UTF-32/UCS-4) + for (int i = 0; i < nul_size; i++) + ga_append(&blob_ga, NUL); + + // Convert the entire blob at once + vimconv_T vimconv; + vimconv.vc_type = CONV_NONE; + // Use raw encoding name for iconv to preserve endianness (utf-16be vs utf-16) + if (convert_setup_ext(&vimconv, from_encoding_raw ? from_encoding_raw : from_encoding, FALSE, p_enc, FALSE) == FAIL) + { + ga_clear(&blob_ga); + semsg(_(e_str_encoding_from_failed), from_encoding); + goto done; + } + vimconv.vc_fail = TRUE; + // Use string_convert_ext with explicit input length + int inlen = blen; + char_u *converted = string_convert_ext(&vimconv, (char_u *)blob_ga.ga_data, &inlen, NULL); + convert_setup(&vimconv, NULL, NULL); + ga_clear(&blob_ga); + append_converted_string_to_list(converted, validate_utf8, rettv->vval.v_list, from_encoding); + } + else + { + // Original logic for non-UTF-16 encodings + idx = 0; + while (idx < blen) + { + char_u *str; - str = string_from_blob(blob, &idx); - if (str == NULL) - break; + str = string_from_blob(blob, &idx); + if (str == NULL) + break; - converted_str = str; - if (from_encoding != NULL) - { - converted_str = convert_string(str, from_encoding, p_enc); - vim_free(str); - if (converted_str == NULL) + if (from_encoding != NULL) { - semsg(_(e_str_encoding_from_failed), from_encoding); - goto done; + char_u *converted = convert_string(str, + from_encoding_raw ? from_encoding_raw : from_encoding, p_enc); + vim_free(str); + str = converted; } - } - if (validate_utf8) - { - if (!utf_valid_string(converted_str, NULL)) + if (str == NULL) { - semsg(_(e_str_encoding_from_failed), p_enc); - vim_free(converted_str); + semsg(_(e_str_encoding_from_failed), from_encoding); goto done; } - } - int ret = list_append_string(rettv->vval.v_list, converted_str, -1); - vim_free(converted_str); - if (ret == FAIL) - break; + if (append_validated_line_to_list(str, validate_utf8, rettv->vval.v_list) == FAIL) + goto done; + } } // If the blob ends with a newline, we need to add another empty string. @@ -1365,6 +1520,7 @@ f_blob2str(typval_T *argvars, typval_T *rettv) done: vim_free(from_encoding); + vim_free(from_encoding_raw); } /* diff --git a/src/testdir/test_blob.vim b/src/testdir/test_blob.vim index 1ce227d5cf..34a5cb7509 100644 --- a/src/testdir/test_blob.vim +++ b/src/testdir/test_blob.vim @@ -898,4 +898,44 @@ func Test_blob2str_empty_line() call assert_equal(['Hello', '', 'World!'], blob2str(b)) endfunc +func Test_blob2str_multi_byte_encodings() + " UTF-16LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00 + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'})) + + " UTF-16BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F + call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'utf-16be'})) + call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'utf16be'})) + + " UCS-2LE: "Hello" = 48 00 65 00 6C 00 6C 00 6F 00 + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'})) + + " UCS-2BE: "Hello" = 00 48 00 65 00 6C 00 6C 00 6F + call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'ucs-2be'})) + call assert_equal(['Hello'], blob2str(0z00480065006C006C006F, {'encoding': 'ucs2be'})) + + " UTF-32LE: "Hi" = 48 00 00 00 69 00 00 00 + call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'utf-32le'})) + call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'utf32le'})) + + " UTF-32BE: "Hi" = 00 00 00 48 00 00 00 69 + call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'utf-32be'})) + call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'utf32be'})) + + " UCS-4LE: "Hi" = 48 00 00 00 69 00 00 00 + call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'ucs-4le'})) + call assert_equal(['Hi'], blob2str(0z4800000069000000, {'encoding': 'ucs4le'})) + + " UCS-4BE: "Hi" = 00 00 00 48 00 00 00 69 + call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'ucs-4be'})) + call assert_equal(['Hi'], blob2str(0z0000004800000069, {'encoding': 'ucs4be'})) + + " UTF-16LE with newlines: "Hi\nBye" = 48 00 69 00 0A 00 42 00 79 00 65 00 + call assert_equal(['Hi', 'Bye'], blob2str(0z48006900.0A004200.79006500, {'encoding': 'utf-16le'})) + + " UTF-32LE with newlines: "A\nB" = 41 00 00 00 0A 00 00 00 42 00 00 00 + call assert_equal(['A', 'B'], blob2str(0z41000000.0A000000.42000000, {'encoding': 'utf-32le'})) +endfunc + " vim: shiftwidth=2 sts=2 expandtab diff --git a/src/testdir/test_functions.vim b/src/testdir/test_functions.vim index 2fa09eddc0..0f3d30e51b 100644 --- a/src/testdir/test_functions.vim +++ b/src/testdir/test_functions.vim @@ -4557,6 +4557,13 @@ func Test_blob2str() call assert_fails("call blob2str(0z6162, [])", 'E1206: Dictionary required for argument 2') call assert_fails("call blob2str(0z6162, {'encoding': []})", 'E730: Using a List as a String') call assert_fails("call blob2str(0z6162, {'encoding': 'ab12xy'})", 'E1515: Unable to convert from ''ab12xy'' encoding') + + #" UTF-16LE encoding + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf-16le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'utf16le'})) + #" UCS-2LE encoding + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs-2le'})) + call assert_equal(['Hello'], blob2str(0z480065006C006C006F00, {'encoding': 'ucs2le'})) END call v9.CheckLegacyAndVim9Success(lines) endfunc diff --git a/src/version.c b/src/version.c index 5dc95056bb..011a4d1e6d 100644 --- a/src/version.c +++ b/src/version.c @@ -734,6 +734,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 2124, /**/ 2123, /**/