utf8: handle systems that don't write BOM for UTF-16

author brian m. carlson <sandals@crustytoothpaste.net>

Tue, 12 Feb 2019 00:52:06 +0000 (00:52 +0000)

committer Junio C Hamano <gitster@pobox.com>

Tue, 12 Feb 2019 02:20:07 +0000 (18:20 -0800)
author brian m. carlson <sandals@crustytoothpaste.net>
Tue, 12 Feb 2019 00:52:06 +0000 (00:52 +0000)
committer Junio C Hamano <gitster@pobox.com>
Tue, 12 Feb 2019 02:20:07 +0000 (18:20 -0800)
diff --git a/Makefile b/Makefile

index 0e13a5b4698a02e93c9f2c95e06b4193fe229567..457311bc31b54c48aa17f6abdc487baa174786e1 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -259,6 +259,10 @@ all::
  # Define OLD_ICONV if your library has an old iconv(), where the second
  # (input buffer pointer) parameter is declared with type (const char **).
  #
+# Define ICONV_OMITS_BOM if your iconv implementation does not write a
+# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in
+# big-endian format.
+#
  # Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
  #
  # Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
@@ -1415,6 +1419,9 @@ ifndef NO_ICONV
                 EXTLIBS += $(ICONV_LINK) -liconv
         endif
  endif
+ifdef ICONV_OMITS_BOM
+       BASIC_CFLAGS += -DICONV_OMITS_BOM
+endif
  ifdef NEEDS_LIBGEN
         EXTLIBS += -lgen
  endif
diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh

index e58ecbfc44037fa1d38707b30545e803e1401b4f..500229a9bd6245c8a93ddc10b008bc6afc2d9f71 100755 (executable)
--- a/t/t0028-working-tree-encoding.sh
+++ b/t/t0028-working-tree-encoding.sh
@@ -6,6 +6,30 @@ test_description='working-tree-encoding conversion via gitattributes'
  
  GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING
  
+test_lazy_prereq NO_UTF16_BOM '
+       test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6
+'
+
+test_lazy_prereq NO_UTF32_BOM '
+       test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12
+'
+
+write_utf16 () {
+       if test_have_prereq NO_UTF16_BOM
+       then
+               printf '\xfe\xff'
+       fi &&
+       iconv -f UTF-8 -t UTF-16
+}
+
+write_utf32 () {
+       if test_have_prereq NO_UTF32_BOM
+       then
+               printf '\x00\x00\xfe\xff'
+       fi &&
+       iconv -f UTF-8 -t UTF-32
+}
+
  test_expect_success 'setup test files' '
         git config core.eol lf &&
  
@@ -13,8 +37,8 @@ test_expect_success 'setup test files' '
         echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes &&
         echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes &&
         printf "$text" >test.utf8.raw &&
-       printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw &&
-       printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw &&
+       printf "$text" | write_utf16 >test.utf16.raw &&
+       printf "$text" | write_utf32 >test.utf32.raw &&
         printf "\377\376"                         >test.utf16lebom.raw &&
         printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw &&
  
@@ -124,8 +148,8 @@ do
                 test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" &&
                 test_when_finished "git reset --hard HEAD^" &&
  
-               cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw &&
-               cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw &&
+               cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw &&
+               cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw &&
                 cp crlf.utf${i}.raw eol.utf${i} &&
  
                 cat >expectIndexLF <<-EOF &&
@@ -223,7 +247,7 @@ test_expect_success ICONV_SHIFT_JIS 'check roundtrip encoding' '
  
         text="hallo there!\nroundtrip test here!" &&
         printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift &&
-       printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 &&
+       printf "$text" | write_utf16 >roundtrip.utf16 &&
         echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes &&
  
         # SHIFT-JIS encoded files are round-trip checked by default...
diff --git a/utf8.c b/utf8.c

index 83824dc2f4ab151a19418c61c46e0c1ffbb0e42c..3b42fadffd7ccb89a5658fdf8d314014f299a769 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -559,6 +559,10 @@ char *reencode_string_len(const char *in, size_t insz,
         /*
          * For writing, UTF-16 iconv typically creates "UTF-16BE-BOM"
          * Some users under Windows want the little endian version
+        *
+        * We handle UTF-16 and UTF-32 ourselves only if the platform does not
+        * provide a BOM (which we require), since we want to match the behavior
+        * of the system tools and libc as much as possible.
          */
         if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) {
                 bom_str = utf16_le_bom;
@@ -568,6 +572,16 @@ char *reencode_string_len(const char *in, size_t insz,
                 bom_str = utf16_be_bom;
                 bom_len = sizeof(utf16_be_bom);
                 out_encoding = "UTF-16BE";
+#ifdef ICONV_OMITS_BOM
+       } else if (same_utf_encoding("UTF-16", out_encoding)) {
+               bom_str = utf16_be_bom;
+               bom_len = sizeof(utf16_be_bom);
+               out_encoding = "UTF-16BE";
+       } else if (same_utf_encoding("UTF-32", out_encoding)) {
+               bom_str = utf32_be_bom;
+               bom_len = sizeof(utf32_be_bom);
+               out_encoding = "UTF-32BE";
+#endif
         }
  
         conv = iconv_open(out_encoding, in_encoding);
author	brian m. carlson <sandals@crustytoothpaste.net>
	Tue, 12 Feb 2019 00:52:06 +0000 (00:52 +0000)
committer	Junio C Hamano <gitster@pobox.com>
	Tue, 12 Feb 2019 02:20:07 +0000 (18:20 -0800)
Makefile		patch \| blob \| blame \| history
t/t0028-working-tree-encoding.sh		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history