printf: with \U, support all valid unicode points

author Pádraig Brady <P@draigBrady.com>

Thu, 27 Oct 2022 14:17:07 +0000 (15:17 +0100)

committer Pádraig Brady <P@draigBrady.com>

Fri, 28 Oct 2022 12:45:27 +0000 (13:45 +0100)
author Pádraig Brady <P@draigBrady.com>
Thu, 27 Oct 2022 14:17:07 +0000 (15:17 +0100)
committer Pádraig Brady <P@draigBrady.com>
Fri, 28 Oct 2022 12:45:27 +0000 (13:45 +0100)
diff --git a/NEWS b/NEWS

index 31b04fe9fbcbca127e0f895ac5df88cccda669f4..b6b5201e7c74f143bf4fbd2e8db7a81d714eb21e 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -41,6 +41,10 @@ GNU coreutils NEWS                                    -*- outline -*-
    reverting to the behavior in coreutils-9.0 and earlier.
    This behavior is now documented.
  
+  printf unicode \uNNNN, \UNNNNNNNN syntax, now supports all valid
+  unicode code points.  Previously is was restricted to the C
+  universal character subset, which restricted most points <= 0x9F.
+
    runcon now exits with status 125 for internal errors.  Previously upon
    internal errors it would exit with status 1, which was less distinguishable
    from errors from the invoked command.
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 8ce23898b7ab2de7be554cc8b7625f38055825ac..d52754f50f46d131dbe35a0c1b4b2fb67a543d34 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -13209,16 +13209,16 @@ For example, @samp{printf '\400'} is equivalent to @samp{printf '\0'}.
  @cindex Unicode
  @cindex ISO/IEC 10646
  @vindex LC_CTYPE
-@command{printf} interprets two character syntaxes introduced in
-ISO C 99:
-@samp{\u} for 16-bit Unicode (ISO/IEC 10646)
-characters, specified as
+@command{printf} interprets two syntaxes for specifying Unicode
+(ISO/IEC 10646) characters.
+@samp{\u} for 16-bit Unicode characters, specified as
  four hexadecimal digits @var{hhhh}, and @samp{\U} for 32-bit Unicode
  characters, specified as eight hexadecimal digits @var{hhhhhhhh}.
  @command{printf} outputs the Unicode characters
-according to the @env{LC_CTYPE} locale.  Unicode characters in the ranges
-U+0000@dots{}U+009F, U+D800@dots{}U+DFFF cannot be specified by this syntax,
-except for U+0024 ($), U+0040 (@@), and U+0060 (@`).
+according to the @env{LC_CTYPE} locale.  Unicode characters in the range
+U+D800@dots{}U+DFFF cannot be specified by this syntax.
+This syntax fully supports the universal character subset
+introduced in ISO C 99.
  
  The processing of @samp{\u} and @samp{\U} requires a full-featured
  @code{iconv} facility.  It is activated on systems with glibc 2.2 (or newer),
diff --git a/src/printf.c b/src/printf.c

index 68c3883419b93a051a6ff3469f8d8d7089343270..5d420cef2d6b08c4b0dc6b764999f12e7262db59 100644 (file)
--- a/src/printf.c
+++ b/src/printf.c
@@ -298,14 +298,9 @@ print_esc (char const *escstart, bool octal_0)
            uni_value = uni_value * 16 + hextobin (*p);
          }
  
-      /* A universal character name shall not specify a character short
-         identifier in the range 00000000 through 00000020, 0000007F through
-         0000009F, or 0000D800 through 0000DFFF inclusive. A universal
-         character name shall not designate a character in the required
-         character set.  */
-      if ((uni_value <= 0x9f
-           && uni_value != 0x24 && uni_value != 0x40 && uni_value != 0x60)
-          || (uni_value >= 0xd800 && uni_value <= 0xdfff))
+      /* Error for invalid code points 0000D800 through 0000DFFF inclusive.
+         Note print_unicode_char() would print the literal \u.. in this case. */
+      if (uni_value >= 0xd800 && uni_value <= 0xdfff)
          die (EXIT_FAILURE, 0, _("invalid universal character name \\%c%0*x"),
               esc_char, (esc_char == 'u' ? 4 : 8), uni_value);
  
diff --git a/tests/misc/printf-cov.pl b/tests/misc/printf-cov.pl

index ab6bd159b96bd5368964a1caf8d12f87db8e67ae..59c85cb669534cd86768ca24953c90204b2500cb 100755 (executable)
--- a/tests/misc/printf-cov.pl
+++ b/tests/misc/printf-cov.pl
@@ -66,9 +66,14 @@ my @Tests =
    ['esc', q('\xaa\0377'),  {OUT=>"\xaa\0377"}],
    ['esc-bad-hex', q('\x'), {EXIT=>1},
      {ERR=>"$prog: missing hexadecimal number in escape\n"}],
-  # ['u4', q('\u09ac'), {OUT=>"\xe0a6ac"}],
-  ['u-invalid', q('\u0000'), {EXIT=>1},
-    {ERR=>"$prog: invalid universal character name \\u0000\n"}],
+  ['u-bad-hex', q('\u00'), {EXIT=>1},
+    {ERR=>"$prog: missing hexadecimal number in escape\n"}],
+  ['U-bad-hex', q('\U0000'), {EXIT=>1},
+    {ERR=>"$prog: missing hexadecimal number in escape\n"}],
+  ['u4', q('\u0030'), {OUT=>"0"}],
+  ['U8', q('\U00000030'), {OUT=>"0"}],
+  ['u-invalid', q('\ud800'), {EXIT=>1},
+    {ERR=>"$prog: invalid universal character name \\ud800\n"}],
    ['u-missing', q('\u'), {EXIT=>1},
      {ERR=>"$prog: missing hexadecimal number in escape\n"}],
    ['d-invalid', '%d no-num', {OUT=>'0'}, {EXIT=>1},
author	Pádraig Brady <P@draigBrady.com>
	Thu, 27 Oct 2022 14:17:07 +0000 (15:17 +0100)
committer	Pádraig Brady <P@draigBrady.com>
	Fri, 28 Oct 2022 12:45:27 +0000 (13:45 +0100)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/printf.c		patch \| blob \| blame \| history
tests/misc/printf-cov.pl		patch \| blob \| blame \| history