xgettext: In language Tcl, support \x, \u, \U escapes as specified in Tcl 8.6.

author Bruno Haible <bruno@clisp.org>

Tue, 14 Mar 2023 11:59:22 +0000 (12:59 +0100)

committer Bruno Haible <bruno@clisp.org>

Tue, 14 Mar 2023 12:10:10 +0000 (13:10 +0100)
author Bruno Haible <bruno@clisp.org>
Tue, 14 Mar 2023 11:59:22 +0000 (12:59 +0100)
committer Bruno Haible <bruno@clisp.org>
Tue, 14 Mar 2023 12:10:10 +0000 (13:10 +0100)
diff --git a/NEWS b/NEWS

index 828853bc32b98f5cc70f346136bdfe92024a7f93..be252ca1d3c98c577e70c1dfde8dc34592803289 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -15,6 +15,8 @@ Version 0.21.2 - February 2023
    - C, C++: xgettext now supports gettext-like functions that take wide strings
      (of type 'const wchar_t *', 'const char16_t *', or 'const char32_t *') as
      arguments.
+  - Tcl: xgettext now supports the \x, \u, and \U escapes as defined in
+    Tcl 8.6.
  
  * xgettext:
    - The xgettext option '--sorted-output' is now deprecated.
diff --git a/gettext-tools/src/x-tcl.c b/gettext-tools/src/x-tcl.c

index 5fcfe4f851c5c123cd7e0dbf463b3eb668da64c9..182ece6e836bc1bbeaf98a0cd9193fa8969e78b0 100644 (file)
--- a/gettext-tools/src/x-tcl.c
+++ b/gettext-tools/src/x-tcl.c
@@ -55,7 +55,8 @@
  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  
  
-/* The Tcl syntax is defined in the Tcl.n manual page.
+/* The Tcl syntax is defined in the Tcl.n manual page, see
+   https://www.tcl-lang.org/man/tcl8.6/TclCmd/Tcl.htm .
     Summary of Tcl syntax:
     Like sh syntax, except that `...` is replaced with [...]. In detail:
     - In a preprocessing pass, backslash-newline-anywhitespace is replaced
@@ -69,7 +70,7 @@
     - The list of resulting words is split into commands by semicolon and
       newline.
     - '#' at the beginning of a command introduces a comment until end of line.
-   The parser is implemented in tcl8.3.3/generic/tclParse.c.  */
+   The parser is implemented in tcl8.6/generic/tclParse.c.  */
  
  
  /* ====================== Keyword set customization.  ====================== */
@@ -174,7 +175,7 @@ do_ungetc (int c)
  /* An int that becomes a space when casted to 'unsigned char'.  */
  #define BS_NL (UCHAR_MAX + 1 + ' ')
  
-static int phase1_pushback[1];
+static int phase1_pushback[5];
  static int phase1_pushback_length;
  
  static int
@@ -472,7 +473,7 @@ static int brace_nesting_depth;
  
  
  /* Read an escape sequence.  The value is an ISO-8859-1 character (in the
-   range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff).  */
+   range 0x00..0xff) or a Unicode character (in the range 0x0000..0x10FFFF).  */
  static int
  do_getc_escaped ()
  {
@@ -499,14 +500,17 @@ do_getc_escaped ()
        return '\v';
      case 'x':
        {
-        int n = 0;
+        unsigned int n = 0;
          unsigned int i;
  
-        for (i = 0;; i++)
+        for (i = 0; i < 2; i++)
            {
              c = phase1_getc ();
              if (c == EOF || !c_isxdigit ((unsigned char) c))
-              break;
+              {
+                phase1_ungetc (c);
+                break;
+              }
  
              if (c >= '0' && c <= '9')
                n = (n << 4) + (c - '0');
@@ -515,12 +519,11 @@ do_getc_escaped ()
              else if (c >= 'a' && c <= 'f')
                n = (n << 4) + (c - 'a' + 10);
            }
-        phase1_ungetc (c);
          return (i > 0 ? (unsigned char) n : 'x');
        }
      case 'u':
        {
-        int n = 0;
+        unsigned int n = 0;
          unsigned int i;
  
          for (i = 0; i < 4; i++)
@@ -532,6 +535,29 @@ do_getc_escaped ()
                  break;
                }
  
+            if (c >= '0' && c <= '9')
+              n = (n << 4) + (c - '0');
+            else if (c >= 'A' && c <= 'F')
+              n = (n << 4) + (c - 'A' + 10);
+            else if (c >= 'a' && c <= 'f')
+              n = (n << 4) + (c - 'a' + 10);
+          }
+        return (i > 0 ? n : 'u');
+      }
+    case 'U':
+      {
+        unsigned int n = 0;
+        unsigned int i;
+
+        for (i = 0; i < 8; i++)
+          {
+            c = phase1_getc ();
+            if (c == EOF || !c_isxdigit ((unsigned char) c) || n >= 0x11000)
+              {
+                phase1_ungetc (c);
+                break;
+              }
+
              if (c >= '0' && c <= '9')
                n = (n << 4) + (c - '0');
              else if (c >= 'A' && c <= 'F')
@@ -572,6 +598,58 @@ do_getc_escaped ()
      }
  }
  
+/* Read an escape sequence for a low surrogate Unicode character.
+   The value is in the range 0xDC00..0xDFFF.
+   Return -1 when none was seen.  */
+static int
+do_getc_escaped_low_surrogate ()
+{
+  int c;
+
+  c = phase1_getc ();
+  switch (c)
+    {
+    case 'u':
+      {
+        unsigned char buf[4];
+        unsigned int n = 0;
+        unsigned int i;
+
+        for (i = 0; i < 4; i++)
+          {
+            c = phase1_getc ();
+            if (c == EOF || !c_isxdigit ((unsigned char) c))
+              {
+                phase1_ungetc (c);
+                while (i > 0)
+                  phase1_ungetc (buf[--i]);
+                phase1_ungetc ('u');
+                return -1;
+              }
+
+            if (c >= '0' && c <= '9')
+              n = (n << 4) + (c - '0');
+            else if (c >= 'A' && c <= 'F')
+              n = (n << 4) + (c - 'A' + 10);
+            else if (c >= 'a' && c <= 'f')
+              n = (n << 4) + (c - 'a' + 10);
+          }
+        if (n >= 0xdc00 && n <= 0xdfff)
+          return n;
+        else
+          {
+            while (i > 0)
+              phase1_ungetc (buf[--i]);
+            phase1_ungetc ('u');
+            return -1;
+          }
+      }
+    default:
+      phase1_ungetc (c);
+      return -1;
+    }
+}
+
  
  enum terminator
  {
@@ -699,31 +777,48 @@ accumulate_word (struct word *wp, enum terminator looking_for,
          }
        else if (c == '\\')
          {
-          unsigned int uc;
-          unsigned char utf8buf[6];
-          int count;
-          int i;
-
-          uc = do_getc_escaped ();
-          assert (uc < 0x10000);
-          count = u8_uctomb (utf8buf, uc, 6);
-          if (count < 0)
+          unsigned int uc = do_getc_escaped ();
+          assert (uc < 0x110000);
+          if (uc >= 0xd800 && uc <= 0xdfff)
              {
+              if (uc < 0xdc00)
+                {
+                  /* Saw a high surrogate Unicode character.
+                     Is it followed by a low surrogate Unicode character?  */
+                  c = phase2_getc ();
+                  if (c == '\\')
+                    {
+                      int uc2 = do_getc_escaped_low_surrogate ();
+                      if (uc2 >= 0)
+                        {
+                          /* Saw a low surrogate Unicode character.  */
+                          assert (uc2 >= 0xdc00 && uc2 <= 0xdfff);
+                          uc = 0x10000 + ((uc - 0xd800) << 10) + (uc2 - 0xdc00);
+                          goto saw_unicode_escape;
+                        }
+                    }
+                  phase2_ungetc (c);
+                }
                error_with_progname = false;
                error (0, 0, _("%s:%d: warning: invalid Unicode character"),
                       logical_file_name, line_number);
                error_with_progname = true;
+              goto done_escape;
              }
-          else
-            {
-              assert (count > 0);
-              if (wp->type == t_string)
-                for (i = 0; i < count; i++)
-                  {
-                    grow_token (wp->token);
-                    wp->token->chars[wp->token->charcount++] = utf8buf[i];
-                  }
-            }
+         saw_unicode_escape:
+          {
+            unsigned char utf8buf[6];
+            int count = u8_uctomb (utf8buf, uc, 6);
+            int i;
+            assert (count > 0);
+            if (wp->type == t_string)
+              for (i = 0; i < count; i++)
+                {
+                  grow_token (wp->token);
+                  wp->token->chars[wp->token->charcount++] = utf8buf[i];
+                }
+          }
+         done_escape: ;
          }
        else
          {
diff --git a/gettext-tools/tests/xgettext-tcl-4 b/gettext-tools/tests/xgettext-tcl-4

index d1582f6f9bb9c741d4b054294c9d399d58c3741c..2419def60bb35e83f02a7406a0f9ac7f89ecd71f 100755 (executable)
--- a/gettext-tools/tests/xgettext-tcl-4
+++ b/gettext-tools/tests/xgettext-tcl-4
@@ -7,11 +7,17 @@ cat <<\EOF > xg-t-4.tcl
  puts [_ "Hello\u200e\u201cWorld\u201d"]
  puts [_ "x\u20y\x20z"]
  puts [_ "\xFF20"]
+puts [_ "\UFF20"]
+puts [_ "\uD83D\udc1c"]
+# Does not work yet in Tcl 8.6:
+# puts [_ "\U0001F41C"]
  EOF
  
  : ${XGETTEXT=xgettext}
  ${XGETTEXT} --add-comments --no-location -k_ -o xg-t-4.tmp xg-t-4.tcl 2>xg-t-4.err
-test $? = 0 || { cat xg-t-4.err; Exit 1; }
+result=$?
+cat xg-t-4.err
+test $result = 0 || Exit 1
  func_filter_POT_Creation_Date xg-t-4.tmp xg-t-4.pot
  
  cat <<\EOF > xg-t-4.ok
@@ -39,7 +45,13 @@ msgstr ""
  msgid "x y z"
  msgstr ""
  
-msgid " "
+msgid "ÿ20"
+msgstr ""
+
+msgid "＠"
+msgstr ""
+
+msgid "🐜"
  msgstr ""
  EOF
  
diff --git a/gettext-tools/tests/xgettext-tcl-5 b/gettext-tools/tests/xgettext-tcl-5

index 1c0ee9fdd1c69eb0e838ce68a314a777e15bec67..612f5cf2189f00e37f89a5e5517af9b802f97e9d 100755 (executable)
--- a/gettext-tools/tests/xgettext-tcl-5
+++ b/gettext-tools/tests/xgettext-tcl-5
@@ -11,6 +11,26 @@ cat <<\EOF > xg-t-5b.tcl
  puts [_ "\udc1c"]
  EOF
  
+cat <<\EOF > xg-t-5c.tcl
+puts [_ "\uD83D\n"]
+EOF
+
+cat <<\EOF > xg-t-5d.tcl
+puts [_ "\uD83D\u"]
+EOF
+
+cat <<\EOF > xg-t-5e.tcl
+puts [_ "\uD83D\u9843"]
+EOF
+
+cat <<\EOF > xg-t-5f.tcl
+puts [_ "\uD83D\ud913"]
+EOF
+
+cat <<\EOF > xg-t-5g.tcl
+puts [_ "\udc1c\ud83d"]
+EOF
+
  : ${XGETTEXT=xgettext}
  LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5a.tcl 2>xg-t-5.err
  result=$?
@@ -23,4 +43,34 @@ result=$?
  cat xg-t-5.err
  test $result = 0 || Exit 1
  
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5c.tcl 2>xg-t-5.err
+result=$?
+cat xg-t-5.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5d.tcl 2>xg-t-5.err
+result=$?
+cat xg-t-5.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5e.tcl 2>xg-t-5.err
+result=$?
+cat xg-t-5.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5f.tcl 2>xg-t-5.err
+result=$?
+cat xg-t-5.err
+test $result = 0 || Exit 1
+
+: ${XGETTEXT=xgettext}
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5g.tcl 2>xg-t-5.err
+result=$?
+cat xg-t-5.err
+test $result = 0 || Exit 1
+
  exit 0
author	Bruno Haible <bruno@clisp.org>
	Tue, 14 Mar 2023 11:59:22 +0000 (12:59 +0100)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 14 Mar 2023 12:10:10 +0000 (13:10 +0100)
NEWS		patch \| blob \| blame \| history
gettext-tools/src/x-tcl.c		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-tcl-4		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-tcl-5		patch \| blob \| blame \| history