vala: Support C99-style Unicode escapes

author Daiki Ueno <ueno@gnu.org>

Fri, 2 May 2014 08:39:37 +0000 (17:39 +0900)

committer Daiki Ueno <ueno@gnu.org>

Fri, 2 May 2014 09:02:33 +0000 (18:02 +0900)
author Daiki Ueno <ueno@gnu.org>
Fri, 2 May 2014 08:39:37 +0000 (17:39 +0900)
committer Daiki Ueno <ueno@gnu.org>
Fri, 2 May 2014 09:02:33 +0000 (18:02 +0900)
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog

index f4ead255a835a4953f802c55bbc3e84f2348c410..29a1f216467a5cabcf55d926a7d3b0729ddd0da0 100644 (file)
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,15 @@
+2014-05-02  Daiki Ueno  <ueno@gnu.org>
+
+       vala: Support C99-style Unicode character escapes
+       * x-vala.c: Include assert.h and po-charset.h.
+       (P7_QUOTES, P7_QUOTE, P7_NEWLINE): Redefine as a negative integer.
+       (P7_EOF, P7_STRING_END): New definitions.
+       (UNICODE): New macro.
+       (IS_UNICODE): New macro.
+       (UNICODE_VALUE): New macro.
+       (phase7_getc): Recognize "\unnnn" and "\Unnnnnnnn".
+       (phase3_get): Use mixed_string_buffer for parse string literal.
+
  2014-05-02  Daiki Ueno  <ueno@gnu.org>
  
         xgettext: Factor out commonly used mixed_string_buffer
diff --git a/gettext-tools/src/x-vala.c b/gettext-tools/src/x-vala.c

index 68c8d9cebea30108e85a0467b1bc4709a9160680..cee1deb6c15d2475ad92cef5178d3aa25f2ad8e7 100644 (file)
--- a/gettext-tools/src/x-vala.c
+++ b/gettext-tools/src/x-vala.c
@@ -23,6 +23,7 @@
  /* Specification.  */
  #include "x-vala.h"
  
+#include <assert.h>
  #include <errno.h>
  #include <stdbool.h>
  #include <stdio.h>
@@ -36,6 +37,7 @@
  #include "xalloc.h"
  #include "xvasprintf.h"
  #include "hash.h"
+#include "po-charset.h"
  #include "gettext.h"
  
  #define _(s) gettext(s)
@@ -377,12 +379,28 @@ free_token (token_ty *tp)
  }
  
  
+/* Return value of phase7_getc when EOF is reached.  */
+#define P7_EOF (-1)
+#define P7_STRING_END (-2)
+
  /* Replace escape sequences within character strings with their single
     character equivalents.  */
+#define P7_QUOTES (-3)
+#define P7_QUOTE (-4)
+#define P7_NEWLINE (-5)
+
+/* Convert an UTF-16 or UTF-32 code point to a return value that can be
+   distinguished from a single-byte return value.  */
+#define UNICODE(code) (0x100 + (code))
+
+/* Test a return value of phase7_getuc whether it designates an UTF-16 or
+   UTF-32 code point.  */
+#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
+
+/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
+   IS_UNICODE.  */
+#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
  
-#define P7_QUOTES (1000 + '"')
-#define P7_QUOTE (1000 + '\'')
-#define P7_NEWLINE (1000 + '\n')
  
  static int
  phase7_getc ()
@@ -514,6 +532,47 @@ phase7_getc ()
          }
        phase1_ungetc (c);
        return n;
+
+    case 'U': case 'u':
+      {
+        unsigned char buf[8];
+
+        n = 0;
+        for (j = 0; j < (c == 'u' ? 4 : 8); j++)
+          {
+            int c1 = phase1_getc ();
+
+            if (c1 >= '0' && c1 <= '9')
+              n = (n << 4) + (c1 - '0');
+            else if (c1 >= 'A' && c1 <= 'F')
+              n = (n << 4) + (c1 - 'A' + 10);
+            else if (c1 >= 'a' && c1 <= 'f')
+              n = (n << 4) + (c1 - 'a' + 10);
+            else
+              {
+                phase1_ungetc (c1);
+                while (--j >= 0)
+                  phase1_ungetc (buf[j]);
+                phase1_ungetc (c);
+                return '\\';
+              }
+
+            buf[j] = c1;
+          }
+
+        if (n < 0x110000)
+          return UNICODE (n);
+
+        error_with_progname = false;
+        error (0, 0, _("%s:%d: warning: invalid Unicode character"),
+               logical_file_name, line_number);
+        error_with_progname = true;
+
+        while (--j >= 0)
+          phase1_ungetc (buf[j]);
+        phase1_ungetc (c);
+        return '\\';
+      }
      }
  }
  
@@ -802,7 +861,9 @@ phase3_get (token_ty *tp)
            /* FALLTHROUGH */
          case '"':
            {
+            struct mixed_string_buffer *bp;
              int c2 = phase2_getc ();
+
              if (c2 == '"')
                {
                  int c3 = phase2_getc ();
@@ -816,65 +877,67 @@ phase3_get (token_ty *tp)
                }
              else
                phase2_ungetc (c2);
-          }
  
-          bufpos = 0;
-          for (;;)
-            {
-              c = phase7_getc ();
-              if (c == P7_NEWLINE)
-                {
-                  if (verbatim)
-                    c = '\n';
-                  else
-                    {
-                      error_with_progname = false;
-                      error (0, 0, _("%s:%d: warning: unterminated string literal"),
-                             logical_file_name, line_number - 1);
-                      error_with_progname = true;
-                      phase7_ungetc ('\n');
+            /* Start accumulating the string.  */
+            bp = mixed_string_buffer_alloc (lc_string,
+                                            logical_file_name,
+                                            line_number);
+            for (;;)
+              {
+                c = phase7_getc ();
+                if (c == P7_NEWLINE)
+                  {
+                    if (verbatim)
+                      c = '\n';
+                    else
+                      {
+                        error_with_progname = false;
+                        error (0, 0, _("\
+%s:%d: warning: unterminated string literal"),
+                               logical_file_name, line_number - 1);
+                        error_with_progname = true;
+                        phase7_ungetc ('\n');
+                        break;
+                      }
+                  }
+                if (c == P7_QUOTES)
+                  {
+                    if (verbatim)
+                      {
+                        int c2 = phase2_getc ();
+                        if (c2 == '"')
+                          {
+                            int c3 = phase2_getc ();
+                            if (c3 == '"')
+                              break;
+                            phase2_ungetc (c3);
+                          }
+                        phase2_ungetc (c2);
+                        c = '"';
+                      }
+                    else
                        break;
-                    }
-                }
-              if (c == P7_QUOTES)
-                {
-                  if (verbatim)
-                    {
-                      int c2 = phase2_getc ();
-                      if (c2 == '"')
-                        {
-                          int c3 = phase2_getc ();
-                          if (c3 == '"')
-                            break;
-                          phase2_ungetc (c3);
-                        }
-                      phase2_ungetc (c2);
-                      c = '"';
-                    }
-                  else
-                    break;
-                }
-              if (c == EOF)
-                break;
-              if (c == P7_QUOTE)
-                c = '\'';
-              if (bufpos >= bufmax)
-                {
-                  bufmax = 2 * bufmax + 10;
-                  buffer = xrealloc (buffer, bufmax);
-                }
-              buffer[bufpos++] = c;
-            }
-          if (bufpos >= bufmax)
-            {
-              bufmax = 2 * bufmax + 10;
-              buffer = xrealloc (buffer, bufmax);
-            }
-          buffer[bufpos] = 0;
-          tp->type = last_token_type = template ? token_type_string_template : token_type_string_literal;
-          tp->string = xstrdup (buffer);
-          tp->comment = add_reference (savable_comment);
-          return;
+                  }
+                if (c == EOF)
+                  break;
+                if (c == P7_QUOTE)
+                  c = '\'';
+                if (IS_UNICODE (c))
+                  {
+                    assert (UNICODE_VALUE (c) >= 0
+                            && UNICODE_VALUE (c) < 0x110000);
+                    mixed_string_buffer_append_unicode (bp,
+                                                        UNICODE_VALUE (c));
+                  }
+                else
+                  mixed_string_buffer_append_char (bp, c);
+              }
+            tp->type = last_token_type = template
+              ? token_type_string_template : token_type_string_literal;
+            tp->string = xstrdup (mixed_string_buffer_done (bp));
+            tp->comment = add_reference (savable_comment);
+            return;
+          }
  
          case '/':
            switch (last_token_type)
@@ -1192,7 +1255,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
                                  arglist_parser_alloc (mlp,
                                                        state ? next_shapes : NULL)))
              {
+              xgettext_current_source_encoding = po_charset_utf8;
                arglist_parser_done (argparser, arg);
+              xgettext_current_source_encoding = xgettext_global_source_encoding;
                return true;
              }
            next_context_iter = null_context_list_iterator;
@@ -1202,7 +1267,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
          case token_type_rparen:
            if (delim == token_type_rparen || delim == token_type_eof)
              {
+              xgettext_current_source_encoding = po_charset_utf8;
                arglist_parser_done (argparser, arg);
+              xgettext_current_source_encoding = xgettext_global_source_encoding;
                return false;
              }
  
@@ -1221,7 +1288,9 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
            continue;
  
          case token_type_eof:
+          xgettext_current_source_encoding = po_charset_utf8;
            arglist_parser_done (argparser, arg);
+          xgettext_current_source_encoding = xgettext_global_source_encoding;
            return true;
  
          case token_type_string_literal:
@@ -1230,6 +1299,7 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
              pos.file_name = logical_file_name;
              pos.line_number = token.line_number;
  
+            xgettext_current_source_encoding = po_charset_utf8;
              if (extract_all)
                remember_a_message (mlp, NULL, token.string, inner_context,
                                    &pos, NULL, token.comment);
@@ -1251,6 +1321,7 @@ extract_balanced (message_list_ty *mlp, token_type_ty delim,
                                             inner_context, pos.file_name,
                                             pos.line_number, token.comment);
                }
+            xgettext_current_source_encoding = xgettext_global_source_encoding;
            }
            drop_reference (token.comment);
            next_context_iter = null_context_list_iterator;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog

index 740bf5a7bfacfce099cba8f7b9d368fc9354d02f..ddfdd6266c9a4d8fceec09d6a20178af562b1efd 100644 (file)
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,7 @@
+2014-05-02  Daiki Ueno  <ueno@gnu.org>
+
+       * xgettext-vala-1: Test Unicode character escapes.
+
  2014-04-30  Daiki Ueno  <ueno@gnu.org>
  
         * xgettext-scheme-4: New file.
diff --git a/gettext-tools/tests/xgettext-vala-1 b/gettext-tools/tests/xgettext-vala-1

index ebc769af74e3ba8411fdc6ed11d0cee767e47467..e176d17492f17764f1db4f00f3adfcd036716b07 100755 (executable)
--- a/gettext-tools/tests/xgettext-vala-1
+++ b/gettext-tools/tests/xgettext-vala-1
@@ -16,6 +16,8 @@ int main (string[] args) {
      var s4 = _("""Extract this
      ""
      fourth string""");
+
+    var s5 = _("Extract this \u2464th string");
      return 0;
  }
  EOF
@@ -43,7 +45,7 @@ msgstr ""
  "Language-Team: LANGUAGE <LL@li.org>\n"
  "Language: \n"
  "MIME-Version: 1.0\n"
-"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Type: text/plain; charset=UTF-8\n"
  "Content-Transfer-Encoding: 8bit\n"
  
  msgid "Extract this first string"
@@ -60,6 +62,9 @@ msgid ""
  "    \"\"\n"
  "    fourth string"
  msgstr ""
+
+msgid "Extract this ⑤th string"
+msgstr ""
  EOF
  
  : ${DIFF=diff}
author	Daiki Ueno <ueno@gnu.org>
	Fri, 2 May 2014 08:39:37 +0000 (17:39 +0900)
committer	Daiki Ueno <ueno@gnu.org>
	Fri, 2 May 2014 09:02:33 +0000 (18:02 +0900)
gettext-tools/src/ChangeLog		patch \| blob \| blame \| history
gettext-tools/src/x-vala.c		patch \| blob \| blame \| history
gettext-tools/tests/ChangeLog		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-vala-1		patch \| blob \| blame \| history