xgettext: Add E4X support to JavaScript scanner

author Daiki Ueno <ueno@gnu.org>

Wed, 20 Nov 2013 03:41:20 +0000 (12:41 +0900)

committer Daiki Ueno <ueno@gnu.org>

Wed, 4 Dec 2013 10:53:11 +0000 (19:53 +0900)
author Daiki Ueno <ueno@gnu.org>
Wed, 20 Nov 2013 03:41:20 +0000 (12:41 +0900)
committer Daiki Ueno <ueno@gnu.org>
Wed, 4 Dec 2013 10:53:11 +0000 (19:53 +0900)
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog

index 97c3c1f3216d0131a68730753ede0332066d6b10..64ea86dc93f23c0df6c82bde8785a55023c2c5c4 100644 (file)
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,17 @@
+2013-11-20  Daiki Ueno  <ueno@gnu.org>
+
+       xgettext: Add E4X support to JavaScript scanner
+       Reported by Piotr Drąg at: <https://savannah.gnu.org/bugs/?40125>.
+       * xgettext.h (enum lexical_context_ty): New enumeration items
+       lc_xml_open_tag, lc_xml_close_tag, lc_xml_content.
+       * x-javascript.c (phase5_scan_xml_markup): New function.
+       (phase5_get): Handle '<', '>', '/', '=', '{', and '}' specially
+       to support E4X.
+       (enum token_type_ty): New enumeration item token_type_equal.
+       (xml_element_depth): New variable.
+       (inside_embedded_in_xml): New variable.
+       (extract_javascript): Initialize those variables.
+
  2013-11-14  Daiki Ueno  <ueno@gnu.org>
  
         * x-javascript.c (phase3_getc): Make sure to call comment_line_end
diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c

index c1503cef5bbfedd97b3110c6738c5053a9ea866e..59bbcbe2eada1c86515b9c5643dc972e62a0a85e 100644 (file)
--- a/gettext-tools/src/x-javascript.c
+++ b/gettext-tools/src/x-javascript.c
@@ -926,6 +926,7 @@ enum token_type_ty
    token_type_plus,              /* + */
    token_type_regexp,            /* /.../ */
    token_type_operator,          /* - * / % . < > = ~ ! | & ? : ^ */
+  token_type_equal,             /* = */
    token_type_string,            /* "abc", 'abc' */
    token_type_keyword,           /* return, else */
    token_type_symbol,            /* symbol, number */
@@ -1160,6 +1161,114 @@ phase5_scan_regexp ()
        phase2_ungetc (c);
  }
  
+static int xml_element_depth = 0;
+static bool inside_embedded_js_in_xml = false;
+
+static bool
+phase5_scan_xml_markup (token_ty *tp)
+{
+  struct
+  {
+    const char *start;
+    const char *end;
+  } markers[] =
+      {
+        { "!--", "--" },
+        { "![CDATA[", "]]" },
+        { "?", "?" }
+      };
+  int i;
+
+  for (i = 0; i < SIZEOF (markers); i++)
+    {
+      const char *start = markers[i].start;
+      const char *end = markers[i].end;
+      int j;
+
+      /* Look for a start marker.  */
+      for (j = 0; start[j] != '\0'; j++)
+        {
+          int c;
+
+          assert (phase2_pushback_length + j < SIZEOF (phase2_pushback));
+          c = phase2_getc ();
+          if (c == UEOF)
+            goto eof;
+          if (c != start[j])
+            {
+              int k = j;
+
+              phase2_ungetc (c);
+              k--;
+
+              for (; k >= 0; k--)
+                phase2_ungetc (start[k]);
+              break;
+            }
+        }
+
+      if (start[j] != '\0')
+        continue;
+
+      /* Skip until the end marker.  */
+      for (;;)
+        {
+          int c;
+
+          for (j = 0; end[j] != '\0'; j++)
+            {
+              assert (phase2_pushback_length + 1 < SIZEOF (phase2_pushback));
+              c = phase2_getc ();
+              if (c == UEOF)
+                goto eof;
+              if (c != end[j])
+                {
+                  /* Don't push the first character back so the next
+                     iteration start from the second character.  */
+                  if (j > 0)
+                    {
+                      int k = j;
+
+                      phase2_ungetc (c);
+                      k--;
+
+                      for (; k > 0; k--)
+                        phase2_ungetc (end[k]);
+                    }
+                  break;
+                }
+            }
+
+          if (end[j] != '\0')
+            continue;
+
+          c = phase2_getc ();
+          if (c == UEOF)
+            goto eof;
+          if (c != '>')
+            {
+              error_with_progname = false;
+              error (0, 0,
+                     _("%s:%d: warning: %s is not allowed"),
+                     logical_file_name, line_number,
+                     end);
+              error_with_progname = true;
+              return false;
+            }
+          return true;
+        }
+    }
+  return false;
+
+ eof:
+  error_with_progname = false;
+  error (0, 0,
+         _("%s:%d: warning: unterminated XML markup"),
+         logical_file_name, line_number);
+  error_with_progname = true;
+  return false;
+}
+
  static void
  phase5_get (token_ty *tp)
  {
@@ -1314,13 +1423,93 @@ phase5_get (token_ty *tp)
          /* Identify operators. The multiple character ones are simply ignored
           * as they are recognized here and are otherwise not relevant. */
          case '-': case '*': /* '+' and '/' are not listed here! */
-        case '%': case '<': case '>': case '=':
+        case '%':
          case '~': case '!': case '|': case '&': case '^':
          case '?': case ':':
            tp->type = last_token_type = token_type_operator;
            return;
  
+        case '=':
+          tp->type = last_token_type = token_type_equal;
+          return;
+
+        case '<':
+          {
+            /* We assume:
+               - XMLMarkup and XMLElement are only allowed after '=' or '('
+               - embedded JavaScript expressions in XML do not recurse
+             */
+            if (xml_element_depth > 0
+                || (!inside_embedded_js_in_xml
+                    && (last_token_type == token_type_equal
+                        || last_token_type == token_type_lparen)))
+              {
+                /* Comments, PI, or CDATA.  */
+                if (phase5_scan_xml_markup (tp))
+                  return;
+                c = phase2_getc ();
+
+                /* Closing tag.  */
+                if (c == '/')
+                  lexical_context = lc_xml_close_tag;
+
+                /* Opening element.  */
+                else
+                  {
+                    phase2_ungetc (c);
+                    lexical_context = lc_xml_open_tag;
+                    xml_element_depth++;
+                  }
+
+                tp->type = last_token_type = token_type_other;
+              }
+            else
+              tp->type = last_token_type = token_type_operator;
+          }
+          return;
+
+        case '>':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            {
+              switch (lexical_context)
+                {
+                case lc_xml_open_tag:
+                  lexical_context = lc_xml_content;
+                  break;
+
+                case lc_xml_close_tag:
+                  if (xml_element_depth-- > 0)
+                    lexical_context = lc_xml_content;
+                  else
+                    lexical_context = lc_outside;
+                  break;
+
+                default:
+                  break;
+                }
+              tp->type = last_token_type = token_type_other;
+            }
+          else
+            tp->type = last_token_type = token_type_operator;
+          return;
+
          case '/':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            {
+              /* If it appears in an opening tag of an XML element, it's
+                 part of '/>'.  */
+              if (lexical_context == lc_xml_open_tag)
+                {
+                  c = phase2_getc ();
+                  if (c == '>')
+                    lexical_context = lc_outside;
+                  else
+                    phase2_ungetc (c);
+                }
+              tp->type = last_token_type = token_type_other;
+              return;
+            }
+
            /* Either a division operator or the start of a regular
               expression literal.  If the '/' token is spotted after a
               symbol it's a division, otherwise it's a regular
@@ -1336,6 +1525,18 @@ phase5_get (token_ty *tp)
              }
            return;
  
+        case '{':
+          if (xml_element_depth > 0 && !inside_embedded_js_in_xml)
+            inside_embedded_js_in_xml = true;
+          tp->type = last_token_type = token_type_other;
+          return;
+
+        case '}':
+          if (xml_element_depth > 0 && inside_embedded_js_in_xml)
+            inside_embedded_js_in_xml = false;
+          tp->type = last_token_type = token_type_other;
+          return;
+
          case '(':
            tp->type = last_token_type = token_type_lparen;
            return;
@@ -1598,6 +1799,7 @@ extract_balanced (message_list_ty *mlp,
          case token_type_plus:
          case token_type_regexp:
          case token_type_operator:
+        case token_type_equal:
          case token_type_other:
            next_context_iter = null_context_list_iterator;
            state = 0;
@@ -1628,6 +1830,8 @@ extract_javascript (FILE *f,
    last_comment_line = -1;
    last_non_comment_line = -1;
  
+  xml_element_depth = 0;
+
    xgettext_current_file_source_encoding = xgettext_global_source_encoding;
  #if HAVE_ICONV
    xgettext_current_file_source_iconv = xgettext_global_source_iconv;
diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h

index 16540fe9ccf188818cc552bc92f433a2fc32a6e9..2f8a084869f5998695fa831cba36ea9f03580d3d 100644 (file)
--- a/gettext-tools/src/xgettext.h
+++ b/gettext-tools/src/xgettext.h
@@ -144,7 +144,12 @@ typedef enum
    {
      lc_outside, /* Initial context: outside of comments and strings.  */
      lc_comment, /* Inside a comment.  */
-    lc_string   /* Inside a string literal.  */
+    lc_string,  /* Inside a string literal.  */
+
+    /* For embedded XML in programming code, like E4X in JavaScript.  */
+    lc_xml_open_tag,   /* Inside an opening tag of an XML element.  */
+    lc_xml_close_tag,  /* Inside a closing tag of an XML element.  */
+    lc_xml_content     /* Inside an XML text node.  */
    }
    lexical_context_ty;
  
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog

index 33efe8da745fc386933ba4ef5ca6b50a4f13db59..42c680bcecd5052e1d4881dda620569885f95637 100644 (file)
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,9 @@
+2013-11-20  Daiki Ueno  <ueno@gnu.org>
+
+       xgettext: Add E4X support to JavaScript scanner
+       * Makefile.am (TESTS): Add xgettext-javascript-6.
+       * xgettext-javascript-6: New file.
+
  2013-11-14  Daiki Ueno  <ueno@gnu.org>
  
         * xgettext-javascript-1: Add a test to extract translator comments.
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am

index e1e0df26520f2b2a3f3780dc69ba7a76e75779af..d48aa7b4245ff4c56d2f88127baebc78dd992d25 100644 (file)
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -102,7 +102,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
         xgettext-ycp-1 xgettext-ycp-2 xgettext-ycp-3 xgettext-ycp-4 \
         xgettext-lua-1 xgettext-lua-2 \
         xgettext-javascript-1 xgettext-javascript-2 xgettext-javascript-3 \
-       xgettext-javascript-4 xgettext-javascript-5 \
+       xgettext-javascript-4 xgettext-javascript-5 xgettext-javascript-6 \
         xgettext-vala-1 \
         xgettext-gsettings-1 \
         format-awk-1 format-awk-2 \
diff --git a/gettext-tools/tests/xgettext-javascript-6 b/gettext-tools/tests/xgettext-javascript-6

new file mode 100644 (file)

index 0000000..a891ebe
--- /dev/null
+++ b/gettext-tools/tests/xgettext-javascript-6
@@ -0,0 +1,75 @@
+#!/bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test of JavaScript E4X support.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-js-6.js"
+cat <<\EOF > xg-js-6.js
+var x1 = <x1></x1>;
+var s1 = _("Expected translation string #1");
+var s2 = "foo";
+var x2 = <{s2}>foo {s2} bar</{s2}>;
+var x3 = <x3 a1="/"><x4>{_("Expected translation string #2")}</x4></x3>;
+var x4 = <x5 a2='/'><x{_("Expected translation string #3")}>
+</x{_("Expected translation string #3")}></x5>;
+var x4 = <![CDATA[
+  _("Unexpected translation string #1")
+]]>;
+var x5 = <!-- - _("Unexpected translation string #2") - -->;
+var s6 = _("Expected translation string #4");
+var x6 = <? _("Unexpected translation string #3") ?>;
+var x7 = <!--- this is a comment --> <foo>
+</foo>;
+EOF
+
+tmpfiles="$tmpfiles xg-js-6.err xg-js-6.tmp xg-js-6.pot"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --add-comments --no-location -o xg-js-6.tmp xg-js-6.js 2>xg-js-6.err
+test $? = 0 || { cat xg-js-6.err; rm -fr $tmpfiles; exit 1; }
+# Don't simplify this to "grep ... < xg-js-6.tmp", otherwise OpenBSD 4.0 grep
+# only outputs "Binary file (standard input) matches".
+cat xg-js-6.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-js-6.pot
+
+tmpfiles="$tmpfiles xg-js-6.ok"
+cat <<\EOF > xg-js-6.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: \n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "Expected translation string #1"
+msgstr ""
+
+msgid "Expected translation string #2"
+msgstr ""
+
+msgid "Expected translation string #3"
+msgstr ""
+
+msgid "Expected translation string #4"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-js-6.ok xg-js-6.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
author	Daiki Ueno <ueno@gnu.org>
	Wed, 20 Nov 2013 03:41:20 +0000 (12:41 +0900)
committer	Daiki Ueno <ueno@gnu.org>
	Wed, 4 Dec 2013 10:53:11 +0000 (19:53 +0900)
gettext-tools/src/ChangeLog		patch \| blob \| blame \| history
gettext-tools/src/x-javascript.c		patch \| blob \| blame \| history
gettext-tools/src/xgettext.h		patch \| blob \| blame \| history
gettext-tools/tests/ChangeLog		patch \| blob \| blame \| history
gettext-tools/tests/Makefile.am		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-javascript-6	[new file with mode: 0644]	patch \| blob