From: Bruno Haible <bruno@clisp.org>
Date: Mon, 6 Apr 2020 21:29:56 +0000 (+0200)
Subject: xgettext: Improve JSX support in JavaScript.
X-Git-Tag: v0.21~108
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f6fd2276ef2dab61f6120055f5b06c8d5aa899b1;p=thirdparty%2Fgettext.git

xgettext: Improve JSX support in JavaScript.

Reported by Vaclav Slavik <vaclav@slavik.io> in
<https://savannah.gnu.org/bugs/?57927>.

* gettext-tools/src/x-javascript.c (enum token_type_ty): Add some more types.
(is_after_expression): New function.
(phase5_get): Distinguish the "other" tokens in more detail. Use
'is_after_expression ()' when interpreting a '<' or '/' character.
(extract_balanced): Don't let commas in braced expressions and in XML elements
disturb the argument processing outside.
(extract_javascript): Update.
* gettext-tools/tests/xgettext-javascript-6: Enhance the test.
* NEWS: Mention the change.
---

diff --git a/NEWS b/NEWS
index ded2f208f..9ec77fde8 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,8 @@ Version 0.20.2 - April 2020
     o xgettext now recognizes format strings in the Formatter syntax.  They
       are marked as 'java-printf-format' in POT and PO files.
     o xgettext now recognizes text blocks as string literals.
+  - JavaScript:
+    xgettext parses JSX expressions more reliably.
   - Desktop Entry:
     The value of the 'Icon' property is no longer extracted into the POT file
     by xgettext.  The documentation explains how to localize icons.
diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c
index 564abd3a9..34da71760 100644
--- a/gettext-tools/src/x-javascript.c
+++ b/gettext-tools/src/x-javascript.c
@@ -62,7 +62,16 @@
 
 /* The JavaScript aka ECMA-Script syntax is defined in ECMA-262
    specification:
-   https://www.ecma-international.org/publications/standards/Ecma-262.htm */
+   <https://www.ecma-international.org/publications/standards/Ecma-262.htm>
+
+   Regarding the XML element support:
+   The earlier standard E4X
+   <https://en.wikipedia.org/wiki/ECMAScript_for_XML>
+   <https://web.archive.org/web/20131104082608/http://www.ecma-international.org/publications/standards/Ecma-357.htm>
+   is no longer widely supported.
+   Instead, nowadays, JSX is widely used.
+   <https://facebook.github.io/jsx/>
+*/
 
 /* ====================== Keyword set customization.  ====================== */
 
@@ -654,9 +663,13 @@ phase3_ungetc (int c)
 enum token_type_ty
 {
   token_type_eof,
+  token_type_start,
   token_type_lparen,            /* ( */
   token_type_rparen,            /* ) */
+  token_type_lbrace,            /* { */
+  token_type_rbrace,            /* } */
   token_type_comma,             /* , */
+  token_type_dot,               /* . */
   token_type_lbracket,          /* [ */
   token_type_rbracket,          /* ] */
   token_type_plus,              /* + */
@@ -668,6 +681,10 @@ enum token_type_ty
   token_type_ltemplate,         /* left part of template: `abc${ */
   token_type_mtemplate,         /* middle part of template: }abc${ */
   token_type_rtemplate,         /* right part of template: }abc` */
+  token_type_xml_tag,           /* < or </ */
+  token_type_xml_element_start, /* last token of < ... > */
+  token_type_xml_element_end,   /* last token of </ ... > */
+  token_type_xml_empty_element, /* last token of < ... /> */
   token_type_keyword,           /* return, else */
   token_type_symbol,            /* symbol, number */
   token_type_other              /* misc. operator */
@@ -894,39 +911,83 @@ static int phase5_pushback_length;
 
 static token_type_ty last_token_type;
 
+/* Returns true if last_token_type indicates that we have just seen the
+   possibly last token of an expression.  In this case, '<', '>', and '/'
+   need to be interpreted as operators, rather than as XML markup or start
+   of a regular expression.  */
+static bool
+is_after_expression (void)
+{
+  switch (last_token_type)
+    {
+    case token_type_rparen:
+    case token_type_rbrace:
+    case token_type_rbracket:
+    case token_type_regexp:
+    case token_type_string:
+    case token_type_template:
+    case token_type_rtemplate:
+    case token_type_xml_element_end:
+    case token_type_xml_empty_element:
+    case token_type_symbol:
+      return true;
+
+    case token_type_eof:
+    case token_type_start:
+    case token_type_lparen:
+    case token_type_lbrace:
+    case token_type_comma:
+    case token_type_dot:
+    case token_type_lbracket:
+    case token_type_plus:
+    case token_type_operator:
+    case token_type_equal:
+    case token_type_ltemplate:
+    case token_type_mtemplate:
+    case token_type_xml_tag:
+    case token_type_xml_element_start:
+    case token_type_keyword:
+    case token_type_other:
+      return false;
+
+    default:
+      abort ();
+    }
+}
+
 static void
 phase5_scan_regexp (void)
 {
-    int c;
+  int c;
 
-    /* Scan for end of RegExp literal ('/').  */
-    for (;;)
-      {
-        /* Must use phase2 as there can't be comments.  */
-        c = phase2_getc ();
-        if (c == '/')
-          break;
-        if (c == '\\')
-          {
-            c = phase2_getc ();
-            if (c != UEOF)
-              continue;
-          }
-        if (c == UEOF)
-          {
-            error_with_progname = false;
-            error (0, 0,
-                   _("%s:%d: warning: RegExp literal terminated too early"),
-                   logical_file_name, line_number);
-            error_with_progname = true;
-            return;
-          }
-      }
+  /* Scan for end of RegExp literal ('/').  */
+  for (;;)
+    {
+      /* Must use phase2 as there can't be comments.  */
+      c = phase2_getc ();
+      if (c == '/')
+        break;
+      if (c == '\\')
+        {
+          c = phase2_getc ();
+          if (c != UEOF)
+            continue;
+        }
+      if (c == UEOF)
+        {
+          error_with_progname = false;
+          error (0, 0,
+                 _("%s:%d: warning: RegExp literal terminated too early"),
+                 logical_file_name, line_number);
+          error_with_progname = true;
+          return;
+        }
+    }
 
-    /* Scan for modifier flags (ECMA-262 5th section 15.10.4.1).  */
-    c = phase2_getc ();
-    if (!(c == 'g' || c == 'i' || c == 'm'))
-      phase2_ungetc (c);
+  /* Scan for modifier flags (ECMA-262 5th section 15.10.4.1).  */
+  c = phase2_getc ();
+  if (!(c == 'g' || c == 'i' || c == 'm'))
+    phase2_ungetc (c);
 }
 
 /* Number of open template literals `...${  */
@@ -1105,7 +1166,7 @@ phase5_get (token_ty *tp)
             if (!(c1 >= '0' && c1 <= '9'))
               {
 
-                tp->type = last_token_type = token_type_other;
+                tp->type = last_token_type = token_type_dot;
                 return;
               }
           }
@@ -1281,32 +1342,32 @@ phase5_get (token_ty *tp)
         case '<':
           {
             /* We assume:
-               - XMLMarkup and XMLElement are only allowed after '=' or '('
-               - embedded JavaScript expressions in XML do not recurse
+               - XMLMarkup and XMLElement are not allowed after an expression,
+               - embedded JavaScript expressions in XML do not recurse.
              */
             if (xml_element_depth > 0
                 || (!inside_embedded_js_in_xml
-                    && (last_token_type == token_type_equal
-                        || last_token_type == token_type_lparen)))
+                    && ! is_after_expression ()))
               {
                 /* Comments, PI, or CDATA.  */
                 if (phase5_scan_xml_markup (tp))
+                  /* BUG: *tp is not filled in here!  */
                   return;
                 c = phase2_getc ();
 
-                /* Closing tag.  */
                 if (c == '/')
-                  lexical_context = lc_xml_close_tag;
-
-                /* Opening element.  */
+                  {
+                    /* Closing tag.  */
+                    lexical_context = lc_xml_close_tag;
+                  }
                 else
                   {
+                    /* Opening element.  */
                     phase2_ungetc (c);
                     lexical_context = lc_xml_open_tag;
                     xml_element_depth++;
                   }
-
-                tp->type = last_token_type = token_type_other;
+                tp->type = last_token_type = token_type_xml_tag;
               }
             else
               tp->type = last_token_type = token_type_operator;
@@ -1320,22 +1381,22 @@ phase5_get (token_ty *tp)
                 {
                 case lc_xml_open_tag:
                   lexical_context = lc_xml_content;
-                  break;
+                  tp->type = last_token_type = token_type_xml_element_start;
+                  return;
 
                 case lc_xml_close_tag:
                   if (--xml_element_depth > 0)
                     lexical_context = lc_xml_content;
                   else
                     lexical_context = lc_outside;
-                  break;
+                  tp->type = last_token_type = token_type_xml_element_end;
+                  return;
 
                 default:
                   break;
                 }
-              tp->type = last_token_type = token_type_other;
             }
-          else
-            tp->type = last_token_type = token_type_operator;
+          tp->type = last_token_type = token_type_operator;
           return;
 
         case '/':
@@ -1352,21 +1413,18 @@ phase5_get (token_ty *tp)
                         lexical_context = lc_xml_content;
                       else
                         lexical_context = lc_outside;
+                      tp->type = last_token_type = token_type_xml_empty_element;
+                      return;
                     }
                   else
                     phase2_ungetc (c);
                 }
-              tp->type = last_token_type = token_type_other;
-              return;
             }
 
-          /* Either a division operator or the start of a regular
-             expression literal.  If the '/' token is spotted after a
-             symbol it's a division, otherwise it's a regular
-             expression.  */
-          if (last_token_type == token_type_symbol
-              || last_token_type == token_type_rparen
-              || last_token_type == token_type_rbracket)
+          /* Either a division operator or the start of a regular expression
+             literal.  If the '/' token is spotted after an expression, it's a
+             division; otherwise it's a regular expression.  */
+          if (is_after_expression ())
             tp->type = last_token_type = token_type_operator;
           else
             {
@@ -1380,7 +1438,7 @@ phase5_get (token_ty *tp)
             inside_embedded_js_in_xml = true;
           else
             brace_depths[template_literal_depth]++;
-          tp->type = last_token_type = token_type_other;
+          tp->type = last_token_type = token_type_lbrace;
           return;
 
         case '}':
@@ -1410,7 +1468,7 @@ phase5_get (token_ty *tp)
                 }
               return;
             }
-          tp->type = last_token_type = token_type_other;
+          tp->type = last_token_type = token_type_rbrace;
           return;
 
         case '(':
@@ -1649,6 +1707,28 @@ extract_balanced (message_list_ty *mlp,
           state = 0;
           continue;
 
+        case token_type_lbrace:
+          if (extract_balanced (mlp, token_type_rbrace,
+                                null_context, null_context_list_iterator,
+                                arglist_parser_alloc (mlp, NULL)))
+            {
+              arglist_parser_done (argparser, arg);
+              return true;
+            }
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          continue;
+
+        case token_type_rbrace:
+          if (delim == token_type_rbrace || delim == token_type_eof)
+            {
+              arglist_parser_done (argparser, arg);
+              return false;
+            }
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          continue;
+
         case token_type_string:
         case token_type_template:
           {
@@ -1676,6 +1756,28 @@ extract_balanced (message_list_ty *mlp,
           state = 0;
           continue;
 
+        case token_type_xml_element_start:
+          if (extract_balanced (mlp, token_type_xml_element_end,
+                                null_context, null_context_list_iterator,
+                                arglist_parser_alloc (mlp, NULL)))
+            {
+              arglist_parser_done (argparser, arg);
+              return true;
+            }
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          continue;
+
+        case token_type_xml_element_end:
+          if (delim == token_type_xml_element_end || delim == token_type_eof)
+            {
+              arglist_parser_done (argparser, arg);
+              return false;
+            }
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          continue;
+
         case token_type_eof:
           arglist_parser_done (argparser, arg);
           return true;
@@ -1684,10 +1786,14 @@ extract_balanced (message_list_ty *mlp,
         case token_type_mtemplate:
         case token_type_rtemplate:
         case token_type_keyword:
+        case token_type_start:
+        case token_type_dot:
         case token_type_plus:
         case token_type_regexp:
         case token_type_operator:
         case token_type_equal:
+        case token_type_xml_tag:
+        case token_type_xml_empty_element:
         case token_type_other:
           next_context_iter = null_context_list_iterator;
           state = 0;
@@ -1737,7 +1843,7 @@ extract_javascript (FILE *f,
   continuation_or_nonblank_line = false;
 
   phase5_pushback_length = 0;
-  last_token_type = token_type_other;
+  last_token_type = token_type_start;
 
   template_literal_depth = 0;
   new_brace_depth_level ();
diff --git a/gettext-tools/tests/xgettext-javascript-6 b/gettext-tools/tests/xgettext-javascript-6
index 5193eba88..da2fe60b3 100755
--- a/gettext-tools/tests/xgettext-javascript-6
+++ b/gettext-tools/tests/xgettext-javascript-6
@@ -1,7 +1,7 @@
 #!/bin/sh
 . "${srcdir=.}/init.sh"; path_prepend_ . ../src
 
-# Test of JavaScript E4X support.
+# Test of JavaScript JSX support.
 
 cat <<\EOF > xg-js-6.js
 class Foo extends React.Component {
@@ -44,6 +44,10 @@ var x8 = <x8><x9>{_("<x8>{Expected translation string #7}</x8>")}</x9></x8>
 var x9 = <x10 attr='{_("Unexpected translation string #6")}'><x11>data</x11></x10>;
 var s9 = _("Expected translation string #8");
 </foo>;
+function foo() {
+  return <a>{ 'b' }</a>;
+}
+var s10 = _("Expected translation string #9");
 EOF
 
 : ${XGETTEXT=xgettext}
@@ -96,6 +100,9 @@ msgstr ""
 
 msgid "Expected translation string #8"
 msgstr ""
+
+msgid "Expected translation string #9"
+msgstr ""
 EOF
 
 : ${DIFF=diff}