]> git.ipfire.org Git - thirdparty/gettext.git/commitdiff
xgettext: PHP: Fix parsing of strings and heredocs with embedded expressions.
authorBruno Haible <bruno@clisp.org>
Wed, 28 Jan 2026 12:50:21 +0000 (13:50 +0100)
committerBruno Haible <bruno@clisp.org>
Wed, 28 Jan 2026 12:50:21 +0000 (13:50 +0100)
Reported by Gert Jan Schoneveld <gertjan@facilityapps.com> in
<https://lists.gnu.org/archive/html/bug-gettext/2025-08/msg00007.html>
and by Benoit Waldmann <benoit.waldmann@siagilus.fr>
at <https://savannah.gnu.org/bugs/index.php?item_id=67948>.

* gettext-tools/src/x-php.c (enum token_type_ty): New enum values
token_type_lbrace, token_type_rbrace.
(struct php_extractor): New field brace_nesting_depth.
(php_extractor_init_rest): Initialize it.
(extract_balanced): Forward-declare.
(process_dquote_or_heredoc): Change handling of embedded expressions.
(phase4_get): Recognize token_type_lbrace, token_type_rbrace.
(extract_balanced): Recurse also for token_type_lbrace.
(extract_php_input): Remove function.
(extract_php): Inline it here.
* gettext-tools/tests/xgettext-php-1: Add more tests of embedded expressions.

gettext-tools/src/x-php.c
gettext-tools/tests/xgettext-php-1

index 83b9011e350a8823e3e90f9b935446246da17a66..071e188b171da183af5320b025fd9c076c4a3c8d 100644 (file)
@@ -151,6 +151,8 @@ enum token_type_ty
   token_type_dot,               /* . */
   token_type_operator1,         /* * / % ++ -- */
   token_type_operator2,         /* + - ! ~ @ */
+  token_type_lbrace,            /* { */
+  token_type_rbrace,            /* } */
   token_type_string_literal,    /* "abc" */
   token_type_symbol,            /* symbol, number */
   token_type_other              /* misc. operator */
@@ -211,6 +213,7 @@ struct php_extractor
   /* Current nesting depths.  */
   int paren_nesting_depth;
   int bracket_nesting_depth;
+  int brace_nesting_depth;
 };
 
 static inline void
@@ -235,10 +238,15 @@ php_extractor_init_rest (struct php_extractor *xp)
 
   xp->paren_nesting_depth = 0;
   xp->bracket_nesting_depth = 0;
+  xp->brace_nesting_depth = 0;
 }
 
 /* Forward declarations.  */
-static void extract_php_input (struct php_extractor *xp);
+static bool extract_balanced (struct php_extractor *xp,
+                              token_type_ty delim,
+                              flag_region_ty *outer_region,
+                              flag_context_list_iterator_ty context_iter,
+                              struct arglist_parser *argparser);
 
 
 /* ======================== Reading of characters.  ======================== */
@@ -973,81 +981,16 @@ process_dquote_or_heredoc (struct php_extractor *xp, bool heredoc)
 
  string_with_embedded_expressions:
   is_constant = false;
-  {
-    size_t nesting_stack_alloc = 10;
-    char *nesting_stack = xmalloc (nesting_stack_alloc);
-    size_t nesting_stack_depth = 0;
-    /* We just read a '{', so expect a matching '}'.  */
-    nesting_stack[nesting_stack_depth++] = '}';
-
-    /* Find the extent of the expression.  */
-    struct string_buffer buffer;
-    sb_init (&buffer);
-    for (;;)
-      {
-        int c;
-
-        c = phase1_getc (xp);
-        if (!heredoc && c == EOF)
-          break;
-        if (c == (heredoc ? EOF : '"'))
-          {
-            if (nesting_stack_depth > 0)
-              if_error (IF_SEVERITY_WARNING,
-                        logical_file_name, xp->line_number, (size_t)(-1), false,
-                        heredoc
-                        ? _("unterminated expression in heredoc, expected a '%c'")
-                        : _("unterminated expression in string literal, expected a '%c'"),
-                        nesting_stack[nesting_stack_depth - 1]);
-            break;
-          }
-        if (heredoc && c == '\n')
-          xp->line_number++;
-        if (c == '{' || c == '[' || c == '(')
-          {
-            if (nesting_stack_depth >= nesting_stack_alloc)
-              {
-                nesting_stack_alloc = 2 * nesting_stack_alloc;
-                nesting_stack =
-                  xrealloc (nesting_stack, nesting_stack_alloc);
-              }
-            nesting_stack[nesting_stack_depth++] =
-              (c == '{' ? '}' : c == '[' ? ']' : ')');
-          }
-        else if (c == '}' || c == ']' || c == ')')
-          {
-            if (nesting_stack_depth > 0
-                && c == nesting_stack[nesting_stack_depth - 1])
-              {
-                if (--nesting_stack_depth == 0)
-                  break;
-              }
-            else
-              if_error (IF_SEVERITY_WARNING,
-                        logical_file_name, xp->line_number, (size_t)(-1), false,
-                        heredoc
-                        ? _("unterminated expression in heredoc contains unbalanced '%c'")
-                        : _("unterminated expression in string literal contains unbalanced '%c'"),
-                        c);
-          }
-        sb_xappend1 (&buffer, c);
-      }
-
-    /* Recursively extract messages from the expression.  */
-    string_desc_t substring = sb_contents (&buffer);
-
-    struct php_extractor *rxp = XMALLOC (struct php_extractor);
-    rxp->mlp = xp->mlp;
-    sf_istream_init_from_string_desc (&rxp->input, substring);
-    rxp->line_number = xp->line_number;
-    php_extractor_init_rest (rxp);
-
-    extract_php_input (rxp);
-
-    free (rxp);
-    sb_free (&buffer);
-    free (nesting_stack);
-  }
+  if (++(xp->brace_nesting_depth) > MAX_NESTING_DEPTH)
+    if_error (IF_SEVERITY_FATAL_ERROR,
+              logical_file_name, xp->line_number, (size_t)(-1), false,
+              _("too many open braces"));
+  if (extract_balanced (xp, token_type_rbrace,
+                        null_context_region (),
+                        null_context_list_iterator,
+                        arglist_parser_alloc (xp->mlp, NULL)))
+    return NULL;
+  xp->brace_nesting_depth--;
   goto string_continued;
 }
 
@@ -1283,6 +1226,14 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
           tp->type = token_type_operator2;
           return;
 
+        case '{':
+          tp->type = token_type_lbrace;
+          return;
+
+        case '}':
+          tp->type = token_type_rbrace;
+          return;
+
         case '<':
           {
             int c2 = phase1_getc (xp);
@@ -1698,8 +1649,8 @@ static flag_context_list_table_ty *flag_context_list_table;
 
 /* Extract messages until the next balanced closing parenthesis or bracket.
    Extracted messages are added to XP->MLP.
-   DELIM can be either token_type_rparen or token_type_rbracket, or
-   token_type_eof to accept both.
+   DELIM can be either token_type_rparen or token_type_rbracket or
+   token_type_rbrace, or token_type_eof to accept any of them.
    Return true upon eof, false upon closing parenthesis or bracket.  */
 static bool
 extract_balanced (struct php_extractor *xp,
@@ -1824,6 +1775,36 @@ extract_balanced (struct php_extractor *xp,
           state = 0;
           break;
 
+        case token_type_lbrace:
+          if (++(xp->brace_nesting_depth) > MAX_NESTING_DEPTH)
+            if_error (IF_SEVERITY_FATAL_ERROR,
+                      logical_file_name, xp->line_number, (size_t)(-1), false,
+                      _("too many open braces"));
+          if (extract_balanced (xp, token_type_rbrace,
+                                null_context_region (),
+                                null_context_list_iterator,
+                                arglist_parser_alloc (xp->mlp, NULL)))
+            {
+              arglist_parser_done (argparser, arg);
+              unref_region (inner_region);
+              return true;
+            }
+          xp->brace_nesting_depth--;
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          break;
+
+        case token_type_rbrace:
+          if (delim == token_type_rbrace || delim == token_type_eof)
+            {
+              arglist_parser_done (argparser, arg);
+              unref_region (inner_region);
+              return false;
+            }
+          next_context_iter = null_context_list_iterator;
+          state = 0;
+          break;
+
         case token_type_string_literal:
           {
             lex_pos_ty pos;
@@ -1870,18 +1851,6 @@ extract_balanced (struct php_extractor *xp,
 }
 
 
-static void
-extract_php_input (struct php_extractor *xp)
-{
-  /* Eat tokens until eof is seen.  When extract_balanced returns
-     due to an unbalanced closing parenthesis, just restart it.  */
-  while (!extract_balanced (xp, token_type_eof,
-                            null_context_region (), null_context_list_iterator,
-                            arglist_parser_alloc (xp->mlp, NULL)))
-    ;
-}
-
-
 void
 extract_php (FILE *f,
              const char *real_filename, const char *logical_filename,
@@ -1904,7 +1873,12 @@ extract_php (FILE *f,
   /* Initial mode is HTML mode, not PHP mode.  */
   skip_html (xp);
 
-  extract_php_input (xp);
+  /* Eat tokens until eof is seen.  When extract_balanced returns
+     due to an unbalanced closing parenthesis, just restart it.  */
+  while (!extract_balanced (xp, token_type_eof,
+                            null_context_region (), null_context_list_iterator,
+                            arglist_parser_alloc (xp->mlp, NULL)))
+    ;
 
   /* Close scanner.  */
   free (xp);
index 1b403bc8c3b65e655fc8618d608c94ffe7d78916..53dcb800709453b29ba80c27b89fd9c920c34af5 100755 (executable)
@@ -24,22 +24,34 @@ echo _("embedded_1_$foo bar");
 echo _("embedded_2_${foo}bar");
 echo _("embedded_3_{$foo}bar");
 echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz");
-echo _("embedded_5");
+echo _("embedded_5_{$array[func(_("embedded_5_sub1"))]}_bar_{$array[func(_("embedded_5_sub2"))]}_baz");
+echo _("embedded_6_{$array[func(_("embedded_6_sub]}1"))]}_bar_{$array[func(_("embedded_6_sub]}2"))]}_baz");
+echo _("embedded_7_{$array[func(_("embedded_7_sub\"1"))]}_bar_{$array[func(_("embedded_7_sub\"2"))]}_baz");
+echo _("embedded_8");
 // Heredoc with with embedded expressions.
 echo _(<<<EOT
-embedded_6_$foo bar
+embedded_11_$foo bar
 EOT);
 echo _(<<<EOT
-embedded_7_${foo}bar
+embedded_12_${foo}bar
 EOT);
 echo _(<<<EOT
-embedded_8_{$foo}bar
+embedded_13_{$foo}bar
 EOT);
 echo _(<<<EOT
-embedded_9_{$array[func(_('embedded_9_sub1'))]}_bar_{$array[func(_('embedded_9_sub2'))]}_baz
+embedded_14_{$array[func(_('embedded_14_sub1'))]}_bar_{$array[func(_('embedded_14_sub2'))]}_baz
 EOT);
 echo _(<<<EOT
-embedded_10
+embedded_15_{$array[func(_("embedded_15_sub1"))]}_bar_{$array[func(_("embedded_15_sub2"))]}_baz
+EOT);
+echo _(<<<EOT
+embedded_16_{$array[func(_("embedded_16_sub]}1"))]}_bar_{$array[func(_("embedded_16_sub]}2"))]}_baz
+EOT);
+echo _(<<<EOT
+embedded_17_{$array[func(_("embedded_17_sub\"1"))]}_bar_{$array[func(_("embedded_17_sub\"2"))]}_baz
+EOT);
+echo _(<<<EOT
+embedded_18
 EOT);
 ?>
 EOF
@@ -76,16 +88,52 @@ msgstr ""
 msgid "embedded_4_sub2"
 msgstr ""
 
-msgid "embedded_5"
+msgid "embedded_5_sub1"
+msgstr ""
+
+msgid "embedded_5_sub2"
+msgstr ""
+
+msgid "embedded_6_sub]}1"
+msgstr ""
+
+msgid "embedded_6_sub]}2"
+msgstr ""
+
+msgid "embedded_7_sub\"1"
+msgstr ""
+
+msgid "embedded_7_sub\"2"
+msgstr ""
+
+msgid "embedded_8"
+msgstr ""
+
+msgid "embedded_14_sub1"
+msgstr ""
+
+msgid "embedded_14_sub2"
+msgstr ""
+
+msgid "embedded_15_sub1"
+msgstr ""
+
+msgid "embedded_15_sub2"
+msgstr ""
+
+msgid "embedded_16_sub]}1"
+msgstr ""
+
+msgid "embedded_16_sub]}2"
 msgstr ""
 
-msgid "embedded_9_sub1"
+msgid "embedded_17_sub\"1"
 msgstr ""
 
-msgid "embedded_9_sub2"
+msgid "embedded_17_sub\"2"
 msgstr ""
 
-msgid "embedded_10"
+msgid "embedded_18"
 msgstr ""
 EOF