From: Bruno Haible Date: Wed, 28 Jan 2026 12:50:21 +0000 (+0100) Subject: xgettext: PHP: Fix parsing of strings and heredocs with embedded expressions. X-Git-Tag: v1.0~8 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=053b5ef04dcdd077d66ed91470432be8c9c8a49e;p=thirdparty%2Fgettext.git xgettext: PHP: Fix parsing of strings and heredocs with embedded expressions. Reported by Gert Jan Schoneveld in and by Benoit Waldmann at . * gettext-tools/src/x-php.c (enum token_type_ty): New enum values token_type_lbrace, token_type_rbrace. (struct php_extractor): New field brace_nesting_depth. (php_extractor_init_rest): Initialize it. (extract_balanced): Forward-declare. (process_dquote_or_heredoc): Change handling of embedded expressions. (phase4_get): Recognize token_type_lbrace, token_type_rbrace. (extract_balanced): Recurse also for token_type_lbrace. (extract_php_input): Remove function. (extract_php): Inline it here. * gettext-tools/tests/xgettext-php-1: Add more tests of embedded expressions. --- diff --git a/gettext-tools/src/x-php.c b/gettext-tools/src/x-php.c index 83b9011e3..071e188b1 100644 --- a/gettext-tools/src/x-php.c +++ b/gettext-tools/src/x-php.c @@ -151,6 +151,8 @@ enum token_type_ty token_type_dot, /* . */ token_type_operator1, /* * / % ++ -- */ token_type_operator2, /* + - ! ~ @ */ + token_type_lbrace, /* { */ + token_type_rbrace, /* } */ token_type_string_literal, /* "abc" */ token_type_symbol, /* symbol, number */ token_type_other /* misc. operator */ @@ -211,6 +213,7 @@ struct php_extractor /* Current nesting depths. */ int paren_nesting_depth; int bracket_nesting_depth; + int brace_nesting_depth; }; static inline void @@ -235,10 +238,15 @@ php_extractor_init_rest (struct php_extractor *xp) xp->paren_nesting_depth = 0; xp->bracket_nesting_depth = 0; + xp->brace_nesting_depth = 0; } /* Forward declarations. */ -static void extract_php_input (struct php_extractor *xp); +static bool extract_balanced (struct php_extractor *xp, + token_type_ty delim, + flag_region_ty *outer_region, + flag_context_list_iterator_ty context_iter, + struct arglist_parser *argparser); /* ======================== Reading of characters. ======================== */ @@ -973,81 +981,16 @@ process_dquote_or_heredoc (struct php_extractor *xp, bool heredoc) string_with_embedded_expressions: is_constant = false; - { - size_t nesting_stack_alloc = 10; - char *nesting_stack = xmalloc (nesting_stack_alloc); - size_t nesting_stack_depth = 0; - /* We just read a '{', so expect a matching '}'. */ - nesting_stack[nesting_stack_depth++] = '}'; - - /* Find the extent of the expression. */ - struct string_buffer buffer; - sb_init (&buffer); - for (;;) - { - int c; - - c = phase1_getc (xp); - if (!heredoc && c == EOF) - break; - if (c == (heredoc ? EOF : '"')) - { - if (nesting_stack_depth > 0) - if_error (IF_SEVERITY_WARNING, - logical_file_name, xp->line_number, (size_t)(-1), false, - heredoc - ? _("unterminated expression in heredoc, expected a '%c'") - : _("unterminated expression in string literal, expected a '%c'"), - nesting_stack[nesting_stack_depth - 1]); - break; - } - if (heredoc && c == '\n') - xp->line_number++; - if (c == '{' || c == '[' || c == '(') - { - if (nesting_stack_depth >= nesting_stack_alloc) - { - nesting_stack_alloc = 2 * nesting_stack_alloc; - nesting_stack = - xrealloc (nesting_stack, nesting_stack_alloc); - } - nesting_stack[nesting_stack_depth++] = - (c == '{' ? '}' : c == '[' ? ']' : ')'); - } - else if (c == '}' || c == ']' || c == ')') - { - if (nesting_stack_depth > 0 - && c == nesting_stack[nesting_stack_depth - 1]) - { - if (--nesting_stack_depth == 0) - break; - } - else - if_error (IF_SEVERITY_WARNING, - logical_file_name, xp->line_number, (size_t)(-1), false, - heredoc - ? _("unterminated expression in heredoc contains unbalanced '%c'") - : _("unterminated expression in string literal contains unbalanced '%c'"), - c); - } - sb_xappend1 (&buffer, c); - } - - /* Recursively extract messages from the expression. */ - string_desc_t substring = sb_contents (&buffer); - - struct php_extractor *rxp = XMALLOC (struct php_extractor); - rxp->mlp = xp->mlp; - sf_istream_init_from_string_desc (&rxp->input, substring); - rxp->line_number = xp->line_number; - php_extractor_init_rest (rxp); - - extract_php_input (rxp); - - free (rxp); - sb_free (&buffer); - free (nesting_stack); - } + if (++(xp->brace_nesting_depth) > MAX_NESTING_DEPTH) + if_error (IF_SEVERITY_FATAL_ERROR, + logical_file_name, xp->line_number, (size_t)(-1), false, + _("too many open braces")); + if (extract_balanced (xp, token_type_rbrace, + null_context_region (), + null_context_list_iterator, + arglist_parser_alloc (xp->mlp, NULL))) + return NULL; + xp->brace_nesting_depth--; goto string_continued; } @@ -1283,6 +1226,14 @@ phase4_get (struct php_extractor *xp, token_ty *tp) tp->type = token_type_operator2; return; + case '{': + tp->type = token_type_lbrace; + return; + + case '}': + tp->type = token_type_rbrace; + return; + case '<': { int c2 = phase1_getc (xp); @@ -1698,8 +1649,8 @@ static flag_context_list_table_ty *flag_context_list_table; /* Extract messages until the next balanced closing parenthesis or bracket. Extracted messages are added to XP->MLP. - DELIM can be either token_type_rparen or token_type_rbracket, or - token_type_eof to accept both. + DELIM can be either token_type_rparen or token_type_rbracket or + token_type_rbrace, or token_type_eof to accept any of them. Return true upon eof, false upon closing parenthesis or bracket. */ static bool extract_balanced (struct php_extractor *xp, @@ -1824,6 +1775,36 @@ extract_balanced (struct php_extractor *xp, state = 0; break; + case token_type_lbrace: + if (++(xp->brace_nesting_depth) > MAX_NESTING_DEPTH) + if_error (IF_SEVERITY_FATAL_ERROR, + logical_file_name, xp->line_number, (size_t)(-1), false, + _("too many open braces")); + if (extract_balanced (xp, token_type_rbrace, + null_context_region (), + null_context_list_iterator, + arglist_parser_alloc (xp->mlp, NULL))) + { + arglist_parser_done (argparser, arg); + unref_region (inner_region); + return true; + } + xp->brace_nesting_depth--; + next_context_iter = null_context_list_iterator; + state = 0; + break; + + case token_type_rbrace: + if (delim == token_type_rbrace || delim == token_type_eof) + { + arglist_parser_done (argparser, arg); + unref_region (inner_region); + return false; + } + next_context_iter = null_context_list_iterator; + state = 0; + break; + case token_type_string_literal: { lex_pos_ty pos; @@ -1870,18 +1851,6 @@ extract_balanced (struct php_extractor *xp, } -static void -extract_php_input (struct php_extractor *xp) -{ - /* Eat tokens until eof is seen. When extract_balanced returns - due to an unbalanced closing parenthesis, just restart it. */ - while (!extract_balanced (xp, token_type_eof, - null_context_region (), null_context_list_iterator, - arglist_parser_alloc (xp->mlp, NULL))) - ; -} - - void extract_php (FILE *f, const char *real_filename, const char *logical_filename, @@ -1904,7 +1873,12 @@ extract_php (FILE *f, /* Initial mode is HTML mode, not PHP mode. */ skip_html (xp); - extract_php_input (xp); + /* Eat tokens until eof is seen. When extract_balanced returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_balanced (xp, token_type_eof, + null_context_region (), null_context_list_iterator, + arglist_parser_alloc (xp->mlp, NULL))) + ; /* Close scanner. */ free (xp); diff --git a/gettext-tools/tests/xgettext-php-1 b/gettext-tools/tests/xgettext-php-1 index 1b403bc8c..53dcb8007 100755 --- a/gettext-tools/tests/xgettext-php-1 +++ b/gettext-tools/tests/xgettext-php-1 @@ -24,22 +24,34 @@ echo _("embedded_1_$foo bar"); echo _("embedded_2_${foo}bar"); echo _("embedded_3_{$foo}bar"); echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz"); -echo _("embedded_5"); +echo _("embedded_5_{$array[func(_("embedded_5_sub1"))]}_bar_{$array[func(_("embedded_5_sub2"))]}_baz"); +echo _("embedded_6_{$array[func(_("embedded_6_sub]}1"))]}_bar_{$array[func(_("embedded_6_sub]}2"))]}_baz"); +echo _("embedded_7_{$array[func(_("embedded_7_sub\"1"))]}_bar_{$array[func(_("embedded_7_sub\"2"))]}_baz"); +echo _("embedded_8"); // Heredoc with with embedded expressions. echo _(<< EOF @@ -76,16 +88,52 @@ msgstr "" msgid "embedded_4_sub2" msgstr "" -msgid "embedded_5" +msgid "embedded_5_sub1" +msgstr "" + +msgid "embedded_5_sub2" +msgstr "" + +msgid "embedded_6_sub]}1" +msgstr "" + +msgid "embedded_6_sub]}2" +msgstr "" + +msgid "embedded_7_sub\"1" +msgstr "" + +msgid "embedded_7_sub\"2" +msgstr "" + +msgid "embedded_8" +msgstr "" + +msgid "embedded_14_sub1" +msgstr "" + +msgid "embedded_14_sub2" +msgstr "" + +msgid "embedded_15_sub1" +msgstr "" + +msgid "embedded_15_sub2" +msgstr "" + +msgid "embedded_16_sub]}1" +msgstr "" + +msgid "embedded_16_sub]}2" msgstr "" -msgid "embedded_9_sub1" +msgid "embedded_17_sub\"1" msgstr "" -msgid "embedded_9_sub2" +msgid "embedded_17_sub\"2" msgstr "" -msgid "embedded_10" +msgid "embedded_18" msgstr "" EOF