From: Bruno Haible Date: Wed, 18 Sep 2024 21:37:32 +0000 (+0200) Subject: xgettext: PHP: Recognize strings with embedded expressions. X-Git-Tag: v0.23~126 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=28abce958bf1927776c157360f7b444a1199e995;p=thirdparty%2Fgettext.git xgettext: PHP: Recognize strings with embedded expressions. * gettext-tools/src/x-php.c (enum token_type_ty, struct token_ty): Moved. (struct php_extractor): New type. (fp, phase1_pushback, phase1_pushback_length, phase2_pushback, phase2_pushback_length, buffer, bufmax, buflen, last_comment_line, last_non_comment_line, phase3_pushback, phase3_pushback_length, phase4_pushback, phase4_pushback_length, phase5_last, paren_nesting_depth, bracket_nesting_depth): Remove variables. (php_extractor_init_rest): New function. (extract_php_input): New declaration. (phase1_getc): Add a 'struct php_extractor *' parameter. Read from a string if fp == NULL. (phase1_ungetc, skip_html, comment_start, comment_add, comment_line_end, phase3_getc, phase3_ungetc): Add a 'struct php_extractor *' parameter. (phase4_get): Likewise. Add handling of embedded expressions in strings. (phase4_unget, x_php_lex): Add a 'struct php_extractor *' parameter. (extract_balanced): Add a 'struct php_extractor *' parameter. Remove mlp parameter. (extract_php_input): New function, extracted from extract_php. (extract_php): Use it. Create a 'struct php_extractor'. * gettext-tools/tests/xgettext-php-1: Add tests of strings with embedded expressions. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 687f8ae3d..56b66c34a 100644 --- a/NEWS +++ b/NEWS @@ -26,6 +26,7 @@ Version 0.23 - September 2024 - Tcl: With the forthcoming Tcl 9.0, characters outside the Unicode BMP in Tcl message catalogs (.msg files) will work regardless of the locale's encoding. + - PHP: Strings with embedded expressions are now recognized. * Runtime behaviour: - In the C.UTF-8 locale, like in the C locale, the *gettext() functions diff --git a/gettext-tools/src/x-php.c b/gettext-tools/src/x-php.c index dd8a9f07b..2ea185d49 100644 --- a/gettext-tools/src/x-php.c +++ b/gettext-tools/src/x-php.c @@ -135,55 +135,162 @@ init_flag_table_php () } -/* ======================== Reading of characters. ======================== */ +/* =================== Variables used by the extractor. =================== */ -/* The input file stream. */ -static FILE *fp; +/* Type definitions needed for the variables. */ +enum token_type_ty +{ + token_type_eof, + token_type_lparen, /* ( */ + token_type_rparen, /* ) */ + token_type_comma, /* , */ + token_type_lbracket, /* [ */ + token_type_rbracket, /* ] */ + token_type_dot, /* . */ + token_type_operator1, /* * / % ++ -- */ + token_type_operator2, /* + - ! ~ @ */ + token_type_string_literal, /* "abc" */ + token_type_symbol, /* symbol, number */ + token_type_other /* misc. operator */ +}; +typedef enum token_type_ty token_type_ty; -/* 1. line_number handling. */ +typedef struct token_ty token_ty; +struct token_ty +{ + token_type_ty type; + char *string; /* for token_type_string_literal, token_type_symbol */ + refcounted_string_list_ty *comment; /* for token_type_string_literal */ + int line_number; +}; + +/* These variables are combined in a struct, so that we can invoke the + extractor in a reentrant way. */ + +struct php_extractor +{ + /* Accumulator for the output. */ + message_list_ty *mlp; + + /* The input file stream, when reading from a file. */ + FILE *fp; + /* The input area, when reading from a string. */ + const char *input; + const char *input_end; + + int line_number; + + unsigned char phase1_pushback[2]; + int phase1_pushback_length; + +#if 0 + unsigned char phase2_pushback[1]; + int phase2_pushback_length; +#endif + + /* For accumulating comments. */ + char *buffer; + size_t bufmax; + size_t buflen; + + /* These are for tracking whether comments count as immediately before + keyword. */ + int last_comment_line; + int last_non_comment_line; + + unsigned char phase3_pushback[1]; + int phase3_pushback_length; + + token_ty phase4_pushback[3]; + int phase4_pushback_length; -static unsigned char phase1_pushback[2]; -static int phase1_pushback_length; + token_type_ty phase5_last; + + /* Maximum supported nesting depth. */ + #define MAX_NESTING_DEPTH 1000 + + /* Current nesting depths. */ + int paren_nesting_depth; + int bracket_nesting_depth; +}; + +static inline void +php_extractor_init_rest (struct php_extractor *xp) +{ + xp->phase1_pushback_length = 0; +#if 0 + xp->phase2_pushback_length = 0; +#endif + + xp->buffer = NULL; + xp->bufmax = 0; + xp->buflen = 0; + + xp->last_comment_line = -1; + xp->last_non_comment_line = -1; + + xp->phase3_pushback_length = 0; + xp->phase4_pushback_length = 0; + + xp->phase5_last = token_type_eof; + + xp->paren_nesting_depth = 0; + xp->bracket_nesting_depth = 0; +} + +/* Forward declarations. */ +static void extract_php_input (struct php_extractor *xp); + + +/* ======================== Reading of characters. ======================== */ + +/* 1. line_number handling. */ static int -phase1_getc () +phase1_getc (struct php_extractor *xp) { int c; - if (phase1_pushback_length) - c = phase1_pushback[--phase1_pushback_length]; - else + if (xp->phase1_pushback_length) + c = xp->phase1_pushback[--(xp->phase1_pushback_length)]; + else if (xp->fp != NULL) { - c = getc (fp); + c = getc (xp->fp); if (c == EOF) { - if (ferror (fp)) + if (ferror (xp->fp)) error (EXIT_FAILURE, errno, _("error while reading \"%s\""), real_file_name); return EOF; } } + else + { + if (xp->input == xp->input_end) + return EOF; + c = *(xp->input++); + } - if (c == '\n') - line_number++; + if (xp->fp != NULL && c == '\n') + xp->line_number++; return c; } /* Supports 2 characters of pushback. */ static void -phase1_ungetc (int c) +phase1_ungetc (struct php_extractor *xp, int c) { if (c != EOF) { if (c == '\n') - --line_number; + --(xp->line_number); - if (phase1_pushback_length == SIZEOF (phase1_pushback)) + if (xp->phase1_pushback_length == SIZEOF (xp->phase1_pushback)) abort (); - phase1_pushback[phase1_pushback_length++] = c; + xp->phase1_pushback[xp->phase1_pushback_length++] = c; } } @@ -192,18 +299,18 @@ phase1_ungetc (int c) therefore don't contain translatable strings. */ static void -skip_html () +skip_html (struct php_extractor *xp) { for (;;) { - int c = phase1_getc (); + int c = phase1_getc (xp); if (c == EOF) return; if (c == '<') { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); if (c2 == EOF) break; @@ -212,10 +319,10 @@ skip_html () { /* are always recognized. */ while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 's' && c2 != 'S') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'c' && c2 != 'C') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'r' && c2 != 'R') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'i' && c2 != 'I') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p' && c2 != 'P') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 't' && c2 != 'T') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')) { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } do - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); if (c2 != 'l' && c2 != 'L') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'a' && c2 != 'A') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'n' && c2 != 'N') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'g' && c2 != 'G') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'u' && c2 != 'U') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'a' && c2 != 'A') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'g' && c2 != 'G') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'e' && c2 != 'E') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != '=') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == '"') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'h') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != '"') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } } else if (c2 == '\'') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'h') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != '\'') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } } @@ -404,28 +511,28 @@ skip_html () { if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'h') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != 'p') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } } - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 != '>') { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); continue; } return; @@ -435,70 +542,67 @@ skip_html () #if 0 -static unsigned char phase2_pushback[1]; -static int phase2_pushback_length; - static int -phase2_getc () +phase2_getc (struct php_extractor *xp) { int c; - if (phase2_pushback_length) - return phase2_pushback[--phase2_pushback_length]; + if (xp->phase2_pushback_length) + return xp->phase2_pushback[--(xp->phase2_pushback_length)]; - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { case '?': case '%': { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); if (c2 == '>') { /* ?> and %> terminate PHP mode and switch back to HTML mode. */ skip_html (); return ' '; } - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } break; case '<': { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); /* < / script > terminates PHP mode and switches back to HTML mode. */ while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == '/') { do - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); if (c2 == 's' || c2 == 'S') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'c' || c2 == 'C') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'r' || c2 == 'R') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'i' || c2 == 'I') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'p' || c2 == 'P') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 't' || c2 == 'T') { do - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); if (c2 == '>') { - skip_html (); + skip_html (xp); return ' '; } } @@ -508,7 +612,7 @@ phase2_getc () } } } - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } break; } @@ -517,13 +621,13 @@ phase2_getc () } static void -phase2_ungetc (int c) +phase2_ungetc (struct php_extractor *xp, int c) { if (c != EOF) { - if (phase2_pushback_length == SIZEOF (phase2_pushback)) + if (xp->phase2_pushback_length == SIZEOF (xp->phase2_pushback)) abort (); - phase2_pushback[phase2_pushback_length++] = c; + xp->phase2_pushback[xp->phase2_pushback_length++] = c; } } @@ -532,41 +636,38 @@ phase2_ungetc (int c) /* Accumulating comments. */ -static char *buffer; -static size_t bufmax; -static size_t buflen; - static inline void -comment_start () +comment_start (struct php_extractor *xp) { - buflen = 0; + xp->buflen = 0; } static inline void -comment_add (int c) +comment_add (struct php_extractor *xp, int c) { - if (buflen >= bufmax) + if (xp->buflen >= xp->bufmax) { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); + xp->bufmax = 2 * xp->bufmax + 10; + xp->buffer = xrealloc (xp->buffer, xp->bufmax); } - buffer[buflen++] = c; + xp->buffer[xp->buflen++] = c; } static inline void -comment_line_end (size_t chars_to_remove) +comment_line_end (struct php_extractor *xp, size_t chars_to_remove) { - buflen -= chars_to_remove; - while (buflen >= 1 - && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) - --buflen; - if (chars_to_remove == 0 && buflen >= bufmax) + xp->buflen -= chars_to_remove; + while (xp->buflen >= 1 + && (xp->buffer[xp->buflen - 1] == ' ' + || xp->buffer[xp->buflen - 1] == '\t')) + --(xp->buflen); + if (chars_to_remove == 0 && xp->buflen >= xp->bufmax) { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); + xp->bufmax = 2 * xp->bufmax + 10; + xp->buffer = xrealloc (xp->buffer, xp->bufmax); } - buffer[buflen] = '\0'; - savable_comment_add (buffer); + xp->buffer[xp->buflen] = '\0'; + savable_comment_add (xp->buffer); } @@ -574,62 +675,54 @@ comment_line_end (size_t chars_to_remove) space character. We need to remember the comment for later, because it may be attached to a keyword string. */ -/* These are for tracking whether comments count as immediately before - keyword. */ -static int last_comment_line; -static int last_non_comment_line; - -static unsigned char phase3_pushback[1]; -static int phase3_pushback_length; - static int -phase3_getc () +phase3_getc (struct php_extractor *xp) { int lineno; int c; - if (phase3_pushback_length) - return phase3_pushback[--phase3_pushback_length]; + if (xp->phase3_pushback_length) + return xp->phase3_pushback[--(xp->phase3_pushback_length)]; - c = phase1_getc (); + c = phase1_getc (xp); if (c == '#') { /* sh comment. */ bool last_was_qmark = false; - comment_start (); - lineno = line_number; + comment_start (xp); + lineno = xp->line_number; for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == '\n' || c == EOF) { - comment_line_end (0); + comment_line_end (xp, 0); break; } if (last_was_qmark && c == '>') { - comment_line_end (1); - skip_html (); + comment_line_end (xp, 1); + skip_html (xp); break; } /* We skip all leading white space, but not EOLs. */ - if (!(buflen == 0 && (c == ' ' || c == '\t'))) - comment_add (c); + if (!(xp->buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (xp, c); last_was_qmark = (c == '?' || c == '%'); } - last_comment_line = lineno; + xp->last_comment_line = lineno; return '\n'; } else if (c == '/') { - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { default: - phase1_ungetc (c); + phase1_ungetc (xp, c); return '/'; case '*': @@ -637,24 +730,24 @@ phase3_getc () /* C comment. */ bool last_was_star; - comment_start (); - lineno = line_number; + comment_start (xp); + lineno = xp->line_number; last_was_star = false; for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == EOF) break; /* We skip all leading white space, but not EOLs. */ - if (buflen == 0 && (c == ' ' || c == '\t')) + if (xp->buflen == 0 && (c == ' ' || c == '\t')) continue; - comment_add (c); + comment_add (xp, c); switch (c) { case '\n': - comment_line_end (1); - comment_start (); - lineno = line_number; + comment_line_end (xp, 1); + comment_start (xp); + lineno = xp->line_number; last_was_star = false; continue; @@ -665,7 +758,7 @@ phase3_getc () case '/': if (last_was_star) { - comment_line_end (2); + comment_line_end (xp, 2); break; } FALLTHROUGH; @@ -676,7 +769,7 @@ phase3_getc () } break; } - last_comment_line = lineno; + xp->last_comment_line = lineno; return ' '; } @@ -685,28 +778,28 @@ phase3_getc () /* C++ comment. */ bool last_was_qmark = false; - comment_start (); - lineno = line_number; + comment_start (xp); + lineno = xp->line_number; for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == '\n' || c == EOF) { - comment_line_end (0); + comment_line_end (xp, 0); break; } if (last_was_qmark && c == '>') { - comment_line_end (1); - skip_html (); + comment_line_end (xp, 1); + skip_html (xp); break; } /* We skip all leading white space, but not EOLs. */ - if (!(buflen == 0 && (c == ' ' || c == '\t'))) - comment_add (c); + if (!(xp->buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (xp, c); last_was_qmark = (c == '?' || c == '%'); } - last_comment_line = lineno; + xp->last_comment_line = lineno; return '\n'; } } @@ -717,13 +810,13 @@ phase3_getc () #ifdef unused static void -phase3_ungetc (int c) +phase3_ungetc (struct php_extractor *xp, int c) { if (c != EOF) { - if (phase3_pushback_length == SIZEOF (phase3_pushback)) + if (xp->phase3_pushback_length == SIZEOF (xp->phase3_pushback)) abort (); - phase3_pushback[phase3_pushback_length++] = c; + xp->phase3_pushback[xp->phase3_pushback_length++] = c; } } #endif @@ -732,32 +825,7 @@ phase3_ungetc (int c) /* ========================== Reading of tokens. ========================== */ -enum token_type_ty -{ - token_type_eof, - token_type_lparen, /* ( */ - token_type_rparen, /* ) */ - token_type_comma, /* , */ - token_type_lbracket, /* [ */ - token_type_rbracket, /* ] */ - token_type_dot, /* . */ - token_type_operator1, /* * / % ++ -- */ - token_type_operator2, /* + - ! ~ @ */ - token_type_string_literal, /* "abc" */ - token_type_symbol, /* symbol, number */ - token_type_other /* misc. operator */ -}; -typedef enum token_type_ty token_type_ty; - -typedef struct token_ty token_ty; -struct token_ty -{ - token_type_ty type; - char *string; /* for token_type_string_literal, token_type_symbol */ - refcounted_string_list_ty *comment; /* for token_type_string_literal */ - int line_number; -}; - +/* 'struct token_ty' is defined above. */ /* Free the memory pointed to by a 'struct token_ty'. */ static inline void @@ -772,28 +840,25 @@ free_token (token_ty *tp) /* 4. Combine characters into tokens. Discard whitespace. */ -static token_ty phase4_pushback[3]; -static int phase4_pushback_length; - static void -phase4_get (token_ty *tp) +phase4_get (struct php_extractor *xp, token_ty *tp) { static char *buffer; static int bufmax; int bufpos; int c; - if (phase4_pushback_length) + if (xp->phase4_pushback_length) { - *tp = phase4_pushback[--phase4_pushback_length]; + *tp = xp->phase4_pushback[--(xp->phase4_pushback_length)]; return; } tp->string = NULL; for (;;) { - tp->line_number = line_number; - c = phase3_getc (); + tp->line_number = xp->line_number; + c = phase3_getc (xp); switch (c) { case EOF: @@ -801,7 +866,7 @@ phase4_get (token_ty *tp) return; case '\n': - if (last_non_comment_line > last_comment_line) + if (xp->last_non_comment_line > xp->last_comment_line) savable_comment_reset (); FALLTHROUGH; case ' ': @@ -811,7 +876,7 @@ phase4_get (token_ty *tp) continue; } - last_non_comment_line = tp->line_number; + xp->last_non_comment_line = tp->line_number; switch (c) { @@ -852,7 +917,7 @@ phase4_get (token_ty *tp) buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': @@ -893,7 +958,7 @@ phase4_get (token_ty *tp) continue; default: - phase1_ungetc (c); + phase1_ungetc (xp, c); break; } break; @@ -913,15 +978,15 @@ phase4_get (token_ty *tp) bufpos = 0; for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == EOF || c == '\'') break; if (c == '\\') { - c = phase1_getc (); + c = phase1_getc (xp); if (c != '\\' && c != '\'') { - phase1_ungetc (c); + phase1_ungetc (xp, c); c = '\\'; } } @@ -946,42 +1011,43 @@ phase4_get (token_ty *tp) case '"': /* Double-quoted string literal. */ tp->type = token_type_string_literal; + string_literal_continued: bufpos = 0; for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == EOF || c == '"') break; if (c == '$') { - c = phase1_getc (); + c = phase1_getc (xp); if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') - || c == '_' || c == '{' || c >= 0x7f) + || c == '_' || c >= 0x7f) { /* String with variables. */ tp->type = token_type_other; continue; } - phase1_ungetc (c); + if (c == '{') + /* String with embedded expressions. */ + goto string_with_embedded_expressions; + phase1_ungetc (xp, c); c = '$'; } if (c == '{') { - c = phase1_getc (); + c = phase1_getc (xp); if (c == '$') - { - /* String with expressions. */ - tp->type = token_type_other; - continue; - } - phase1_ungetc (c); + /* String with embedded expressions. */ + goto string_with_embedded_expressions; + phase1_ungetc (xp, c); c = '{'; } if (c == '\\') { int n, j; - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { case '"': @@ -995,7 +1061,7 @@ phase4_get (token_ty *tp) for (j = 0; j < 3; ++j) { n = n * 8 + c - '0'; - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { default: @@ -1007,7 +1073,7 @@ phase4_get (token_ty *tp) } break; } - phase1_ungetc (c); + phase1_ungetc (xp, c); c = n; break; @@ -1015,7 +1081,7 @@ phase4_get (token_ty *tp) n = 0; for (j = 0; j < 2; ++j) { - c = phase1_getc (); + c = phase1_getc (xp); switch (c) { case '0': case '1': case '2': case '3': case '4': @@ -1031,7 +1097,7 @@ phase4_get (token_ty *tp) n = n * 16 + 10 + c - 'a'; break; default: - phase1_ungetc (c); + phase1_ungetc (xp, c); c = 0; break; } @@ -1040,7 +1106,7 @@ phase4_get (token_ty *tp) } if (j == 0) { - phase1_ungetc ('x'); + phase1_ungetc (xp, 'x'); c = '\\'; } else @@ -1058,7 +1124,7 @@ phase4_get (token_ty *tp) break; default: - phase1_ungetc (c); + phase1_ungetc (xp, c); c = '\\'; break; } @@ -1083,20 +1149,98 @@ phase4_get (token_ty *tp) } return; + string_with_embedded_expressions: + tp->type = token_type_other; + { + size_t nesting_stack_alloc = 10; + char *nesting_stack = malloc (nesting_stack_alloc); + size_t nesting_stack_depth = 0; + /* We just read a '{', so expect a matching '}'. */ + nesting_stack[nesting_stack_depth++] = '}'; + + /* Find the extent of the expression. */ + bufpos = 0; + for (;;) + { + c = phase1_getc (xp); + if (c == EOF) + break; + if (c == '"') + { + if (nesting_stack_depth > 0) + if_error (IF_SEVERITY_WARNING, + logical_file_name, xp->line_number, (size_t)(-1), false, + _("unterminated expression in string literal, expected a '%c'"), + nesting_stack[nesting_stack_depth - 1]); + break; + } + if (c == '{' || c == '[' || c == '(') + { + if (nesting_stack_depth >= nesting_stack_alloc) + { + nesting_stack_alloc = 2 * nesting_stack_alloc; + nesting_stack = + xrealloc (nesting_stack, nesting_stack_alloc); + } + nesting_stack[nesting_stack_depth++] = + (c == '{' ? '}' : c == '[' ? ']' : ')'); + } + else if (c == '}' || c == ']' || c == ')') + { + if (nesting_stack_depth > 0 + && c == nesting_stack[nesting_stack_depth - 1]) + { + if (--nesting_stack_depth == 0) + break; + } + else + if_error (IF_SEVERITY_WARNING, + logical_file_name, xp->line_number, (size_t)(-1), false, + _("unterminated expression in string literal contains unbalanced '%c'"), + c); + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + + /* Recursively extract messages from the expression. */ + char *substring = xmalloc (bufpos); + memcpy (substring, buffer, bufpos); + + struct php_extractor *rxp = XMALLOC (struct php_extractor); + rxp->mlp = xp->mlp; + rxp->fp = NULL; + rxp->input = substring; + rxp->input_end = substring + bufpos; + rxp->line_number = xp->line_number; + php_extractor_init_rest (rxp); + + extract_php_input (rxp); + + free (rxp); + free (substring); + free (nesting_stack); + } + goto string_literal_continued; + case '?': case '%': { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); if (c2 == '>') { /* ?> and %> terminate PHP mode and switch back to HTML mode. */ - skip_html (); + skip_html (xp); tp->type = token_type_other; } else { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); tp->type = (c == '%' ? token_type_operator1 : token_type_other); } return; @@ -1134,14 +1278,14 @@ phase4_get (token_ty *tp) case '+': case '-': { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); if (c2 == c) /* ++ or -- */ tp->type = token_type_operator1; else /* + or - */ { - phase1_ungetc (c2); + phase1_ungetc (xp, c2); tp->type = token_type_operator2; } return; @@ -1155,10 +1299,10 @@ phase4_get (token_ty *tp) case '<': { - int c2 = phase1_getc (); + int c2 = phase1_getc (xp); if (c2 == '<') { - int c3 = phase1_getc (); + int c3 = phase1_getc (xp); if (c3 == '<') { int label_start = 0; @@ -1166,7 +1310,7 @@ phase4_get (token_ty *tp) /* Start of here and now document. Parse whitespace, then label, then newline. */ do - c = phase3_getc (); + c = phase3_getc (xp); while (c == ' ' || c == '\t' || c == '\n' || c == '\r'); bufpos = 0; @@ -1178,7 +1322,7 @@ phase4_get (token_ty *tp) buffer = xrealloc (buffer, bufmax); } buffer[bufpos++] = c; - c = phase3_getc (); + c = phase3_getc (xp); } while (c != EOF && c != '\n' && c != '\r'); /* buffer[0..bufpos-1] now contains the label @@ -1193,7 +1337,7 @@ phase4_get (token_ty *tp) /* Now skip the here document. */ for (;;) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == EOF) break; if (c == '\n' || c == '\r') @@ -1202,22 +1346,22 @@ phase4_get (token_ty *tp) while (bufidx < bufpos) { - c = phase1_getc (); + c = phase1_getc (xp); if (c == EOF) break; if (c != buffer[bufidx]) { - phase1_ungetc (c); + phase1_ungetc (xp, c); break; } bufidx++; } if (bufidx == bufpos) { - c = phase1_getc (); + c = phase1_getc (xp); if (c != ';') - phase1_ungetc (c); - c = phase1_getc (); + phase1_ungetc (xp, c); + c = phase1_getc (xp); if (c == '\n' || c == '\r') break; } @@ -1231,66 +1375,66 @@ phase4_get (token_ty *tp) tp->type = token_type_other; return; } - phase1_ungetc (c3); + phase1_ungetc (xp, c3); } /* < / script > terminates PHP mode and switches back to HTML mode. */ while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == '/') { do - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); if (c2 == 's' || c2 == 'S') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'c' || c2 == 'C') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'r' || c2 == 'R') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'i' || c2 == 'I') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 'p' || c2 == 'P') { - c2 = phase1_getc (); + c2 = phase1_getc (xp); if (c2 == 't' || c2 == 'T') { do - c2 = phase1_getc (); + c2 = phase1_getc (xp); while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); if (c2 == '>') { - skip_html (); + skip_html (xp); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); } else - phase1_ungetc (c2); + phase1_ungetc (xp, c2); tp->type = token_type_other; return; @@ -1310,13 +1454,13 @@ phase4_get (token_ty *tp) /* Supports 3 tokens of pushback. */ static void -phase4_unget (token_ty *tp) +phase4_unget (struct php_extractor *xp, token_ty *tp) { if (tp->type != token_type_eof) { - if (phase4_pushback_length == SIZEOF (phase4_pushback)) + if (xp->phase4_pushback_length == SIZEOF (xp->phase4_pushback)) abort (); - phase4_pushback[phase4_pushback_length++] = *tp; + xp->phase4_pushback[xp->phase4_pushback_length++] = *tp; } } @@ -1334,17 +1478,15 @@ phase4_unget (token_ty *tp) higher precedence as '.', such as a multiplicative or postincrement expression). */ -static token_type_ty phase5_last; - static void -x_php_lex (token_ty *tp) +x_php_lex (struct php_extractor *xp, token_ty *tp) { - phase4_get (tp); + phase4_get (xp, tp); if (tp->type == token_type_string_literal - && !(phase5_last == token_type_dot - || phase5_last == token_type_operator1 - || phase5_last == token_type_operator2 - || phase5_last == token_type_rparen)) + && !(xp->phase5_last == token_type_dot + || xp->phase5_last == token_type_operator1 + || xp->phase5_last == token_type_operator2 + || xp->phase5_last == token_type_rparen)) { char *sum = tp->string; size_t sum_len = strlen (sum); @@ -1353,17 +1495,17 @@ x_php_lex (token_ty *tp) { token_ty token2; - phase4_get (&token2); + phase4_get (xp, &token2); if (token2.type == token_type_dot) { token_ty token3; - phase4_get (&token3); + phase4_get (xp, &token3); if (token3.type == token_type_string_literal) { token_ty token_after; - phase4_get (&token_after); + phase4_get (xp, &token_after); if (token_after.type != token_type_operator1) { char *addend = token3.string; @@ -1373,21 +1515,21 @@ x_php_lex (token_ty *tp) memcpy (sum + sum_len, addend, addend_len + 1); sum_len += addend_len; - phase4_unget (&token_after); + phase4_unget (xp, &token_after); free_token (&token3); free_token (&token2); continue; } - phase4_unget (&token_after); + phase4_unget (xp, &token_after); } - phase4_unget (&token3); + phase4_unget (xp, &token3); } - phase4_unget (&token2); + phase4_unget (xp, &token2); break; } tp->string = sum; } - phase5_last = tp->type; + xp->phase5_last = tp->type; } @@ -1398,14 +1540,6 @@ x_php_lex (token_ty *tp) static flag_context_list_table_ty *flag_context_list_table; -/* Maximum supported nesting depth. */ -#define MAX_NESTING_DEPTH 1000 - -/* Current nesting depths. */ -static int paren_nesting_depth; -static int bracket_nesting_depth; - - /* The file is broken into tokens. Scan the token stream, looking for a keyword, followed by a left paren, followed by a string. When we see this sequence, we have something to remember. We assume we are @@ -1422,12 +1556,12 @@ static int bracket_nesting_depth; /* Extract messages until the next balanced closing parenthesis or bracket. - Extracted messages are added to MLP. + Extracted messages are added to XP->MLP. DELIM can be either token_type_rparen or token_type_rbracket, or token_type_eof to accept both. Return true upon eof, false upon closing parenthesis or bracket. */ static bool -extract_balanced (message_list_ty *mlp, +extract_balanced (struct php_extractor *xp, token_type_ty delim, flag_region_ty *outer_region, flag_context_list_iterator_ty context_iter, @@ -1454,7 +1588,7 @@ extract_balanced (message_list_ty *mlp, { token_ty token; - x_php_lex (&token); + x_php_lex (xp, &token); switch (token.type) { case token_type_symbol: @@ -1480,20 +1614,20 @@ extract_balanced (message_list_ty *mlp, continue; case token_type_lparen: - if (++paren_nesting_depth > MAX_NESTING_DEPTH) + if (++(xp->paren_nesting_depth) > MAX_NESTING_DEPTH) if_error (IF_SEVERITY_FATAL_ERROR, - logical_file_name, line_number, (size_t)(-1), false, + logical_file_name, xp->line_number, (size_t)(-1), false, _("too many open parentheses")); - if (extract_balanced (mlp, token_type_rparen, + if (extract_balanced (xp, token_type_rparen, inner_region, next_context_iter, - arglist_parser_alloc (mlp, + arglist_parser_alloc (xp->mlp, state ? next_shapes : NULL))) { arglist_parser_done (argparser, arg); unref_region (inner_region); return true; } - paren_nesting_depth--; + xp->paren_nesting_depth--; next_context_iter = null_context_list_iterator; state = 0; continue; @@ -1521,20 +1655,20 @@ extract_balanced (message_list_ty *mlp, continue; case token_type_lbracket: - if (++bracket_nesting_depth > MAX_NESTING_DEPTH) + if (++(xp->bracket_nesting_depth) > MAX_NESTING_DEPTH) if_error (IF_SEVERITY_FATAL_ERROR, - logical_file_name, line_number, (size_t)(-1), false, + logical_file_name, xp->line_number, (size_t)(-1), false, _("too many open brackets")); - if (extract_balanced (mlp, token_type_rbracket, + if (extract_balanced (xp, token_type_rbracket, null_context_region (), null_context_list_iterator, - arglist_parser_alloc (mlp, NULL))) + arglist_parser_alloc (xp->mlp, NULL))) { arglist_parser_done (argparser, arg); unref_region (inner_region); return true; } - bracket_nesting_depth--; + xp->bracket_nesting_depth--; next_context_iter = null_context_list_iterator; state = 0; continue; @@ -1557,7 +1691,7 @@ extract_balanced (message_list_ty *mlp, pos.line_number = token.line_number; if (extract_all) - remember_a_message (mlp, NULL, token.string, false, false, + remember_a_message (xp->mlp, NULL, token.string, false, false, inner_region, &pos, NULL, token.comment, false); else @@ -1596,51 +1730,46 @@ extract_balanced (message_list_ty *mlp, } +static void +extract_php_input (struct php_extractor *xp) +{ + /* Eat tokens until eof is seen. When extract_balanced returns + due to an unbalanced closing parenthesis, just restart it. */ + while (!extract_balanced (xp, token_type_eof, + null_context_region (), null_context_list_iterator, + arglist_parser_alloc (xp->mlp, NULL))) + ; +} + + void extract_php (FILE *f, const char *real_filename, const char *logical_filename, flag_context_list_table_ty *flag_table, msgdomain_list_ty *mdlp) { - message_list_ty *mlp = mdlp->item[0]->messages; - - fp = f; - real_file_name = real_filename; - logical_file_name = xstrdup (logical_filename); - line_number = 1; - - phase1_pushback_length = 0; -#if 0 - phase2_pushback_length = 0; -#endif - - last_comment_line = -1; - last_non_comment_line = -1; - - phase3_pushback_length = 0; - phase4_pushback_length = 0; - - phase5_last = token_type_eof; - flag_context_list_table = flag_table; - paren_nesting_depth = 0; - bracket_nesting_depth = 0; init_keywords (); + struct php_extractor *xp = XMALLOC (struct php_extractor); + + xp->mlp = mdlp->item[0]->messages; + xp->fp = f; + xp->input = NULL; + xp->input_end = NULL; + real_file_name = real_filename; + logical_file_name = xstrdup (logical_filename); + xp->line_number = 1; + php_extractor_init_rest (xp); + /* Initial mode is HTML mode, not PHP mode. */ - skip_html (); + skip_html (xp); - /* Eat tokens until eof is seen. When extract_balanced returns - due to an unbalanced closing parenthesis, just restart it. */ - while (!extract_balanced (mlp, token_type_eof, - null_context_region (), null_context_list_iterator, - arglist_parser_alloc (mlp, NULL))) - ; + extract_php_input (xp); /* Close scanner. */ - fp = NULL; + free (xp); real_file_name = NULL; logical_file_name = NULL; - line_number = 0; } diff --git a/gettext-tools/tests/xgettext-php-1 b/gettext-tools/tests/xgettext-php-1 index 9c9891c4d..ebf31a988 100755 --- a/gettext-tools/tests/xgettext-php-1 +++ b/gettext-tools/tests/xgettext-php-1 @@ -1,7 +1,7 @@ #!/bin/sh . "${srcdir=.}/init.sh"; path_prepend_ . ../src -# Test PHP support: --add-comments option. +# Test PHP support: --add-comments option, strings with embedded expressions. cat <<\EOF > xg-ph-1.php EOF @@ -47,6 +53,15 @@ msgstr "" #. msgid "The Fabulous Four" msgstr "" + +msgid "embedded_4_sub1" +msgstr "" + +msgid "embedded_4_sub2" +msgstr "" + +msgid "embedded_5" +msgstr "" EOF : ${DIFF=diff}