From: Bruno Haible Date: Sat, 21 Sep 2024 11:05:16 +0000 (+0200) Subject: xgettext: PHP: Support heredoc and nowdoc strings. X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9d4340a78812fad9d69a40b43fdac03417db4c8c;p=thirdparty%2Fgettext.git xgettext: PHP: Support heredoc and nowdoc strings. Reported: by Bodo Graumann at , at , by Cédric Anne at . * gettext-tools/src/x-php.c (TAB_WIDTH): New macro. (process_heredoc): New function. (phase4_get): Process the heredoc or nowdoc string, instead of blindly skipping it. * gettext-tools/tests/xgettext-php-1: Add tests of heredoc strings with embedded expressions. * gettext-tools/tests/xgettext-php-2: Expect spaces before a heredoc end label to be eaten. Add tests of heredoc strings and nowdoc strings. * NEWS: Mention the improvement. --- diff --git a/NEWS b/NEWS index bee082e1e..24be260ee 100644 --- a/NEWS +++ b/NEWS @@ -30,7 +30,9 @@ Version 0.23 - September 2024 o xgettext now reports warnings instead of fatal errors. o Strings with embedded expressions (a.k.a. interpolated strings) are now recognized. - - PHP: Strings with embedded expressions are now recognized. + - PHP: + o Strings with embedded expressions are now recognized. + o Heredoc and Nowdoc strings are now scanned correctly. * Runtime behaviour: - In the C.UTF-8 locale, like in the C locale, the *gettext() functions diff --git a/gettext-tools/src/x-php.c b/gettext-tools/src/x-php.c index 0dcc40001..b616ccd6f 100644 --- a/gettext-tools/src/x-php.c +++ b/gettext-tools/src/x-php.c @@ -840,8 +840,246 @@ free_token (token_ty *tp) } +/* In heredoc and nowdoc, assume a tab width of 8. */ +#define TAB_WIDTH 8 + + /* 4. Combine characters into tokens. Discard whitespace. */ +/* On a heredoc string, do the same processing as phase4_getc (below) does + on a double-quoted string (except for recognizing a double-quote as + end-of-string, of course). + Return the processed string, or NULL if it contains variables or embedded + expressions. */ +static char * +process_heredoc (struct php_extractor *xp, const char *doc, int doc_line_number) +{ + bool is_constant = true; + int lineno = doc_line_number; + int bufmax = strlen (doc) + 1; + char *buffer = xmalloc (bufmax); + int bufpos; + + heredoc_continued: + bufpos = 0; + for (;;) + { + char c = *doc++; + if (c == '\0') + break; + if (c == '\n') + lineno++; + if (c == '$') + { + c = *doc++; + if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') + || c == '_' || c >= 0x7f) + { + /* String with variables. */ + is_constant = false; + continue; + } + if (c == '{') + /* Heredoc string with embedded expressions. */ + goto heredoc_with_embedded_expressions; + --doc; + c = '$'; + } + if (c == '{') + { + c = *doc++; + if (c == '$') + /* Heredoc string with embedded expressions. */ + goto heredoc_with_embedded_expressions; + --doc; + c = '{'; + } + if (c == '\\') + { + int n, j; + + c = *doc++; + switch (c) + { + case '\\': + case '$': + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + n = 0; + for (j = 0; j < 3; ++j) + { + n = n * 8 + c - '0'; + c = *doc++; + switch (c) + { + default: + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + continue; + } + break; + } + --doc; + c = n; + break; + + case 'x': + n = 0; + for (j = 0; j < 2; ++j) + { + c = *doc++; + switch (c) + { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = n * 16 + c - '0'; + break; + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + n = n * 16 + 10 + c - 'A'; + break; + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + n = n * 16 + 10 + c - 'a'; + break; + default: + --doc; + c = 0; + break; + } + if (c == 0) + break; + } + if (j == 0) + { + --doc; + c = '\\'; + } + else + c = n; + break; + + case 'n': + c = '\n'; + break; + case 't': + c = '\t'; + break; + case 'r': + c = '\r'; + break; + + default: + --doc; + c = '\\'; + break; + } + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (bufpos >= bufmax) + { + bufmax = bufmax + 1; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos] = '\0'; + if (is_constant) + return buffer; + else + { + free (buffer); + return NULL; + } + + heredoc_with_embedded_expressions: + is_constant = false; + { + size_t nesting_stack_alloc = 10; + char *nesting_stack = xmalloc (nesting_stack_alloc); + size_t nesting_stack_depth = 0; + /* We just read a '{', so expect a matching '}'. */ + nesting_stack[nesting_stack_depth++] = '}'; + + /* Find the extent of the expression. */ + bufpos = 0; + for (;;) + { + char c = *doc; + if (c == '\0') + { + if (nesting_stack_depth > 0) + if_error (IF_SEVERITY_WARNING, + logical_file_name, lineno, (size_t)(-1), false, + _("unterminated expression in heredoc, expected a '%c'"), + nesting_stack[nesting_stack_depth - 1]); + break; + } + doc++; + if (c == '\n') + lineno++; + if (c == '{' || c == '[' || c == '(') + { + if (nesting_stack_depth >= nesting_stack_alloc) + { + nesting_stack_alloc = 2 * nesting_stack_alloc; + nesting_stack = + xrealloc (nesting_stack, nesting_stack_alloc); + } + nesting_stack[nesting_stack_depth++] = + (c == '{' ? '}' : c == '[' ? ']' : ')'); + } + else if (c == '}' || c == ']' || c == ')') + { + if (nesting_stack_depth > 0 + && c == nesting_stack[nesting_stack_depth - 1]) + { + if (--nesting_stack_depth == 0) + break; + } + else + if_error (IF_SEVERITY_WARNING, + logical_file_name, lineno, (size_t)(-1), false, + _("unterminated expression in heredoc contains unbalanced '%c'"), + c); + } + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + + /* Recursively extract messages from the expression. */ + char *substring = xmalloc (bufpos); + memcpy (substring, buffer, bufpos); + + struct php_extractor *rxp = XMALLOC (struct php_extractor); + rxp->mlp = xp->mlp; + rxp->fp = NULL; + rxp->input = substring; + rxp->input_end = substring + bufpos; + rxp->line_number = xp->line_number; + php_extractor_init_rest (rxp); + + extract_php_input (rxp); + + free (rxp); + free (substring); + free (nesting_stack); + } + goto heredoc_continued; +} + static void phase4_get (struct php_extractor *xp, token_ty *tp) { @@ -1155,7 +1393,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp) tp->type = token_type_other; { size_t nesting_stack_alloc = 10; - char *nesting_stack = malloc (nesting_stack_alloc); + char *nesting_stack = xmalloc (nesting_stack_alloc); size_t nesting_stack_depth = 0; /* We just read a '{', so expect a matching '}'. */ nesting_stack[nesting_stack_depth++] = '}'; @@ -1307,9 +1545,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp) int c3 = phase1_getc (xp); if (c3 == '<') { - int label_start = 0; - - /* Start of here and now document. + /* Start of heredoc or nowdoc. Parse whitespace, then label, then newline. */ do c = phase3_getc (xp); @@ -1330,51 +1566,201 @@ phase4_get (struct php_extractor *xp, token_ty *tp) /* buffer[0..bufpos-1] now contains the label (including single or double quotes). */ - if (*buffer == '\'' || *buffer == '"') + int doc_line_number = xp->line_number; + + bool heredoc = true; + int label_start = 0; + int label_end = bufpos; + if (bufpos >= 2 + && ((buffer[label_start] == '\'' && buffer[label_end - 1] == '\'') + || (buffer[label_start] == '"' && buffer[label_end - 1] == '"'))) { + heredoc = (buffer[label_start] == '"'); label_start++; - bufpos--; + label_end--; } - /* Now skip the here document. */ + /* Now read the heredoc or nowdoc. */ + size_t doc_alloc = 10; + char *doc = xmalloc (doc_alloc); + size_t doc_len = 0; + size_t doc_start_of_line = 0; + + /* These two variables keep track of the matching of the + end label. */ + int in_label_pos = -1; /* <= label_end - label_start */ + int end_label_indent = 0; + for (;;) { c = phase1_getc (xp); if (c == EOF) break; - if (c == '\n' || c == '\r') + + if (doc_len >= doc_alloc) + { + doc_alloc = 2 * doc_alloc + 10; + doc = xrealloc (doc, doc_alloc); + } + doc[doc_len++] = c; + + if (c == '\n') + doc_start_of_line = doc_len; + + /* Incrementally match the label. */ + if (in_label_pos == 0 && (c == ' ' || c == '\t')) + { + if (c == '\t') + end_label_indent |= TAB_WIDTH - 1; + end_label_indent++; + } + else if (in_label_pos >= 0 + && in_label_pos < label_end - label_start + && c == buffer[label_start + in_label_pos]) + { + in_label_pos++; + } + else if (in_label_pos == label_end - label_start) { - int bufidx = label_start; + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case '_': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 128: case 129: case 130: case 131: case 132: + case 133: case 134: case 135: case 136: case 137: + case 138: case 139: case 140: case 141: case 142: + case 143: case 144: case 145: case 146: case 147: + case 148: case 149: case 150: case 151: case 152: + case 153: case 154: case 155: case 156: case 157: + case 158: case 159: case 160: case 161: case 162: + case 163: case 164: case 165: case 166: case 167: + case 168: case 169: case 170: case 171: case 172: + case 173: case 174: case 175: case 176: case 177: + case 178: case 179: case 180: case 181: case 182: + case 183: case 184: case 185: case 186: case 187: + case 188: case 189: case 190: case 191: case 192: + case 193: case 194: case 195: case 196: case 197: + case 198: case 199: case 200: case 201: case 202: + case 203: case 204: case 205: case 206: case 207: + case 208: case 209: case 210: case 211: case 212: + case 213: case 214: case 215: case 216: case 217: + case 218: case 219: case 220: case 221: case 222: + case 223: case 224: case 225: case 226: case 227: + case 228: case 229: case 230: case 231: case 232: + case 233: case 234: case 235: case 236: case 237: + case 238: case 239: case 240: case 241: case 242: + case 243: case 244: case 245: case 246: case 247: + case 248: case 249: case 250: case 251: case 252: + case 253: case 254: case 255: + in_label_pos = -1; + break; + default: + break; + } + if (in_label_pos >= 0) + { + /* Finished recognizing the label. */ + phase1_ungetc (xp, c); + break; + } + } + else if (c == '\n' || c == '\r') + { + in_label_pos = 0; + end_label_indent = 0; + } + else + { + in_label_pos = -1; + end_label_indent = 0; + } + } + + /* The contents is the substring + [doc, doc + doc_start_of_line). */ + doc_len = doc_start_of_line; - while (bufidx < bufpos) + /* Discard leading indentation. */ + if (end_label_indent > 0) + { + /* Scan through the doc string, copying *q = *p. */ + const char *p; + char *q = doc; + int curr_line_indent = 0; + + for (p = doc; p < doc + doc_len; p++) + { + /* Invariant: doc <= q <= p <= doc + doc_len. */ + char c = *p; + *q++ = c; + if (curr_line_indent < end_label_indent) { - c = phase1_getc (xp); - if (c == EOF) - break; - if (c != buffer[bufidx]) + if (c == ' ') { - phase1_ungetc (xp, c); - break; + curr_line_indent++; + --q; + } + else if (c == '\t') + { + curr_line_indent |= TAB_WIDTH - 1; + curr_line_indent++; + if (curr_line_indent <= end_label_indent) + --q; } - bufidx++; - } - if (bufidx == bufpos) - { - c = phase1_getc (xp); - if (c != ';') - phase1_ungetc (xp, c); - c = phase1_getc (xp); - if (c == '\n' || c == '\r') - break; } + if (c == '\n') + curr_line_indent = 0; } + doc_len = q - doc; + } + + /* Discard the trailing newline. */ + if (doc_len > 0 && doc[doc_len - 1] == '\n') + { + --doc_len; + if (doc_len > 0 && doc[doc_len - 1] == '\r') + --doc_len; + } + + /* NUL-terminate it. */ + if (doc_len >= doc_alloc) + { + doc_alloc = doc_alloc + 1; + doc = xrealloc (doc, doc_alloc); + } + doc[doc_len++] = '\0'; + + /* For a here document, do the same processing as in + double-quoted strings (see above). */ + if (heredoc) + { + char *processed_doc = + process_heredoc (xp, doc, doc_line_number); + free (doc); + doc = processed_doc; } - /* FIXME: Ideally we should turn the here document into a - string literal if it didn't contain $ substitution. And - we should also respect backslash escape sequences like - in double-quoted strings. */ - tp->type = token_type_other; + if (doc != NULL) + { + tp->type = token_type_string_literal; + tp->string = doc; + tp->comment = add_reference (savable_comment); + } + else + tp->type = token_type_other; return; } phase1_ungetc (xp, c3); diff --git a/gettext-tools/tests/xgettext-php-1 b/gettext-tools/tests/xgettext-php-1 index ebf31a988..1b403bc8c 100755 --- a/gettext-tools/tests/xgettext-php-1 +++ b/gettext-tools/tests/xgettext-php-1 @@ -25,6 +25,22 @@ echo _("embedded_2_${foo}bar"); echo _("embedded_3_{$foo}bar"); echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz"); echo _("embedded_5"); +// Heredoc with with embedded expressions. +echo _(<< EOF @@ -62,6 +78,15 @@ msgstr "" msgid "embedded_5" msgstr "" + +msgid "embedded_9_sub1" +msgstr "" + +msgid "embedded_9_sub2" +msgstr "" + +msgid "embedded_10" +msgstr "" EOF : ${DIFF=diff} diff --git a/gettext-tools/tests/xgettext-php-2 b/gettext-tools/tests/xgettext-php-2 index b343ed6a4..942d2cf12 100755 --- a/gettext-tools/tests/xgettext-php-2 +++ b/gettext-tools/tests/xgettext-php-2 @@ -1,7 +1,7 @@ #!/bin/sh . "${srcdir=.}/init.sh"; path_prepend_ . ../src -# Test PHP support: here documents. +# Test PHP support: heredoc strings, nowdoc strings. cat <<\EOF > xg-ph-2.php EOF @@ -70,8 +84,32 @@ msgstr "" msgid "Romans" msgstr "" +msgid "Goths" +msgstr "" + msgid "Franks" msgstr "" + +msgid "" +"Chlodovechus Rex Francorum\n" +"Carolus Rex Francorum et Langobardorum, Magnus Imperator Romanorum" +msgstr "" + +msgid "Hohenstaufens" +msgstr "" + +msgid "" +" Fridericus Imperator Romani Imperii\n" +" Fridericus II Imperator" +msgstr "" + +msgid "French" +msgstr "" + +msgid "" +"Louis XIV\n" +"Napoleon Bonaparte" +msgstr "" EOF : ${DIFF=diff}