]> git.ipfire.org Git - thirdparty/gettext.git/commitdiff
xgettext: PHP: Support heredoc and nowdoc strings.
authorBruno Haible <bruno@clisp.org>
Sat, 21 Sep 2024 11:05:16 +0000 (13:05 +0200)
committerBruno Haible <bruno@clisp.org>
Sat, 21 Sep 2024 11:05:16 +0000 (13:05 +0200)
Reported:
by Bodo Graumann <mail@bodograumann.de> at <https://savannah.gnu.org/bugs/?27740>,
at <https://savannah.gnu.org/bugs/?35944>,
by Cédric Anne <canne@teclib.com> at <https://savannah.gnu.org/bugs/?62158>.

* gettext-tools/src/x-php.c (TAB_WIDTH): New macro.
(process_heredoc): New function.
(phase4_get): Process the heredoc or nowdoc string, instead of blindly skipping
it.
* gettext-tools/tests/xgettext-php-1: Add tests of heredoc strings with embedded
expressions.
* gettext-tools/tests/xgettext-php-2: Expect spaces before a heredoc end label
to be eaten. Add tests of heredoc strings and nowdoc strings.
* NEWS: Mention the improvement.

NEWS
gettext-tools/src/x-php.c
gettext-tools/tests/xgettext-php-1
gettext-tools/tests/xgettext-php-2

diff --git a/NEWS b/NEWS
index bee082e1e0fa9520fcb6fcab865bf52b6f4bbe52..24be260ee8edb24224bb213f2eaafda2c92d0699 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -30,7 +30,9 @@ Version 0.23 - September 2024
     o xgettext now reports warnings instead of fatal errors.
     o Strings with embedded expressions (a.k.a. interpolated strings) are now
       recognized.
-  - PHP: Strings with embedded expressions are now recognized.
+  - PHP:
+    o Strings with embedded expressions are now recognized.
+    o Heredoc and Nowdoc strings are now scanned correctly.
 
 * Runtime behaviour:
   - In the C.UTF-8 locale, like in the C locale, the *gettext() functions
index 0dcc400018ce9e8d9085336d8ec331933ac5450b..b616ccd6f87551f4f94184d97a016c0a857a25c4 100644 (file)
@@ -840,8 +840,246 @@ free_token (token_ty *tp)
 }
 
 
+/* In heredoc and nowdoc, assume a tab width of 8.  */
+#define TAB_WIDTH 8
+
+
 /* 4. Combine characters into tokens.  Discard whitespace.  */
 
+/* On a heredoc string, do the same processing as phase4_getc (below) does
+   on a double-quoted string (except for recognizing a double-quote as
+   end-of-string, of course).
+   Return the processed string, or NULL if it contains variables or embedded
+   expressions.  */
+static char *
+process_heredoc (struct php_extractor *xp, const char *doc, int doc_line_number)
+{
+  bool is_constant = true;
+  int lineno = doc_line_number;
+  int bufmax = strlen (doc) + 1;
+  char *buffer = xmalloc (bufmax);
+  int bufpos;
+
+ heredoc_continued:
+  bufpos = 0;
+  for (;;)
+    {
+      char c = *doc++;
+      if (c == '\0')
+        break;
+      if (c == '\n')
+        lineno++;
+      if (c == '$')
+        {
+          c = *doc++;
+          if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+              || c == '_' || c >= 0x7f)
+            {
+              /* String with variables.  */
+              is_constant = false;
+              continue;
+            }
+          if (c == '{')
+            /* Heredoc string with embedded expressions.  */
+            goto heredoc_with_embedded_expressions;
+          --doc;
+          c = '$';
+        }
+      if (c == '{')
+        {
+          c = *doc++;
+          if (c == '$')
+            /* Heredoc string with embedded expressions.  */
+            goto heredoc_with_embedded_expressions;
+          --doc;
+          c = '{';
+        }
+      if (c == '\\')
+        {
+          int n, j;
+
+          c = *doc++;
+          switch (c)
+            {
+            case '\\':
+            case '$':
+              break;
+
+            case '0': case '1': case '2': case '3':
+            case '4': case '5': case '6': case '7':
+              n = 0;
+              for (j = 0; j < 3; ++j)
+                {
+                  n = n * 8 + c - '0';
+                  c = *doc++;
+                  switch (c)
+                    {
+                    default:
+                      break;
+
+                    case '0': case '1': case '2': case '3':
+                    case '4': case '5': case '6': case '7':
+                      continue;
+                    }
+                  break;
+                }
+              --doc;
+              c = n;
+              break;
+
+            case 'x':
+              n = 0;
+              for (j = 0; j < 2; ++j)
+                {
+                  c = *doc++;
+                  switch (c)
+                    {
+                    case '0': case '1': case '2': case '3': case '4':
+                    case '5': case '6': case '7': case '8': case '9':
+                      n = n * 16 + c - '0';
+                      break;
+                    case 'A': case 'B': case 'C': case 'D': case 'E':
+                    case 'F':
+                      n = n * 16 + 10 + c - 'A';
+                      break;
+                    case 'a': case 'b': case 'c': case 'd': case 'e':
+                    case 'f':
+                      n = n * 16 + 10 + c - 'a';
+                      break;
+                    default:
+                      --doc;
+                      c = 0;
+                      break;
+                    }
+                  if (c == 0)
+                    break;
+                }
+              if (j == 0)
+                {
+                  --doc;
+                  c = '\\';
+                }
+              else
+                c = n;
+              break;
+
+            case 'n':
+              c = '\n';
+              break;
+            case 't':
+              c = '\t';
+              break;
+            case 'r':
+              c = '\r';
+              break;
+
+            default:
+              --doc;
+              c = '\\';
+              break;
+            }
+        }
+      if (bufpos >= bufmax)
+        {
+          bufmax = 2 * bufmax + 10;
+          buffer = xrealloc (buffer, bufmax);
+        }
+      buffer[bufpos++] = c;
+    }
+  if (bufpos >= bufmax)
+    {
+      bufmax = bufmax + 1;
+      buffer = xrealloc (buffer, bufmax);
+    }
+  buffer[bufpos] = '\0';
+  if (is_constant)
+    return buffer;
+  else
+    {
+      free (buffer);
+      return NULL;
+    }
+
+ heredoc_with_embedded_expressions:
+  is_constant = false;
+  {
+    size_t nesting_stack_alloc = 10;
+    char *nesting_stack = xmalloc (nesting_stack_alloc);
+    size_t nesting_stack_depth = 0;
+    /* We just read a '{', so expect a matching '}'.  */
+    nesting_stack[nesting_stack_depth++] = '}';
+
+    /* Find the extent of the expression.  */
+    bufpos = 0;
+    for (;;)
+      {
+        char c = *doc;
+        if (c == '\0')
+          {
+            if (nesting_stack_depth > 0)
+              if_error (IF_SEVERITY_WARNING,
+                        logical_file_name, lineno, (size_t)(-1), false,
+                        _("unterminated expression in heredoc, expected a '%c'"),
+                        nesting_stack[nesting_stack_depth - 1]);
+            break;
+          }
+        doc++;
+        if (c == '\n')
+          lineno++;
+        if (c == '{' || c == '[' || c == '(')
+          {
+            if (nesting_stack_depth >= nesting_stack_alloc)
+              {
+                nesting_stack_alloc = 2 * nesting_stack_alloc;
+                nesting_stack =
+                  xrealloc (nesting_stack, nesting_stack_alloc);
+              }
+            nesting_stack[nesting_stack_depth++] =
+              (c == '{' ? '}' : c == '[' ? ']' : ')');
+          }
+        else if (c == '}' || c == ']' || c == ')')
+          {
+            if (nesting_stack_depth > 0
+                && c == nesting_stack[nesting_stack_depth - 1])
+              {
+                if (--nesting_stack_depth == 0)
+                  break;
+              }
+            else
+              if_error (IF_SEVERITY_WARNING,
+                        logical_file_name, lineno, (size_t)(-1), false,
+                        _("unterminated expression in heredoc contains unbalanced '%c'"),
+                        c);
+          }
+        if (bufpos >= bufmax)
+          {
+            bufmax = 2 * bufmax + 10;
+            buffer = xrealloc (buffer, bufmax);
+          }
+        buffer[bufpos++] = c;
+      }
+
+    /* Recursively extract messages from the expression.  */
+    char *substring = xmalloc (bufpos);
+    memcpy (substring, buffer, bufpos);
+
+    struct php_extractor *rxp = XMALLOC (struct php_extractor);
+    rxp->mlp = xp->mlp;
+    rxp->fp = NULL;
+    rxp->input = substring;
+    rxp->input_end = substring + bufpos;
+    rxp->line_number = xp->line_number;
+    php_extractor_init_rest (rxp);
+
+    extract_php_input (rxp);
+
+    free (rxp);
+    free (substring);
+    free (nesting_stack);
+  }
+  goto heredoc_continued;
+}
+
 static void
 phase4_get (struct php_extractor *xp, token_ty *tp)
 {
@@ -1155,7 +1393,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
           tp->type = token_type_other;
           {
             size_t nesting_stack_alloc = 10;
-            char *nesting_stack = malloc (nesting_stack_alloc);
+            char *nesting_stack = xmalloc (nesting_stack_alloc);
             size_t nesting_stack_depth = 0;
             /* We just read a '{', so expect a matching '}'.  */
             nesting_stack[nesting_stack_depth++] = '}';
@@ -1307,9 +1545,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
                 int c3 = phase1_getc (xp);
                 if (c3 == '<')
                   {
-                    int label_start = 0;
-
-                    /* Start of here and now document.
+                    /* Start of heredoc or nowdoc.
                        Parse whitespace, then label, then newline.  */
                     do
                       c = phase3_getc (xp);
@@ -1330,51 +1566,201 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
                     /* buffer[0..bufpos-1] now contains the label
                        (including single or double quotes).  */
 
-                    if (*buffer == '\'' || *buffer == '"')
+                    int doc_line_number = xp->line_number;
+
+                    bool heredoc = true;
+                    int label_start = 0;
+                    int label_end = bufpos;
+                    if (bufpos >= 2
+                        && ((buffer[label_start] == '\'' && buffer[label_end - 1] == '\'')
+                            || (buffer[label_start] == '"' && buffer[label_end - 1] == '"')))
                       {
+                        heredoc = (buffer[label_start] == '"');
                         label_start++;
-                        bufpos--;
+                        label_end--;
                       }
 
-                    /* Now skip the here document.  */
+                    /* Now read the heredoc or nowdoc.  */
+                    size_t doc_alloc = 10;
+                    char *doc = xmalloc (doc_alloc);
+                    size_t doc_len = 0;
+                    size_t doc_start_of_line = 0;
+
+                    /* These two variables keep track of the matching of the
+                       end label.  */
+                    int in_label_pos = -1; /* <= label_end - label_start */
+                    int end_label_indent = 0;
+
                     for (;;)
                       {
                         c = phase1_getc (xp);
                         if (c == EOF)
                           break;
-                        if (c == '\n' || c == '\r')
+
+                        if (doc_len >= doc_alloc)
+                          {
+                            doc_alloc = 2 * doc_alloc + 10;
+                            doc = xrealloc (doc, doc_alloc);
+                          }
+                        doc[doc_len++] = c;
+
+                        if (c == '\n')
+                          doc_start_of_line = doc_len;
+
+                        /* Incrementally match the label.  */
+                        if (in_label_pos == 0 && (c == ' ' || c == '\t'))
+                          {
+                            if (c == '\t')
+                              end_label_indent |= TAB_WIDTH - 1;
+                            end_label_indent++;
+                          }
+                        else if (in_label_pos >= 0
+                                 && in_label_pos < label_end - label_start
+                                 && c == buffer[label_start + in_label_pos])
+                          {
+                            in_label_pos++;
+                          }
+                        else if (in_label_pos == label_end - label_start)
                           {
-                            int bufidx = label_start;
+                            switch (c)
+                              {
+                              case 'A': case 'B': case 'C': case 'D': case 'E':
+                              case 'F': case 'G': case 'H': case 'I': case 'J':
+                              case 'K': case 'L': case 'M': case 'N': case 'O':
+                              case 'P': case 'Q': case 'R': case 'S': case 'T':
+                              case 'U': case 'V': case 'W': case 'X': case 'Y':
+                              case 'Z':
+                              case '_':
+                              case 'a': case 'b': case 'c': case 'd': case 'e':
+                              case 'f': case 'g': case 'h': case 'i': case 'j':
+                              case 'k': case 'l': case 'm': case 'n': case 'o':
+                              case 'p': case 'q': case 'r': case 's': case 't':
+                              case 'u': case 'v': case 'w': case 'x': case 'y':
+                              case 'z':
+                              case '0': case '1': case '2': case '3': case '4':
+                              case '5': case '6': case '7': case '8': case '9':
+                              case 128: case 129: case 130: case 131: case 132:
+                              case 133: case 134: case 135: case 136: case 137:
+                              case 138: case 139: case 140: case 141: case 142:
+                              case 143: case 144: case 145: case 146: case 147:
+                              case 148: case 149: case 150: case 151: case 152:
+                              case 153: case 154: case 155: case 156: case 157:
+                              case 158: case 159: case 160: case 161: case 162:
+                              case 163: case 164: case 165: case 166: case 167:
+                              case 168: case 169: case 170: case 171: case 172:
+                              case 173: case 174: case 175: case 176: case 177:
+                              case 178: case 179: case 180: case 181: case 182:
+                              case 183: case 184: case 185: case 186: case 187:
+                              case 188: case 189: case 190: case 191: case 192:
+                              case 193: case 194: case 195: case 196: case 197:
+                              case 198: case 199: case 200: case 201: case 202:
+                              case 203: case 204: case 205: case 206: case 207:
+                              case 208: case 209: case 210: case 211: case 212:
+                              case 213: case 214: case 215: case 216: case 217:
+                              case 218: case 219: case 220: case 221: case 222:
+                              case 223: case 224: case 225: case 226: case 227:
+                              case 228: case 229: case 230: case 231: case 232:
+                              case 233: case 234: case 235: case 236: case 237:
+                              case 238: case 239: case 240: case 241: case 242:
+                              case 243: case 244: case 245: case 246: case 247:
+                              case 248: case 249: case 250: case 251: case 252:
+                              case 253: case 254: case 255:
+                                in_label_pos = -1;
+                                break;
+                              default:
+                                break;
+                              }
+                            if (in_label_pos >= 0)
+                              {
+                                /* Finished recognizing the label.  */
+                                phase1_ungetc (xp, c);
+                                break;
+                              }
+                          }
+                        else if (c == '\n' || c == '\r')
+                          {
+                            in_label_pos = 0;
+                            end_label_indent = 0;
+                          }
+                        else
+                          {
+                            in_label_pos = -1;
+                            end_label_indent = 0;
+                          }
+                      }
+
+                    /* The contents is the substring
+                       [doc, doc + doc_start_of_line).  */
+                    doc_len = doc_start_of_line;
 
-                            while (bufidx < bufpos)
+                    /* Discard leading indentation.  */
+                    if (end_label_indent > 0)
+                      {
+                        /* Scan through the doc string, copying *q = *p.  */
+                        const char *p;
+                        char *q = doc;
+                        int curr_line_indent = 0;
+
+                        for (p = doc; p < doc + doc_len; p++)
+                          {
+                            /* Invariant: doc <= q <= p <= doc + doc_len.  */
+                            char c = *p;
+                            *q++ = c;
+                            if (curr_line_indent < end_label_indent)
                               {
-                                c = phase1_getc (xp);
-                                if (c == EOF)
-                                  break;
-                                if (c != buffer[bufidx])
+                                if (c == ' ')
                                   {
-                                    phase1_ungetc (xp, c);
-                                    break;
+                                    curr_line_indent++;
+                                    --q;
+                                  }
+                                else if (c == '\t')
+                                  {
+                                    curr_line_indent |= TAB_WIDTH - 1;
+                                    curr_line_indent++;
+                                    if (curr_line_indent <= end_label_indent)
+                                      --q;
                                   }
-                                bufidx++;
-                              }
-                            if (bufidx == bufpos)
-                              {
-                                c = phase1_getc (xp);
-                                if (c != ';')
-                                  phase1_ungetc (xp, c);
-                                c = phase1_getc (xp);
-                                if (c == '\n' || c == '\r')
-                                  break;
                               }
+                            if (c == '\n')
+                              curr_line_indent = 0;
                           }
+                        doc_len = q - doc;
+                      }
+
+                    /* Discard the trailing newline.  */
+                    if (doc_len > 0 && doc[doc_len - 1] == '\n')
+                      {
+                        --doc_len;
+                        if (doc_len > 0 && doc[doc_len - 1] == '\r')
+                          --doc_len;
+                      }
+
+                    /* NUL-terminate it.  */
+                    if (doc_len >= doc_alloc)
+                      {
+                        doc_alloc = doc_alloc + 1;
+                        doc = xrealloc (doc, doc_alloc);
+                      }
+                    doc[doc_len++] = '\0';
+
+                    /* For a here document, do the same processing as in
+                       double-quoted strings (see above).  */
+                    if (heredoc)
+                      {
+                        char *processed_doc =
+                          process_heredoc (xp, doc, doc_line_number);
+                        free (doc);
+                        doc = processed_doc;
                       }
 
-                    /* FIXME: Ideally we should turn the here document into a
-                       string literal if it didn't contain $ substitution.  And
-                       we should also respect backslash escape sequences like
-                       in double-quoted strings.  */
-                    tp->type = token_type_other;
+                    if (doc != NULL)
+                      {
+                        tp->type = token_type_string_literal;
+                        tp->string = doc;
+                        tp->comment = add_reference (savable_comment);
+                      }
+                    else
+                      tp->type = token_type_other;
                     return;
                   }
                 phase1_ungetc (xp, c3);
index ebf31a988b9774a195bc31bbfd0d8ffe7bbbdad0..1b403bc8c3b65e655fc8618d608c94ffe7d78916 100755 (executable)
@@ -25,6 +25,22 @@ echo _("embedded_2_${foo}bar");
 echo _("embedded_3_{$foo}bar");
 echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz");
 echo _("embedded_5");
+// Heredoc with with embedded expressions.
+echo _(<<<EOT
+embedded_6_$foo bar
+EOT);
+echo _(<<<EOT
+embedded_7_${foo}bar
+EOT);
+echo _(<<<EOT
+embedded_8_{$foo}bar
+EOT);
+echo _(<<<EOT
+embedded_9_{$array[func(_('embedded_9_sub1'))]}_bar_{$array[func(_('embedded_9_sub2'))]}_baz
+EOT);
+echo _(<<<EOT
+embedded_10
+EOT);
 ?>
 EOF
 
@@ -62,6 +78,15 @@ msgstr ""
 
 msgid "embedded_5"
 msgstr ""
+
+msgid "embedded_9_sub1"
+msgstr ""
+
+msgid "embedded_9_sub2"
+msgstr ""
+
+msgid "embedded_10"
+msgstr ""
 EOF
 
 : ${DIFF=diff}
index b343ed6a4fe6e484e8d65fe089aa3c8f8877957d..942d2cf1224e16f16338f4ad279a131aa8907497 100755 (executable)
@@ -1,7 +1,7 @@
 #!/bin/sh
 . "${srcdir=.}/init.sh"; path_prepend_ . ../src
 
-# Test PHP support: here documents.
+# Test PHP support: heredoc strings, nowdoc strings.
 
 cat <<\EOF > xg-ph-2.php
 <?
@@ -36,6 +36,20 @@ Odoakar
 Theoderich
 EOTMARKER
 echo _("Franks");
+echo _(<<<EOTMARKER
+Chlodovechus Rex Francorum
+Carolus Rex Francorum et Langobardorum, Magnus Imperator Romanorum
+EOTMARKER);
+echo _("Hohenstaufens");
+echo _(<<<"EOTMARKER"
+   Fridericus Imperator Romani Imperii
+     Fridericus II Imperator
+  EOTMARKER);
+echo _("French");
+echo _(<<<'EOTMARKER'
+Louis XIV
+Napoleon Bonaparte
+EOTMARKER);
 ?>
 EOF
 
@@ -70,8 +84,32 @@ msgstr ""
 msgid "Romans"
 msgstr ""
 
+msgid "Goths"
+msgstr ""
+
 msgid "Franks"
 msgstr ""
+
+msgid ""
+"Chlodovechus Rex Francorum\n"
+"Carolus Rex Francorum et Langobardorum, Magnus Imperator Romanorum"
+msgstr ""
+
+msgid "Hohenstaufens"
+msgstr ""
+
+msgid ""
+" Fridericus Imperator Romani Imperii\n"
+"   Fridericus II Imperator"
+msgstr ""
+
+msgid "French"
+msgstr ""
+
+msgid ""
+"Louis XIV\n"
+"Napoleon Bonaparte"
+msgstr ""
 EOF
 
 : ${DIFF=diff}