From: Bruno Haible <bruno@clisp.org>
Date: Wed, 18 Sep 2024 21:37:32 +0000 (+0200)
Subject: xgettext: PHP: Recognize strings with embedded expressions.
X-Git-Tag: v0.23~126
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=28abce958bf1927776c157360f7b444a1199e995;p=thirdparty%2Fgettext.git

xgettext: PHP: Recognize strings with embedded expressions.

* gettext-tools/src/x-php.c (enum token_type_ty, struct token_ty): Moved.
(struct php_extractor): New type.
(fp, phase1_pushback, phase1_pushback_length, phase2_pushback,
phase2_pushback_length, buffer, bufmax, buflen, last_comment_line,
last_non_comment_line, phase3_pushback, phase3_pushback_length,
phase4_pushback, phase4_pushback_length, phase5_last, paren_nesting_depth,
bracket_nesting_depth): Remove variables.
(php_extractor_init_rest): New function.
(extract_php_input): New declaration.
(phase1_getc): Add a 'struct php_extractor *' parameter. Read from a string
if fp == NULL.
(phase1_ungetc, skip_html, comment_start, comment_add, comment_line_end,
phase3_getc, phase3_ungetc): Add a 'struct php_extractor *' parameter.
(phase4_get): Likewise. Add handling of embedded expressions in strings.
(phase4_unget, x_php_lex): Add a 'struct php_extractor *' parameter.
(extract_balanced): Add a 'struct php_extractor *' parameter. Remove mlp
parameter.
(extract_php_input): New function, extracted from extract_php.
(extract_php): Use it. Create a 'struct php_extractor'.
* gettext-tools/tests/xgettext-php-1: Add tests of strings with embedded
expressions.
* NEWS: Mention the change.
---

diff --git a/NEWS b/NEWS
index 687f8ae3d..56b66c34a 100644
--- a/NEWS
+++ b/NEWS
@@ -26,6 +26,7 @@ Version 0.23 - September 2024
   - Tcl: With the forthcoming Tcl 9.0, characters outside the Unicode BMP
     in Tcl message catalogs (.msg files) will work regardless of the locale's
     encoding.
+  - PHP: Strings with embedded expressions are now recognized.
 
 * Runtime behaviour:
   - In the C.UTF-8 locale, like in the C locale, the *gettext() functions
diff --git a/gettext-tools/src/x-php.c b/gettext-tools/src/x-php.c
index dd8a9f07b..2ea185d49 100644
--- a/gettext-tools/src/x-php.c
+++ b/gettext-tools/src/x-php.c
@@ -135,55 +135,162 @@ init_flag_table_php ()
 }
 
 
-/* ======================== Reading of characters.  ======================== */
+/* =================== Variables used by the extractor.  =================== */
 
-/* The input file stream.  */
-static FILE *fp;
+/* Type definitions needed for the variables.  */
 
+enum token_type_ty
+{
+  token_type_eof,
+  token_type_lparen,            /* ( */
+  token_type_rparen,            /* ) */
+  token_type_comma,             /* , */
+  token_type_lbracket,          /* [ */
+  token_type_rbracket,          /* ] */
+  token_type_dot,               /* . */
+  token_type_operator1,         /* * / % ++ -- */
+  token_type_operator2,         /* + - ! ~ @ */
+  token_type_string_literal,    /* "abc" */
+  token_type_symbol,            /* symbol, number */
+  token_type_other              /* misc. operator */
+};
+typedef enum token_type_ty token_type_ty;
 
-/* 1. line_number handling.  */
+typedef struct token_ty token_ty;
+struct token_ty
+{
+  token_type_ty type;
+  char *string;         /* for token_type_string_literal, token_type_symbol */
+  refcounted_string_list_ty *comment;   /* for token_type_string_literal */
+  int line_number;
+};
+
+/* These variables are combined in a struct, so that we can invoke the
+   extractor in a reentrant way.  */
+
+struct php_extractor
+{
+  /* Accumulator for the output.  */
+  message_list_ty *mlp;
+
+  /* The input file stream, when reading from a file.  */
+  FILE *fp;
+  /* The input area, when reading from a string.  */
+  const char *input;
+  const char *input_end;
+
+  int line_number;
+
+  unsigned char phase1_pushback[2];
+  int phase1_pushback_length;
+
+#if 0
+  unsigned char phase2_pushback[1];
+  int phase2_pushback_length;
+#endif
+
+  /* For accumulating comments.  */
+  char *buffer;
+  size_t bufmax;
+  size_t buflen;
+
+  /* These are for tracking whether comments count as immediately before
+     keyword.  */
+  int last_comment_line;
+  int last_non_comment_line;
+
+  unsigned char phase3_pushback[1];
+  int phase3_pushback_length;
+
+  token_ty phase4_pushback[3];
+  int phase4_pushback_length;
 
-static unsigned char phase1_pushback[2];
-static int phase1_pushback_length;
+  token_type_ty phase5_last;
+
+  /* Maximum supported nesting depth.  */
+  #define MAX_NESTING_DEPTH 1000
+
+  /* Current nesting depths.  */
+  int paren_nesting_depth;
+  int bracket_nesting_depth;
+};
+
+static inline void
+php_extractor_init_rest (struct php_extractor *xp)
+{
+  xp->phase1_pushback_length = 0;
+#if 0
+  xp->phase2_pushback_length = 0;
+#endif
+
+  xp->buffer = NULL;
+  xp->bufmax = 0;
+  xp->buflen = 0;
+
+  xp->last_comment_line = -1;
+  xp->last_non_comment_line = -1;
+
+  xp->phase3_pushback_length = 0;
+  xp->phase4_pushback_length = 0;
+
+  xp->phase5_last = token_type_eof;
+
+  xp->paren_nesting_depth = 0;
+  xp->bracket_nesting_depth = 0;
+}
+
+/* Forward declarations.  */
+static void extract_php_input (struct php_extractor *xp);
+
+
+/* ======================== Reading of characters.  ======================== */
+
+/* 1. line_number handling.  */
 
 static int
-phase1_getc ()
+phase1_getc (struct php_extractor *xp)
 {
   int c;
 
-  if (phase1_pushback_length)
-    c = phase1_pushback[--phase1_pushback_length];
-  else
+  if (xp->phase1_pushback_length)
+    c = xp->phase1_pushback[--(xp->phase1_pushback_length)];
+  else if (xp->fp != NULL)
     {
-      c = getc (fp);
+      c = getc (xp->fp);
 
       if (c == EOF)
         {
-          if (ferror (fp))
+          if (ferror (xp->fp))
             error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
                    real_file_name);
           return EOF;
         }
     }
+  else
+    {
+      if (xp->input == xp->input_end)
+        return EOF;
+      c = *(xp->input++);
+    }
 
-  if (c == '\n')
-    line_number++;
+  if (xp->fp != NULL && c == '\n')
+    xp->line_number++;
 
   return c;
 }
 
 /* Supports 2 characters of pushback.  */
 static void
-phase1_ungetc (int c)
+phase1_ungetc (struct php_extractor *xp, int c)
 {
   if (c != EOF)
     {
       if (c == '\n')
-        --line_number;
+        --(xp->line_number);
 
-      if (phase1_pushback_length == SIZEOF (phase1_pushback))
+      if (xp->phase1_pushback_length == SIZEOF (xp->phase1_pushback))
         abort ();
-      phase1_pushback[phase1_pushback_length++] = c;
+      xp->phase1_pushback[xp->phase1_pushback_length++] = c;
     }
 }
 
@@ -192,18 +299,18 @@ phase1_ungetc (int c)
    therefore don't contain translatable strings.  */
 
 static void
-skip_html ()
+skip_html (struct php_extractor *xp)
 {
   for (;;)
     {
-      int c = phase1_getc ();
+      int c = phase1_getc (xp);
 
       if (c == EOF)
         return;
 
       if (c == '<')
         {
-          int c2 = phase1_getc ();
+          int c2 = phase1_getc (xp);
 
           if (c2 == EOF)
             break;
@@ -212,10 +319,10 @@ skip_html ()
             {
               /* <?php is the normal way to enter PHP mode. <? and <?= are
                  recognized by PHP depending on a configuration setting.  */
-              int c3 = phase1_getc ();
+              int c3 = phase1_getc (xp);
 
               if (c3 != '=')
-                phase1_ungetc (c3);
+                phase1_ungetc (xp, c3);
 
               return;
             }
@@ -224,17 +331,17 @@ skip_html ()
             {
               /* <% and <%= are recognized by PHP depending on a configuration
                  setting.  */
-              int c3 = phase1_getc ();
+              int c3 = phase1_getc (xp);
 
               if (c3 != '=')
-                phase1_ungetc (c3);
+                phase1_ungetc (xp, c3);
 
               return;
             }
 
           if (c2 == '<')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
 
@@ -243,160 +350,160 @@ skip_html ()
              < script language = 'php' >
              are always recognized.  */
           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-            c2 = phase1_getc ();
+            c2 = phase1_getc (xp);
           if (c2 != 's' && c2 != 'S')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'c' && c2 != 'C')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'r' && c2 != 'R')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'i' && c2 != 'I')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'p' && c2 != 'P')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 't' && c2 != 'T')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
           do
-            c2 = phase1_getc ();
+            c2 = phase1_getc (xp);
           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
           if (c2 != 'l' && c2 != 'L')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'a' && c2 != 'A')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'n' && c2 != 'N')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'g' && c2 != 'G')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'u' && c2 != 'U')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'a' && c2 != 'A')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'g' && c2 != 'G')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           if (c2 != 'e' && c2 != 'E')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-            c2 = phase1_getc ();
+            c2 = phase1_getc (xp);
           if (c2 != '=')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-            c2 = phase1_getc ();
+            c2 = phase1_getc (xp);
           if (c2 == '"')
             {
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'h')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != '"')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
             }
           else if (c2 == '\'')
             {
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'h')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != '\'')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
             }
@@ -404,28 +511,28 @@ skip_html ()
             {
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'h')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
               if (c2 != 'p')
                 {
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
                   continue;
                 }
             }
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
           while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-            c2 = phase1_getc ();
+            c2 = phase1_getc (xp);
           if (c2 != '>')
             {
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
               continue;
             }
           return;
@@ -435,70 +542,67 @@ skip_html ()
 
 #if 0
 
-static unsigned char phase2_pushback[1];
-static int phase2_pushback_length;
-
 static int
-phase2_getc ()
+phase2_getc (struct php_extractor *xp)
 {
   int c;
 
-  if (phase2_pushback_length)
-    return phase2_pushback[--phase2_pushback_length];
+  if (xp->phase2_pushback_length)
+    return xp->phase2_pushback[--(xp->phase2_pushback_length)];
 
-  c = phase1_getc ();
+  c = phase1_getc (xp);
   switch (c)
     {
     case '?':
     case '%':
       {
-        int c2 = phase1_getc ();
+        int c2 = phase1_getc (xp);
         if (c2 == '>')
           {
             /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
             skip_html ();
             return ' ';
           }
-        phase1_ungetc (c2);
+        phase1_ungetc (xp, c2);
       }
       break;
 
     case '<':
       {
-        int c2 = phase1_getc ();
+        int c2 = phase1_getc (xp);
 
         /* < / script > terminates PHP mode and switches back to HTML mode.  */
         while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-          c2 = phase1_getc ();
+          c2 = phase1_getc (xp);
         if (c2 == '/')
           {
             do
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
             if (c2 == 's' || c2 == 'S')
               {
-                c2 = phase1_getc ();
+                c2 = phase1_getc (xp);
                 if (c2 == 'c' || c2 == 'C')
                   {
-                    c2 = phase1_getc ();
+                    c2 = phase1_getc (xp);
                     if (c2 == 'r' || c2 == 'R')
                       {
-                        c2 = phase1_getc ();
+                        c2 = phase1_getc (xp);
                         if (c2 == 'i' || c2 == 'I')
                           {
-                            c2 = phase1_getc ();
+                            c2 = phase1_getc (xp);
                             if (c2 == 'p' || c2 == 'P')
                               {
-                                c2 = phase1_getc ();
+                                c2 = phase1_getc (xp);
                                 if (c2 == 't' || c2 == 'T')
                                   {
                                     do
-                                      c2 = phase1_getc ();
+                                      c2 = phase1_getc (xp);
                                     while (c2 == ' ' || c2 == '\t'
                                            || c2 == '\n' || c2 == '\r');
                                     if (c2 == '>')
                                       {
-                                        skip_html ();
+                                        skip_html (xp);
                                         return ' ';
                                       }
                                   }
@@ -508,7 +612,7 @@ phase2_getc ()
                   }
               }
           }
-        phase1_ungetc (c2);
+        phase1_ungetc (xp, c2);
       }
       break;
     }
@@ -517,13 +621,13 @@ phase2_getc ()
 }
 
 static void
-phase2_ungetc (int c)
+phase2_ungetc (struct php_extractor *xp, int c)
 {
   if (c != EOF)
     {
-      if (phase2_pushback_length == SIZEOF (phase2_pushback))
+      if (xp->phase2_pushback_length == SIZEOF (xp->phase2_pushback))
         abort ();
-      phase2_pushback[phase2_pushback_length++] = c;
+      xp->phase2_pushback[xp->phase2_pushback_length++] = c;
     }
 }
 
@@ -532,41 +636,38 @@ phase2_ungetc (int c)
 
 /* Accumulating comments.  */
 
-static char *buffer;
-static size_t bufmax;
-static size_t buflen;
-
 static inline void
-comment_start ()
+comment_start (struct php_extractor *xp)
 {
-  buflen = 0;
+  xp->buflen = 0;
 }
 
 static inline void
-comment_add (int c)
+comment_add (struct php_extractor *xp, int c)
 {
-  if (buflen >= bufmax)
+  if (xp->buflen >= xp->bufmax)
     {
-      bufmax = 2 * bufmax + 10;
-      buffer = xrealloc (buffer, bufmax);
+      xp->bufmax = 2 * xp->bufmax + 10;
+      xp->buffer = xrealloc (xp->buffer, xp->bufmax);
     }
-  buffer[buflen++] = c;
+  xp->buffer[xp->buflen++] = c;
 }
 
 static inline void
-comment_line_end (size_t chars_to_remove)
+comment_line_end (struct php_extractor *xp, size_t chars_to_remove)
 {
-  buflen -= chars_to_remove;
-  while (buflen >= 1
-         && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
-    --buflen;
-  if (chars_to_remove == 0 && buflen >= bufmax)
+  xp->buflen -= chars_to_remove;
+  while (xp->buflen >= 1
+         && (xp->buffer[xp->buflen - 1] == ' '
+             || xp->buffer[xp->buflen - 1] == '\t'))
+    --(xp->buflen);
+  if (chars_to_remove == 0 && xp->buflen >= xp->bufmax)
     {
-      bufmax = 2 * bufmax + 10;
-      buffer = xrealloc (buffer, bufmax);
+      xp->bufmax = 2 * xp->bufmax + 10;
+      xp->buffer = xrealloc (xp->buffer, xp->bufmax);
     }
-  buffer[buflen] = '\0';
-  savable_comment_add (buffer);
+  xp->buffer[xp->buflen] = '\0';
+  savable_comment_add (xp->buffer);
 }
 
 
@@ -574,62 +675,54 @@ comment_line_end (size_t chars_to_remove)
    space character.  We need to remember the comment for later, because
    it may be attached to a keyword string.  */
 
-/* These are for tracking whether comments count as immediately before
-   keyword.  */
-static int last_comment_line;
-static int last_non_comment_line;
-
-static unsigned char phase3_pushback[1];
-static int phase3_pushback_length;
-
 static int
-phase3_getc ()
+phase3_getc (struct php_extractor *xp)
 {
   int lineno;
   int c;
 
-  if (phase3_pushback_length)
-    return phase3_pushback[--phase3_pushback_length];
+  if (xp->phase3_pushback_length)
+    return xp->phase3_pushback[--(xp->phase3_pushback_length)];
 
-  c = phase1_getc ();
+  c = phase1_getc (xp);
 
   if (c == '#')
     {
       /* sh comment.  */
       bool last_was_qmark = false;
 
-      comment_start ();
-      lineno = line_number;
+      comment_start (xp);
+      lineno = xp->line_number;
       for (;;)
         {
-          c = phase1_getc ();
+          c = phase1_getc (xp);
           if (c == '\n' || c == EOF)
             {
-              comment_line_end (0);
+              comment_line_end (xp, 0);
               break;
             }
           if (last_was_qmark && c == '>')
             {
-              comment_line_end (1);
-              skip_html ();
+              comment_line_end (xp, 1);
+              skip_html (xp);
               break;
             }
           /* We skip all leading white space, but not EOLs.  */
-          if (!(buflen == 0 && (c == ' ' || c == '\t')))
-            comment_add (c);
+          if (!(xp->buflen == 0 && (c == ' ' || c == '\t')))
+            comment_add (xp, c);
           last_was_qmark = (c == '?' || c == '%');
         }
-      last_comment_line = lineno;
+      xp->last_comment_line = lineno;
       return '\n';
     }
   else if (c == '/')
     {
-      c = phase1_getc ();
+      c = phase1_getc (xp);
 
       switch (c)
         {
         default:
-          phase1_ungetc (c);
+          phase1_ungetc (xp, c);
           return '/';
 
         case '*':
@@ -637,24 +730,24 @@ phase3_getc ()
             /* C comment.  */
             bool last_was_star;
 
-            comment_start ();
-            lineno = line_number;
+            comment_start (xp);
+            lineno = xp->line_number;
             last_was_star = false;
             for (;;)
               {
-                c = phase1_getc ();
+                c = phase1_getc (xp);
                 if (c == EOF)
                   break;
                 /* We skip all leading white space, but not EOLs.  */
-                if (buflen == 0 && (c == ' ' || c == '\t'))
+                if (xp->buflen == 0 && (c == ' ' || c == '\t'))
                   continue;
-                comment_add (c);
+                comment_add (xp, c);
                 switch (c)
                   {
                   case '\n':
-                    comment_line_end (1);
-                    comment_start ();
-                    lineno = line_number;
+                    comment_line_end (xp, 1);
+                    comment_start (xp);
+                    lineno = xp->line_number;
                     last_was_star = false;
                     continue;
 
@@ -665,7 +758,7 @@ phase3_getc ()
                   case '/':
                     if (last_was_star)
                       {
-                        comment_line_end (2);
+                        comment_line_end (xp, 2);
                         break;
                       }
                     FALLTHROUGH;
@@ -676,7 +769,7 @@ phase3_getc ()
                   }
                 break;
               }
-            last_comment_line = lineno;
+            xp->last_comment_line = lineno;
             return ' ';
           }
 
@@ -685,28 +778,28 @@ phase3_getc ()
             /* C++ comment.  */
             bool last_was_qmark = false;
 
-            comment_start ();
-            lineno = line_number;
+            comment_start (xp);
+            lineno = xp->line_number;
             for (;;)
               {
-                c = phase1_getc ();
+                c = phase1_getc (xp);
                 if (c == '\n' || c == EOF)
                   {
-                    comment_line_end (0);
+                    comment_line_end (xp, 0);
                     break;
                   }
                 if (last_was_qmark && c == '>')
                   {
-                    comment_line_end (1);
-                    skip_html ();
+                    comment_line_end (xp, 1);
+                    skip_html (xp);
                     break;
                   }
                 /* We skip all leading white space, but not EOLs.  */
-                if (!(buflen == 0 && (c == ' ' || c == '\t')))
-                  comment_add (c);
+                if (!(xp->buflen == 0 && (c == ' ' || c == '\t')))
+                  comment_add (xp, c);
                 last_was_qmark = (c == '?' || c == '%');
               }
-            last_comment_line = lineno;
+            xp->last_comment_line = lineno;
             return '\n';
           }
         }
@@ -717,13 +810,13 @@ phase3_getc ()
 
 #ifdef unused
 static void
-phase3_ungetc (int c)
+phase3_ungetc (struct php_extractor *xp, int c)
 {
   if (c != EOF)
     {
-      if (phase3_pushback_length == SIZEOF (phase3_pushback))
+      if (xp->phase3_pushback_length == SIZEOF (xp->phase3_pushback))
         abort ();
-      phase3_pushback[phase3_pushback_length++] = c;
+      xp->phase3_pushback[xp->phase3_pushback_length++] = c;
     }
 }
 #endif
@@ -732,32 +825,7 @@ phase3_ungetc (int c)
 /* ========================== Reading of tokens.  ========================== */
 
 
-enum token_type_ty
-{
-  token_type_eof,
-  token_type_lparen,            /* ( */
-  token_type_rparen,            /* ) */
-  token_type_comma,             /* , */
-  token_type_lbracket,          /* [ */
-  token_type_rbracket,          /* ] */
-  token_type_dot,               /* . */
-  token_type_operator1,         /* * / % ++ -- */
-  token_type_operator2,         /* + - ! ~ @ */
-  token_type_string_literal,    /* "abc" */
-  token_type_symbol,            /* symbol, number */
-  token_type_other              /* misc. operator */
-};
-typedef enum token_type_ty token_type_ty;
-
-typedef struct token_ty token_ty;
-struct token_ty
-{
-  token_type_ty type;
-  char *string;         /* for token_type_string_literal, token_type_symbol */
-  refcounted_string_list_ty *comment;   /* for token_type_string_literal */
-  int line_number;
-};
-
+/* 'struct token_ty' is defined above.  */
 
 /* Free the memory pointed to by a 'struct token_ty'.  */
 static inline void
@@ -772,28 +840,25 @@ free_token (token_ty *tp)
 
 /* 4. Combine characters into tokens.  Discard whitespace.  */
 
-static token_ty phase4_pushback[3];
-static int phase4_pushback_length;
-
 static void
-phase4_get (token_ty *tp)
+phase4_get (struct php_extractor *xp, token_ty *tp)
 {
   static char *buffer;
   static int bufmax;
   int bufpos;
   int c;
 
-  if (phase4_pushback_length)
+  if (xp->phase4_pushback_length)
     {
-      *tp = phase4_pushback[--phase4_pushback_length];
+      *tp = xp->phase4_pushback[--(xp->phase4_pushback_length)];
       return;
     }
   tp->string = NULL;
 
   for (;;)
     {
-      tp->line_number = line_number;
-      c = phase3_getc ();
+      tp->line_number = xp->line_number;
+      c = phase3_getc (xp);
       switch (c)
         {
         case EOF:
@@ -801,7 +866,7 @@ phase4_get (token_ty *tp)
           return;
 
         case '\n':
-          if (last_non_comment_line > last_comment_line)
+          if (xp->last_non_comment_line > xp->last_comment_line)
             savable_comment_reset ();
           FALLTHROUGH;
         case ' ':
@@ -811,7 +876,7 @@ phase4_get (token_ty *tp)
           continue;
         }
 
-      last_non_comment_line = tp->line_number;
+      xp->last_non_comment_line = tp->line_number;
 
       switch (c)
         {
@@ -852,7 +917,7 @@ phase4_get (token_ty *tp)
                   buffer = xrealloc (buffer, bufmax);
                 }
               buffer[bufpos++] = c;
-              c = phase1_getc ();
+              c = phase1_getc (xp);
               switch (c)
                 {
                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
@@ -893,7 +958,7 @@ phase4_get (token_ty *tp)
                   continue;
 
                 default:
-                  phase1_ungetc (c);
+                  phase1_ungetc (xp, c);
                   break;
                 }
               break;
@@ -913,15 +978,15 @@ phase4_get (token_ty *tp)
           bufpos = 0;
           for (;;)
             {
-              c = phase1_getc ();
+              c = phase1_getc (xp);
               if (c == EOF || c == '\'')
                 break;
               if (c == '\\')
                 {
-                  c = phase1_getc ();
+                  c = phase1_getc (xp);
                   if (c != '\\' && c != '\'')
                     {
-                      phase1_ungetc (c);
+                      phase1_ungetc (xp, c);
                       c = '\\';
                     }
                 }
@@ -946,42 +1011,43 @@ phase4_get (token_ty *tp)
         case '"':
           /* Double-quoted string literal.  */
           tp->type = token_type_string_literal;
+        string_literal_continued:
           bufpos = 0;
           for (;;)
             {
-              c = phase1_getc ();
+              c = phase1_getc (xp);
               if (c == EOF || c == '"')
                 break;
               if (c == '$')
                 {
-                  c = phase1_getc ();
+                  c = phase1_getc (xp);
                   if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
-                      || c == '_' || c == '{' || c >= 0x7f)
+                      || c == '_' || c >= 0x7f)
                     {
                       /* String with variables.  */
                       tp->type = token_type_other;
                       continue;
                     }
-                  phase1_ungetc (c);
+                  if (c == '{')
+                    /* String with embedded expressions.  */
+                    goto string_with_embedded_expressions;
+                  phase1_ungetc (xp, c);
                   c = '$';
                 }
               if (c == '{')
                 {
-                  c = phase1_getc ();
+                  c = phase1_getc (xp);
                   if (c == '$')
-                    {
-                      /* String with expressions.  */
-                      tp->type = token_type_other;
-                      continue;
-                    }
-                  phase1_ungetc (c);
+                    /* String with embedded expressions.  */
+                    goto string_with_embedded_expressions;
+                  phase1_ungetc (xp, c);
                   c = '{';
                 }
               if (c == '\\')
                 {
                   int n, j;
 
-                  c = phase1_getc ();
+                  c = phase1_getc (xp);
                   switch (c)
                     {
                     case '"':
@@ -995,7 +1061,7 @@ phase4_get (token_ty *tp)
                       for (j = 0; j < 3; ++j)
                         {
                           n = n * 8 + c - '0';
-                          c = phase1_getc ();
+                          c = phase1_getc (xp);
                           switch (c)
                             {
                             default:
@@ -1007,7 +1073,7 @@ phase4_get (token_ty *tp)
                             }
                           break;
                         }
-                      phase1_ungetc (c);
+                      phase1_ungetc (xp, c);
                       c = n;
                       break;
 
@@ -1015,7 +1081,7 @@ phase4_get (token_ty *tp)
                       n = 0;
                       for (j = 0; j < 2; ++j)
                         {
-                          c = phase1_getc ();
+                          c = phase1_getc (xp);
                           switch (c)
                             {
                             case '0': case '1': case '2': case '3': case '4':
@@ -1031,7 +1097,7 @@ phase4_get (token_ty *tp)
                               n = n * 16 + 10 + c - 'a';
                               break;
                             default:
-                              phase1_ungetc (c);
+                              phase1_ungetc (xp, c);
                               c = 0;
                               break;
                             }
@@ -1040,7 +1106,7 @@ phase4_get (token_ty *tp)
                         }
                       if (j == 0)
                         {
-                          phase1_ungetc ('x');
+                          phase1_ungetc (xp, 'x');
                           c = '\\';
                         }
                       else
@@ -1058,7 +1124,7 @@ phase4_get (token_ty *tp)
                       break;
 
                     default:
-                      phase1_ungetc (c);
+                      phase1_ungetc (xp, c);
                       c = '\\';
                       break;
                     }
@@ -1083,20 +1149,98 @@ phase4_get (token_ty *tp)
             }
           return;
 
+        string_with_embedded_expressions:
+          tp->type = token_type_other;
+          {
+            size_t nesting_stack_alloc = 10;
+            char *nesting_stack = malloc (nesting_stack_alloc);
+            size_t nesting_stack_depth = 0;
+            /* We just read a '{', so expect a matching '}'.  */
+            nesting_stack[nesting_stack_depth++] = '}';
+
+            /* Find the extent of the expression.  */
+            bufpos = 0;
+            for (;;)
+              {
+                c = phase1_getc (xp);
+                if (c == EOF)
+                  break;
+                if (c == '"')
+                  {
+                    if (nesting_stack_depth > 0)
+                      if_error (IF_SEVERITY_WARNING,
+                                logical_file_name, xp->line_number, (size_t)(-1), false,
+                                _("unterminated expression in string literal, expected a '%c'"),
+                                nesting_stack[nesting_stack_depth - 1]);
+                    break;
+                  }
+                if (c == '{' || c == '[' || c == '(')
+                  {
+                    if (nesting_stack_depth >= nesting_stack_alloc)
+                      {
+                        nesting_stack_alloc = 2 * nesting_stack_alloc;
+                        nesting_stack =
+                          xrealloc (nesting_stack, nesting_stack_alloc);
+                      }
+                    nesting_stack[nesting_stack_depth++] =
+                      (c == '{' ? '}' : c == '[' ? ']' : ')');
+                  }
+                else if (c == '}' || c == ']' || c == ')')
+                  {
+                    if (nesting_stack_depth > 0
+                        && c == nesting_stack[nesting_stack_depth - 1])
+                      {
+                        if (--nesting_stack_depth == 0)
+                          break;
+                      }
+                    else
+                      if_error (IF_SEVERITY_WARNING,
+                                logical_file_name, xp->line_number, (size_t)(-1), false,
+                                _("unterminated expression in string literal contains unbalanced '%c'"),
+                                c);
+                  }
+                if (bufpos >= bufmax)
+                  {
+                    bufmax = 2 * bufmax + 10;
+                    buffer = xrealloc (buffer, bufmax);
+                  }
+                buffer[bufpos++] = c;
+              }
+
+            /* Recursively extract messages from the expression.  */
+            char *substring = xmalloc (bufpos);
+            memcpy (substring, buffer, bufpos);
+
+            struct php_extractor *rxp = XMALLOC (struct php_extractor);
+            rxp->mlp = xp->mlp;
+            rxp->fp = NULL;
+            rxp->input = substring;
+            rxp->input_end = substring + bufpos;
+            rxp->line_number = xp->line_number;
+            php_extractor_init_rest (rxp);
+
+            extract_php_input (rxp);
+
+            free (rxp);
+            free (substring);
+            free (nesting_stack);
+          }
+          goto string_literal_continued;
+
         case '?':
         case '%':
           {
-            int c2 = phase1_getc ();
+            int c2 = phase1_getc (xp);
             if (c2 == '>')
               {
                 /* ?> and %> terminate PHP mode and switch back to HTML
                    mode.  */
-                skip_html ();
+                skip_html (xp);
                 tp->type = token_type_other;
               }
             else
               {
-                phase1_ungetc (c2);
+                phase1_ungetc (xp, c2);
                 tp->type = (c == '%' ? token_type_operator1 : token_type_other);
               }
             return;
@@ -1134,14 +1278,14 @@ phase4_get (token_ty *tp)
         case '+':
         case '-':
           {
-            int c2 = phase1_getc ();
+            int c2 = phase1_getc (xp);
             if (c2 == c)
               /* ++ or -- */
               tp->type = token_type_operator1;
             else
               /* + or - */
               {
-                phase1_ungetc (c2);
+                phase1_ungetc (xp, c2);
                 tp->type = token_type_operator2;
               }
             return;
@@ -1155,10 +1299,10 @@ phase4_get (token_ty *tp)
 
         case '<':
           {
-            int c2 = phase1_getc ();
+            int c2 = phase1_getc (xp);
             if (c2 == '<')
               {
-                int c3 = phase1_getc ();
+                int c3 = phase1_getc (xp);
                 if (c3 == '<')
                   {
                     int label_start = 0;
@@ -1166,7 +1310,7 @@ phase4_get (token_ty *tp)
                     /* Start of here and now document.
                        Parse whitespace, then label, then newline.  */
                     do
-                      c = phase3_getc ();
+                      c = phase3_getc (xp);
                     while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
 
                     bufpos = 0;
@@ -1178,7 +1322,7 @@ phase4_get (token_ty *tp)
                             buffer = xrealloc (buffer, bufmax);
                           }
                         buffer[bufpos++] = c;
-                        c = phase3_getc ();
+                        c = phase3_getc (xp);
                       }
                     while (c != EOF && c != '\n' && c != '\r');
                     /* buffer[0..bufpos-1] now contains the label
@@ -1193,7 +1337,7 @@ phase4_get (token_ty *tp)
                     /* Now skip the here document.  */
                     for (;;)
                       {
-                        c = phase1_getc ();
+                        c = phase1_getc (xp);
                         if (c == EOF)
                           break;
                         if (c == '\n' || c == '\r')
@@ -1202,22 +1346,22 @@ phase4_get (token_ty *tp)
 
                             while (bufidx < bufpos)
                               {
-                                c = phase1_getc ();
+                                c = phase1_getc (xp);
                                 if (c == EOF)
                                   break;
                                 if (c != buffer[bufidx])
                                   {
-                                    phase1_ungetc (c);
+                                    phase1_ungetc (xp, c);
                                     break;
                                   }
                                 bufidx++;
                               }
                             if (bufidx == bufpos)
                               {
-                                c = phase1_getc ();
+                                c = phase1_getc (xp);
                                 if (c != ';')
-                                  phase1_ungetc (c);
-                                c = phase1_getc ();
+                                  phase1_ungetc (xp, c);
+                                c = phase1_getc (xp);
                                 if (c == '\n' || c == '\r')
                                   break;
                               }
@@ -1231,66 +1375,66 @@ phase4_get (token_ty *tp)
                     tp->type = token_type_other;
                     return;
                   }
-                phase1_ungetc (c3);
+                phase1_ungetc (xp, c3);
               }
 
             /* < / script > terminates PHP mode and switches back to HTML
                mode.  */
             while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
-              c2 = phase1_getc ();
+              c2 = phase1_getc (xp);
             if (c2 == '/')
               {
                 do
-                  c2 = phase1_getc ();
+                  c2 = phase1_getc (xp);
                 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
                 if (c2 == 's' || c2 == 'S')
                   {
-                    c2 = phase1_getc ();
+                    c2 = phase1_getc (xp);
                     if (c2 == 'c' || c2 == 'C')
                       {
-                        c2 = phase1_getc ();
+                        c2 = phase1_getc (xp);
                         if (c2 == 'r' || c2 == 'R')
                           {
-                            c2 = phase1_getc ();
+                            c2 = phase1_getc (xp);
                             if (c2 == 'i' || c2 == 'I')
                               {
-                                c2 = phase1_getc ();
+                                c2 = phase1_getc (xp);
                                 if (c2 == 'p' || c2 == 'P')
                                   {
-                                    c2 = phase1_getc ();
+                                    c2 = phase1_getc (xp);
                                     if (c2 == 't' || c2 == 'T')
                                       {
                                         do
-                                          c2 = phase1_getc ();
+                                          c2 = phase1_getc (xp);
                                         while (c2 == ' ' || c2 == '\t'
                                                || c2 == '\n' || c2 == '\r');
                                         if (c2 == '>')
                                           {
-                                            skip_html ();
+                                            skip_html (xp);
                                           }
                                         else
-                                          phase1_ungetc (c2);
+                                          phase1_ungetc (xp, c2);
                                       }
                                     else
-                                      phase1_ungetc (c2);
+                                      phase1_ungetc (xp, c2);
                                   }
                                 else
-                                  phase1_ungetc (c2);
+                                  phase1_ungetc (xp, c2);
                               }
                             else
-                              phase1_ungetc (c2);
+                              phase1_ungetc (xp, c2);
                           }
                         else
-                          phase1_ungetc (c2);
+                          phase1_ungetc (xp, c2);
                       }
                     else
-                      phase1_ungetc (c2);
+                      phase1_ungetc (xp, c2);
                   }
                 else
-                  phase1_ungetc (c2);
+                  phase1_ungetc (xp, c2);
               }
             else
-              phase1_ungetc (c2);
+              phase1_ungetc (xp, c2);
 
             tp->type = token_type_other;
             return;
@@ -1310,13 +1454,13 @@ phase4_get (token_ty *tp)
 
 /* Supports 3 tokens of pushback.  */
 static void
-phase4_unget (token_ty *tp)
+phase4_unget (struct php_extractor *xp, token_ty *tp)
 {
   if (tp->type != token_type_eof)
     {
-      if (phase4_pushback_length == SIZEOF (phase4_pushback))
+      if (xp->phase4_pushback_length == SIZEOF (xp->phase4_pushback))
         abort ();
-      phase4_pushback[phase4_pushback_length++] = *tp;
+      xp->phase4_pushback[xp->phase4_pushback_length++] = *tp;
     }
 }
 
@@ -1334,17 +1478,15 @@ phase4_unget (token_ty *tp)
        higher precedence as '.', such as a multiplicative or postincrement
        expression).  */
 
-static token_type_ty phase5_last;
-
 static void
-x_php_lex (token_ty *tp)
+x_php_lex (struct php_extractor *xp, token_ty *tp)
 {
-  phase4_get (tp);
+  phase4_get (xp, tp);
   if (tp->type == token_type_string_literal
-      && !(phase5_last == token_type_dot
-           || phase5_last == token_type_operator1
-           || phase5_last == token_type_operator2
-           || phase5_last == token_type_rparen))
+      && !(xp->phase5_last == token_type_dot
+           || xp->phase5_last == token_type_operator1
+           || xp->phase5_last == token_type_operator2
+           || xp->phase5_last == token_type_rparen))
     {
       char *sum = tp->string;
       size_t sum_len = strlen (sum);
@@ -1353,17 +1495,17 @@ x_php_lex (token_ty *tp)
         {
           token_ty token2;
 
-          phase4_get (&token2);
+          phase4_get (xp, &token2);
           if (token2.type == token_type_dot)
             {
               token_ty token3;
 
-              phase4_get (&token3);
+              phase4_get (xp, &token3);
               if (token3.type == token_type_string_literal)
                 {
                   token_ty token_after;
 
-                  phase4_get (&token_after);
+                  phase4_get (xp, &token_after);
                   if (token_after.type != token_type_operator1)
                     {
                       char *addend = token3.string;
@@ -1373,21 +1515,21 @@ x_php_lex (token_ty *tp)
                       memcpy (sum + sum_len, addend, addend_len + 1);
                       sum_len += addend_len;
 
-                      phase4_unget (&token_after);
+                      phase4_unget (xp, &token_after);
                       free_token (&token3);
                       free_token (&token2);
                       continue;
                     }
-                  phase4_unget (&token_after);
+                  phase4_unget (xp, &token_after);
                 }
-              phase4_unget (&token3);
+              phase4_unget (xp, &token3);
             }
-          phase4_unget (&token2);
+          phase4_unget (xp, &token2);
           break;
         }
       tp->string = sum;
     }
-  phase5_last = tp->type;
+  xp->phase5_last = tp->type;
 }
 
 
@@ -1398,14 +1540,6 @@ x_php_lex (token_ty *tp)
 static flag_context_list_table_ty *flag_context_list_table;
 
 
-/* Maximum supported nesting depth.  */
-#define MAX_NESTING_DEPTH 1000
-
-/* Current nesting depths.  */
-static int paren_nesting_depth;
-static int bracket_nesting_depth;
-
-
 /* The file is broken into tokens.  Scan the token stream, looking for
    a keyword, followed by a left paren, followed by a string.  When we
    see this sequence, we have something to remember.  We assume we are
@@ -1422,12 +1556,12 @@ static int bracket_nesting_depth;
 
 
 /* Extract messages until the next balanced closing parenthesis or bracket.
-   Extracted messages are added to MLP.
+   Extracted messages are added to XP->MLP.
    DELIM can be either token_type_rparen or token_type_rbracket, or
    token_type_eof to accept both.
    Return true upon eof, false upon closing parenthesis or bracket.  */
 static bool
-extract_balanced (message_list_ty *mlp,
+extract_balanced (struct php_extractor *xp,
                   token_type_ty delim,
                   flag_region_ty *outer_region,
                   flag_context_list_iterator_ty context_iter,
@@ -1454,7 +1588,7 @@ extract_balanced (message_list_ty *mlp,
     {
       token_ty token;
 
-      x_php_lex (&token);
+      x_php_lex (xp, &token);
       switch (token.type)
         {
         case token_type_symbol:
@@ -1480,20 +1614,20 @@ extract_balanced (message_list_ty *mlp,
           continue;
 
         case token_type_lparen:
-          if (++paren_nesting_depth > MAX_NESTING_DEPTH)
+          if (++(xp->paren_nesting_depth) > MAX_NESTING_DEPTH)
             if_error (IF_SEVERITY_FATAL_ERROR,
-                      logical_file_name, line_number, (size_t)(-1), false,
+                      logical_file_name, xp->line_number, (size_t)(-1), false,
                       _("too many open parentheses"));
-          if (extract_balanced (mlp, token_type_rparen,
+          if (extract_balanced (xp, token_type_rparen,
                                 inner_region, next_context_iter,
-                                arglist_parser_alloc (mlp,
+                                arglist_parser_alloc (xp->mlp,
                                                       state ? next_shapes : NULL)))
             {
               arglist_parser_done (argparser, arg);
               unref_region (inner_region);
               return true;
             }
-          paren_nesting_depth--;
+          xp->paren_nesting_depth--;
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;
@@ -1521,20 +1655,20 @@ extract_balanced (message_list_ty *mlp,
           continue;
 
         case token_type_lbracket:
-          if (++bracket_nesting_depth > MAX_NESTING_DEPTH)
+          if (++(xp->bracket_nesting_depth) > MAX_NESTING_DEPTH)
             if_error (IF_SEVERITY_FATAL_ERROR,
-                      logical_file_name, line_number, (size_t)(-1), false,
+                      logical_file_name, xp->line_number, (size_t)(-1), false,
                       _("too many open brackets"));
-          if (extract_balanced (mlp, token_type_rbracket,
+          if (extract_balanced (xp, token_type_rbracket,
                                 null_context_region (),
                                 null_context_list_iterator,
-                                arglist_parser_alloc (mlp, NULL)))
+                                arglist_parser_alloc (xp->mlp, NULL)))
             {
               arglist_parser_done (argparser, arg);
               unref_region (inner_region);
               return true;
             }
-          bracket_nesting_depth--;
+          xp->bracket_nesting_depth--;
           next_context_iter = null_context_list_iterator;
           state = 0;
           continue;
@@ -1557,7 +1691,7 @@ extract_balanced (message_list_ty *mlp,
             pos.line_number = token.line_number;
 
             if (extract_all)
-              remember_a_message (mlp, NULL, token.string, false, false,
+              remember_a_message (xp->mlp, NULL, token.string, false, false,
                                   inner_region, &pos,
                                   NULL, token.comment, false);
             else
@@ -1596,51 +1730,46 @@ extract_balanced (message_list_ty *mlp,
 }
 
 
+static void
+extract_php_input (struct php_extractor *xp)
+{
+  /* Eat tokens until eof is seen.  When extract_balanced returns
+     due to an unbalanced closing parenthesis, just restart it.  */
+  while (!extract_balanced (xp, token_type_eof,
+                            null_context_region (), null_context_list_iterator,
+                            arglist_parser_alloc (xp->mlp, NULL)))
+    ;
+}
+
+
 void
 extract_php (FILE *f,
              const char *real_filename, const char *logical_filename,
              flag_context_list_table_ty *flag_table,
              msgdomain_list_ty *mdlp)
 {
-  message_list_ty *mlp = mdlp->item[0]->messages;
-
-  fp = f;
-  real_file_name = real_filename;
-  logical_file_name = xstrdup (logical_filename);
-  line_number = 1;
-
-  phase1_pushback_length = 0;
-#if 0
-  phase2_pushback_length = 0;
-#endif
-
-  last_comment_line = -1;
-  last_non_comment_line = -1;
-
-  phase3_pushback_length = 0;
-  phase4_pushback_length = 0;
-
-  phase5_last = token_type_eof;
-
   flag_context_list_table = flag_table;
-  paren_nesting_depth = 0;
-  bracket_nesting_depth = 0;
 
   init_keywords ();
 
+  struct php_extractor *xp = XMALLOC (struct php_extractor);
+
+  xp->mlp = mdlp->item[0]->messages;
+  xp->fp = f;
+  xp->input = NULL;
+  xp->input_end = NULL;
+  real_file_name = real_filename;
+  logical_file_name = xstrdup (logical_filename);
+  xp->line_number = 1;
+  php_extractor_init_rest (xp);
+
   /* Initial mode is HTML mode, not PHP mode.  */
-  skip_html ();
+  skip_html (xp);
 
-  /* Eat tokens until eof is seen.  When extract_balanced returns
-     due to an unbalanced closing parenthesis, just restart it.  */
-  while (!extract_balanced (mlp, token_type_eof,
-                            null_context_region (), null_context_list_iterator,
-                            arglist_parser_alloc (mlp, NULL)))
-    ;
+  extract_php_input (xp);
 
   /* Close scanner.  */
-  fp = NULL;
+  free (xp);
   real_file_name = NULL;
   logical_file_name = NULL;
-  line_number = 0;
 }
diff --git a/gettext-tools/tests/xgettext-php-1 b/gettext-tools/tests/xgettext-php-1
index 9c9891c4d..ebf31a988 100755
--- a/gettext-tools/tests/xgettext-php-1
+++ b/gettext-tools/tests/xgettext-php-1
@@ -1,7 +1,7 @@
 #!/bin/sh
 . "${srcdir=.}/init.sh"; path_prepend_ . ../src
 
-# Test PHP support: --add-comments option.
+# Test PHP support: --add-comments option, strings with embedded expressions.
 
 cat <<\EOF > xg-ph-1.php
 <?
@@ -19,6 +19,12 @@ echo _("Hey Jude");
 echo _("The Fabulous Four");
 // This will not be extracted.
 echo _(table["key"]);
+// Strings with embedded expressions, a.k.a. variable parsing.
+echo _("embedded_1_$foo bar");
+echo _("embedded_2_${foo}bar");
+echo _("embedded_3_{$foo}bar");
+echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz");
+echo _("embedded_5");
 ?>
 EOF
 
@@ -47,6 +53,15 @@ msgstr ""
 #.
 msgid "The Fabulous Four"
 msgstr ""
+
+msgid "embedded_4_sub1"
+msgstr ""
+
+msgid "embedded_4_sub2"
+msgstr ""
+
+msgid "embedded_5"
+msgstr ""
 EOF
 
 : ${DIFF=diff}