From: Bruno Haible <bruno@clisp.org>
Date: Wed, 18 Sep 2024 15:00:04 +0000 (+0200)
Subject: xgettext: C#: Recognize strings with embedded expressions.
X-Git-Tag: v0.23~128
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4391a42f1540ad20aa6e1b7fdc87c8e0e79e34e6;p=thirdparty%2Fgettext.git

xgettext: C#: Recognize strings with embedded expressions.

* gettext-tools/src/x-csharp.c (token_type_template, token_type_ltemplate,
token_type_mtemplate, token_type_rtemplate): New enum items.
(free_token): Treat token_type_template like token_type_string_literal.
(accumulate_escaped): Accept a second delimiter argument. Return the delimiter.
(template_literal_depth, brace_depths, brace_depths_alloc): New variables,
copied from x-javascript.c.
(new_brace_depth_level): New function, copied from x-javascript.c.
(phase6_get): Recognize strings with embedded expressions. Handle braces in a
nested manner.
(extract_parenthesized): Handle the token types token_type_template,
token_type_ltemplate, token_type_mtemplate, token_type_rtemplate.
(extract_csharp): Initialize template_literal_depth and call
new_brace_depth_level.
* gettext-tools/tests/xgettext-csharp-5: Add tests of strings with embedded
expressions.
* NEWS: Mention the change.
---

diff --git a/NEWS b/NEWS
index 0f78e3644..687f8ae3d 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,8 @@ Version 0.23 - September 2024
     o xgettext now recognizes comments of the form '#; <expression>'.
   - Java: Improved recognition of format strings when the String.formatted
     method is used.
+  - C#: Strings with embedded expressions (a.k.a. interpolated strings) are now
+    recognized.
   - awk: String concatenation by juxtaposition is now recognized.
   - Smalltalk: The string concatenation operator ',' is now recognized.
   - Vala: Improved recognition of format strings when the string.printf method
diff --git a/gettext-tools/src/x-csharp.c b/gettext-tools/src/x-csharp.c
index 42f299c46..3d43e89bf 100644
--- a/gettext-tools/src/x-csharp.c
+++ b/gettext-tools/src/x-csharp.c
@@ -56,7 +56,9 @@
 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
 
 
-/* The C# syntax is defined in ECMA-334, second edition.  */
+/* The C# syntax is defined in ECMA-334, second edition.
+   Strings with embedded expressions are defined in
+   <https://learn.microsoft.com/en-us/dotnet/csharp/how-to/concatenate-multiple-strings#string-interpolation>.  */
 
 
 /* ====================== Keyword set customization.  ====================== */
@@ -1253,6 +1255,10 @@ enum token_type_ty
   token_type_comma,             /* , */
   token_type_dot,               /* . */
   token_type_string_literal,    /* "abc", @"abc" */
+  token_type_template,          /* $"abc" */
+  token_type_ltemplate,         /* left part of template: $"abc{ */
+  token_type_mtemplate,         /* middle part of template: }abc{ */
+  token_type_rtemplate,         /* right part of template: }abc" */
   token_type_number,            /* 1.23 */
   token_type_symbol,            /* identifier, keyword, null */
   token_type_plus,              /* + */
@@ -1265,8 +1271,8 @@ struct token_ty
 {
   token_type_ty type;
   char *string;                         /* for token_type_symbol */
-  mixed_string_ty *mixed_string;        /* for token_type_string_literal */
-  refcounted_string_list_ty *comment;   /* for token_type_string_literal */
+  mixed_string_ty *mixed_string;        /* for token_type_string_literal, token_type_template */
+  refcounted_string_list_ty *comment;   /* for token_type_string_literal, token_type_template */
   int line_number;
   int logical_line_number;
 };
@@ -1278,7 +1284,7 @@ free_token (token_ty *tp)
 {
   if (tp->type == token_type_symbol)
     free (tp->string);
-  if (tp->type == token_type_string_literal)
+  if (tp->type == token_type_string_literal || tp->type == token_type_template)
     {
       mixed_string_free (tp->mixed_string);
       drop_reference (tp->comment);
@@ -1431,9 +1437,11 @@ do_getc_escaped ()
 }
 
 /* Read a regular string literal or character literal.
-   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
-static void
-accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
+   See ECMA-334 sections 9.4.4.4., 9.4.4.5.
+   Returns one of UEOF, delimiter, delimiter2, UNL.  */
+static int
+accumulate_escaped (struct mixed_string_buffer *literal,
+                    int delimiter, int delimiter2)
 {
   int c;
 
@@ -1441,7 +1449,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
     {
       /* Use phase 3, because phase 4 elides comments.  */
       c = phase3_getc ();
-      if (c == UEOF || c == delimiter)
+      if (c == UEOF || c == delimiter || c == delimiter2)
         break;
       if (c == UNL)
         {
@@ -1461,6 +1469,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
       if (literal)
         mixed_string_buffer_append_unicode (literal, c);
     }
+  return c;
 }
 
 
@@ -1470,6 +1479,30 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
 static token_ty phase6_pushback[4];
 static int phase6_pushback_length;
 
+/* Number of open template literals $"...{  */
+static int template_literal_depth;
+
+/* Number of open '{' tokens, at each template literal level.
+   The "current" element is brace_depths[template_literal_depth].  */
+static int *brace_depths;
+/* Number of allocated elements in brace_depths.  */
+static size_t brace_depths_alloc;
+
+/* Adds a new brace_depths level after template_literal_depth was
+   incremented.  */
+static void
+new_brace_depth_level (void)
+{
+  if (template_literal_depth == brace_depths_alloc)
+    {
+      brace_depths_alloc = 2 * brace_depths_alloc + 1;
+      /* Now template_literal_depth < brace_depths_alloc.  */
+      brace_depths =
+        (int *) xrealloc (brace_depths, brace_depths_alloc * sizeof (int));
+    }
+  brace_depths[template_literal_depth] = 0;
+}
+
 static void
 phase6_get (token_ty *tp)
 {
@@ -1519,14 +1552,6 @@ phase6_get (token_ty *tp)
           tp->type = token_type_rparen;
           return;
 
-        case '{':
-          tp->type = token_type_lbrace;
-          return;
-
-        case '}':
-          tp->type = token_type_rbrace;
-          return;
-
         case ',':
           tp->type = token_type_comma;
           return;
@@ -1587,7 +1612,7 @@ phase6_get (token_ty *tp)
                                       lexical_context,
                                       logical_file_name,
                                       logical_line_number);
-            accumulate_escaped (&literal, '"');
+            accumulate_escaped (&literal, '"', '"');
             tp->mixed_string = mixed_string_buffer_result (&literal);
             tp->comment = add_reference (savable_comment);
             lexical_context = lc_outside;
@@ -1595,10 +1620,46 @@ phase6_get (token_ty *tp)
             return;
           }
 
+        case '$':
+          c = phase4_getc ();
+          if (c != '"')
+            {
+              phase4_ungetc (c);
+              /* Misc. operator.  */
+              tp->type = token_type_other;
+              return;
+            }
+          /* String with embedded expressions, a.k.a. "interpolated string".  */
+          {
+            struct mixed_string_buffer msb;
+
+            lexical_context = lc_string;
+            /* Start accumulating the string.  */
+            mixed_string_buffer_init (&msb, lexical_context,
+                                      logical_file_name, logical_line_number);
+            c = accumulate_escaped (&msb, '"', '{');
+            /* Keep line_number in sync.  */
+            msb.line_number = logical_line_number;
+            if (c == '{')
+              {
+                mixed_string_buffer_destroy (&msb);
+                tp->type = token_type_ltemplate;
+                template_literal_depth++;
+                new_brace_depth_level ();
+              }
+            else
+              {
+                tp->mixed_string = mixed_string_buffer_result (&msb);
+                tp->comment = add_reference (savable_comment);
+                tp->type = token_type_template;
+              }
+            return;
+          }
+
         case '\'':
           /* Character literal.  */
           {
-            accumulate_escaped (NULL, '\'');
+            accumulate_escaped (NULL, '\'', '\'');
             tp->type = token_type_other;
             return;
           }
@@ -1619,6 +1680,30 @@ phase6_get (token_ty *tp)
             }
           return;
 
+        case '{':
+          brace_depths[template_literal_depth]++;
+          tp->type = token_type_lbrace;
+          return;
+
+        case '}':
+          if (brace_depths[template_literal_depth] > 0)
+            brace_depths[template_literal_depth]--;
+          else if (template_literal_depth > 0)
+            {
+              /* Middle or right part of string with embedded expressions.  */
+              c = accumulate_escaped (NULL, '"', '{');
+              if (c == '{')
+                tp->type = token_type_mtemplate;
+              else
+                {
+                  tp->type = token_type_rtemplate;
+                  template_literal_depth--;
+                }
+              return;
+            }
+          tp->type = token_type_rbrace;
+          return;
+
         case '@':
           c = phase4_getc ();
           if (c == '"')
@@ -2018,6 +2103,7 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
           continue;
 
         case token_type_string_literal:
+        case token_type_template:
           {
             lex_pos_ty pos;
 
@@ -2048,6 +2134,9 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
           unref_region (inner_region);
           return true;
 
+        case token_type_ltemplate:
+        case token_type_mtemplate:
+        case token_type_rtemplate:
         case token_type_dot:
         case token_type_number:
         case token_type_plus:
@@ -2091,6 +2180,8 @@ extract_csharp (FILE *f,
 
   phase5_pushback_length = 0;
   phase6_pushback_length = 0;
+  template_literal_depth = 0;
+  new_brace_depth_level ();
   phase7_pushback_length = 0;
 
   flag_context_list_table = flag_table;
diff --git a/gettext-tools/tests/xgettext-csharp-5 b/gettext-tools/tests/xgettext-csharp-5
index 0fcb5224b..5780f1e62 100755
--- a/gettext-tools/tests/xgettext-csharp-5
+++ b/gettext-tools/tests/xgettext-csharp-5
@@ -1,7 +1,8 @@
 #!/bin/sh
 . "${srcdir=.}/init.sh"; path_prepend_ . ../src
 
-# Test C# support: --add-comments option, string concatenation.
+# Test C# support: --add-comments option, string concatenation,
+# strings with embedded expressions.
 
 cat <<\EOF > xg-cs-5.cs
 // This comment will not be extracted.
@@ -18,6 +19,10 @@ Console.WriteLine(GetString("The Fabulous Four"));
 Console.WriteLine(GetString("there is not enough" +
 " room on a single line for this entire long, " // confusing, eh?
 + "verbose string"));
+// Strings with embedded expressions, a.k.a. interpolated strings.
+Console.WriteLine(GetString($"embedded_1_{foo}_bar"));
+Console.WriteLine(GetString($"embedded_2_{GetString("embedded_2_sub1")}_bar_{GetString("embedded_2_sub2")}_baz"));
+Console.WriteLine(GetString($"embedded_3"));
 EOF
 
 : ${XGETTEXT=xgettext}
@@ -47,6 +52,15 @@ msgid ""
 "there is not enough room on a single line for this entire long, verbose "
 "string"
 msgstr ""
+
+msgid "embedded_2_sub1"
+msgstr ""
+
+msgid "embedded_2_sub2"
+msgstr ""
+
+msgid "embedded_3"
+msgstr ""
 EOF
 
 : ${DIFF=diff}