xgettext: C#: Recognize strings with embedded expressions.

author Bruno Haible <bruno@clisp.org>

Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)

committer Bruno Haible <bruno@clisp.org>

Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)
author Bruno Haible <bruno@clisp.org>
Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)
committer Bruno Haible <bruno@clisp.org>
Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)
diff --git a/NEWS b/NEWS

index 0f78e3644011e17ef20a9f692d70c78b64084845..687f8ae3dce243105367645b521fe7dfcebdf66a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,8 @@ Version 0.23 - September 2024
      o xgettext now recognizes comments of the form '#; <expression>'.
    - Java: Improved recognition of format strings when the String.formatted
      method is used.
+  - C#: Strings with embedded expressions (a.k.a. interpolated strings) are now
+    recognized.
    - awk: String concatenation by juxtaposition is now recognized.
    - Smalltalk: The string concatenation operator ',' is now recognized.
    - Vala: Improved recognition of format strings when the string.printf method
diff --git a/gettext-tools/src/x-csharp.c b/gettext-tools/src/x-csharp.c

index 42f299c4645396b974735cb1d524702ee04893a8..3d43e89bf7413e602617b6f363c3dc7dc3c1a760 100644 (file)
--- a/gettext-tools/src/x-csharp.c
+++ b/gettext-tools/src/x-csharp.c
@@ -56,7 +56,9 @@
  #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  
  
-/* The C# syntax is defined in ECMA-334, second edition.  */
+/* The C# syntax is defined in ECMA-334, second edition.
+   Strings with embedded expressions are defined in
+   <https://learn.microsoft.com/en-us/dotnet/csharp/how-to/concatenate-multiple-strings#string-interpolation>.  */
  
  
  /* ====================== Keyword set customization.  ====================== */
@@ -1253,6 +1255,10 @@ enum token_type_ty
    token_type_comma,             /* , */
    token_type_dot,               /* . */
    token_type_string_literal,    /* "abc", @"abc" */
+  token_type_template,          /* $"abc" */
+  token_type_ltemplate,         /* left part of template: $"abc{ */
+  token_type_mtemplate,         /* middle part of template: }abc{ */
+  token_type_rtemplate,         /* right part of template: }abc" */
    token_type_number,            /* 1.23 */
    token_type_symbol,            /* identifier, keyword, null */
    token_type_plus,              /* + */
@@ -1265,8 +1271,8 @@ struct token_ty
  {
    token_type_ty type;
    char *string;                         /* for token_type_symbol */
-  mixed_string_ty *mixed_string;        /* for token_type_string_literal */
-  refcounted_string_list_ty *comment;   /* for token_type_string_literal */
+  mixed_string_ty *mixed_string;        /* for token_type_string_literal, token_type_template */
+  refcounted_string_list_ty *comment;   /* for token_type_string_literal, token_type_template */
    int line_number;
    int logical_line_number;
  };
@@ -1278,7 +1284,7 @@ free_token (token_ty *tp)
  {
    if (tp->type == token_type_symbol)
      free (tp->string);
-  if (tp->type == token_type_string_literal)
+  if (tp->type == token_type_string_literal || tp->type == token_type_template)
      {
        mixed_string_free (tp->mixed_string);
        drop_reference (tp->comment);
@@ -1431,9 +1437,11 @@ do_getc_escaped ()
  }
  
  /* Read a regular string literal or character literal.
-   See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
-static void
-accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
+   See ECMA-334 sections 9.4.4.4., 9.4.4.5.
+   Returns one of UEOF, delimiter, delimiter2, UNL.  */
+static int
+accumulate_escaped (struct mixed_string_buffer *literal,
+                    int delimiter, int delimiter2)
  {
    int c;
  
@@ -1441,7 +1449,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
      {
        /* Use phase 3, because phase 4 elides comments.  */
        c = phase3_getc ();
-      if (c == UEOF || c == delimiter)
+      if (c == UEOF || c == delimiter || c == delimiter2)
          break;
        if (c == UNL)
          {
@@ -1461,6 +1469,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
        if (literal)
          mixed_string_buffer_append_unicode (literal, c);
      }
+  return c;
  }
  
  
@@ -1470,6 +1479,30 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
  static token_ty phase6_pushback[4];
  static int phase6_pushback_length;
  
+/* Number of open template literals $"...{  */
+static int template_literal_depth;
+
+/* Number of open '{' tokens, at each template literal level.
+   The "current" element is brace_depths[template_literal_depth].  */
+static int *brace_depths;
+/* Number of allocated elements in brace_depths.  */
+static size_t brace_depths_alloc;
+
+/* Adds a new brace_depths level after template_literal_depth was
+   incremented.  */
+static void
+new_brace_depth_level (void)
+{
+  if (template_literal_depth == brace_depths_alloc)
+    {
+      brace_depths_alloc = 2 * brace_depths_alloc + 1;
+      /* Now template_literal_depth < brace_depths_alloc.  */
+      brace_depths =
+        (int *) xrealloc (brace_depths, brace_depths_alloc * sizeof (int));
+    }
+  brace_depths[template_literal_depth] = 0;
+}
+
  static void
  phase6_get (token_ty *tp)
  {
@@ -1519,14 +1552,6 @@ phase6_get (token_ty *tp)
            tp->type = token_type_rparen;
            return;
  
-        case '{':
-          tp->type = token_type_lbrace;
-          return;
-
-        case '}':
-          tp->type = token_type_rbrace;
-          return;
-
          case ',':
            tp->type = token_type_comma;
            return;
@@ -1587,7 +1612,7 @@ phase6_get (token_ty *tp)
                                        lexical_context,
                                        logical_file_name,
                                        logical_line_number);
-            accumulate_escaped (&literal, '"');
+            accumulate_escaped (&literal, '"', '"');
              tp->mixed_string = mixed_string_buffer_result (&literal);
              tp->comment = add_reference (savable_comment);
              lexical_context = lc_outside;
@@ -1595,10 +1620,46 @@ phase6_get (token_ty *tp)
              return;
            }
  
+        case '$':
+          c = phase4_getc ();
+          if (c != '"')
+            {
+              phase4_ungetc (c);
+              /* Misc. operator.  */
+              tp->type = token_type_other;
+              return;
+            }
+          /* String with embedded expressions, a.k.a. "interpolated string".  */
+          {
+            struct mixed_string_buffer msb;
+
+            lexical_context = lc_string;
+            /* Start accumulating the string.  */
+            mixed_string_buffer_init (&msb, lexical_context,
+                                      logical_file_name, logical_line_number);
+            c = accumulate_escaped (&msb, '"', '{');
+            /* Keep line_number in sync.  */
+            msb.line_number = logical_line_number;
+            if (c == '{')
+              {
+                mixed_string_buffer_destroy (&msb);
+                tp->type = token_type_ltemplate;
+                template_literal_depth++;
+                new_brace_depth_level ();
+              }
+            else
+              {
+                tp->mixed_string = mixed_string_buffer_result (&msb);
+                tp->comment = add_reference (savable_comment);
+                tp->type = token_type_template;
+              }
+            return;
+          }
+
          case '\'':
            /* Character literal.  */
            {
-            accumulate_escaped (NULL, '\'');
+            accumulate_escaped (NULL, '\'', '\'');
              tp->type = token_type_other;
              return;
            }
@@ -1619,6 +1680,30 @@ phase6_get (token_ty *tp)
              }
            return;
  
+        case '{':
+          brace_depths[template_literal_depth]++;
+          tp->type = token_type_lbrace;
+          return;
+
+        case '}':
+          if (brace_depths[template_literal_depth] > 0)
+            brace_depths[template_literal_depth]--;
+          else if (template_literal_depth > 0)
+            {
+              /* Middle or right part of string with embedded expressions.  */
+              c = accumulate_escaped (NULL, '"', '{');
+              if (c == '{')
+                tp->type = token_type_mtemplate;
+              else
+                {
+                  tp->type = token_type_rtemplate;
+                  template_literal_depth--;
+                }
+              return;
+            }
+          tp->type = token_type_rbrace;
+          return;
+
          case '@':
            c = phase4_getc ();
            if (c == '"')
@@ -2018,6 +2103,7 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
            continue;
  
          case token_type_string_literal:
+        case token_type_template:
            {
              lex_pos_ty pos;
  
@@ -2048,6 +2134,9 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
            unref_region (inner_region);
            return true;
  
+        case token_type_ltemplate:
+        case token_type_mtemplate:
+        case token_type_rtemplate:
          case token_type_dot:
          case token_type_number:
          case token_type_plus:
@@ -2091,6 +2180,8 @@ extract_csharp (FILE *f,
  
    phase5_pushback_length = 0;
    phase6_pushback_length = 0;
+  template_literal_depth = 0;
+  new_brace_depth_level ();
    phase7_pushback_length = 0;
  
    flag_context_list_table = flag_table;
diff --git a/gettext-tools/tests/xgettext-csharp-5 b/gettext-tools/tests/xgettext-csharp-5

index 0fcb5224b967ae5399a4da0594696910bfa89eab..5780f1e620c0e078a6131f058fd763587a4ab7fa 100755 (executable)
--- a/gettext-tools/tests/xgettext-csharp-5
+++ b/gettext-tools/tests/xgettext-csharp-5
@@ -1,7 +1,8 @@
  #!/bin/sh
  . "${srcdir=.}/init.sh"; path_prepend_ . ../src
  
-# Test C# support: --add-comments option, string concatenation.
+# Test C# support: --add-comments option, string concatenation,
+# strings with embedded expressions.
  
  cat <<\EOF > xg-cs-5.cs
  // This comment will not be extracted.
@@ -18,6 +19,10 @@ Console.WriteLine(GetString("The Fabulous Four"));
  Console.WriteLine(GetString("there is not enough" +
  " room on a single line for this entire long, " // confusing, eh?
  + "verbose string"));
+// Strings with embedded expressions, a.k.a. interpolated strings.
+Console.WriteLine(GetString($"embedded_1_{foo}_bar"));
+Console.WriteLine(GetString($"embedded_2_{GetString("embedded_2_sub1")}_bar_{GetString("embedded_2_sub2")}_baz"));
+Console.WriteLine(GetString($"embedded_3"));
  EOF
  
  : ${XGETTEXT=xgettext}
@@ -47,6 +52,15 @@ msgid ""
  "there is not enough room on a single line for this entire long, verbose "
  "string"
  msgstr ""
+
+msgid "embedded_2_sub1"
+msgstr ""
+
+msgid "embedded_2_sub2"
+msgstr ""
+
+msgid "embedded_3"
+msgstr ""
  EOF
  
  : ${DIFF=diff}
author	Bruno Haible <bruno@clisp.org>
	Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)
committer	Bruno Haible <bruno@clisp.org>
	Wed, 18 Sep 2024 15:00:04 +0000 (17:00 +0200)
NEWS		patch \| blob \| blame \| history
gettext-tools/src/x-csharp.c		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-csharp-5		patch \| blob \| blame \| history