From: Bruno Haible Date: Wed, 18 Sep 2024 15:00:04 +0000 (+0200) Subject: xgettext: C#: Recognize strings with embedded expressions. X-Git-Tag: v0.23~128 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4391a42f1540ad20aa6e1b7fdc87c8e0e79e34e6;p=thirdparty%2Fgettext.git xgettext: C#: Recognize strings with embedded expressions. * gettext-tools/src/x-csharp.c (token_type_template, token_type_ltemplate, token_type_mtemplate, token_type_rtemplate): New enum items. (free_token): Treat token_type_template like token_type_string_literal. (accumulate_escaped): Accept a second delimiter argument. Return the delimiter. (template_literal_depth, brace_depths, brace_depths_alloc): New variables, copied from x-javascript.c. (new_brace_depth_level): New function, copied from x-javascript.c. (phase6_get): Recognize strings with embedded expressions. Handle braces in a nested manner. (extract_parenthesized): Handle the token types token_type_template, token_type_ltemplate, token_type_mtemplate, token_type_rtemplate. (extract_csharp): Initialize template_literal_depth and call new_brace_depth_level. * gettext-tools/tests/xgettext-csharp-5: Add tests of strings with embedded expressions. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 0f78e3644..687f8ae3d 100644 --- a/NEWS +++ b/NEWS @@ -16,6 +16,8 @@ Version 0.23 - September 2024 o xgettext now recognizes comments of the form '#; '. - Java: Improved recognition of format strings when the String.formatted method is used. + - C#: Strings with embedded expressions (a.k.a. interpolated strings) are now + recognized. - awk: String concatenation by juxtaposition is now recognized. - Smalltalk: The string concatenation operator ',' is now recognized. - Vala: Improved recognition of format strings when the string.printf method diff --git a/gettext-tools/src/x-csharp.c b/gettext-tools/src/x-csharp.c index 42f299c46..3d43e89bf 100644 --- a/gettext-tools/src/x-csharp.c +++ b/gettext-tools/src/x-csharp.c @@ -56,7 +56,9 @@ #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) -/* The C# syntax is defined in ECMA-334, second edition. */ +/* The C# syntax is defined in ECMA-334, second edition. + Strings with embedded expressions are defined in + . */ /* ====================== Keyword set customization. ====================== */ @@ -1253,6 +1255,10 @@ enum token_type_ty token_type_comma, /* , */ token_type_dot, /* . */ token_type_string_literal, /* "abc", @"abc" */ + token_type_template, /* $"abc" */ + token_type_ltemplate, /* left part of template: $"abc{ */ + token_type_mtemplate, /* middle part of template: }abc{ */ + token_type_rtemplate, /* right part of template: }abc" */ token_type_number, /* 1.23 */ token_type_symbol, /* identifier, keyword, null */ token_type_plus, /* + */ @@ -1265,8 +1271,8 @@ struct token_ty { token_type_ty type; char *string; /* for token_type_symbol */ - mixed_string_ty *mixed_string; /* for token_type_string_literal */ - refcounted_string_list_ty *comment; /* for token_type_string_literal */ + mixed_string_ty *mixed_string; /* for token_type_string_literal, token_type_template */ + refcounted_string_list_ty *comment; /* for token_type_string_literal, token_type_template */ int line_number; int logical_line_number; }; @@ -1278,7 +1284,7 @@ free_token (token_ty *tp) { if (tp->type == token_type_symbol) free (tp->string); - if (tp->type == token_type_string_literal) + if (tp->type == token_type_string_literal || tp->type == token_type_template) { mixed_string_free (tp->mixed_string); drop_reference (tp->comment); @@ -1431,9 +1437,11 @@ do_getc_escaped () } /* Read a regular string literal or character literal. - See ECMA-334 sections 9.4.4.4., 9.4.4.5. */ -static void -accumulate_escaped (struct mixed_string_buffer *literal, int delimiter) + See ECMA-334 sections 9.4.4.4., 9.4.4.5. + Returns one of UEOF, delimiter, delimiter2, UNL. */ +static int +accumulate_escaped (struct mixed_string_buffer *literal, + int delimiter, int delimiter2) { int c; @@ -1441,7 +1449,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter) { /* Use phase 3, because phase 4 elides comments. */ c = phase3_getc (); - if (c == UEOF || c == delimiter) + if (c == UEOF || c == delimiter || c == delimiter2) break; if (c == UNL) { @@ -1461,6 +1469,7 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter) if (literal) mixed_string_buffer_append_unicode (literal, c); } + return c; } @@ -1470,6 +1479,30 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter) static token_ty phase6_pushback[4]; static int phase6_pushback_length; +/* Number of open template literals $"...{ */ +static int template_literal_depth; + +/* Number of open '{' tokens, at each template literal level. + The "current" element is brace_depths[template_literal_depth]. */ +static int *brace_depths; +/* Number of allocated elements in brace_depths. */ +static size_t brace_depths_alloc; + +/* Adds a new brace_depths level after template_literal_depth was + incremented. */ +static void +new_brace_depth_level (void) +{ + if (template_literal_depth == brace_depths_alloc) + { + brace_depths_alloc = 2 * brace_depths_alloc + 1; + /* Now template_literal_depth < brace_depths_alloc. */ + brace_depths = + (int *) xrealloc (brace_depths, brace_depths_alloc * sizeof (int)); + } + brace_depths[template_literal_depth] = 0; +} + static void phase6_get (token_ty *tp) { @@ -1519,14 +1552,6 @@ phase6_get (token_ty *tp) tp->type = token_type_rparen; return; - case '{': - tp->type = token_type_lbrace; - return; - - case '}': - tp->type = token_type_rbrace; - return; - case ',': tp->type = token_type_comma; return; @@ -1587,7 +1612,7 @@ phase6_get (token_ty *tp) lexical_context, logical_file_name, logical_line_number); - accumulate_escaped (&literal, '"'); + accumulate_escaped (&literal, '"', '"'); tp->mixed_string = mixed_string_buffer_result (&literal); tp->comment = add_reference (savable_comment); lexical_context = lc_outside; @@ -1595,10 +1620,46 @@ phase6_get (token_ty *tp) return; } + case '$': + c = phase4_getc (); + if (c != '"') + { + phase4_ungetc (c); + /* Misc. operator. */ + tp->type = token_type_other; + return; + } + /* String with embedded expressions, a.k.a. "interpolated string". */ + { + struct mixed_string_buffer msb; + + lexical_context = lc_string; + /* Start accumulating the string. */ + mixed_string_buffer_init (&msb, lexical_context, + logical_file_name, logical_line_number); + c = accumulate_escaped (&msb, '"', '{'); + /* Keep line_number in sync. */ + msb.line_number = logical_line_number; + if (c == '{') + { + mixed_string_buffer_destroy (&msb); + tp->type = token_type_ltemplate; + template_literal_depth++; + new_brace_depth_level (); + } + else + { + tp->mixed_string = mixed_string_buffer_result (&msb); + tp->comment = add_reference (savable_comment); + tp->type = token_type_template; + } + return; + } + case '\'': /* Character literal. */ { - accumulate_escaped (NULL, '\''); + accumulate_escaped (NULL, '\'', '\''); tp->type = token_type_other; return; } @@ -1619,6 +1680,30 @@ phase6_get (token_ty *tp) } return; + case '{': + brace_depths[template_literal_depth]++; + tp->type = token_type_lbrace; + return; + + case '}': + if (brace_depths[template_literal_depth] > 0) + brace_depths[template_literal_depth]--; + else if (template_literal_depth > 0) + { + /* Middle or right part of string with embedded expressions. */ + c = accumulate_escaped (NULL, '"', '{'); + if (c == '{') + tp->type = token_type_mtemplate; + else + { + tp->type = token_type_rtemplate; + template_literal_depth--; + } + return; + } + tp->type = token_type_rbrace; + return; + case '@': c = phase4_getc (); if (c == '"') @@ -2018,6 +2103,7 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, continue; case token_type_string_literal: + case token_type_template: { lex_pos_ty pos; @@ -2048,6 +2134,9 @@ extract_parenthesized (message_list_ty *mlp, token_type_ty terminator, unref_region (inner_region); return true; + case token_type_ltemplate: + case token_type_mtemplate: + case token_type_rtemplate: case token_type_dot: case token_type_number: case token_type_plus: @@ -2091,6 +2180,8 @@ extract_csharp (FILE *f, phase5_pushback_length = 0; phase6_pushback_length = 0; + template_literal_depth = 0; + new_brace_depth_level (); phase7_pushback_length = 0; flag_context_list_table = flag_table; diff --git a/gettext-tools/tests/xgettext-csharp-5 b/gettext-tools/tests/xgettext-csharp-5 index 0fcb5224b..5780f1e62 100755 --- a/gettext-tools/tests/xgettext-csharp-5 +++ b/gettext-tools/tests/xgettext-csharp-5 @@ -1,7 +1,8 @@ #!/bin/sh . "${srcdir=.}/init.sh"; path_prepend_ . ../src -# Test C# support: --add-comments option, string concatenation. +# Test C# support: --add-comments option, string concatenation, +# strings with embedded expressions. cat <<\EOF > xg-cs-5.cs // This comment will not be extracted. @@ -18,6 +19,10 @@ Console.WriteLine(GetString("The Fabulous Four")); Console.WriteLine(GetString("there is not enough" + " room on a single line for this entire long, " // confusing, eh? + "verbose string")); +// Strings with embedded expressions, a.k.a. interpolated strings. +Console.WriteLine(GetString($"embedded_1_{foo}_bar")); +Console.WriteLine(GetString($"embedded_2_{GetString("embedded_2_sub1")}_bar_{GetString("embedded_2_sub2")}_baz")); +Console.WriteLine(GetString($"embedded_3")); EOF : ${XGETTEXT=xgettext} @@ -47,6 +52,15 @@ msgid "" "there is not enough room on a single line for this entire long, verbose " "string" msgstr "" + +msgid "embedded_2_sub1" +msgstr "" + +msgid "embedded_2_sub2" +msgstr "" + +msgid "embedded_3" +msgstr "" EOF : ${DIFF=diff}