From: Bruno Haible Date: Tue, 5 Dec 2023 13:15:42 +0000 (+0100) Subject: xgettext: Python: Add support for f-strings. X-Git-Tag: v0.23~283 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=994c5188cbf9f5b9fa9b6af8fba95d9d102f5140;p=thirdparty%2Fgettext.git xgettext: Python: Add support for f-strings. Reported by Ben at . * gettext-tools/src/x-python.c (P7_498_START_OF_EXPRESSION): New macro. (token_type_498, token_type_l498, token_type_m498, token_type_r498): New enum items. (free_token): Treat token_type_498 like token_type_string. (phase7_getuc): Accept an f_string parameter. Use it to combine {{ to { and }} to }. (f_string_depth): New variable. (struct f_string_level): New type. (f_string_stack, f_string_stack_alloc): New variables. (new_f_string_level): New function. (open_pb): Renamed from open_pbb. (phase5_get): Consider also the f_string_stack. Accept f"...", fr"...", rf"..." syntax. Recognize tokens of type token_type_498, token_type_l498, token_type_m498, token_type_r498. (x_python_lex): Treat token_type_498 like token_type_string. (extract_balanced): Handle the new token types. (extract_python): Initialize f_string_depth and the f_string_stack. * gettext-tools/tests/xgettext-python-1: Test also the backslash handling in f-strings. * gettext-tools/tests/xgettext-python-8: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. * gettext-tools/doc/lang-python.texi (Python): Explain the limitations of f-string support. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index f9e7ae073..a99ecc85f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,7 @@ Version 0.23 - December 2023 * Programming languages support: + - Python: xgettext now recognizes the f-string syntax. - Vala: Improved recognition of format strings when the string.printf method is used. - Glade: xgettext has improved support for GtkBuilder 4. diff --git a/gettext-tools/doc/lang-python.texi b/gettext-tools/doc/lang-python.texi index f2226b734..08e39171a 100644 --- a/gettext-tools/doc/lang-python.texi +++ b/gettext-tools/doc/lang-python.texi @@ -1,5 +1,5 @@ @c This file is part of the GNU gettext manual. -@c Copyright (C) 1995-2021 Free Software Foundation, Inc. +@c Copyright (C) 1995-2023 Free Software Foundation, Inc. @c See the file gettext.texi for copying conditions. @node Python @@ -92,3 +92,20 @@ individual arguments from format strings like this is only possible with the named argument syntax. (With unnamed arguments, Python -- unlike C -- verifies that the format string uses all supplied arguments.) @end itemize + +A note about f-strings (PEP 498): @code{xgettext} +@itemize @bullet +@item +syntactically recognizes f-strings, +@item +is able to extract f-strings that contain no sub-expressions. +@end itemize +@noindent +However, @code{xgettext} does not extract f-strings marked for translation +that contain sub-expressions. This will not work as expected: +@smallexample +_(f"The file @{file[i]@} does not exist.") +@end smallexample +@noindent +because the translator is generally not a programmer and should thus not be +confronted with expressions from the programming language. diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c index 3873246de..581b5549b 100644 --- a/gettext-tools/src/x-python.c +++ b/gettext-tools/src/x-python.c @@ -67,7 +67,9 @@ /* The Python syntax is defined in the Python Reference Manual /usr/share/doc/packages/python/html/ref/index.html. See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, - Python-2.0/Objects/unicodeobject.c. */ + Python-2.0/Objects/unicodeobject.c. + For the f-strings, refer to https://peps.python.org/pep-0498/ + and https://docs.python.org/3/reference/lexical_analysis.html#literals . */ /* ====================== Keyword set customization. ====================== */ @@ -775,6 +777,7 @@ phase3_ungetc (int c) /* Return value of phase7_getuc when EOF is reached. */ #define P7_EOF (-1) #define P7_STRING_END (-2) +#define P7_498_START_OF_EXPRESSION (-3) /* { */ /* Convert an UTF-16 or UTF-32 code point to a return value that can be distinguished from a single-byte return value. */ @@ -801,6 +804,10 @@ enum token_type_ty token_type_lbracket, /* [ */ token_type_rbracket, /* ] */ token_type_string, /* "abc", 'abc', """abc""", '''abc''' */ + token_type_498, /* f"abc", f'abc', f"""abc""", f'''abc''' */ + token_type_l498, /* left part of f-string: f"abc{, f'abc{, f"""abc{, f'''abc{ */ + token_type_m498, /* middle part of f-string: }abc{ */ + token_type_r498, /* right part of f-string: }abc", }abc', }abc""", }abc''' */ token_type_symbol, /* symbol, number */ token_type_plus, /* + */ token_type_other /* misc. operator */ @@ -811,9 +818,9 @@ typedef struct token_ty token_ty; struct token_ty { token_type_ty type; - char *string; /* for token_type_symbol */ - mixed_string_ty *mixed_string; /* for token_type_string */ - refcounted_string_list_ty *comment; /* for token_type_string */ + char *string; /* for token_type_symbol */ + mixed_string_ty *mixed_string; /* for token_type_string, token_type_498 */ + refcounted_string_list_ty *comment; /* for token_type_string, token_type_498 */ int line_number; }; @@ -823,7 +830,7 @@ free_token (token_ty *tp) { if (tp->type == token_type_symbol) free (tp->string); - if (tp->type == token_type_string) + if (tp->type == token_type_string || tp->type == token_type_498) { mixed_string_free (tp->mixed_string); drop_reference (tp->comment); @@ -847,6 +854,7 @@ free_token (token_ty *tp) static int phase7_getuc (int quote_char, bool triple, bool interpret_ansic, bool interpret_unicode, + bool f_string, unsigned int *backslash_counter) { int c; @@ -900,6 +908,25 @@ phase7_getuc (int quote_char, return P7_STRING_END; } + if (f_string) + { + if (c == '{') + { + int c1 = phase2_getc (); + if (c1 == '{') + return UNICODE ('{'); + phase2_ungetc (c1); + return P7_498_START_OF_EXPRESSION; + } + if (c == '}') + { + int c1 = phase2_getc (); + if (c1 == '}') + return UNICODE ('}'); + phase2_ungetc (c1); + } + } + if (c != '\\') { *backslash_counter = 0; @@ -1168,8 +1195,49 @@ phase7_getuc (int quote_char, /* Combine characters into tokens. Discard whitespace except newlines at the end of logical lines. */ -/* Number of pending open parentheses/braces/brackets. */ -static int open_pbb; +/* Number of open f-strings f"...{ or f'...{ or f"""...{ or f'''...{ or + fr"...{ or fr'...{ or fr"""...{ or fr'''...{ */ +static int f_string_depth; + +/* Information per f-string nesting level. */ +struct f_string_level +{ + /* Describes the start and end sequence of the f-string. + Only relevant for levels > 0. */ + int quote_char; + bool interpret_ansic; + bool triple; + /* Number of open '{' tokens. */ + int brace_depth; +}; + +/* Stack of f-string nesting levels. + The "current" element is f_string_stack[f_string_depth]. */ +static struct f_string_level *f_string_stack; +/* Number of allocated elements in f_string_stack. */ +static size_t f_string_stack_alloc; + +/* Adds a new f_string_stack level after f_string_depth was incremented. */ +static void +new_f_string_level (int quote_char, bool interpret_ansic, bool triple) +{ + if (f_string_depth == f_string_stack_alloc) + { + f_string_stack_alloc = 2 * f_string_stack_alloc + 1; + /* Now f_string_depth < f_string_stack_alloc. */ + f_string_stack = + (struct f_string_level *) + xrealloc (f_string_stack, + f_string_stack_alloc * sizeof (struct f_string_level)); + } + f_string_stack[f_string_depth].quote_char = quote_char; + f_string_stack[f_string_depth].interpret_ansic = interpret_ansic; + f_string_stack[f_string_depth].triple = triple; + f_string_stack[f_string_depth].brace_depth = 0; +} + +/* Number of pending open parentheses/brackets. */ +static int open_pb; static token_ty phase5_pushback[2]; static int phase5_pushback_length; @@ -1207,7 +1275,7 @@ phase5_get (token_ty *tp) savable_comment_reset (); /* Ignore newline if and only if it is used for implicit line joining. */ - if (open_pbb > 0) + if (open_pb > 0 || f_string_stack[f_string_depth].brace_depth > 0) continue; tp->type = token_type_other; return; @@ -1229,13 +1297,13 @@ phase5_get (token_ty *tp) } } FALLTHROUGH; - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'S': case 'T': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 's': case 't': case 'v': case 'w': case 'x': @@ -1297,9 +1365,38 @@ phase5_get (token_ty *tp) int quote_char; bool interpret_ansic; bool interpret_unicode; + bool f_string; bool triple; unsigned int backslash_counter; + case 'F': case 'f': + { + int c1 = phase2_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = true; + interpret_unicode = false; + f_string = true; + goto string; + } + if (c1 == 'R' || c1 == 'r') + { + int c2 = phase2_getc (); + if (c2 == '"' || c2 == '\'') + { + quote_char = c2; + interpret_ansic = false; + interpret_unicode = false; + f_string = true; + goto string; + } + phase2_ungetc (c2); + } + phase2_ungetc (c1); + goto symbol; + } + case 'R': case 'r': { int c1 = phase2_getc (); @@ -1308,8 +1405,22 @@ phase5_get (token_ty *tp) quote_char = c1; interpret_ansic = false; interpret_unicode = false; + f_string = false; goto string; } + if (c1 == 'F' || c1 == 'f') + { + int c2 = phase2_getc (); + if (c2 == '"' || c2 == '\'') + { + quote_char = c2; + interpret_ansic = false; + interpret_unicode = false; + f_string = true; + goto string; + } + phase2_ungetc (c2); + } phase2_ungetc (c1); goto symbol; } @@ -1322,6 +1433,7 @@ phase5_get (token_ty *tp) quote_char = c1; interpret_ansic = true; interpret_unicode = true; + f_string = false; goto string; } if (c1 == 'R' || c1 == 'r') @@ -1332,6 +1444,7 @@ phase5_get (token_ty *tp) quote_char = c2; interpret_ansic = false; interpret_unicode = true; + f_string = false; goto string; } phase2_ungetc (c2); @@ -1344,6 +1457,7 @@ phase5_get (token_ty *tp) quote_char = c; interpret_ansic = true; interpret_unicode = false; + f_string = false; string: triple = false; lexical_context = lc_string; @@ -1373,13 +1487,28 @@ phase5_get (token_ty *tp) for (;;) { int uc = phase7_getuc (quote_char, triple, interpret_ansic, - interpret_unicode, &backslash_counter); + interpret_unicode, f_string, + &backslash_counter); /* Keep line_number in sync. */ msb.line_number = line_number; if (uc == P7_EOF || uc == P7_STRING_END) - break; + { + tp->mixed_string = mixed_string_buffer_result (&msb); + tp->comment = add_reference (savable_comment); + tp->type = (f_string ? token_type_498 : token_type_string); + break; + } + + if (uc == P7_498_START_OF_EXPRESSION) /* implies f_string */ + { + mixed_string_buffer_destroy (&msb); + tp->type = token_type_l498; + f_string_depth++; + new_f_string_level (quote_char, interpret_ansic, triple); + break; + } if (IS_UNICODE (uc)) { @@ -1391,22 +1520,58 @@ phase5_get (token_ty *tp) else mixed_string_buffer_append_char (&msb, uc); } - tp->mixed_string = mixed_string_buffer_result (&msb); - tp->comment = add_reference (savable_comment); lexical_context = lc_outside; - tp->type = token_type_string; } return; } + case '{': + f_string_stack[f_string_depth].brace_depth++; + tp->type = token_type_other; + return; + + case '}': + if (f_string_stack[f_string_depth].brace_depth > 0) + f_string_stack[f_string_depth].brace_depth--; + else if (f_string_depth > 0) + { + /* Middle or right part of f-string. */ + int quote_char = f_string_stack[f_string_depth].quote_char; + bool interpret_ansic = f_string_stack[f_string_depth].interpret_ansic; + bool triple = f_string_stack[f_string_depth].triple; + unsigned int backslash_counter = 0; + for (;;) + { + int uc = phase7_getuc (quote_char, triple, interpret_ansic, + false, true, + &backslash_counter); + + if (uc == P7_EOF || uc == P7_STRING_END) + { + tp->type = token_type_r498; + f_string_depth--; + break; + } + + if (uc == P7_498_START_OF_EXPRESSION) + { + tp->type = token_type_m498; + break; + } + } + return; + } + tp->type = token_type_other; + return; + case '(': - open_pbb++; + open_pb++; tp->type = token_type_lparen; return; case ')': - if (open_pbb > 0) - open_pbb--; + if (open_pb > 0) + open_pb--; tp->type = token_type_rparen; return; @@ -1414,15 +1579,15 @@ phase5_get (token_ty *tp) tp->type = token_type_comma; return; - case '[': case '{': - open_pbb++; - tp->type = (c == '[' ? token_type_lbracket : token_type_other); + case '[': + open_pb++; + tp->type = token_type_lbracket; return; - case ']': case '}': - if (open_pbb > 0) - open_pbb--; - tp->type = (c == ']' ? token_type_rbracket : token_type_other); + case ']': + if (open_pb > 0) + open_pb--; + tp->type = token_type_rbracket; return; case '+': @@ -1460,7 +1625,7 @@ static void x_python_lex (token_ty *tp) { phase5_get (tp); - if (tp->type == token_type_string) + if (tp->type == token_type_string || tp->type == token_type_498) { mixed_string_ty *sum = tp->mixed_string; @@ -1476,7 +1641,8 @@ x_python_lex (token_ty *tp) case token_type_plus: { phase5_get (&token3); - if (token3.type == token_type_string) + if (token3.type == token_type_string + || token3.type == token_type_498) { free_token (&token2); tp2 = &token3; @@ -1486,6 +1652,7 @@ x_python_lex (token_ty *tp) } break; case token_type_string: + case token_type_498: tp2 = &token2; break; default: @@ -1667,6 +1834,7 @@ extract_balanced (message_list_ty *mlp, continue; case token_type_string: + case token_type_498: { lex_pos_ty pos; @@ -1697,6 +1865,9 @@ extract_balanced (message_list_ty *mlp, unref_region (inner_region); return true; + case token_type_l498: + case token_type_m498: + case token_type_r498: case token_type_plus: case token_type_other: next_context_iter = null_context_list_iterator; @@ -1748,10 +1919,13 @@ extract_python (FILE *f, continuation_or_nonblank_line = false; - open_pbb = 0; + open_pb = 0; phase5_pushback_length = 0; + f_string_depth = 0; + new_f_string_level (0, false, false); + flag_context_list_table = flag_table; paren_nesting_depth = 0; bracket_nesting_depth = 0; diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 4e432e63a..9e37ff142 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -146,7 +146,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-rst-1 xgettext-rst-2 \ xgettext-python-1 xgettext-python-2 xgettext-python-3 \ xgettext-python-4 xgettext-python-5 xgettext-python-6 \ - xgettext-python-7 \ + xgettext-python-7 xgettext-python-8 \ xgettext-python-stackovfl-1 xgettext-python-stackovfl-2 \ xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \ xgettext-ruby-1 \ diff --git a/gettext-tools/tests/xgettext-python-1 b/gettext-tools/tests/xgettext-python-1 index 9695abda4..ec7f92d9a 100755 --- a/gettext-tools/tests/xgettext-python-1 +++ b/gettext-tools/tests/xgettext-python-1 @@ -4,22 +4,30 @@ # Test of Python support. cat <<\EOF > xg-py-1.py -# interpret_ansic = true, interpret_unicode = false +# interpret_ansic = true, interpret_unicode = false, f_string = false _("abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); -# interpret_ansic = false, interpret_unicode = false +# interpret_ansic = false, interpret_unicode = false, f_string = false _(r"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); -# interpret_ansic = true, interpret_unicode = true +# interpret_ansic = true, interpret_unicode = true, f_string = false _(u"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); -# interpret_ansic = false, interpret_unicode = true +# interpret_ansic = false, interpret_unicode = true, f_string = false _(ur"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); +# interpret_ansic = true, interpret_unicode = false, f_string = true +_(f"abc\ +\\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123"); + +# interpret_ansic = false, interpret_unicode = false, f_string = true +_(fr"Abc\ +\\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123"); + # This will not be extracted. _(CATEGORIES["default"]["name"]); @@ -52,32 +60,47 @@ msgstr "" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -#. interpret_ansic = true, interpret_unicode = false +#. interpret_ansic = true, interpret_unicode = false, f_string = false msgid "" "abc\\def'ghi\"jkl\a\b\f\n" "\r\t\v x x~y\\u0142\\U00010123\\N{LATIN SMALL LETTER Z}" msgstr "" -#. interpret_ansic = false, interpret_unicode = false +#. interpret_ansic = false, interpret_unicode = false, f_string = false msgid "" "abc\\\n" "\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v " "x\\040x\\x7ey\\u0142\\U00010123\\N{LATIN SMALL LETTER Z}" msgstr "" -#. interpret_ansic = true, interpret_unicode = true +#. interpret_ansic = true, interpret_unicode = true, f_string = false msgid "" "abc\\def'ghi\"jkl\a\b\f\n" "\r\t\v x x~ył𐄣z" msgstr "" -#. interpret_ansic = false, interpret_unicode = true +#. interpret_ansic = false, interpret_unicode = true, f_string = false msgid "" "abc\\\n" "\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v x\\040x\\x7eył\\U00010123\\N{LATIN " "SMALL LETTER Z}" msgstr "" +#. interpret_ansic = true, interpret_unicode = false, f_string = true +msgid "" +"abc\\def'ghi\"jkl{m\\{n\\{o\\\\{p\\\\{q\\\\\\{r}s\\}t\\}u\\\\}v\\\\}w\\\\\\}" +"x\a\b\f\n" +"\r\t\v x x~y\\u0142\\U00010123" +msgstr "" + +#. interpret_ansic = false, interpret_unicode = false, f_string = true +msgid "" +"Abc\\\n" +"\\\\def\\'ghi\\\"jkl{m\\{n\\\\{o\\\\\\{p\\\\\\\\{q\\\\\\\\\\{r}s\\}t\\\\}u\\" +"\\\\}v\\\\\\\\}w\\\\\\\\\\}x\\a\\b\\f\\n\\r\\t\\v " +"x\\040x\\x7ey\\u0142\\U00010123" +msgstr "" + #. string concatenation msgid "abcdefghijkl" msgstr "" diff --git a/gettext-tools/tests/xgettext-python-8 b/gettext-tools/tests/xgettext-python-8 new file mode 100755 index 000000000..9910be3c5 --- /dev/null +++ b/gettext-tools/tests/xgettext-python-8 @@ -0,0 +1,73 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test of Python f-string support. + +cat <<\EOF > xg-py-8.py +s0 = _(fr'An f-string without substitutions'); +s1 = _(fr'''An f-string with +embedded +newlines'''); +s2 = _(fr'An f-string with {n} substitutions'); +s3 = _(fr'An f-string with several substitutions: {a} and {b} and {c} and so on'); +s4 = fr"that's a valid string. " + _('This too'); +s5 = fr'''a{fr'b{fr"c"+d}'}e'''; +s6 = _("a normal string"); +s7 = fr'abc{foo({},_('should be extracted'))}xyz'; +return _("first normal string") + fr'{foo}' + _("second normal string"); +EOF + +: ${XGETTEXT=xgettext} +${XGETTEXT} --add-comments --no-location -o xg-py-8.tmp xg-py-8.py 2>xg-py-8.err +test $? = 0 || { cat xg-py-8.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-8.tmp xg-py-8.pot + +cat <<\EOF > xg-py-8.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=CHARSET\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "An f-string without substitutions" +msgstr "" + +msgid "" +"An f-string with\n" +"embedded\n" +"newlines" +msgstr "" + +msgid "This too" +msgstr "" + +msgid "a normal string" +msgstr "" + +msgid "should be extracted" +msgstr "" + +msgid "first normal string" +msgstr "" + +msgid "second normal string" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-py-8.ok xg-py-8.pot +result=$? + +exit $result