From: Bruno Haible Date: Wed, 14 Aug 2024 16:18:37 +0000 (+0200) Subject: xgettext: Python: Change escape sequences interpretation to follow Python 3. X-Git-Tag: v0.23~167 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=d3ef74eee8be555cc83d19cb4c60fde8b36b6270;p=thirdparty%2Fgettext.git xgettext: Python: Change escape sequences interpretation to follow Python 3. Reported by Serhii Tereshchenko at . * gettext-tools/src/x-python.c (phase7_getuc): Update comments for Python 3. Don't interpret \u sequences in raw bytes and raw strings. (phase5_get): Recognize bytes prefixes 'b' and 'B'. For f-strings, invoke phase7_getuc with interpret_unicode = true, not false. * gettext-tools/tests/xgettext-python-1: Update for Python 3. * gettext-tools/tests/xgettext-python-5: Likewise. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index ccd390ce6..ecbc45064 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,10 @@ Version 0.23 - August 2024 * Programming languages support: - - Python: xgettext now recognizes the f-string syntax. + - Python: + o xgettext now assumes source code for Python 3 rather than Python 2. + This affects the interpretation of escape sequences in string literals. + o xgettext now recognizes the f-string syntax. - Java: Improved recognition of format strings when the String.formatted method is used. - Vala: Improved recognition of format strings when the string.printf method diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c index f1a157ec7..42bfd9d2d 100644 --- a/gettext-tools/src/x-python.c +++ b/gettext-tools/src/x-python.c @@ -66,8 +66,8 @@ /* The Python syntax is defined in the Python Reference Manual /usr/share/doc/packages/python/html/ref/index.html. - See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c, - Python-2.0/Objects/unicodeobject.c. + See also Python-3.7.17/Parser/tokenizer.c, Python-3.7.17/Python/compile.c, + Python-3.7.17/Objects/bytesobject.c, Python-3.7.17/Objects/unicodeobject.c. For the f-strings, refer to https://peps.python.org/pep-0498/ and https://docs.python.org/3/reference/lexical_analysis.html#literals . */ @@ -838,12 +838,13 @@ free_token (token_ty *tp) } -/* There are two different input syntaxes for strings, "abc" and r"abc", - and two different input syntaxes for Unicode strings, u"abc" and ur"abc". +/* There are two different input syntaxes for byte strings, b"abc" and br"abc", + and two different input syntaxes for Unicode strings, u"abc" and ur"abc"; + the 'u' may be omitted. Which escape sequences are understood, i.e. what is interpreted specially after backslash? - "abc" \ \\ \' \" \a\b\f\n\r\t\v \ooo \xnn - r"abc" + b"abc" \ \\ \' \" \a\b\f\n\r\t\v \ooo \xnn + br"abc" u"abc" \ \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...} ur"abc" \unnnn The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two @@ -1005,6 +1006,11 @@ phase7_getuc (int quote_char, phase2_ungetc (c); } *backslash_counter = 0; + /* + says: "In a bytes literal, hexadecimal and octal escapes denote + the byte with the given value. + In a string literal, these escapes denote a Unicode + character with the given value." */ if (interpret_unicode) return UNICODE (n); else @@ -1042,6 +1048,12 @@ phase7_getuc (int quote_char, { int n = (n1 << 4) + n2; *backslash_counter = 0; + /* + says: + "In a bytes literal, hexadecimal and octal escapes denote + the byte with the given value. + In a string literal, these escapes denote a Unicode + character with the given value." */ if (interpret_unicode) return UNICODE (n); else @@ -1057,7 +1069,7 @@ phase7_getuc (int quote_char, } } - if (interpret_unicode) + if (interpret_ansic && interpret_unicode) { if (c == 'u') { @@ -1091,97 +1103,94 @@ phase7_getuc (int quote_char, return UNICODE (n); } - if (interpret_ansic) + if (c == 'U') { - if (c == 'U') + unsigned char buf[8]; + unsigned int n = 0; + int i; + + for (i = 0; i < 8; i++) + { + int c1 = phase2_getc (); + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + { + phase2_ungetc (c1); + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + + buf[i] = c1; + } + if (n < 0x110000) + { + *backslash_counter = 0; + return UNICODE (n); + } + + if_error (IF_SEVERITY_WARNING, + logical_file_name, line_number, (size_t)(-1), false, + _("invalid Unicode character")); + + while (--i >= 0) + phase2_ungetc (buf[i]); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); + } + + if (c == 'N') + { + int c1 = phase2_getc (); + if (c1 == '{') { - unsigned char buf[8]; - unsigned int n = 0; + unsigned char buf[UNINAME_MAX + 1]; int i; + unsigned int n; - for (i = 0; i < 8; i++) + for (i = 0; i < UNINAME_MAX; i++) { - int c1 = phase2_getc (); - - if (c1 >= '0' && c1 <= '9') - n = (n << 4) + (c1 - '0'); - else if (c1 >= 'A' && c1 <= 'F') - n = (n << 4) + (c1 - 'A' + 10); - else if (c1 >= 'a' && c1 <= 'f') - n = (n << 4) + (c1 - 'a' + 10); - else + int c2 = phase2_getc (); + if (!(c2 >= ' ' && c2 <= '~')) { - phase2_ungetc (c1); + phase2_ungetc (c2); while (--i >= 0) phase2_ungetc (buf[i]); + phase2_ungetc (c1); phase2_ungetc (c); ++*backslash_counter; return UNICODE ('\\'); } - - buf[i] = c1; + if (c2 == '}') + break; + buf[i] = c2; } - if (n < 0x110000) + buf[i] = '\0'; + + n = unicode_name_character ((char *) buf); + if (n != UNINAME_INVALID) { *backslash_counter = 0; return UNICODE (n); } - if_error (IF_SEVERITY_WARNING, - logical_file_name, line_number, (size_t)(-1), false, - _("invalid Unicode character")); - + phase2_ungetc ('}'); while (--i >= 0) phase2_ungetc (buf[i]); - phase2_ungetc (c); - ++*backslash_counter; - return UNICODE ('\\'); - } - - if (c == 'N') - { - int c1 = phase2_getc (); - if (c1 == '{') - { - unsigned char buf[UNINAME_MAX + 1]; - int i; - unsigned int n; - - for (i = 0; i < UNINAME_MAX; i++) - { - int c2 = phase2_getc (); - if (!(c2 >= ' ' && c2 <= '~')) - { - phase2_ungetc (c2); - while (--i >= 0) - phase2_ungetc (buf[i]); - phase2_ungetc (c1); - phase2_ungetc (c); - ++*backslash_counter; - return UNICODE ('\\'); - } - if (c2 == '}') - break; - buf[i] = c2; - } - buf[i] = '\0'; - - n = unicode_name_character ((char *) buf); - if (n != UNINAME_INVALID) - { - *backslash_counter = 0; - return UNICODE (n); - } - - phase2_ungetc ('}'); - while (--i >= 0) - phase2_ungetc (buf[i]); - } - phase2_ungetc (c1); - phase2_ungetc (c); - ++*backslash_counter; - return UNICODE ('\\'); } + phase2_ungetc (c1); + phase2_ungetc (c); + ++*backslash_counter; + return UNICODE ('\\'); } } @@ -1297,13 +1306,13 @@ phase5_get (token_ty *tp) } } FALLTHROUGH; - case 'A': case 'B': case 'C': case 'D': case 'E': + case 'A': case 'C': case 'D': case 'E': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'S': case 'T': case 'V': case 'W': case 'X': case 'Y': case 'Z': case '_': - case 'a': case 'b': case 'c': case 'd': case 'e': + case 'a': case 'c': case 'd': case 'e': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 's': case 't': case 'v': case 'w': case 'x': @@ -1369,7 +1378,7 @@ phase5_get (token_ty *tp) bool triple; unsigned int backslash_counter; - case 'F': case 'f': + case 'B': case 'b': { int c1 = phase2_getc (); if (c1 == '"' || c1 == '\'') @@ -1377,7 +1386,7 @@ phase5_get (token_ty *tp) quote_char = c1; interpret_ansic = true; interpret_unicode = false; - f_string = true; + f_string = false; goto string; } if (c1 == 'R' || c1 == 'r') @@ -1388,7 +1397,7 @@ phase5_get (token_ty *tp) quote_char = c2; interpret_ansic = false; interpret_unicode = false; - f_string = true; + f_string = false; goto string; } phase2_ungetc (c2); @@ -1397,25 +1406,25 @@ phase5_get (token_ty *tp) goto symbol; } - case 'R': case 'r': + case 'F': case 'f': { int c1 = phase2_getc (); if (c1 == '"' || c1 == '\'') { quote_char = c1; - interpret_ansic = false; - interpret_unicode = false; - f_string = false; + interpret_ansic = true; + interpret_unicode = true; + f_string = true; goto string; } - if (c1 == 'F' || c1 == 'f') + if (c1 == 'R' || c1 == 'r') { int c2 = phase2_getc (); if (c2 == '"' || c2 == '\'') { quote_char = c2; interpret_ansic = false; - interpret_unicode = false; + interpret_unicode = true; f_string = true; goto string; } @@ -1425,18 +1434,18 @@ phase5_get (token_ty *tp) goto symbol; } - case 'U': case 'u': + case 'R': case 'r': { int c1 = phase2_getc (); if (c1 == '"' || c1 == '\'') { quote_char = c1; - interpret_ansic = true; + interpret_ansic = false; interpret_unicode = true; f_string = false; goto string; } - if (c1 == 'R' || c1 == 'r') + if (c1 == 'F' || c1 == 'f') { int c2 = phase2_getc (); if (c2 == '"' || c2 == '\'') @@ -1444,7 +1453,7 @@ phase5_get (token_ty *tp) quote_char = c2; interpret_ansic = false; interpret_unicode = true; - f_string = false; + f_string = true; goto string; } phase2_ungetc (c2); @@ -1453,10 +1462,25 @@ phase5_get (token_ty *tp) goto symbol; } + case 'U': case 'u': + { + int c1 = phase2_getc (); + if (c1 == '"' || c1 == '\'') + { + quote_char = c1; + interpret_ansic = true; + interpret_unicode = true; + f_string = false; + goto string; + } + phase2_ungetc (c1); + goto symbol; + } + case '"': case '\'': quote_char = c; interpret_ansic = true; - interpret_unicode = false; + interpret_unicode = true; f_string = false; string: triple = false; @@ -1543,7 +1567,7 @@ phase5_get (token_ty *tp) for (;;) { int uc = phase7_getuc (quote_char, triple, interpret_ansic, - false, true, + true, true, &backslash_counter); if (uc == P7_EOF || uc == P7_STRING_END) diff --git a/gettext-tools/tests/xgettext-python-1 b/gettext-tools/tests/xgettext-python-1 index ec7f92d9a..abd3f996d 100755 --- a/gettext-tools/tests/xgettext-python-1 +++ b/gettext-tools/tests/xgettext-python-1 @@ -5,26 +5,30 @@ cat <<\EOF > xg-py-1.py # interpret_ansic = true, interpret_unicode = false, f_string = false -_("abc\ +_(b"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); # interpret_ansic = false, interpret_unicode = false, f_string = false -_(r"abc\ +_(br"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); # interpret_ansic = true, interpret_unicode = true, f_string = false -_(u"abc\ +_("abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); # interpret_ansic = false, interpret_unicode = true, f_string = false -_(ur"abc\ +_(r"abc\ \\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); -# interpret_ansic = true, interpret_unicode = false, f_string = true +# interpret_ansic = true, interpret_unicode = true, f_string = false +_(u"abc\ +\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}"); + +# interpret_ansic = true, interpret_unicode = true, f_string = true _(f"abc\ \\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123"); -# interpret_ansic = false, interpret_unicode = false, f_string = true +# interpret_ansic = false, interpret_unicode = true, f_string = true _(fr"Abc\ \\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123"); @@ -67,6 +71,7 @@ msgid "" msgstr "" #. interpret_ansic = false, interpret_unicode = false, f_string = false +#. interpret_ansic = false, interpret_unicode = true, f_string = false msgid "" "abc\\\n" "\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v " @@ -79,21 +84,14 @@ msgid "" "\r\t\v x x~ył𐄣z" msgstr "" -#. interpret_ansic = false, interpret_unicode = true, f_string = false -msgid "" -"abc\\\n" -"\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v x\\040x\\x7eył\\U00010123\\N{LATIN " -"SMALL LETTER Z}" -msgstr "" - -#. interpret_ansic = true, interpret_unicode = false, f_string = true +#. interpret_ansic = true, interpret_unicode = true, f_string = true msgid "" "abc\\def'ghi\"jkl{m\\{n\\{o\\\\{p\\\\{q\\\\\\{r}s\\}t\\}u\\\\}v\\\\}w\\\\\\}" "x\a\b\f\n" -"\r\t\v x x~y\\u0142\\U00010123" +"\r\t\v x x~ył𐄣" msgstr "" -#. interpret_ansic = false, interpret_unicode = false, f_string = true +#. interpret_ansic = false, interpret_unicode = true, f_string = true msgid "" "Abc\\\n" "\\\\def\\'ghi\\\"jkl{m\\{n\\\\{o\\\\\\{p\\\\\\\\{q\\\\\\\\\\{r}s\\}t\\\\}u\\" diff --git a/gettext-tools/tests/xgettext-python-5 b/gettext-tools/tests/xgettext-python-5 index f480b4a28..b3d7fe076 100755 --- a/gettext-tools/tests/xgettext-python-5 +++ b/gettext-tools/tests/xgettext-python-5 @@ -1,18 +1,130 @@ #! /bin/sh . "${srcdir=.}/init.sh"; path_prepend_ . ../src -# Test Python support: strings with hexadecimal escape sequences that are -# invalid UTF-8. +# Test Python support: bytes and strings with hexadecimal escape sequences that +# are invalid UTF-8. -cat <<\EOF > xg-py-5.py -_("\xE0") +cat <<\EOF > xg-py-5-b.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "ŕ" +msgstr "" +EOF + +cat <<\EOF > xg-py-5-u.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "à" +msgstr "" EOF : ${XGETTEXT=xgettext} -LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err +: ${DIFF=diff} + +# With the encoding set to ISO-8859-2 on the command line: + +cat <<\EOF > xg-py-5-b1.py +_(b"\xE0") +EOF + +${XGETTEXT} --no-location -o xg-py-5-b1.tmp --from-code=iso-8859-2 xg-py-5-b1.py 2>xg-py-5-b1.err +test $? = 0 || { cat xg-py-5-b1.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-5-b1.tmp xg-py-5-b1.pot + +${DIFF} xg-py-5-b.ok xg-py-5-b1.pot || Exit 1 + +# With the encoding set to ISO-8859-2 inside the source file: + +cat <<\EOF > xg-py-5-b2.py +# -*- coding: iso-8859-2 -*- +_(b"\xE0") +EOF + +${XGETTEXT} --no-location -o xg-py-5-b2.tmp xg-py-5-b2.py 2>xg-py-5-b2.err +test $? = 0 || { cat xg-py-5-b2.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-5-b2.tmp xg-py-5-b2.pot + +${DIFF} xg-py-5-b.ok xg-py-5-b2.pot || Exit 1 + +# With the encoding being UTF-8 by default: + +cat <<\EOF > xg-py-5-b3.py +_(b"\xE0") +EOF + +LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -o xg-py-5-b3.tmp xg-py-5-b3.py 2>xg-py-5-b3.err result=$? -cat xg-py-5.err +cat xg-py-5-b3.err test $result = 1 || Exit 1 -grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1 +grep 'is not UTF-8 encoded' xg-py-5-b3.err >/dev/null || Exit 1 + +# With the encoding set to ISO-8859-1 on the command line: + +cat <<\EOF > xg-py-5-u1.py +_("\xE0") +EOF + +${XGETTEXT} --no-location -o xg-py-5-u1.tmp --from-code=iso-8859-2 xg-py-5-u1.py 2>xg-py-5-u1.err +test $? = 0 || { cat xg-py-5-u1.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-5-u1.tmp xg-py-5-u1.pot + +${DIFF} xg-py-5-u.ok xg-py-5-u1.pot || Exit 1 + +# With the encoding set to ISO-8859-1 inside the source file: + +cat <<\EOF > xg-py-5-u2.py +# -*- coding: iso-8859-1 -*- +_("\xE0") +EOF + +${XGETTEXT} --no-location -o xg-py-5-u2.tmp xg-py-5-u2.py 2>xg-py-5-u2.err +test $? = 0 || { cat xg-py-5-u2.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-5-u2.tmp xg-py-5-u2.pot + +${DIFF} xg-py-5-u.ok xg-py-5-u2.pot || Exit 1 + +# With the encoding being UTF-8 by default: + +cat <<\EOF > xg-py-5-u3.py +_("\xE0") +EOF + +${XGETTEXT} --no-location -o xg-py-5-u3.tmp xg-py-5-u3.py 2>xg-py-5-u3.err +test $? = 0 || { cat xg-py-5-u3.err; Exit 1; } +func_filter_POT_Creation_Date xg-py-5-u3.tmp xg-py-5-u3.pot + +${DIFF} xg-py-5-u.ok xg-py-5-u3.pot || Exit 1 exit 0