/* The Python syntax is defined in the Python Reference Manual
/usr/share/doc/packages/python/html/ref/index.html.
- See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
- Python-2.0/Objects/unicodeobject.c.
+ See also Python-3.7.17/Parser/tokenizer.c, Python-3.7.17/Python/compile.c,
+ Python-3.7.17/Objects/bytesobject.c, Python-3.7.17/Objects/unicodeobject.c.
For the f-strings, refer to https://peps.python.org/pep-0498/
and https://docs.python.org/3/reference/lexical_analysis.html#literals . */
}
-/* There are two different input syntaxes for strings, "abc" and r"abc",
- and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
+/* There are two different input syntaxes for byte strings, b"abc" and br"abc",
+ and two different input syntaxes for Unicode strings, u"abc" and ur"abc";
+ the 'u' may be omitted.
Which escape sequences are understood, i.e. what is interpreted specially
after backslash?
- "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
- r"abc"
+ b"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
+ br"abc"
u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
ur"abc" \unnnn
The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
phase2_ungetc (c);
}
*backslash_counter = 0;
+ /* <https://docs.python.org/3.12/reference/lexical_analysis.html#escape-sequences>
+ says: "In a bytes literal, hexadecimal and octal escapes denote
+ the byte with the given value.
+ In a string literal, these escapes denote a Unicode
+ character with the given value." */
if (interpret_unicode)
return UNICODE (n);
else
{
int n = (n1 << 4) + n2;
*backslash_counter = 0;
+ /* <https://docs.python.org/3.12/reference/lexical_analysis.html#escape-sequences>
+ says:
+ "In a bytes literal, hexadecimal and octal escapes denote
+ the byte with the given value.
+ In a string literal, these escapes denote a Unicode
+ character with the given value." */
if (interpret_unicode)
return UNICODE (n);
else
}
}
- if (interpret_unicode)
+ if (interpret_ansic && interpret_unicode)
{
if (c == 'u')
{
return UNICODE (n);
}
- if (interpret_ansic)
+ if (c == 'U')
{
- if (c == 'U')
+ unsigned char buf[8];
+ unsigned int n = 0;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ {
+ int c1 = phase2_getc ();
+
+ if (c1 >= '0' && c1 <= '9')
+ n = (n << 4) + (c1 - '0');
+ else if (c1 >= 'A' && c1 <= 'F')
+ n = (n << 4) + (c1 - 'A' + 10);
+ else if (c1 >= 'a' && c1 <= 'f')
+ n = (n << 4) + (c1 - 'a' + 10);
+ else
+ {
+ phase2_ungetc (c1);
+ while (--i >= 0)
+ phase2_ungetc (buf[i]);
+ phase2_ungetc (c);
+ ++*backslash_counter;
+ return UNICODE ('\\');
+ }
+
+ buf[i] = c1;
+ }
+ if (n < 0x110000)
+ {
+ *backslash_counter = 0;
+ return UNICODE (n);
+ }
+
+ if_error (IF_SEVERITY_WARNING,
+ logical_file_name, line_number, (size_t)(-1), false,
+ _("invalid Unicode character"));
+
+ while (--i >= 0)
+ phase2_ungetc (buf[i]);
+ phase2_ungetc (c);
+ ++*backslash_counter;
+ return UNICODE ('\\');
+ }
+
+ if (c == 'N')
+ {
+ int c1 = phase2_getc ();
+ if (c1 == '{')
{
- unsigned char buf[8];
- unsigned int n = 0;
+ unsigned char buf[UNINAME_MAX + 1];
int i;
+ unsigned int n;
- for (i = 0; i < 8; i++)
+ for (i = 0; i < UNINAME_MAX; i++)
{
- int c1 = phase2_getc ();
-
- if (c1 >= '0' && c1 <= '9')
- n = (n << 4) + (c1 - '0');
- else if (c1 >= 'A' && c1 <= 'F')
- n = (n << 4) + (c1 - 'A' + 10);
- else if (c1 >= 'a' && c1 <= 'f')
- n = (n << 4) + (c1 - 'a' + 10);
- else
+ int c2 = phase2_getc ();
+ if (!(c2 >= ' ' && c2 <= '~'))
{
- phase2_ungetc (c1);
+ phase2_ungetc (c2);
while (--i >= 0)
phase2_ungetc (buf[i]);
+ phase2_ungetc (c1);
phase2_ungetc (c);
++*backslash_counter;
return UNICODE ('\\');
}
-
- buf[i] = c1;
+ if (c2 == '}')
+ break;
+ buf[i] = c2;
}
- if (n < 0x110000)
+ buf[i] = '\0';
+
+ n = unicode_name_character ((char *) buf);
+ if (n != UNINAME_INVALID)
{
*backslash_counter = 0;
return UNICODE (n);
}
- if_error (IF_SEVERITY_WARNING,
- logical_file_name, line_number, (size_t)(-1), false,
- _("invalid Unicode character"));
-
+ phase2_ungetc ('}');
while (--i >= 0)
phase2_ungetc (buf[i]);
- phase2_ungetc (c);
- ++*backslash_counter;
- return UNICODE ('\\');
- }
-
- if (c == 'N')
- {
- int c1 = phase2_getc ();
- if (c1 == '{')
- {
- unsigned char buf[UNINAME_MAX + 1];
- int i;
- unsigned int n;
-
- for (i = 0; i < UNINAME_MAX; i++)
- {
- int c2 = phase2_getc ();
- if (!(c2 >= ' ' && c2 <= '~'))
- {
- phase2_ungetc (c2);
- while (--i >= 0)
- phase2_ungetc (buf[i]);
- phase2_ungetc (c1);
- phase2_ungetc (c);
- ++*backslash_counter;
- return UNICODE ('\\');
- }
- if (c2 == '}')
- break;
- buf[i] = c2;
- }
- buf[i] = '\0';
-
- n = unicode_name_character ((char *) buf);
- if (n != UNINAME_INVALID)
- {
- *backslash_counter = 0;
- return UNICODE (n);
- }
-
- phase2_ungetc ('}');
- while (--i >= 0)
- phase2_ungetc (buf[i]);
- }
- phase2_ungetc (c1);
- phase2_ungetc (c);
- ++*backslash_counter;
- return UNICODE ('\\');
}
+ phase2_ungetc (c1);
+ phase2_ungetc (c);
+ ++*backslash_counter;
+ return UNICODE ('\\');
}
}
}
}
FALLTHROUGH;
- case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'A': case 'C': case 'D': case 'E':
case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
case 'M': case 'N': case 'O': case 'P': case 'Q':
case 'S': case 'T': case 'V': case 'W': case 'X':
case 'Y': case 'Z':
case '_':
- case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'a': case 'c': case 'd': case 'e':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
case 'm': case 'n': case 'o': case 'p': case 'q':
case 's': case 't': case 'v': case 'w': case 'x':
bool triple;
unsigned int backslash_counter;
- case 'F': case 'f':
+ case 'B': case 'b':
{
int c1 = phase2_getc ();
if (c1 == '"' || c1 == '\'')
quote_char = c1;
interpret_ansic = true;
interpret_unicode = false;
- f_string = true;
+ f_string = false;
goto string;
}
if (c1 == 'R' || c1 == 'r')
quote_char = c2;
interpret_ansic = false;
interpret_unicode = false;
- f_string = true;
+ f_string = false;
goto string;
}
phase2_ungetc (c2);
goto symbol;
}
- case 'R': case 'r':
+ case 'F': case 'f':
{
int c1 = phase2_getc ();
if (c1 == '"' || c1 == '\'')
{
quote_char = c1;
- interpret_ansic = false;
- interpret_unicode = false;
- f_string = false;
+ interpret_ansic = true;
+ interpret_unicode = true;
+ f_string = true;
goto string;
}
- if (c1 == 'F' || c1 == 'f')
+ if (c1 == 'R' || c1 == 'r')
{
int c2 = phase2_getc ();
if (c2 == '"' || c2 == '\'')
{
quote_char = c2;
interpret_ansic = false;
- interpret_unicode = false;
+ interpret_unicode = true;
f_string = true;
goto string;
}
goto symbol;
}
- case 'U': case 'u':
+ case 'R': case 'r':
{
int c1 = phase2_getc ();
if (c1 == '"' || c1 == '\'')
{
quote_char = c1;
- interpret_ansic = true;
+ interpret_ansic = false;
interpret_unicode = true;
f_string = false;
goto string;
}
- if (c1 == 'R' || c1 == 'r')
+ if (c1 == 'F' || c1 == 'f')
{
int c2 = phase2_getc ();
if (c2 == '"' || c2 == '\'')
quote_char = c2;
interpret_ansic = false;
interpret_unicode = true;
- f_string = false;
+ f_string = true;
goto string;
}
phase2_ungetc (c2);
goto symbol;
}
+ case 'U': case 'u':
+ {
+ int c1 = phase2_getc ();
+ if (c1 == '"' || c1 == '\'')
+ {
+ quote_char = c1;
+ interpret_ansic = true;
+ interpret_unicode = true;
+ f_string = false;
+ goto string;
+ }
+ phase2_ungetc (c1);
+ goto symbol;
+ }
+
case '"': case '\'':
quote_char = c;
interpret_ansic = true;
- interpret_unicode = false;
+ interpret_unicode = true;
f_string = false;
string:
triple = false;
for (;;)
{
int uc = phase7_getuc (quote_char, triple, interpret_ansic,
- false, true,
+ true, true,
&backslash_counter);
if (uc == P7_EOF || uc == P7_STRING_END)
cat <<\EOF > xg-py-1.py
# interpret_ansic = true, interpret_unicode = false, f_string = false
-_("abc\
+_(b"abc\
\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}");
# interpret_ansic = false, interpret_unicode = false, f_string = false
-_(r"abc\
+_(br"abc\
\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}");
# interpret_ansic = true, interpret_unicode = true, f_string = false
-_(u"abc\
+_("abc\
\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}");
# interpret_ansic = false, interpret_unicode = true, f_string = false
-_(ur"abc\
+_(r"abc\
\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}");
-# interpret_ansic = true, interpret_unicode = false, f_string = true
+# interpret_ansic = true, interpret_unicode = true, f_string = false
+_(u"abc\
+\\def\'ghi\"jkl\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123\N{LATIN SMALL LETTER Z}");
+
+# interpret_ansic = true, interpret_unicode = true, f_string = true
_(f"abc\
\\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123");
-# interpret_ansic = false, interpret_unicode = false, f_string = true
+# interpret_ansic = false, interpret_unicode = true, f_string = true
_(fr"Abc\
\\def\'ghi\"jkl{{m\{{n\\{{o\\\{{p\\\\{{q\\\\\{{r}}s\}}t\\}}u\\\}}v\\\\}}w\\\\\}}x\a\b\f\n\r\t\v x\040x\x7ey\u0142\U00010123");
msgstr ""
#. interpret_ansic = false, interpret_unicode = false, f_string = false
+#. interpret_ansic = false, interpret_unicode = true, f_string = false
msgid ""
"abc\\\n"
"\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v "
"\r\t\v x x~ył𐄣z"
msgstr ""
-#. interpret_ansic = false, interpret_unicode = true, f_string = false
-msgid ""
-"abc\\\n"
-"\\\\def\\'ghi\\\"jkl\\a\\b\\f\\n\\r\\t\\v x\\040x\\x7eył\\U00010123\\N{LATIN "
-"SMALL LETTER Z}"
-msgstr ""
-
-#. interpret_ansic = true, interpret_unicode = false, f_string = true
+#. interpret_ansic = true, interpret_unicode = true, f_string = true
msgid ""
"abc\\def'ghi\"jkl{m\\{n\\{o\\\\{p\\\\{q\\\\\\{r}s\\}t\\}u\\\\}v\\\\}w\\\\\\}"
"x\a\b\f\n"
-"\r\t\v x x~y\\u0142\\U00010123"
+"\r\t\v x x~ył𐄣"
msgstr ""
-#. interpret_ansic = false, interpret_unicode = false, f_string = true
+#. interpret_ansic = false, interpret_unicode = true, f_string = true
msgid ""
"Abc\\\n"
"\\\\def\\'ghi\\\"jkl{m\\{n\\\\{o\\\\\\{p\\\\\\\\{q\\\\\\\\\\{r}s\\}t\\\\}u\\"
#! /bin/sh
. "${srcdir=.}/init.sh"; path_prepend_ . ../src
-# Test Python support: strings with hexadecimal escape sequences that are
-# invalid UTF-8.
+# Test Python support: bytes and strings with hexadecimal escape sequences that
+# are invalid UTF-8.
-cat <<\EOF > xg-py-5.py
-_("\xE0")
+cat <<\EOF > xg-py-5-b.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: \n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "ŕ"
+msgstr ""
+EOF
+
+cat <<\EOF > xg-py-5-u.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: \n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"Language: \n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+msgid "à"
+msgstr ""
EOF
: ${XGETTEXT=xgettext}
-LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -d xg-py-5.tmp xg-py-5.py 2>xg-py-5.err
+: ${DIFF=diff}
+
+# With the encoding set to ISO-8859-2 on the command line:
+
+cat <<\EOF > xg-py-5-b1.py
+_(b"\xE0")
+EOF
+
+${XGETTEXT} --no-location -o xg-py-5-b1.tmp --from-code=iso-8859-2 xg-py-5-b1.py 2>xg-py-5-b1.err
+test $? = 0 || { cat xg-py-5-b1.err; Exit 1; }
+func_filter_POT_Creation_Date xg-py-5-b1.tmp xg-py-5-b1.pot
+
+${DIFF} xg-py-5-b.ok xg-py-5-b1.pot || Exit 1
+
+# With the encoding set to ISO-8859-2 inside the source file:
+
+cat <<\EOF > xg-py-5-b2.py
+# -*- coding: iso-8859-2 -*-
+_(b"\xE0")
+EOF
+
+${XGETTEXT} --no-location -o xg-py-5-b2.tmp xg-py-5-b2.py 2>xg-py-5-b2.err
+test $? = 0 || { cat xg-py-5-b2.err; Exit 1; }
+func_filter_POT_Creation_Date xg-py-5-b2.tmp xg-py-5-b2.pot
+
+${DIFF} xg-py-5-b.ok xg-py-5-b2.pot || Exit 1
+
+# With the encoding being UTF-8 by default:
+
+cat <<\EOF > xg-py-5-b3.py
+_(b"\xE0")
+EOF
+
+LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -o xg-py-5-b3.tmp xg-py-5-b3.py 2>xg-py-5-b3.err
result=$?
-cat xg-py-5.err
+cat xg-py-5-b3.err
test $result = 1 || Exit 1
-grep 'is not UTF-8 encoded' xg-py-5.err >/dev/null || Exit 1
+grep 'is not UTF-8 encoded' xg-py-5-b3.err >/dev/null || Exit 1
+
+# With the encoding set to ISO-8859-1 on the command line:
+
+cat <<\EOF > xg-py-5-u1.py
+_("\xE0")
+EOF
+
+${XGETTEXT} --no-location -o xg-py-5-u1.tmp --from-code=iso-8859-2 xg-py-5-u1.py 2>xg-py-5-u1.err
+test $? = 0 || { cat xg-py-5-u1.err; Exit 1; }
+func_filter_POT_Creation_Date xg-py-5-u1.tmp xg-py-5-u1.pot
+
+${DIFF} xg-py-5-u.ok xg-py-5-u1.pot || Exit 1
+
+# With the encoding set to ISO-8859-1 inside the source file:
+
+cat <<\EOF > xg-py-5-u2.py
+# -*- coding: iso-8859-1 -*-
+_("\xE0")
+EOF
+
+${XGETTEXT} --no-location -o xg-py-5-u2.tmp xg-py-5-u2.py 2>xg-py-5-u2.err
+test $? = 0 || { cat xg-py-5-u2.err; Exit 1; }
+func_filter_POT_Creation_Date xg-py-5-u2.tmp xg-py-5-u2.pot
+
+${DIFF} xg-py-5-u.ok xg-py-5-u2.pot || Exit 1
+
+# With the encoding being UTF-8 by default:
+
+cat <<\EOF > xg-py-5-u3.py
+_("\xE0")
+EOF
+
+${XGETTEXT} --no-location -o xg-py-5-u3.tmp xg-py-5-u3.py 2>xg-py-5-u3.err
+test $? = 0 || { cat xg-py-5-u3.err; Exit 1; }
+func_filter_POT_Creation_Date xg-py-5-u3.tmp xg-py-5-u3.pot
+
+${DIFF} xg-py-5-u.ok xg-py-5-u3.pot || Exit 1
exit 0