From: Bruno Haible Date: Sun, 25 Aug 2019 21:37:44 +0000 (+0200) Subject: xgettext: Assume that Python source files are in UTF-8 by default. X-Git-Tag: v0.20.2~42 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b844085eaad14b22fa432ef3936bb31978d01bb2;p=thirdparty%2Fgettext.git xgettext: Assume that Python source files are in UTF-8 by default. Reported by ilias iliadis at . * gettext-tools/src/xg-encoding.h (xgettext_global_source_encoding): Allow a NULL value. * gettext-tools/src/xg-encoding.c (xgettext_global_source_encoding): Likewise. * gettext-tools/src/xgettext.c (main): Initialize xgettext_global_source_encoding with NULL, not "ASCII". Handle a NULL xgettext_global_source_encoding. (extract_from_file): Use ASCII as default for xgettext_global_source_encoding. * gettext-tools/src/x-javascript.c (extract_javascript): Likewise. * gettext-tools/src/x-python.c (phase2_getc): Signal an error when encountering an invalid or incomplete UTF-8 character. (set_current_file_source_encoding): In the error message, use xgettext_current_file_source_encoding instead of xgettext_global_source_encoding. (extract_python): Use UTF-8 as default for xgettext_global_source_encoding. * gettext-tools/tests/xgettext-python-3: Verify that if the source file has no magic coding comment but is UTF-8 encoded, xgettext succeeds. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 32050e96f..311225c00 100644 --- a/NEWS +++ b/NEWS @@ -8,6 +8,9 @@ Version 0.20.2 - April 2020 o xgettext now recognizes 'gettext' program invocations with the '-e' option, such as gettext -e 'some\nstring\n' + - Python: + xgettext now assumes a Python source file is in UTF-8 encoding by default, + as stated in PEP 3120. - Desktop Entry: The value of the 'Icon' property is no longer extracted into the POT file by xgettext. The documentation explains how to localize icons. diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c index 7ac5b2472..020ba988d 100644 --- a/gettext-tools/src/x-javascript.c +++ b/gettext-tools/src/x-javascript.c @@ -1719,7 +1719,9 @@ extract_javascript (FILE *f, xml_element_depth = 0; inside_embedded_js_in_xml = false; - xgettext_current_file_source_encoding = xgettext_global_source_encoding; + xgettext_current_file_source_encoding = + (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding : + po_charset_ascii); #if HAVE_ICONV xgettext_current_file_source_iconv = xgettext_global_source_iconv; #endif diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c index f6116f551..94e9930d6 100644 --- a/gettext-tools/src/x-python.c +++ b/gettext-tools/src/x-python.c @@ -321,13 +321,7 @@ as specified in https://www.python.org/peps/pep-0263.html.\n"))); if (errno == EILSEQ) { /* An invalid multibyte sequence was encountered. */ - multiline_error (xstrdup (""), - xasprintf (_("\ -%s:%d: Invalid multibyte sequence.\n\ -Please specify the correct source encoding through --from-code or through a\n\ -comment as specified in https://www.python.org/peps/pep-0263.html.\n"), - real_file_name, line_number)); - exit (EXIT_FAILURE); + goto invalid; } else if (errno == EINVAL) { @@ -350,25 +344,9 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"), /* Read one more byte and retry iconv. */ c = phase1_getc (); if (c == EOF) - { - multiline_error (xstrdup (""), - xasprintf (_("\ -%s:%d: Incomplete multibyte sequence at end of file.\n\ -Please specify the correct source encoding through --from-code or through a\n\ -comment as specified in https://www.python.org/peps/pep-0263.html.\n"), - real_file_name, line_number)); - exit (EXIT_FAILURE); - } + goto incomplete_at_eof; if (c == '\n') - { - multiline_error (xstrdup (""), - xasprintf (_("\ -%s:%d: Incomplete multibyte sequence at end of line.\n\ -Please specify the correct source encoding through --from-code or through a\n\ -comment as specified in https://www.python.org/peps/pep-0263.html.\n"), - real_file_name, line_number - 1)); - exit (EXIT_FAILURE); - } + goto incomplete_at_eol; buf[bufcount++] = (unsigned char) c; } else @@ -394,13 +372,7 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"), { /* scratchbuf contains an out-of-range Unicode character (> 0x10ffff). */ - multiline_error (xstrdup (""), - xasprintf (_("\ -%s:%d: Invalid multibyte sequence.\n\ -Please specify the source encoding through --from-code or through a comment\n\ -as specified in https://www.python.org/peps/pep-0263.html.\n"), - real_file_name, line_number)); - exit (EXIT_FAILURE); + goto invalid; } return uc; } @@ -414,76 +386,129 @@ as specified in https://www.python.org/peps/pep-0263.html.\n"), } else { - /* Read an UTF-8 encoded character. */ - unsigned char buf[6]; - unsigned int count; + /* Read an UTF-8 encoded character. + Reject invalid input, like u8_mbtouc does. */ int c; ucs4_t uc; c = phase1_getc (); if (c == EOF) return UEOF; - buf[0] = c; - count = 1; - - if (buf[0] >= 0xc0) - { - c = phase1_getc (); - if (c == EOF) - return UEOF; - buf[1] = c; - count = 2; - } - - if (buf[0] >= 0xe0 - && ((buf[1] ^ 0x80) < 0x40)) + if (c < 0x80) { - c = phase1_getc (); - if (c == EOF) - return UEOF; - buf[2] = c; - count = 3; + uc = c; } - - if (buf[0] >= 0xf0 - && ((buf[1] ^ 0x80) < 0x40) - && ((buf[2] ^ 0x80) < 0x40)) + else if (c < 0xc2) + goto invalid; + else if (c < 0xe0) { - c = phase1_getc (); - if (c == EOF) - return UEOF; - buf[3] = c; - count = 4; + int c1 = phase1_getc (); + if (c1 == EOF) + goto incomplete_at_eof; + if (c1 == '\n') + goto incomplete_at_eol; + if ((c1 ^ 0x80) < 0x40) + uc = ((unsigned int) (c & 0x1f) << 6) + | (unsigned int) (c1 ^ 0x80); + else + goto invalid; } - - if (buf[0] >= 0xf8 - && ((buf[1] ^ 0x80) < 0x40) - && ((buf[2] ^ 0x80) < 0x40) - && ((buf[3] ^ 0x80) < 0x40)) + else if (c < 0xf0) { - c = phase1_getc (); - if (c == EOF) - return UEOF; - buf[4] = c; - count = 5; + int c1 = phase1_getc (); + if (c1 == EOF) + goto incomplete_at_eof; + if (c1 == '\n') + goto incomplete_at_eol; + if ((c1 ^ 0x80) < 0x40 + && (c >= 0xe1 || c1 >= 0xa0) + && (c != 0xed || c1 < 0xa0)) + { + int c2 = phase1_getc (); + if (c2 == EOF) + goto incomplete_at_eof; + if (c2 == '\n') + goto incomplete_at_eol; + if ((c2 ^ 0x80) < 0x40) + uc = ((unsigned int) (c & 0x0f) << 12) + | ((unsigned int) (c1 ^ 0x80) << 6) + | (unsigned int) (c2 ^ 0x80); + else + goto invalid; + } + else + goto invalid; } - - if (buf[0] >= 0xfc - && ((buf[1] ^ 0x80) < 0x40) - && ((buf[2] ^ 0x80) < 0x40) - && ((buf[3] ^ 0x80) < 0x40) - && ((buf[4] ^ 0x80) < 0x40)) + else if (c < 0xf8) { - c = phase1_getc (); - if (c == EOF) - return UEOF; - buf[5] = c; - count = 6; + int c1 = phase1_getc (); + if (c1 == EOF) + goto incomplete_at_eof; + if (c1 == '\n') + goto incomplete_at_eol; + if ((c1 ^ 0x80) < 0x40 + && (c >= 0xf1 || c1 >= 0x90) + && (c < 0xf4 || (c == 0xf4 && c1 < 0x90))) + { + int c2 = phase1_getc (); + if (c2 == EOF) + goto incomplete_at_eof; + if (c2 == '\n') + goto incomplete_at_eol; + if ((c2 ^ 0x80) < 0x40) + { + int c3 = phase1_getc (); + if (c3 == EOF) + goto incomplete_at_eof; + if (c3 == '\n') + goto incomplete_at_eol; + if ((c3 ^ 0x80) < 0x40) + uc = ((unsigned int) (c & 0x07) << 18) + | ((unsigned int) (c1 ^ 0x80) << 12) + | ((unsigned int) (c2 ^ 0x80) << 6) + | (unsigned int) (c3 ^ 0x80); + else + goto invalid; + } + else + goto invalid; + } + else + goto invalid; } + else + goto invalid; - u8_mbtouc (&uc, buf, count); return uc; } + + invalid: + /* An invalid multibyte sequence was encountered. */ + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Invalid multibyte sequence.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in https://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + + incomplete_at_eof: + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of file.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in https://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number)); + exit (EXIT_FAILURE); + + incomplete_at_eol: + multiline_error (xstrdup (""), + xasprintf (_("\ +%s:%d: Incomplete multibyte sequence at end of line.\n\ +Please specify the correct source encoding through --from-code or through a\n\ +comment as specified in https://www.python.org/peps/pep-0263.html.\n"), + real_file_name, line_number - 1)); + exit (EXIT_FAILURE); } /* Supports max (9, UNINAME_MAX + 3) pushback characters. */ @@ -603,7 +628,7 @@ set_current_file_source_encoding (const char *canon_encoding) #else error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."), - xgettext_global_source_encoding, po_charset_utf8, + xgettext_current_file_source_encoding, po_charset_utf8, basename (program_name)); #endif } @@ -1672,7 +1697,11 @@ extract_python (FILE *f, last_comment_line = -1; last_non_comment_line = -1; - xgettext_current_file_source_encoding = xgettext_global_source_encoding; + /* For Python, the default source file encoding is UTF-8. This is specified + in PEP 3120. */ + xgettext_current_file_source_encoding = + (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding : + po_charset_utf8); #if HAVE_ICONV xgettext_current_file_source_iconv = xgettext_global_source_iconv; #endif diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c index 63f30eeeb..11793368e 100644 --- a/gettext-tools/src/xg-encoding.c +++ b/gettext-tools/src/xg-encoding.c @@ -35,7 +35,9 @@ #define _(str) gettext (str) -/* Canonicalized encoding name for all input files. */ +/* Canonicalized encoding name for all input files. + It can be NULL when the --from-code option has not been specified. In this + case, the default (ASCII or UTF-8) depends on the programming language. */ const char *xgettext_global_source_encoding; #if HAVE_ICONV diff --git a/gettext-tools/src/xg-encoding.h b/gettext-tools/src/xg-encoding.h index b2b571cb1..9ef2da76f 100644 --- a/gettext-tools/src/xg-encoding.h +++ b/gettext-tools/src/xg-encoding.h @@ -1,5 +1,5 @@ /* Keeping track of the encoding of strings to be extracted. - Copyright (C) 2001-2018 Free Software Foundation, Inc. + Copyright (C) 2001-2019 Free Software Foundation, Inc. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,7 +49,9 @@ extern char *non_ascii_error_message (lexical_context_ty lcontext, size_t line_number); -/* Canonicalized encoding name for all input files. */ +/* Canonicalized encoding name for all input files. + It can be NULL when the --from-code option has not been specified. In this + case, the default (ASCII or UTF-8) depends on the programming language. */ extern const char *xgettext_global_source_encoding; #if HAVE_ICONV diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c index a6bceddcf..16a7538cb 100644 --- a/gettext-tools/src/xgettext.c +++ b/gettext-tools/src/xgettext.c @@ -347,7 +347,7 @@ main (int argc, char *argv[]) /* Set initial value of variables. */ default_domain = MESSAGE_DOMAIN_DEFAULT; - xgettext_global_source_encoding = po_charset_ascii; + xgettext_global_source_encoding = NULL; init_flag_table_c (); init_flag_table_objc (); init_flag_table_gcc_internal (); @@ -768,7 +768,8 @@ xgettext cannot work without keywords to look for")); /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except from ASCII or UTF-8, when this conversion is a no-op). */ - if (xgettext_global_source_encoding != po_charset_ascii + if (xgettext_global_source_encoding != NULL + && xgettext_global_source_encoding != po_charset_ascii && xgettext_global_source_encoding != po_charset_utf8) { #if HAVE_ICONV @@ -965,7 +966,8 @@ xgettext cannot work without keywords to look for")); /* Free the allocated converter. */ #if HAVE_ICONV - if (xgettext_global_source_encoding != po_charset_ascii + if (xgettext_global_source_encoding != NULL + && xgettext_global_source_encoding != po_charset_ascii && xgettext_global_source_encoding != po_charset_utf8) iconv_close (xgettext_global_source_iconv); #endif @@ -1764,7 +1766,9 @@ extract_from_file (const char *file_name, extractor_ty extractor, /* Set the default for the source file encoding. May be overridden by the extractor function. */ - xgettext_current_source_encoding = xgettext_global_source_encoding; + xgettext_current_source_encoding = + (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding : + po_charset_ascii); #if HAVE_ICONV xgettext_current_source_iconv = xgettext_global_source_iconv; #endif diff --git a/gettext-tools/tests/xgettext-python-3 b/gettext-tools/tests/xgettext-python-3 index ca0926dc8..fa19f1d90 100755 --- a/gettext-tools/tests/xgettext-python-3 +++ b/gettext-tools/tests/xgettext-python-3 @@ -19,6 +19,12 @@ cat <<\EOF > xg-py-3b.py print gettext.gettext("ÆüËܸì"); EOF +cat <<\EOF > xg-py-3u.py +#!/usr/bin/env python +# TRANSLATORS: François Pinard is a hero. +print gettext.gettext("日本語"); +EOF + cat <<\EOF > xg-py-3.ok # SOME DESCRIPTIVE TITLE. # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER @@ -68,6 +74,17 @@ cat xg-py-3b.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3b. ${DIFF} xg-py-3.ok xg-py-3b.pot || Exit 1 +# Verify that if the source file has no magic coding comment but is UTF-8 +# encoded, xgettext succeeds. (PEP 3120) + +${XGETTEXT} --add-comments=TRANSLATORS: --no-location \ + -o xg-py-3u.tmp xg-py-3u.py || Exit 1 +# Don't simplify this to "grep ... < xg-py-3u.tmp", otherwise OpenBSD 4.0 grep +# only outputs "Binary file (standard input) matches". +cat xg-py-3u.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3u.pot + +${DIFF} xg-py-3.ok xg-py-3u.pot || Exit 1 + # Verify that if the source file has a magic coding comment and a --from-code # option is given, the magic coding comment takes precedence over it.