xgettext: Assume that Python source files are in UTF-8 by default.

author Bruno Haible <bruno@clisp.org>

Sun, 25 Aug 2019 21:37:44 +0000 (23:37 +0200)

committer Bruno Haible <bruno@clisp.org>

Mon, 13 Apr 2020 11:13:15 +0000 (13:13 +0200)
author Bruno Haible <bruno@clisp.org>
Sun, 25 Aug 2019 21:37:44 +0000 (23:37 +0200)
committer Bruno Haible <bruno@clisp.org>
Mon, 13 Apr 2020 11:13:15 +0000 (13:13 +0200)
diff --git a/NEWS b/NEWS

index 32050e96f11792040ff1eac90fcfb4cf62a9ee21..311225c00a8a6ea69d301e6d21f6708934728457 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -8,6 +8,9 @@ Version 0.20.2 - April 2020
      o xgettext now recognizes 'gettext' program invocations with the '-e'
        option, such as
          gettext -e 'some\nstring\n'
+  - Python:
+    xgettext now assumes a Python source file is in UTF-8 encoding by default,
+    as stated in PEP 3120.
    - Desktop Entry:
      The value of the 'Icon' property is no longer extracted into the POT file
      by xgettext.  The documentation explains how to localize icons.
diff --git a/gettext-tools/src/x-javascript.c b/gettext-tools/src/x-javascript.c

index 7ac5b24725f479e6d33d18e7cc2db5a189ebe844..020ba988d9b79404271aa7056245ce3bf656a50f 100644 (file)
--- a/gettext-tools/src/x-javascript.c
+++ b/gettext-tools/src/x-javascript.c
@@ -1719,7 +1719,9 @@ extract_javascript (FILE *f,
    xml_element_depth = 0;
    inside_embedded_js_in_xml = false;
  
-  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
+  xgettext_current_file_source_encoding =
+    (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
+     po_charset_ascii);
  #if HAVE_ICONV
    xgettext_current_file_source_iconv = xgettext_global_source_iconv;
  #endif
diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c

index f6116f5513771d00b29fde4e2a4f2826381118b0..94e9930d6d60261a65424d033e93c4de01e03d6d 100644 (file)
--- a/gettext-tools/src/x-python.c
+++ b/gettext-tools/src/x-python.c
@@ -321,13 +321,7 @@ as specified in https://www.python.org/peps/pep-0263.html.\n")));
                if (errno == EILSEQ)
                  {
                    /* An invalid multibyte sequence was encountered.  */
-                  multiline_error (xstrdup (""),
-                                   xasprintf (_("\
-%s:%d: Invalid multibyte sequence.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
-                                   real_file_name, line_number));
-                  exit (EXIT_FAILURE);
+                  goto invalid;
                  }
                else if (errno == EINVAL)
                  {
@@ -350,25 +344,9 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
                    /* Read one more byte and retry iconv.  */
                    c = phase1_getc ();
                    if (c == EOF)
-                    {
-                      multiline_error (xstrdup (""),
-                                       xasprintf (_("\
-%s:%d: Incomplete multibyte sequence at end of file.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
-                                       real_file_name, line_number));
-                      exit (EXIT_FAILURE);
-                    }
+                    goto incomplete_at_eof;
                    if (c == '\n')
-                    {
-                      multiline_error (xstrdup (""),
-                                       xasprintf (_("\
-%s:%d: Incomplete multibyte sequence at end of line.\n\
-Please specify the correct source encoding through --from-code or through a\n\
-comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
-                                       real_file_name, line_number - 1));
-                      exit (EXIT_FAILURE);
-                    }
+                    goto incomplete_at_eol;
                    buf[bufcount++] = (unsigned char) c;
                  }
                else
@@ -394,13 +372,7 @@ comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
                  {
                    /* scratchbuf contains an out-of-range Unicode character
                       (> 0x10ffff).  */
-                  multiline_error (xstrdup (""),
-                                   xasprintf (_("\
-%s:%d: Invalid multibyte sequence.\n\
-Please specify the source encoding through --from-code or through a comment\n\
-as specified in https://www.python.org/peps/pep-0263.html.\n"),
-                                   real_file_name, line_number));
-                  exit (EXIT_FAILURE);
+                  goto invalid;
                  }
                return uc;
              }
@@ -414,76 +386,129 @@ as specified in https://www.python.org/peps/pep-0263.html.\n"),
      }
    else
      {
-      /* Read an UTF-8 encoded character.  */
-      unsigned char buf[6];
-      unsigned int count;
+      /* Read an UTF-8 encoded character.
+         Reject invalid input, like u8_mbtouc does.  */
        int c;
        ucs4_t uc;
  
        c = phase1_getc ();
        if (c == EOF)
          return UEOF;
-      buf[0] = c;
-      count = 1;
-
-      if (buf[0] >= 0xc0)
-        {
-          c = phase1_getc ();
-          if (c == EOF)
-            return UEOF;
-          buf[1] = c;
-          count = 2;
-        }
-
-      if (buf[0] >= 0xe0
-          && ((buf[1] ^ 0x80) < 0x40))
+      if (c < 0x80)
          {
-          c = phase1_getc ();
-          if (c == EOF)
-            return UEOF;
-          buf[2] = c;
-          count = 3;
+          uc = c;
          }
-
-      if (buf[0] >= 0xf0
-          && ((buf[1] ^ 0x80) < 0x40)
-          && ((buf[2] ^ 0x80) < 0x40))
+      else if (c < 0xc2)
+        goto invalid;
+      else if (c < 0xe0)
          {
-          c = phase1_getc ();
-          if (c == EOF)
-            return UEOF;
-          buf[3] = c;
-          count = 4;
+          int c1 = phase1_getc ();
+          if (c1 == EOF)
+            goto incomplete_at_eof;
+          if (c1 == '\n')
+            goto incomplete_at_eol;
+          if ((c1 ^ 0x80) < 0x40)
+            uc = ((unsigned int) (c & 0x1f) << 6)
+                 | (unsigned int) (c1 ^ 0x80);
+          else
+            goto invalid;
          }
-
-      if (buf[0] >= 0xf8
-          && ((buf[1] ^ 0x80) < 0x40)
-          && ((buf[2] ^ 0x80) < 0x40)
-          && ((buf[3] ^ 0x80) < 0x40))
+      else if (c < 0xf0)
          {
-          c = phase1_getc ();
-          if (c == EOF)
-            return UEOF;
-          buf[4] = c;
-          count = 5;
+          int c1 = phase1_getc ();
+          if (c1 == EOF)
+            goto incomplete_at_eof;
+          if (c1 == '\n')
+            goto incomplete_at_eol;
+          if ((c1 ^ 0x80) < 0x40
+              && (c >= 0xe1 || c1 >= 0xa0)
+              && (c != 0xed || c1 < 0xa0))
+            {
+              int c2 = phase1_getc ();
+              if (c2 == EOF)
+                goto incomplete_at_eof;
+              if (c2 == '\n')
+                goto incomplete_at_eol;
+              if ((c2 ^ 0x80) < 0x40)
+                uc = ((unsigned int) (c & 0x0f) << 12)
+                     | ((unsigned int) (c1 ^ 0x80) << 6)
+                     | (unsigned int) (c2 ^ 0x80);
+              else
+                goto invalid;
+            }
+          else
+            goto invalid;
          }
-
-      if (buf[0] >= 0xfc
-          && ((buf[1] ^ 0x80) < 0x40)
-          && ((buf[2] ^ 0x80) < 0x40)
-          && ((buf[3] ^ 0x80) < 0x40)
-          && ((buf[4] ^ 0x80) < 0x40))
+      else if (c < 0xf8)
          {
-          c = phase1_getc ();
-          if (c == EOF)
-            return UEOF;
-          buf[5] = c;
-          count = 6;
+          int c1 = phase1_getc ();
+          if (c1 == EOF)
+            goto incomplete_at_eof;
+          if (c1 == '\n')
+            goto incomplete_at_eol;
+          if ((c1 ^ 0x80) < 0x40
+              && (c >= 0xf1 || c1 >= 0x90)
+              && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
+            {
+              int c2 = phase1_getc ();
+              if (c2 == EOF)
+                goto incomplete_at_eof;
+              if (c2 == '\n')
+                goto incomplete_at_eol;
+              if ((c2 ^ 0x80) < 0x40)
+                {
+                  int c3 = phase1_getc ();
+                  if (c3 == EOF)
+                    goto incomplete_at_eof;
+                  if (c3 == '\n')
+                    goto incomplete_at_eol;
+                  if ((c3 ^ 0x80) < 0x40)
+                    uc = ((unsigned int) (c & 0x07) << 18)
+                         | ((unsigned int) (c1 ^ 0x80) << 12)
+                         | ((unsigned int) (c2 ^ 0x80) << 6)
+                         | (unsigned int) (c3 ^ 0x80);
+                  else
+                    goto invalid;
+                }
+              else
+                goto invalid;
+            }
+          else
+            goto invalid;
          }
+      else
+        goto invalid;
  
-      u8_mbtouc (&uc, buf, count);
        return uc;
      }
+
+ invalid:
+  /* An invalid multibyte sequence was encountered.  */
+  multiline_error (xstrdup (""),
+                   xasprintf (_("\
+%s:%d: Invalid multibyte sequence.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+                   real_file_name, line_number));
+  exit (EXIT_FAILURE);
+
+ incomplete_at_eof:
+  multiline_error (xstrdup (""),
+                   xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of file.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+                   real_file_name, line_number));
+  exit (EXIT_FAILURE);
+
+ incomplete_at_eol:
+  multiline_error (xstrdup (""),
+                   xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of line.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
+                   real_file_name, line_number - 1));
+  exit (EXIT_FAILURE);
  }
  
  /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
@@ -603,7 +628,7 @@ set_current_file_source_encoding (const char *canon_encoding)
  #else
        error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
                       _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
-                     xgettext_global_source_encoding, po_charset_utf8,
+                     xgettext_current_file_source_encoding, po_charset_utf8,
                       basename (program_name));
  #endif
      }
@@ -1672,7 +1697,11 @@ extract_python (FILE *f,
    last_comment_line = -1;
    last_non_comment_line = -1;
  
-  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
+  /* For Python, the default source file encoding is UTF-8.  This is specified
+     in PEP 3120.  */
+  xgettext_current_file_source_encoding =
+   (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
+    po_charset_utf8);
  #if HAVE_ICONV
    xgettext_current_file_source_iconv = xgettext_global_source_iconv;
  #endif
diff --git a/gettext-tools/src/xg-encoding.c b/gettext-tools/src/xg-encoding.c

index 63f30eeebc37cca7996476adbb1aa06d0ffec576..11793368e3f0e92cf3428dea35ace530fd26b171 100644 (file)
--- a/gettext-tools/src/xg-encoding.c
+++ b/gettext-tools/src/xg-encoding.c
@@ -35,7 +35,9 @@
  #define _(str) gettext (str)
  
  
-/* Canonicalized encoding name for all input files.  */
+/* Canonicalized encoding name for all input files.
+   It can be NULL when the --from-code option has not been specified.  In this
+   case, the default (ASCII or UTF-8) depends on the programming language.  */
  const char *xgettext_global_source_encoding;
  
  #if HAVE_ICONV
diff --git a/gettext-tools/src/xg-encoding.h b/gettext-tools/src/xg-encoding.h

index b2b571cb1e52e1fc9a97ebba35241e576c78eb5f..9ef2da76f4dd4b0a0da34d50f6b54c6b3ffa7b0e 100644 (file)
--- a/gettext-tools/src/xg-encoding.h
+++ b/gettext-tools/src/xg-encoding.h
@@ -1,5 +1,5 @@
  /* Keeping track of the encoding of strings to be extracted.
-   Copyright (C) 2001-2018 Free Software Foundation, Inc.
+   Copyright (C) 2001-2019 Free Software Foundation, Inc.
  
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -49,7 +49,9 @@ extern char *non_ascii_error_message (lexical_context_ty lcontext,
                                        size_t line_number);
  
  
-/* Canonicalized encoding name for all input files.  */
+/* Canonicalized encoding name for all input files.
+   It can be NULL when the --from-code option has not been specified.  In this
+   case, the default (ASCII or UTF-8) depends on the programming language.  */
  extern const char *xgettext_global_source_encoding;
  
  #if HAVE_ICONV
diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c

index a6bceddcf32706a022e009c7a3af1c8909da9ab6..16a7538cb81e82f27c0a711d7cadc391cd3ea553 100644 (file)
--- a/gettext-tools/src/xgettext.c
+++ b/gettext-tools/src/xgettext.c
@@ -347,7 +347,7 @@ main (int argc, char *argv[])
  
    /* Set initial value of variables.  */
    default_domain = MESSAGE_DOMAIN_DEFAULT;
-  xgettext_global_source_encoding = po_charset_ascii;
+  xgettext_global_source_encoding = NULL;
    init_flag_table_c ();
    init_flag_table_objc ();
    init_flag_table_gcc_internal ();
@@ -768,7 +768,8 @@ xgettext cannot work without keywords to look for"));
  
    /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
       from ASCII or UTF-8, when this conversion is a no-op).  */
-  if (xgettext_global_source_encoding != po_charset_ascii
+  if (xgettext_global_source_encoding != NULL
+      && xgettext_global_source_encoding != po_charset_ascii
        && xgettext_global_source_encoding != po_charset_utf8)
      {
  #if HAVE_ICONV
@@ -965,7 +966,8 @@ xgettext cannot work without keywords to look for"));
  
    /* Free the allocated converter.  */
  #if HAVE_ICONV
-  if (xgettext_global_source_encoding != po_charset_ascii
+  if (xgettext_global_source_encoding != NULL
+      && xgettext_global_source_encoding != po_charset_ascii
        && xgettext_global_source_encoding != po_charset_utf8)
      iconv_close (xgettext_global_source_iconv);
  #endif
@@ -1764,7 +1766,9 @@ extract_from_file (const char *file_name, extractor_ty extractor,
  
    /* Set the default for the source file encoding.  May be overridden by
       the extractor function.  */
-  xgettext_current_source_encoding = xgettext_global_source_encoding;
+  xgettext_current_source_encoding =
+    (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
+     po_charset_ascii);
  #if HAVE_ICONV
    xgettext_current_source_iconv = xgettext_global_source_iconv;
  #endif
diff --git a/gettext-tools/tests/xgettext-python-3 b/gettext-tools/tests/xgettext-python-3

index ca0926dc822817754c7e4a0bd4d595e9b65847c3..fa19f1d9085c7c292c1bd832addf22a73670f22d 100755 (executable)
--- a/gettext-tools/tests/xgettext-python-3
+++ b/gettext-tools/tests/xgettext-python-3
@@ -19,6 +19,12 @@ cat <<\EOF > xg-py-3b.py
  print gettext.gettext("ÆüËÜ¸ì");
  EOF
  
+cat <<\EOF > xg-py-3u.py
+#!/usr/bin/env python
+# TRANSLATORS: François Pinard is a hero.
+print gettext.gettext("日本語");
+EOF
+
  cat <<\EOF > xg-py-3.ok
  # SOME DESCRIPTIVE TITLE.
  # Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
@@ -68,6 +74,17 @@ cat xg-py-3b.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3b.
  
  ${DIFF} xg-py-3.ok xg-py-3b.pot || Exit 1
  
+# Verify that if the source file has no magic coding comment but is UTF-8
+# encoded, xgettext succeeds. (PEP 3120)
+
+${XGETTEXT} --add-comments=TRANSLATORS: --no-location \
+  -o xg-py-3u.tmp xg-py-3u.py || Exit 1
+# Don't simplify this to "grep ... < xg-py-3u.tmp", otherwise OpenBSD 4.0 grep
+# only outputs "Binary file (standard input) matches".
+cat xg-py-3u.tmp | grep -v 'POT-Creation-Date' | LC_ALL=C tr -d '\r' > xg-py-3u.pot
+
+${DIFF} xg-py-3.ok xg-py-3u.pot || Exit 1
+
  # Verify that if the source file has a magic coding comment and a --from-code
  # option is given, the magic coding comment takes precedence over it.
author	Bruno Haible <bruno@clisp.org>
	Sun, 25 Aug 2019 21:37:44 +0000 (23:37 +0200)
committer	Bruno Haible <bruno@clisp.org>
	Mon, 13 Apr 2020 11:13:15 +0000 (13:13 +0200)
NEWS		patch \| blob \| blame \| history
gettext-tools/src/x-javascript.c		patch \| blob \| blame \| history
gettext-tools/src/x-python.c		patch \| blob \| blame \| history
gettext-tools/src/xg-encoding.c		patch \| blob \| blame \| history
gettext-tools/src/xg-encoding.h		patch \| blob \| blame \| history
gettext-tools/src/xgettext.c		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-python-3		patch \| blob \| blame \| history