Support for Python source encodings (PEP 0263).

author Bruno Haible <bruno@clisp.org>

Mon, 26 Sep 2005 09:21:16 +0000 (09:21 +0000)

committer Bruno Haible <bruno@clisp.org>

Tue, 23 Jun 2009 10:12:51 +0000 (12:12 +0200)
author Bruno Haible <bruno@clisp.org>
Mon, 26 Sep 2005 09:21:16 +0000 (09:21 +0000)
committer Bruno Haible <bruno@clisp.org>
Tue, 23 Jun 2009 10:12:51 +0000 (12:12 +0200)
diff --git a/NEWS b/NEWS

index 440e998723b92dae358d3909752810b97f1faee1..c5602db8d5733d13878d4e5103fe5cb139c94387 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,11 @@
  
+* Programming languages support:
+
+  - Python:
+    xgettext now recognizes the source encoding from a "coding:" comment
+    among the first two lines.  The default encoding is now ASCII, no longer
+    ISO-8859-1.
+
  * libgettextpo library:
    - The error handler type passed to po_file_read(), po_file_write(),
      po_message_check_format() has changed.
diff --git a/gettext-tools/doc/ChangeLog b/gettext-tools/doc/ChangeLog

index db6879f1a0f6de86c7fa6687cd007e56d2adb429..8bcff820a82f0870b3be8be5b9e356373632a0af 100644 (file)
--- a/gettext-tools/doc/ChangeLog
+++ b/gettext-tools/doc/ChangeLog
@@ -1,3 +1,8 @@
+2005-09-25  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext.texi (--from-code): Python input is no longer always in
+       ISO-8859-1.
+
  2005-09-20  Bruno Haible  <bruno@clisp.org>
  
         * xgettext.texi (--from-code): Python input is always assumed to be
diff --git a/gettext-tools/doc/xgettext.texi b/gettext-tools/doc/xgettext.texi

index 9e6a1371ef0c7e0381aa5af3e4d697de355fe248..61ad09d502d290840539250ff85a55452b4cbcda 100644 (file)
--- a/gettext-tools/doc/xgettext.texi
+++ b/gettext-tools/doc/xgettext.texi
@@ -93,9 +93,8 @@ extension.
  @opindex --from-code@r{, @code{xgettext} option}
  Specifies the encoding of the input files.  This option is needed only
  if some untranslated message strings or their corresponding comments
-contain non-ASCII characters.  Note that Python input files are always
-assumed to be in ISO-8859-1, regardless of this option.  And Tcl and Glade
-input files are always assumed to be in UTF-8, regardless of this option.
+contain non-ASCII characters.  Note that Tcl and Glade input files are
+always assumed to be in UTF-8, regardless of this option.
  
  @end table
  
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog

index bc859f6cb6812e0ec40ac7cd8575361004f64408..eb744dfda3122f10afde5c38a47d275880f80432 100644 (file)
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,48 @@
+2005-09-25  Bruno Haible  <bruno@clisp.org>
+
+       Support for Python source encodings (PEP 0263).
+       * x-python.c: Include progname.h, basename.h, xerror.h, strstr.h,
+       c-ctype.h, utf8-ucs4.h.
+       (phase1_pushback): Reduce size.
+       (UEOF): New macro.
+       (phase2_pushback, phase2_pushback_length): New variables.
+       (phase2_getc, phase2_ungetc): New functions.
+       (struct unicode_string_buffer): New structure type.
+       (init_unicode_string_buffer, unicode_string_buffer_append_unicode_grow,
+       unicode_string_buffer_append_unicode, unicode_string_buffer_result,
+       free_unicode_string_buffer): New functions.
+       (comment_buffer): New variable.
+       (buffer, bufmax, buflen): Remove variables.
+       (comment_start, comment_add, comment_line_end): Rewritten.
+       (comment_at_start): New function.
+       (xgettext_current_file_source_encoding): New variable.
+       (xgettext_current_file_source_iconv): New variable.
+       (set_current_file_source_encoding, try_to_extract_coding): New
+       functions.
+       (continuation_or_nonblank_line): New variable.
+       (phase3_getc): Renamed from phase2_getc. Use phase2_getc instead of
+       phase1_getc. Return a Unicode character. Call try_to_extract_coding
+       when seeing a comment among the first two lines.
+       (phase3_ungetc): Renamed from phase2_ungetc. Use phase2_ungetc instead
+       of phase1_ungetc.
+       (UNICODE, IS_UNICODE, UNICODE_VALUE): New macros.
+       (struct mixed_string_buffer): New structure type.
+       (init_mixed_string_buffer, mixed_string_buffer_append_byte,
+       mixed_string_buffer_append_unicode_grow,
+       mixed_string_buffer_append_unicode,
+       mixed_string_buffer_flush_utf16_surr,
+       mixed_string_buffer_flush_curr_buffer, mixed_string_buffer_append,
+       mixed_string_buffer_result, free_mixed_string_buffer): New functions.
+       (phase7_getuc): Use phase2_getc instead of phase1_getc. Return a
+       Unicode character except for \ooo and \xnn.
+       (phase5_get): Operate on the level of Unicode characters instead of
+       at the level of bytes. Use a mixed_string_buffer to accumulate a
+       string literal.
+       (extract_parenthesized): Set xgettext_current_source_encoding to UTF-8
+       while passing UTF-8 strings to the xgettext main code.
+       (extract_python): Initialize xgettext_current_file_source_encoding and
+       xgettext_current_source_encoding.
+
  2005-09-25  Bruno Haible  <bruno@clisp.org>
  
         * x-csharp.c (phase2_getc): Fix mis-use of iconv() when the source
diff --git a/gettext-tools/src/x-python.c b/gettext-tools/src/x-python.c

index 768a7c2442dc68557359af7e7548c92682235eae..21c686a6275bcbafb689bd3058f4d3d827b54e11 100644 (file)
--- a/gettext-tools/src/x-python.c
+++ b/gettext-tools/src/x-python.c
@@ -1,5 +1,5 @@
  /* xgettext Python backend.
-   Copyright (C) 2002-2003 Free Software Foundation, Inc.
+   Copyright (C) 2002-2003, 2005 Free Software Foundation, Inc.
  
     This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
  
@@ -33,11 +33,17 @@
  #include "x-python.h"
  #include "error.h"
  #include "error-progname.h"
+#include "progname.h"
+#include "basename.h"
+#include "xerror.h"
  #include "xalloc.h"
  #include "exit.h"
+#include "strstr.h"
+#include "c-ctype.h"
  #include "po-charset.h"
  #include "uniname.h"
  #include "utf16-ucs4.h"
+#include "utf8-ucs4.h"
  #include "ucs4-utf8.h"
  #include "gettext.h"
  
@@ -148,11 +154,14 @@ static int line_number;
  static FILE *fp;
  
  
-/* 1. line_number handling.  Also allow a lookahead.  */
+/* 1. line_number handling.  */
  
-static unsigned char phase1_pushback[max (9, UNINAME_MAX + 3)];
+/* Maximum used, roughly a safer MB_LEN_MAX.  */
+#define MAX_PHASE1_PUSHBACK 16
+static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
  static int phase1_pushback_length;
  
+/* Read the next single byte from the input file.  */
  static int
  phase1_getc ()
  {
@@ -174,12 +183,12 @@ phase1_getc ()
      }
  
    if (c == '\n')
-    line_number++;
+    ++line_number;
  
    return c;
  }
  
-/* Supports max (9, UNINAME_MAX + 3) characters of pushback.  */
+/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
  static void
  phase1_ungetc (int c)
  {
@@ -195,110 +204,770 @@ phase1_ungetc (int c)
  }
  
  
-/* Accumulating comments.  */
+/* Phase 2: Conversion to Unicode.
+   This is done early because PEP 0263 specifies that conversion to Unicode
+   conceptually occurs before tokenization.  A test case where it matters
+   is with encodings like BIG5: when a double-byte character ending in 0x5C
+   is followed by '\' or 'u0021', the tokenizer must not treat the second
+   half of the double-byte character as a backslash.  */
  
-static char *buffer;
-static size_t bufmax;
-static size_t buflen;
+/* End-of-file indicator for functions returning an UCS-4 character.  */
+#define UEOF -1
  
-static inline void
-comment_start ()
+static int phase2_pushback[max (9, UNINAME_MAX + 3)];
+static int phase2_pushback_length;
+
+/* Read the next Unicode UCS-4 character from the input file.  */
+static int
+phase2_getc ()
  {
-  buflen = 0;
+  if (phase2_pushback_length)
+    return phase2_pushback[--phase2_pushback_length];
+
+  if (xgettext_current_source_encoding == po_charset_ascii)
+    {
+      int c = phase1_getc ();
+      if (c == EOF)
+       return UEOF;
+      if (!c_isascii (c))
+       {
+         char buffer[21];
+         sprintf (buffer, ":%ld", (long) line_number);
+         multiline_error (xstrdup (""),
+                          xasprintf (_("\
+Non-ASCII string at %s%s.\n\
+Please specify the source encoding through --from-code or through a comment\n\
+as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                          real_file_name, buffer));
+         exit (EXIT_FAILURE);
+       }
+      return c;
+    }
+  else if (xgettext_current_source_encoding != po_charset_utf8)
+    {
+#if HAVE_ICONV
+      /* Use iconv on an increasing number of bytes.  Read only as many bytes
+        through phase1_getc as needed.  This is needed to give reasonable
+        interactive behaviour when fp is connected to an interactive tty.  */
+      unsigned char buf[MAX_PHASE1_PUSHBACK];
+      size_t bufcount;
+      int c = phase1_getc ();
+      if (c == EOF)
+       return UEOF;
+      buf[0] = (unsigned char) c;
+      bufcount = 1;
+
+      for (;;)
+       {
+         unsigned char scratchbuf[6];
+         const char *inptr = (const char *) &buf[0];
+         size_t insize = bufcount;
+         char *outptr = (char *) &scratchbuf[0];
+         size_t outsize = sizeof (scratchbuf);
+
+         size_t res = iconv (xgettext_current_source_iconv,
+                             (ICONV_CONST char **) &inptr, &insize,
+                             &outptr, &outsize);
+         /* We expect that a character has been produced if and only if
+            some input bytes have been consumed.  */
+         if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
+           abort ();
+         if (outsize == sizeof (scratchbuf))
+           {
+             /* No character has been produced.  Must be an error.  */
+             if (res != (size_t)(-1))
+               abort ();
+
+             if (errno == EILSEQ)
+               {
+                 /* An invalid multibyte sequence was encountered.  */
+                 multiline_error (xstrdup (""),
+                                  xasprintf (_("\
+%s:%d: Invalid multibyte sequence.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                                  real_file_name, line_number));
+                 exit (EXIT_FAILURE);
+               }
+             else if (errno == EINVAL)
+               {
+                 /* An incomplete multibyte character.  */
+                 int c;
+
+                 if (bufcount == MAX_PHASE1_PUSHBACK)
+                   {
+                     /* An overlong incomplete multibyte sequence was
+                        encountered.  */
+                     multiline_error (xstrdup (""),
+                                      xasprintf (_("\
+%s:%d: Long incomplete multibyte sequence.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                                      real_file_name, line_number));
+                     exit (EXIT_FAILURE);
+                   }
+
+                 /* Read one more byte and retry iconv.  */
+                 c = phase1_getc ();
+                 if (c == EOF)
+                   {
+                     multiline_error (xstrdup (""),
+                                      xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of file.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                                      real_file_name, line_number));
+                     exit (EXIT_FAILURE);
+                   }
+                 if (c == '\n')
+                   {
+                     multiline_error (xstrdup (""),
+                                      xasprintf (_("\
+%s:%d: Incomplete multibyte sequence at end of line.\n\
+Please specify the correct source encoding through --from-code or through a\n\
+comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                                      real_file_name, line_number - 1));
+                     exit (EXIT_FAILURE);
+                   }
+                 buf[bufcount++] = (unsigned char) c;
+               }
+             else
+               error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
+                      real_file_name, line_number);
+           }
+         else
+           {
+             size_t outbytes = sizeof (scratchbuf) - outsize;
+             size_t bytes = bufcount - insize;
+             unsigned int uc;
+
+             /* We expect that one character has been produced.  */
+             if (bytes == 0)
+               abort ();
+             if (outbytes == 0)
+               abort ();
+             /* Push back the unused bytes.  */
+             while (insize > 0)
+               phase1_ungetc (buf[--insize]);
+             /* Convert the character from UTF-8 to UCS-4.  */
+             if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
+               {
+                 /* scratchbuf contains an out-of-range Unicode character
+                    (> 0x10ffff).  */
+                 multiline_error (xstrdup (""),
+                                  xasprintf (_("\
+%s:%d: Invalid multibyte sequence.\n\
+Please specify the source encoding through --from-code or through a comment\n\
+as specified in http://www.python.org/peps/pep-0263.html.\n"),
+                                  real_file_name, line_number));
+                 exit (EXIT_FAILURE);
+               }
+             return uc;
+           }
+       }
+#else
+      /* If we don't have iconv(), the only supported values for
+        xgettext_global_source_encoding and thus also for
+        xgettext_current_source_encoding are ASCII and UTF-8.  */
+      abort ();
+#endif
+    }
+  else
+    {
+      /* Read an UTF-8 encoded character.  */
+      unsigned char buf[6];
+      unsigned int count;
+      int c;
+      unsigned int uc;
+
+      c = phase1_getc ();
+      if (c == EOF)
+       return UEOF;
+      buf[0] = c;
+      count = 1;
+
+      if (buf[0] >= 0xc0)
+       {
+         c = phase1_getc ();
+         if (c == EOF)
+           return UEOF;
+         buf[1] = c;
+         count = 2;
+       }
+
+      if (buf[0] >= 0xe0
+         && ((buf[1] ^ 0x80) < 0x40))
+       {
+         c = phase1_getc ();
+         if (c == EOF)
+           return UEOF;
+         buf[2] = c;
+         count = 3;
+       }
+
+      if (buf[0] >= 0xf0
+         && ((buf[1] ^ 0x80) < 0x40)
+         && ((buf[2] ^ 0x80) < 0x40))
+       {
+         c = phase1_getc ();
+         if (c == EOF)
+           return UEOF;
+         buf[3] = c;
+         count = 4;
+       }
+
+      if (buf[0] >= 0xf8
+         && ((buf[1] ^ 0x80) < 0x40)
+         && ((buf[2] ^ 0x80) < 0x40)
+         && ((buf[3] ^ 0x80) < 0x40))
+       {
+         c = phase1_getc ();
+         if (c == EOF)
+           return UEOF;
+         buf[4] = c;
+         count = 5;
+       }
+
+      if (buf[0] >= 0xfc
+         && ((buf[1] ^ 0x80) < 0x40)
+         && ((buf[2] ^ 0x80) < 0x40)
+         && ((buf[3] ^ 0x80) < 0x40)
+         && ((buf[4] ^ 0x80) < 0x40))
+       {
+         c = phase1_getc ();
+         if (c == EOF)
+           return UEOF;
+         buf[5] = c;
+         count = 6;
+       }
+
+      u8_mbtouc (&uc, buf, count);
+      return uc;
+    }
  }
  
-static inline void
-comment_add (int c)
+/* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
+static void
+phase2_ungetc (int c)
  {
-  /* We assume the program source is in ISO-8859-1 (for consistency with
-     Python's \ooo and \xnn syntax inside strings), but we produce a POT
-     file in UTF-8 encoding.  */
-  size_t len = ((unsigned char) c < 0x80 ? 1 : 2);
-  if (buflen + len > bufmax)
+  if (c != UEOF)
      {
-      bufmax = 2 * bufmax + 10;
-      buffer = xrealloc (buffer, bufmax);
+      if (phase2_pushback_length == SIZEOF (phase2_pushback))
+       abort ();
+      phase2_pushback[phase2_pushback_length++] = c;
      }
-  if ((unsigned char) c < 0x80)
-    buffer[buflen++] = c;
-  else
+}
+
+
+/* ========================= Accumulating strings.  ======================== */
+
+/* A string buffer type that allows appending Unicode characters.
+   Returns the entire string in UTF-8 encoding.  */
+
+struct unicode_string_buffer
+{
+  /* The part of the string that has already been converted to UTF-8.  */
+  char *utf8_buffer;
+  size_t utf8_buflen;
+  size_t utf8_allocated;
+};
+
+/* Initialize a 'struct unicode_string_buffer' to empty.  */
+static inline void
+init_unicode_string_buffer (struct unicode_string_buffer *bp)
+{
+  bp->utf8_buffer = NULL;
+  bp->utf8_buflen = 0;
+  bp->utf8_allocated = 0;
+}
+
+/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
+static inline void
+unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
+                                          size_t count)
+{
+  if (bp->utf8_buflen + count > bp->utf8_allocated)
      {
-      buffer[buflen++] = 0xc0 | ((unsigned char) c >> 6);
-      buffer[buflen++] = 0x80 | ((unsigned char) c & 0x3f);
+      size_t new_allocated = 2 * bp->utf8_allocated + 10;
+      if (new_allocated < bp->utf8_buflen + count)
+       new_allocated = bp->utf8_buflen + count;
+      bp->utf8_allocated = new_allocated;
+      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
      }
  }
  
+/* Auxiliary function: Append a Unicode character to bp->utf8.
+   uc must be < 0x110000.  */
+static inline void
+unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
+                                     unsigned int uc)
+{
+  unsigned char utf8buf[6];
+  int count = u8_uctomb (utf8buf, uc, 6);
+
+  if (count < 0)
+    /* The caller should have ensured that uc is not out-of-range.  */
+    abort ();
+
+  unicode_string_buffer_append_unicode_grow (bp, count);
+  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
+  bp->utf8_buflen += count;
+}
+
+/* Return the string buffer's contents.  */
+static char *
+unicode_string_buffer_result (struct unicode_string_buffer *bp)
+{
+  /* NUL-terminate it.  */
+  unicode_string_buffer_append_unicode_grow (bp, 1);
+  bp->utf8_buffer[bp->utf8_buflen] = '\0';
+  /* Return it.  */
+  return bp->utf8_buffer;
+}
+
+/* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
+static inline void
+free_unicode_string_buffer (struct unicode_string_buffer *bp)
+{
+  free (bp->utf8_buffer);
+}
+
+
+/* ======================== Accumulating comments.  ======================== */
+
+
+/* Accumulating a single comment line.  */
+
+static struct unicode_string_buffer comment_buffer;
+
+static inline void
+comment_start ()
+{
+  comment_buffer.utf8_buflen = 0;
+}
+
+static inline bool
+comment_at_start ()
+{
+  return (comment_buffer.utf8_buflen == 0);
+}
+
  static inline void
+comment_add (int c)
+{
+  unicode_string_buffer_append_unicode (&comment_buffer, c);
+}
+
+static inline const char *
  comment_line_end ()
  {
+  char *buffer = unicode_string_buffer_result (&comment_buffer);
+  size_t buflen = strlen (buffer);
+
    while (buflen >= 1
          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
      --buflen;
-  if (buflen >= bufmax)
-    {
-      bufmax = 2 * bufmax + 10;
-      buffer = xrealloc (buffer, bufmax);
-    }
    buffer[buflen] = '\0';
    savable_comment_add (buffer);
+  return buffer;
  }
  
+
  /* These are for tracking whether comments count as immediately before
     keyword.  */
  static int last_comment_line;
  static int last_non_comment_line;
  
  
-/* 2. Outside strings, replace backslash-newline with nothing and a comment
-      with nothing.  */
+/* ======================== Recognizing comments.  ======================== */
+
+
+/* Recognizing the "coding" comment.
+   As specified in PEP 0263, it takes the form
+     "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
+   and is located in a comment in a line that
+     - is either the first or second line,
+     - is not a continuation line,
+     - contains no other tokens except this comment.  */
+
+/* Canonicalized encoding name for the current input file.  */
+static const char *xgettext_current_file_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+static iconv_t xgettext_current_file_source_iconv;
+#endif
+
+static inline void
+set_current_file_source_encoding (const char *canon_encoding)
+{
+  xgettext_current_file_source_encoding = canon_encoding;
+
+  if (xgettext_current_file_source_encoding != po_charset_ascii
+      && xgettext_current_file_source_encoding != po_charset_utf8)
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
+       cd = (iconv_t)(-1);
+      else
+# endif
+      cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
+      if (cd == (iconv_t)(-1))
+       error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
+and iconv() does not support this conversion."),
+              xgettext_current_file_source_encoding, po_charset_utf8,
+              basename (program_name));
+      xgettext_current_file_source_iconv = cd;
+#else
+      error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
+This version was built without iconv()."),
+            xgettext_global_source_encoding, po_charset_utf8,
+            basename (program_name));
+#endif
+    }
+
+  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
+#if HAVE_ICONV
+  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
+#endif
+}
+
+static inline void
+try_to_extract_coding (const char *comment)
+{
+  const char *p = strstr (comment, "coding");
+
+  if (p != NULL)
+    {
+      p += 6;
+      if (*p == ':' || *p == '=')
+       {
+         p++;
+         while (*p == ' ' || *p == '\t')
+           p++;
+         {
+           const char *encoding_start = p;
+
+           while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
+             p++;
+           {
+             const char *encoding_end = p;
+
+             if (encoding_end > encoding_start)
+               {
+                 /* Extract the encoding string.  */
+                 size_t encoding_len = encoding_end - encoding_start;
+                 char *encoding = (char *) xmalloc (encoding_len + 1);
+
+                 memcpy (encoding, encoding_start, encoding_len);
+                 encoding[encoding_len] = '\0';
+
+                 {
+                   /* Canonicalize it.  */
+                   const char *canon_encoding = po_charset_canonicalize (encoding);
+                   if (canon_encoding == NULL)
+                     {
+                       error_at_line (0, 0,
+                                      logical_file_name, line_number - 1, _("\
+Unknown encoding \"%s\". Proceeding with ASCII instead."),
+                                      encoding);
+                       canon_encoding = po_charset_ascii;
+                     }
+
+                   /* Activate it.  */
+                   set_current_file_source_encoding (canon_encoding);
+                 }
+
+                 free (encoding);
+               }
+           }
+         }
+       }
+    }
+}
+
+/* Tracking whether the current line is a continuation line or contains a
+   non-blank character.  */
+static bool continuation_or_nonblank_line = false;
+
+
+/* Phase 3: Outside strings, replace backslash-newline with nothing and a
+   comment with nothing.  */
  
  static int
-phase2_getc ()
+phase3_getc ()
  {
    int c;
  
    for (;;)
      {
-      c = phase1_getc ();
+      c = phase2_getc ();
        if (c == '\\')
         {
-         c = phase1_getc ();
+         c = phase2_getc ();
           if (c != '\n')
             {
-             phase1_ungetc (c);
+             phase2_ungetc (c);
               /* This shouldn't happen usually, because "A backslash is
                  illegal elsewhere on a line outside a string literal."  */
               return '\\';
             }
           /* Eat backslash-newline.  */
+         continuation_or_nonblank_line = true;
         }
        else if (c == '#')
         {
           /* Eat a comment.  */
+         const char *comment;
+
           last_comment_line = line_number;
           comment_start ();
           for (;;)
             {
-             c = phase1_getc ();
-             if (c == EOF || c == '\n')
+             c = phase2_getc ();
+             if (c == UEOF || c == '\n')
                 break;
               /* We skip all leading white space, but not EOLs.  */
-             if (!(buflen == 0 && (c == ' ' || c == '\t')))
+             if (!(comment_at_start () && (c == ' ' || c == '\t')))
                 comment_add (c);
             }
-         comment_line_end ();
+         comment = comment_line_end ();
+         if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
+           try_to_extract_coding (comment);
+         continuation_or_nonblank_line = false;
           return c;
         }
        else
-       return c;
+       {
+         if (c == '\n')
+           continuation_or_nonblank_line = false;
+         else if (!(c == ' ' || c == '\t' || c == '\f'))
+           continuation_or_nonblank_line = true;
+         return c;
+       }
      }
  }
  
  /* Supports only one pushback character.  */
  static void
-phase2_ungetc (int c)
+phase3_ungetc (int c)
+{
+  phase2_ungetc (c);
+}
+
+
+/* ========================= Accumulating strings.  ======================== */
+
+/* Return value of phase7_getuc when EOF is reached.  */
+#define P7_EOF (-1)
+#define P7_STRING_END (-2)
+
+/* Convert an UTF-16 or UTF-32 code point to a return value that can be
+   distinguished from a single-byte return value.  */
+#define UNICODE(code) (0x100 + (code))
+
+/* Test a return value of phase7_getuc whether it designates an UTF-16 or
+   UTF-32 code point.  */
+#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
+
+/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
+   IS_UNICODE.  */
+#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
+
+/* A string buffer type that allows appending bytes (in the
+   xgettext_current_source_encoding) or Unicode characters.
+   Returns the entire string in UTF-8 encoding.  */
+
+struct mixed_string_buffer
+{
+  /* The part of the string that has already been converted to UTF-8.  */
+  char *utf8_buffer;
+  size_t utf8_buflen;
+  size_t utf8_allocated;
+  /* The first half of an UTF-16 surrogate character.  */
+  unsigned short utf16_surr;
+  /* The part of the string that is still in the source encoding.  */
+  char *curr_buffer;
+  size_t curr_buflen;
+  size_t curr_allocated;
+};
+
+/* Initialize a 'struct mixed_string_buffer' to empty.  */
+static inline void
+init_mixed_string_buffer (struct mixed_string_buffer *bp)
+{
+  bp->utf8_buffer = NULL;
+  bp->utf8_buflen = 0;
+  bp->utf8_allocated = 0;
+  bp->utf16_surr = 0;
+  bp->curr_buffer = NULL;
+  bp->curr_buflen = 0;
+  bp->curr_allocated = 0;
+}
+
+/* Auxiliary function: Append a byte to bp->curr.  */
+static inline void
+mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
+{
+  if (bp->curr_buflen == bp->curr_allocated)
+    {
+      bp->curr_allocated = 2 * bp->curr_allocated + 10;
+      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
+    }
+  bp->curr_buffer[bp->curr_buflen++] = c;
+}
+
+/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
+static inline void
+mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
+{
+  if (bp->utf8_buflen + count > bp->utf8_allocated)
+    {
+      size_t new_allocated = 2 * bp->utf8_allocated + 10;
+      if (new_allocated < bp->utf8_buflen + count)
+       new_allocated = bp->utf8_buflen + count;
+      bp->utf8_allocated = new_allocated;
+      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
+    }
+}
+
+/* Auxiliary function: Append a Unicode character to bp->utf8.
+   uc must be < 0x110000.  */
+static inline void
+mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
+{
+  unsigned char utf8buf[6];
+  int count = u8_uctomb (utf8buf, uc, 6);
+
+  if (count < 0)
+    /* The caller should have ensured that uc is not out-of-range.  */
+    abort ();
+
+  mixed_string_buffer_append_unicode_grow (bp, count);
+  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
+  bp->utf8_buflen += count;
+}
+
+/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
+static inline void
+mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
+{
+  if (bp->utf16_surr != 0)
+    {
+      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
+      mixed_string_buffer_append_unicode (bp, 0xfffd);
+      bp->utf16_surr = 0;
+    }
+}
+
+/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
+static inline void
+mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
+{
+  if (bp->curr_buflen > 0)
+    {
+      char *curr;
+      size_t count;
+
+      mixed_string_buffer_append_byte (bp, '\0');
+
+      /* Convert from the source encoding to UTF-8.  */
+      curr = from_current_source_encoding (bp->curr_buffer,
+                                          logical_file_name, lineno);
+
+      /* Append it to bp->utf8_buffer.  */
+      count = strlen (curr);
+      mixed_string_buffer_append_unicode_grow (bp, count);
+      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
+      bp->utf8_buflen += count;
+
+      if (curr != bp->curr_buffer)
+       free (curr);
+      bp->curr_buflen = 0;
+    }
+}
+
+/* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
+static void
+mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
+{
+  if (IS_UNICODE (c))
+    {
+      /* Append a Unicode character.  */
+
+      /* Switch from multibyte character mode to Unicode character mode.  */
+      mixed_string_buffer_flush_curr_buffer (bp, line_number);
+
+      /* Test whether this character and the previous one form a Unicode
+        surrogate character pair.  */
+      if (bp->utf16_surr != 0
+         && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
+       {
+         unsigned short utf16buf[2];
+         unsigned int uc;
+
+         utf16buf[0] = bp->utf16_surr;
+         utf16buf[1] = UNICODE_VALUE (c);
+         if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
+           abort ();
+
+         mixed_string_buffer_append_unicode (bp, uc);
+         bp->utf16_surr = 0;
+       }
+      else
+       {
+         mixed_string_buffer_flush_utf16_surr (bp);
+
+         if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
+           bp->utf16_surr = UNICODE_VALUE (c);
+         else
+           mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
+       }
+    }
+  else
+    {
+      /* Append a single byte.  */
+
+      /* Switch from Unicode character mode to multibyte character mode.  */
+      mixed_string_buffer_flush_utf16_surr (bp);
+
+      /* When a newline is seen, convert the accumulated multibyte sequence.
+        This ensures a correct line number in the error message in case of
+        a conversion error.  The "- 1" is to account for the newline.  */
+      if (c == '\n')
+       mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
+
+      mixed_string_buffer_append_byte (bp, (unsigned char) c);
+    }
+}
+
+/* Return the string buffer's contents.  */
+static char *
+mixed_string_buffer_result (struct mixed_string_buffer *bp)
+{
+  /* Flush all into bp->utf8_buffer.  */
+  mixed_string_buffer_flush_utf16_surr (bp);
+  mixed_string_buffer_flush_curr_buffer (bp, line_number);
+  /* NUL-terminate it.  */
+  mixed_string_buffer_append_unicode_grow (bp, 1);
+  bp->utf8_buffer[bp->utf8_buflen] = '\0';
+  /* Return it.  */
+  return bp->utf8_buffer;
+}
+
+/* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
+static inline void
+free_mixed_string_buffer (struct mixed_string_buffer *bp)
  {
-  phase1_ungetc (c);
+  free (bp->utf8_buffer);
+  free (bp->curr_buffer);
  }
  
  
@@ -336,11 +1005,8 @@ struct token_ty
      u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
      ur"abc"                                           \unnnn
     The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
-   \unnnn items.  The \ooo and \xnn values are ISO-8859-1 values: u"\xff" and
-   u"\u00ff" are the same.  */
-
-#define P7_EOF (-1)
-#define P7_STRING_END (-2)
+   \unnnn items.  The \ooo and \xnn values are in the current source encoding.
+ */
  
  static int
  phase7_getuc (int quote_char,
@@ -351,26 +1017,26 @@ phase7_getuc (int quote_char,
  
    for (;;)
      {
-      /* Use phase 1, because phase 2 elides comments.  */
-      c = phase1_getc ();
+      /* Use phase 2, because phase 3 elides comments.  */
+      c = phase2_getc ();
  
-      if (c == EOF)
+      if (c == UEOF)
         return P7_EOF;
  
        if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
         {
           if (triple)
             {
-             int c1 = phase1_getc ();
+             int c1 = phase2_getc ();
               if (c1 == quote_char)
                 {
-                 int c2 = phase1_getc ();
+                 int c2 = phase2_getc ();
                   if (c2 == quote_char)
                     return P7_STRING_END;
-                 phase1_ungetc (c2);
+                 phase2_ungetc (c2);
                 }
-             phase1_ungetc (c1);
-             return c;
+             phase2_ungetc (c1);
+             return UNICODE (c);
             }
           else
             return P7_STRING_END;
@@ -381,7 +1047,7 @@ phase7_getuc (int quote_char,
           if (triple)
             {
               *backslash_counter = 0;
-             return '\n';
+             return UNICODE ('\n');
             }
           /* In r"..." and ur"..." strings, newline is only allowed
              immediately after an odd number of backslashes (although the
@@ -389,9 +1055,9 @@ phase7_getuc (int quote_char,
           if (!(interpret_ansic || (*backslash_counter & 1) == 0))
             {
               *backslash_counter = 0;
-             return '\n';
+             return UNICODE ('\n');
             }
-         phase1_ungetc (c);
+         phase2_ungetc (c);
           error_with_progname = false;
           error (0, 0, _("%s:%d: warning: unterminated string"),
                  logical_file_name, line_number);
@@ -402,7 +1068,7 @@ phase7_getuc (int quote_char,
        if (c != '\\')
         {
           *backslash_counter = 0;
-         return c;
+         return UNICODE (c);
         }
  
        /* Backslash handling.  */
@@ -410,15 +1076,15 @@ phase7_getuc (int quote_char,
        if (!interpret_ansic && !interpret_unicode)
         {
           ++*backslash_counter;
-         return '\\';
+         return UNICODE ('\\');
         }
  
        /* Dispatch according to the character following the backslash.  */
-      c = phase1_getc ();
-      if (c == EOF)
+      c = phase2_getc ();
+      if (c == UEOF)
         {
           ++*backslash_counter;
-         return '\\';
+         return UNICODE ('\\');
         }
  
        if (interpret_ansic)
@@ -428,60 +1094,60 @@ phase7_getuc (int quote_char,
             continue;
           case '\\':
             ++*backslash_counter;
-           return c;
+           return UNICODE (c);
           case '\'': case '"':
             *backslash_counter = 0;
-           return c;
+           return UNICODE (c);
           case 'a':
             *backslash_counter = 0;
-           return '\a';
+           return UNICODE ('\a');
           case 'b':
             *backslash_counter = 0;
-           return '\b';
+           return UNICODE ('\b');
           case 'f':
             *backslash_counter = 0;
-           return '\f';
+           return UNICODE ('\f');
           case 'n':
             *backslash_counter = 0;
-           return '\n';
+           return UNICODE ('\n');
           case 'r':
             *backslash_counter = 0;
-           return '\r';
+           return UNICODE ('\r');
           case 't':
             *backslash_counter = 0;
-           return '\t';
+           return UNICODE ('\t');
           case 'v':
             *backslash_counter = 0;
-           return '\v';
+           return UNICODE ('\v');
           case '0': case '1': case '2': case '3': case '4':
           case '5': case '6': case '7':
             {
               int n = c - '0';
  
-             c = phase1_getc ();
-             if (c != EOF)
+             c = phase2_getc ();
+             if (c != UEOF)
                 {
                   if (c >= '0' && c <= '7')
                     {
                       n = (n << 3) + (c - '0');
-                     c = phase1_getc ();
-                     if (c != EOF)
+                     c = phase2_getc ();
+                     if (c != UEOF)
                         {
                           if (c >= '0' && c <= '7')
                             n = (n << 3) + (c - '0');
                           else
-                           phase1_ungetc (c);
+                           phase2_ungetc (c);
                         }
                     }
                   else
-                   phase1_ungetc (c);
+                   phase2_ungetc (c);
                 }
               *backslash_counter = 0;
               return (unsigned char) n;
             }
           case 'x':
             {
-             int c1 = phase1_getc ();
+             int c1 = phase2_getc ();
               int n1;
  
               if (c1 >= '0' && c1 <= '9')
@@ -495,7 +1161,7 @@ phase7_getuc (int quote_char,
  
               if (n1 >= 0)
                 {
-                 int c2 = phase1_getc ();
+                 int c2 = phase2_getc ();
                   int n2;
  
                   if (c2 >= '0' && c2 <= '9')
@@ -513,12 +1179,12 @@ phase7_getuc (int quote_char,
                       return (unsigned char) ((n1 << 4) + n2);
                     }
  
-                 phase1_ungetc (c2);
+                 phase2_ungetc (c2);
                 }
-             phase1_ungetc (c1);
-             phase1_ungetc (c);
+             phase2_ungetc (c1);
+             phase2_ungetc (c);
               ++*backslash_counter;
-             return '\\';
+             return UNICODE ('\\');
             }
           }
  
@@ -532,7 +1198,7 @@ phase7_getuc (int quote_char,
  
               for (i = 0; i < 4; i++)
                 {
-                 int c1 = phase1_getc ();
+                 int c1 = phase2_getc ();
  
                   if (c1 >= '0' && c1 <= '9')
                     n = (n << 4) + (c1 - '0');
@@ -542,18 +1208,18 @@ phase7_getuc (int quote_char,
                     n = (n << 4) + (c1 - 'a' + 10);
                   else
                     {
-                     phase1_ungetc (c1);
+                     phase2_ungetc (c1);
                       while (--i >= 0)
-                       phase1_ungetc (buf[i]);
-                     phase1_ungetc (c);
+                       phase2_ungetc (buf[i]);
+                     phase2_ungetc (c);
                       ++*backslash_counter;
-                     return '\\';
+                     return UNICODE ('\\');
                     }
  
                   buf[i] = c1;
                 }
               *backslash_counter = 0;
-             return n;
+             return UNICODE (n);
             }
  
           if (interpret_ansic)
@@ -566,7 +1232,7 @@ phase7_getuc (int quote_char,
  
                   for (i = 0; i < 8; i++)
                     {
-                     int c1 = phase1_getc ();
+                     int c1 = phase2_getc ();
  
                       if (c1 >= '0' && c1 <= '9')
                         n = (n << 4) + (c1 - '0');
@@ -576,12 +1242,12 @@ phase7_getuc (int quote_char,
                         n = (n << 4) + (c1 - 'a' + 10);
                       else
                         {
-                         phase1_ungetc (c1);
+                         phase2_ungetc (c1);
                           while (--i >= 0)
-                           phase1_ungetc (buf[i]);
-                         phase1_ungetc (c);
+                           phase2_ungetc (buf[i]);
+                         phase2_ungetc (c);
                           ++*backslash_counter;
-                         return '\\';
+                         return UNICODE ('\\');
                         }
  
                       buf[i] = c1;
@@ -589,7 +1255,7 @@ phase7_getuc (int quote_char,
                   if (n < 0x110000)
                     {
                       *backslash_counter = 0;
-                     return n;
+                     return UNICODE (n);
                     }
  
                   error_with_progname = false;
@@ -598,15 +1264,15 @@ phase7_getuc (int quote_char,
                   error_with_progname = true;
  
                   while (--i >= 0)
-                   phase1_ungetc (buf[i]);
-                 phase1_ungetc (c);
+                   phase2_ungetc (buf[i]);
+                 phase2_ungetc (c);
                   ++*backslash_counter;
-                 return '\\';
+                 return UNICODE ('\\');
                 }
  
               if (c == 'N')
                 {
-                 int c1 = phase1_getc ();
+                 int c1 = phase2_getc ();
                   if (c1 == '{')
                     {
                       unsigned char buf[UNINAME_MAX + 1];
@@ -615,16 +1281,16 @@ phase7_getuc (int quote_char,
  
                       for (i = 0; i < UNINAME_MAX; i++)
                         {
-                         int c2 = phase1_getc ();
+                         int c2 = phase2_getc ();
                           if (!(c2 >= ' ' && c2 <= '~'))
                             {
-                             phase1_ungetc (c2);
+                             phase2_ungetc (c2);
                               while (--i >= 0)
-                               phase1_ungetc (buf[i]);
-                             phase1_ungetc (c1);
-                             phase1_ungetc (c);
+                               phase2_ungetc (buf[i]);
+                             phase2_ungetc (c1);
+                             phase2_ungetc (c);
                               ++*backslash_counter;
-                             return '\\';
+                             return UNICODE ('\\');
                             }
                           if (c2 == '}')
                             break;
@@ -636,24 +1302,24 @@ phase7_getuc (int quote_char,
                       if (n != UNINAME_INVALID)
                         {
                           *backslash_counter = 0;
-                         return n;
+                         return UNICODE (n);
                         }
  
-                     phase1_ungetc ('}');
+                     phase2_ungetc ('}');
                       while (--i >= 0)
-                       phase1_ungetc (buf[i]);
+                       phase2_ungetc (buf[i]);
                     }
-                 phase1_ungetc (c1);
-                 phase1_ungetc (c);
+                 phase2_ungetc (c1);
+                 phase2_ungetc (c);
                   ++*backslash_counter;
-                 return '\\';
+                 return UNICODE ('\\');
                 }
             }
         }
  
-      phase1_ungetc (c);
+      phase2_ungetc (c);
        ++*backslash_counter;
-      return '\\';
+      return UNICODE ('\\');
      }
  }
  
@@ -681,11 +1347,11 @@ phase5_get (token_ty *tp)
    for (;;)
      {
        tp->line_number = line_number;
-      c = phase2_getc ();
+      c = phase3_getc ();
  
        switch (c)
         {
-       case EOF:
+       case UEOF:
           tp->type = token_type_eof;
           return;
  
@@ -712,8 +1378,8 @@ phase5_get (token_ty *tp)
         {
         case '.':
           {
-           int c1 = phase2_getc ();
-           phase2_ungetc (c1);
+           int c1 = phase3_getc ();
+           phase3_ungetc (c1);
             if (!(c1 >= '0' && c1 <= '9'))
               {
  
@@ -751,7 +1417,7 @@ phase5_get (token_ty *tp)
                     buffer = xrealloc (buffer, bufmax);
                   }
                 buffer[bufpos++] = c;
-               c = phase2_getc ();
+               c = phase3_getc ();
                 switch (c)
                   {
                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
@@ -769,7 +1435,7 @@ phase5_get (token_ty *tp)
                   case '5': case '6': case '7': case '8': case '9':
                     continue;
                   default:
-                   phase2_ungetc (c);
+                   phase3_ungetc (c);
                     break;
                   }
                 break;
@@ -787,9 +1453,7 @@ phase5_get (token_ty *tp)
  
         /* Strings.  */
           {
-           static unsigned short *buffer;
-           static int bufmax;
-           int bufpos;
+           struct mixed_string_buffer literal;
             int quote_char;
             bool interpret_ansic;
             bool interpret_unicode;
@@ -798,7 +1462,7 @@ phase5_get (token_ty *tp)
  
             case 'R': case 'r':
               {
-               int c1 = phase1_getc ();
+               int c1 = phase2_getc ();
                 if (c1 == '"' || c1 == '\'')
                   {
                     quote_char = c1;
@@ -806,13 +1470,13 @@ phase5_get (token_ty *tp)
                     interpret_unicode = false;
                     goto string;
                   }
-               phase1_ungetc (c1);
+               phase2_ungetc (c1);
                 goto symbol;
               }
  
             case 'U': case 'u':
               {
-               int c1 = phase1_getc ();
+               int c1 = phase2_getc ();
                 if (c1 == '"' || c1 == '\'')
                   {
                     quote_char = c1;
@@ -822,7 +1486,7 @@ phase5_get (token_ty *tp)
                   }
                 if (c1 == 'R' || c1 == 'r')
                   {
-                   int c2 = phase1_getc ();
+                   int c2 = phase2_getc ();
                     if (c2 == '"' || c2 == '\'')
                       {
                         quote_char = c2;
@@ -830,9 +1494,9 @@ phase5_get (token_ty *tp)
                         interpret_unicode = true;
                         goto string;
                       }
-                   phase1_ungetc (c2);
+                   phase2_ungetc (c2);
                   }
-               phase1_ungetc (c1);
+               phase2_ungetc (c1);
                 goto symbol;
               }
  
@@ -843,75 +1507,40 @@ phase5_get (token_ty *tp)
             string:
               triple = false;
               {
-               int c1 = phase1_getc ();
+               int c1 = phase2_getc ();
                 if (c1 == quote_char)
                   {
-                   int c2 = phase1_getc ();
+                   int c2 = phase2_getc ();
                     if (c2 == quote_char)
                       triple = true;
                     else
                       {
-                       phase1_ungetc (c2);
-                       phase1_ungetc (c1);
+                       phase2_ungetc (c2);
+                       phase2_ungetc (c1);
                       }
                   }
                 else
-                 phase1_ungetc (c1);
+                 phase2_ungetc (c1);
               }
               backslash_counter = 0;
-             /* Start accumulating the string.  We store the string in
-                UTF-16 before converting it to UTF-8.  Why not converting
-                every character directly to UTF-8? Because a string can
-                contain surrogates like u"\uD800\uDF00", and we must
-                combine them to a single UTF-8 character.  */
-             bufpos = 0;
+             /* Start accumulating the string.  */
+             init_mixed_string_buffer (&literal);
               for (;;)
                 {
                   int uc = phase7_getuc (quote_char, triple, interpret_ansic,
                                          interpret_unicode, &backslash_counter);
-                 unsigned int len;
  
                   if (uc == P7_EOF || uc == P7_STRING_END)
                     break;
  
-                 assert (uc >= 0 && uc < 0x110000);
-                 len = (uc < 0x10000 ? 1 : 2);
-                 if (bufpos + len > bufmax)
-                   {
-                     bufmax = 2 * bufmax + 10;
-                     buffer =
-                       xrealloc (buffer, bufmax * sizeof (unsigned short));
-                   }
-                 if (uc < 0x10000)
-                   buffer[bufpos++] = uc;
-                 else
-                   {
-                     buffer[bufpos++] = 0xd800 + ((uc - 0x10000) >> 10);
-                     buffer[bufpos++] = 0xdc00 + ((uc - 0x10000) & 0x3ff);
-                   }
-               }
-             /* Now convert from UTF-16 to UTF-8.  */
-             {
-               int pos;
-               unsigned char *utf8_string;
-               unsigned char *q;
-
-               /* Each UTF-16 word needs 3 bytes at worst.  */
-               utf8_string = (unsigned char *) xmalloc (3 * bufpos + 1);
-               for (pos = 0, q = utf8_string; pos < bufpos; )
-                 {
-                   unsigned int uc;
-                   int n;
+                 if (IS_UNICODE (uc))
+                   assert (UNICODE_VALUE (uc) >= 0
+                           && UNICODE_VALUE (uc) < 0x110000);
  
-                   pos += u16_mbtouc (&uc, buffer + pos, bufpos - pos);
-                   n = u8_uctomb (q, uc, 6);
-                   assert (n > 0);
-                   q += n;
-                 }
-               *q = '\0';
-               assert (q - utf8_string <= 3 * bufpos);
-               tp->string = (char *) utf8_string;
-             }
+                 mixed_string_buffer_append (&literal, uc);
+               }
+             tp->string = xstrdup (mixed_string_buffer_result (&literal));
+             free_mixed_string_buffer (&literal);
               tp->comment = add_reference (savable_comment);
               tp->type = token_type_string;
               return;
@@ -1124,9 +1753,11 @@ extract_parenthesized (message_list_ty *mlp,
  
             if (extract_all)
               {
+               xgettext_current_source_encoding = po_charset_utf8;
                 savable_comment_to_xgettext_comment (token.comment);
                 remember_a_message (mlp, token.string, inner_context, &pos);
                 savable_comment_reset ();
+               xgettext_current_source_encoding = xgettext_current_file_source_encoding;
               }
             else
               {
@@ -1137,18 +1768,22 @@ extract_parenthesized (message_list_ty *mlp,
                         /* Seen an msgid.  */
                         message_ty *mp;
  
+                       xgettext_current_source_encoding = po_charset_utf8;
                         savable_comment_to_xgettext_comment (token.comment);
                         mp = remember_a_message (mlp, token.string,
                                                 inner_context, &pos);
                         savable_comment_reset ();
+                       xgettext_current_source_encoding = xgettext_current_file_source_encoding;
                         if (plural_commas > 0)
                           plural_mp = mp;
                       }
                     else
                       {
                         /* Seen an msgid_plural.  */
+                       xgettext_current_source_encoding = po_charset_utf8;
                         remember_a_message_plural (plural_mp, token.string,
                                                    inner_context, &pos);
+                       xgettext_current_source_encoding = xgettext_current_file_source_encoding;
                         plural_mp = NULL;
                       }
                   }
@@ -1184,9 +1819,6 @@ extract_python (FILE *f,
  {
    message_list_ty *mlp = mdlp->item[0]->messages;
  
-  /* We convert our strings to UTF-8 encoding.  */
-  xgettext_current_source_encoding = po_charset_utf8;
-
    fp = f;
    real_file_name = real_filename;
    logical_file_name = xstrdup (logical_filename);
@@ -1195,6 +1827,18 @@ extract_python (FILE *f,
    last_comment_line = -1;
    last_non_comment_line = -1;
  
+  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
+#if HAVE_ICONV
+  xgettext_current_file_source_iconv = xgettext_global_source_iconv;
+#endif
+
+  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
+#if HAVE_ICONV
+  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
+#endif
+
+  continuation_or_nonblank_line = false;
+
    open_pbb = 0;
  
    flag_context_list_table = flag_table;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog

index aeb2d1cc61f92e1d65c124e2a68d2f85146790e4..96470b5749ed532a4feff185bd06441c7b74adff 100644 (file)
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2005-09-25  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext-python-3: New file.
+       * Makefile.am (TESTS): Add it.
+
  2005-09-17  Bruno Haible  <bruno@clisp.org>
  
         * msgfmt-7: Update after slight change in error message.
diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am

index b1fee6a9a12b745842eb0dcbfdd713e3927c7d10..6ac70a22d97222f57b95e47adc9a2be2cd6937df 100644 (file)
--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@@ -76,7 +76,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \
         xgettext-php-1 \
         xgettext-po-1 \
         xgettext-properties-1 \
-       xgettext-python-1 xgettext-python-2 \
+       xgettext-python-1 xgettext-python-2 xgettext-python-3 \
         xgettext-scheme-1 \
         xgettext-sh-1 xgettext-sh-2 xgettext-sh-3 xgettext-sh-4 \
         xgettext-smalltalk-1 \
author	Bruno Haible <bruno@clisp.org>
	Mon, 26 Sep 2005 09:21:16 +0000 (09:21 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 23 Jun 2009 10:12:51 +0000 (12:12 +0200)
NEWS		patch \| blob \| blame \| history
gettext-tools/doc/ChangeLog		patch \| blob \| blame \| history
gettext-tools/doc/xgettext.texi		patch \| blob \| blame \| history
gettext-tools/src/ChangeLog		patch \| blob \| blame \| history
gettext-tools/src/x-python.c		patch \| blob \| blame \| history
gettext-tools/tests/ChangeLog		patch \| blob \| blame \| history
gettext-tools/tests/Makefile.am		patch \| blob \| blame \| history