Support non-ASCII msgids and UTF-8 encoded POT files.

author Bruno Haible <bruno@clisp.org>

Tue, 5 Nov 2002 12:21:13 +0000 (12:21 +0000)

committer Bruno Haible <bruno@clisp.org>

Tue, 23 Jun 2009 10:08:50 +0000 (12:08 +0200)
author Bruno Haible <bruno@clisp.org>
Tue, 5 Nov 2002 12:21:13 +0000 (12:21 +0000)
committer Bruno Haible <bruno@clisp.org>
Tue, 23 Jun 2009 10:08:50 +0000 (12:08 +0200)
diff --git a/NEWS b/NEWS

index 948f34f1794297444b3725b98c3063d41e3546b9..f62f0109acbe0e74ee8466e0f083c66bd3d420ad 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -6,6 +6,10 @@ Version 0.11.6 - October 2002
    strings in C++. This is needed for proper internationalization of C++
    programs.
  
+* xgettext now supports msgid strings in other encodings than ASCII.
+  xgettext has a new option --from-code that specifies the encoding of the
+  source files. The resulting POT files are UTF-8 encoded.
+
  * Compatibility with automake-1.7.
  \f
  Version 0.11.5 - August 2002
diff --git a/doc/ChangeLog b/doc/ChangeLog

index a506f7f2fdace40d9632c8ddece3b7fc0b32ec9a..8cfe1f79da391c41efde3ee3d5c4170502483412 100644 (file)
--- a/doc/ChangeLog
+++ b/doc/ChangeLog
@@ -1,3 +1,7 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext.texi: Document --from-code option.
+
  2002-10-30  Bruno Haible  <bruno@clisp.org>
  
         * gettext.texi (C): Refer to node Top of autosprintf.info. Needed to
diff --git a/doc/xgettext.texi b/doc/xgettext.texi

index 7385d504e902329037d3bc8c1d1a0d14ac430554..4c0ba4c178d2a9a2f9d7a6f2100d8def5909f2e4 100644 (file)
--- a/doc/xgettext.texi
+++ b/doc/xgettext.texi
@@ -84,6 +84,20 @@ This is a shorthand for @code{--language=C++}.
  By default the language is guessed depending on the input file name
  extension.
  
+@subsection Input file interpretation
+
+@table @samp
+@item --from-code=@var{name}
+@opindex --from-code@r{, @code{xgettext} option}
+Specifies the encoding of the input files.  This option is needed only
+if some untranslated message strings or their corresponding comments
+contain non-ASCII characters.  Note that Python, Tcl, and Glade input
+files are always assumed to be in UTF-8, regardless of this option.
+
+@end table
+
+By default the input files are assumed to be in ASCII.
+
  @subsection Operation mode
  
  @table @samp
diff --git a/src/ChangeLog b/src/ChangeLog

index 21a8cd69a16210b6b8bb418a0b8693d30e0de28f..9c872b5c6b8521a2669624b5809132290b89499b 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,43 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+       Allow non-ASCII msgids in POT files.
+       * po-charset.h (po_charset_utf8): New declaration.
+       * po-charset.c (utf8, po_charset_utf8): New variables.
+       (po_charset_canonicalize): Use po_charset_utf8.
+       * msgl-iconv.h: Include iconv.h.
+       (convert_string): New declaration.
+       * msgl-iconv.c (convert_string): Export function.
+       (convert_msgid): New function.
+       (iconv_message_list): Call it.
+       * xgettext.h: Include iconv.h.
+       (xgettext_global_source_encoding, xgettext_global_source_iconv,
+       xgettext_current_source_encoding, xgettext_current_source_iconv): New
+       declarations.
+       * xgettext.c (xgettext_global_source_encoding,
+       xgettext_global_source_iconv, xgettext_current_source_encoding,
+       xgettext_current_source_iconv): New variables.
+       (long_options): New option --from-code.
+       (main): Initialize xgettext_global_source_encoding. Handle option
+       --from-code. Initialize and destroy xgettext_global_source_iconv.
+       (usage): Document option --from-code.
+       (extract_from_file): Set xgettext_current_source_encoding and
+       xgettext_current_source_iconv.
+       (CONVERT_STRING): New macro.
+       (remember_a_message, remember_a_message_plural): Call CONVERT_STRING.
+       (finalize_header): Set the charset in the header here.
+       * x-glade.c (do_extract_glade): Set xgettext_current_source_encoding.
+       Don't set the result's header charset; this is now done in xgettext.c.
+       * x-python.c (extract_python): Likewise.
+       * x-tcl.c (extract_tcl): Likewise.
+       * write-po.c (message_print, message_print_obsolete): Don't warn about
+       non-ASCII msgids if the file's encoding is UTF-8.
+       * msginit.c (content_type): Add header argument. Use charset UTF-8
+       if that was already the POT file's encoding.
+       (fields): Update.
+       * msgmerge.c (merge): If the POT file was in UTF-8, convert the
+       definitions to UTF-8.
+       * msgcmp.c (compare): Likewise.
+
  2002-11-01  Bruno Haible  <bruno@clisp.org>
  
         * msgcmp.c: Include read-po.h.
diff --git a/src/msgcmp.c b/src/msgcmp.c

index 615acf0a643120b0f0491c4802d6a3f360564cbe..0919f5283a5b3a657e3501b77aeb5f1413ac08a5 100644 (file)
--- a/src/msgcmp.c
+++ b/src/msgcmp.c
@@ -33,9 +33,12 @@
  #include "basename.h"
  #include "message.h"
  #include "exit.h"
-#include "gettext.h"
  #include "read-po.h"
  #include "po.h"
+#include "msgl-iconv.h"
+#include "strstr.h"
+#include "strcase.h"
+#include "gettext.h"
  
  #define _(str) gettext (str)
  
@@ -301,6 +304,40 @@ compare (fn1, fn2)
       the xgettext program.  */
    ref = remove_obsoletes (read_po_file (fn2));
  
+  /* The references file can be either in ASCII or in UTF-8.  If it is
+     in UTF-8, we have to convert the definitions to UTF-8 as well.  */
+  {
+    bool was_utf8 = false;
+    for (k = 0; k < ref->nitems; k++)
+      {
+       message_list_ty *mlp = ref->item[k]->messages;
+
+       for (j = 0; j < mlp->nitems; j++)
+         if (mlp->item[j]->msgid[0] == '\0' /* && !mlp->item[j]->obsolete */)
+           {
+             const char *header = mlp->item[j]->msgstr; 
+
+             if (header != NULL)
+               {
+                 const char *charsetstr = strstr (header, "charset=");
+
+                 if (charsetstr != NULL)
+                   {
+                     size_t len;
+
+                     charsetstr += strlen ("charset=");
+                     len = strcspn (charsetstr, " \t\n");
+                     if (len == strlen ("UTF-8")
+                         && strncasecmp (charsetstr, "UTF-8", len) == 0)
+                       was_utf8 = true;
+                   }
+               }
+           }
+       }
+    if (was_utf8)
+      def = iconv_msgdomain_list (def, "UTF-8", fn1);
+  }
+
    empty_list = message_list_alloc (false);
  
    /* Every entry in the xgettext generated file must be matched by a
diff --git a/src/msginit.c b/src/msginit.c

index d1c4536fd4d11f56b84761e430c829107b8dc9ec..d443c2ad0976473d8c40fd439011bf12d815226b 100644 (file)
--- a/src/msginit.c
+++ b/src/msginit.c
@@ -69,6 +69,8 @@
  #include "progname.h"
  #include "basename.h"
  #include "strpbrk.h"
+#include "strstr.h"
+#include "strcase.h"
  #include "message.h"
  #include "read-po.h"
  #include "write-po.h"
@@ -145,7 +147,7 @@ static const char *last_translator PARAMS ((void));
  static const char *language_team_address PARAMS ((void));
  static const char *language_team PARAMS ((void));
  static const char *mime_version PARAMS ((void));
-static const char *content_type PARAMS ((void));
+static const char *content_type PARAMS ((const char *header));
  static const char *content_transfer_encoding PARAMS ((void));
  static const char *plural_forms PARAMS ((void));
  static char *get_field PARAMS ((const char *header, const char *field));
@@ -1212,9 +1214,30 @@ mime_version ()
  
  /* Construct the value for the Content-Type field.  */
  static const char *
-content_type ()
+content_type (header)
+     const char *header;
  {
-  return xasprintf ("text/plain; charset=%s", canonical_locale_charset ());
+  bool was_utf8;
+  const char *old_field;
+
+  /* If the POT file contains charset=UTF-8, it means that the POT file
+     contains non-ASCII characters, and we keep the UTF-8 encoding.
+     Otherwise, when the POT file is plain ASCII, we use the locale's
+     encoding.  */
+  was_utf8 = false;
+  old_field = get_field (header, "Content-Type");
+  if (old_field != NULL)
+    {
+      const char *charsetstr = strstr (old_field, "charset=");
+
+      if (charsetstr != NULL)
+       {
+         charsetstr += strlen ("charset=");
+         was_utf8 = (strcasecmp (charsetstr, "UTF-8") == 0);
+       }
+    }
+  return xasprintf ("text/plain; charset=%s",
+                   was_utf8 ? "UTF-8" : canonical_locale_charset ());
  }
  
  
@@ -1259,7 +1282,7 @@ fields[] =
      { "Last-Translator", last_translator, NULL },
      { "Language-Team", language_team, NULL },
      { "MIME-Version", mime_version, NULL },
-    { "Content-Type", content_type, NULL },
+    { "Content-Type", NULL, content_type },
      { "Content-Transfer-Encoding", content_transfer_encoding, NULL },
      { "Plural-Forms", plural_forms, NULL }
    };
diff --git a/src/msgl-iconv.c b/src/msgl-iconv.c

index 6620cfdd2ba2ed02b691c0c04e5ce6d2a7e0d5a0..61633b5e1393d486cabb651ce9a75f2eef14212b 100644 (file)
--- a/src/msgl-iconv.c
+++ b/src/msgl-iconv.c
@@ -54,8 +54,8 @@
  static int iconv_string PARAMS ((iconv_t cd,
                                  const char *start, const char *end,
                                  char **resultp, size_t *lengthp));
-static const char *convert_string PARAMS ((iconv_t cd, const char *string));
  static void convert_string_list PARAMS ((iconv_t cd, string_list_ty *slp));
+static void convert_msgid PARAMS ((iconv_t cd, message_ty *mp));
  static void convert_msgstr PARAMS ((iconv_t cd, message_ty *mp));
  #endif
  
@@ -184,7 +184,7 @@ iconv_string (cd, start, end, resultp, lengthp)
  #undef tmpbufsize
  }
  
-static const char *
+char *
  convert_string (cd, string)
       iconv_t cd;
       const char *string;
@@ -216,6 +216,16 @@ convert_string_list (cd, slp)
        slp->item[i] = convert_string (cd, slp->item[i]);
  }
  
+static void
+convert_msgid (cd, mp)
+     iconv_t cd;
+     message_ty *mp;
+{
+  mp->msgid = convert_string (cd, mp->msgid);
+  if (mp->msgid_plural != NULL)
+    mp->msgid_plural = convert_string (cd, mp->msgid_plural);
+}
+
  static void
  convert_msgstr (cd, mp)
       iconv_t cd;
@@ -377,6 +387,7 @@ and iconv() does not support this conversion."),
  
           convert_string_list (cd, mp->comment);
           convert_string_list (cd, mp->comment_dot);
+         convert_msgid (cd, mp);
           convert_msgstr (cd, mp);
         }
  
diff --git a/src/msgl-iconv.h b/src/msgl-iconv.h

index 45cd5def32398d51414de3f885e9edb241726bff..976427ada8607b115001d2e9d254311b8aa2f0b8 100644 (file)
--- a/src/msgl-iconv.h
+++ b/src/msgl-iconv.h
@@ -19,8 +19,17 @@
  #ifndef _MSGL_ICONV_H
  #define _MSGL_ICONV_H
  
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
  #include "message.h"
  
+#if HAVE_ICONV
+/* Converts the STRING through the conversion descriptor CD.  */
+extern char *convert_string PARAMS ((iconv_t cd, const char *string));
+#endif
+
  /* Converts the message list MLP to the (already canonicalized) encoding
     CANON_TO_CODE.  The (already canonicalized) encoding before conversion
     can be passed as CANON_FROM_CODE; if NULL is passed instead, the
diff --git a/src/msgmerge.c b/src/msgmerge.c

index dc71eb33bf0b5ac7a295990888a5913b967086e1..adae8060ef818bd1d90c5c3c3d27fb5a45901abe 100644 (file)
--- a/src/msgmerge.c
+++ b/src/msgmerge.c
@@ -44,6 +44,7 @@
  #include "stpcpy.h"
  #include "stpncpy.h"
  #include "po.h"
+#include "msgl-iconv.h"
  #include "msgl-equal.h"
  #include "plural-exp.h"
  #include "backupfile.h"
@@ -1017,6 +1018,40 @@ merge (fn1, fn2, defp)
         message_list_prepend (ref->item[k]->messages, refheader);
        }
  
+  /* The references file can be either in ASCII or in UTF-8.  If it is
+     in UTF-8, we have to convert the definitions to UTF-8 as well.  */
+  {
+    bool was_utf8 = false;
+    for (k = 0; k < ref->nitems; k++)
+      {
+       message_list_ty *mlp = ref->item[k]->messages;
+
+       for (j = 0; j < mlp->nitems; j++)
+         if (mlp->item[j]->msgid[0] == '\0' && !mlp->item[j]->obsolete)
+           {
+             const char *header = mlp->item[j]->msgstr; 
+
+             if (header != NULL)
+               {
+                 const char *charsetstr = strstr (header, "charset=");
+
+                 if (charsetstr != NULL)
+                   {
+                     size_t len;
+
+                     charsetstr += strlen ("charset=");
+                     len = strcspn (charsetstr, " \t\n");
+                     if (len == strlen ("UTF-8")
+                         && strncasecmp (charsetstr, "UTF-8", len) == 0)
+                       was_utf8 = true;
+                   }
+               }
+           }
+       }
+    if (was_utf8)
+      def = iconv_msgdomain_list (def, "UTF-8", fn1);
+  }
+
    result = msgdomain_list_alloc (false);
    processed = 0;
  
diff --git a/src/po-charset.c b/src/po-charset.c

index b75971664c498810e5d8c6a1402724534d2ab802..f3d73686d86add27f5b05b3509ed5df3133dbfab 100644 (file)
--- a/src/po-charset.c
+++ b/src/po-charset.c
@@ -44,6 +44,11 @@ static const char ascii[] = "ASCII";
  /* The canonicalized encoding name for ASCII.  */
  const char *po_charset_ascii = ascii;
  
+static const char utf8[] = "UTF-8";
+
+/* The canonicalized encoding name for UTF-8.  */
+const char *po_charset_utf8 = utf8;
+
  /* Canonicalize an encoding name.  */
  const char *
  po_charset_canonicalize (charset)
@@ -96,7 +101,7 @@ po_charset_canonicalize (charset)
      "TIS-620",
      "VISCII",
      "GEORGIAN-PS",
-    "UTF-8"
+    utf8
    };
    size_t i;
  
diff --git a/src/po-charset.h b/src/po-charset.h

index 216ab330a9c9be6526e4fa0e21ff5d4aa008fe0f..dfc7f6f05b8ea5e26f37db831c93bd724fb0e573 100644 (file)
--- a/src/po-charset.h
+++ b/src/po-charset.h
@@ -33,6 +33,9 @@ extern const char *po_charset_canonicalize PARAMS ((const char *charset));
  /* The canonicalized encoding name for ASCII.  */
  extern const char *po_charset_ascii;
  
+/* The canonicalized encoding name for UTF-8.  */
+extern const char *po_charset_utf8;
+
  /* Test for ASCII compatibility.  */
  extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
  
diff --git a/src/write-po.c b/src/write-po.c

index 6feb002254b43d190c0c0a66f429540bdc487c1c..5fec377563c89ee4c65e082734ff05e23ddcb92b 100644 (file)
--- a/src/write-po.c
+++ b/src/write-po.c
@@ -778,7 +778,8 @@ message_print (mp, fp, charset, blank_line, debug)
    /* Print each of the message components.  Wrap them nicely so they
       are as readable as possible.  If there is no recorded msgstr for
       this domain, emit an empty string.  */
-  if (!is_ascii_string (mp->msgid))
+  if (!is_ascii_string (mp->msgid)
+      && po_charset_canonicalize (charset) != po_charset_utf8)
      multiline_warning (xasprintf (_("warning: ")),
                        xasprintf (_("\
  The following msgid contains non-ASCII characters.\n\
@@ -872,7 +873,8 @@ message_print_obsolete (mp, fp, charset, blank_line)
  
    /* Print each of the message components.  Wrap them nicely so they
       are as readable as possible.  */
-  if (!is_ascii_string (mp->msgid))
+  if (!is_ascii_string (mp->msgid)
+      && po_charset_canonicalize (charset) != po_charset_utf8)
      multiline_warning (xasprintf (_("warning: ")),
                        xasprintf (_("\
  The following msgid contains non-ASCII characters.\n\
diff --git a/src/x-glade.c b/src/x-glade.c

index 715c0599da94a18802aa21e4088740e75ee867ed..369e07759136fd534963995b9ebf631e107278ee 100644 (file)
--- a/src/x-glade.c
+++ b/src/x-glade.c
@@ -372,6 +372,9 @@ do_extract_glade (fp, real_filename, logical_filename, mdlp)
  {
    mlp = mdlp->item[0]->messages;
  
+  /* expat feeds us strings in UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
    logical_file_name = xstrdup (logical_filename);
  
    init_keywords ();
@@ -413,14 +416,6 @@ error while reading \"%s\""), real_filename);
  
    XML_ParserFree (parser);
  
-  /* expat feeds us strings in UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
    /* Close scanner.  */
    logical_file_name = NULL;
    parser = NULL;
diff --git a/src/x-python.c b/src/x-python.c

index 823e852380a94fce1d127fcbd9a3086c61fa6f34..161a10306941b01f203b67baa403a08c52ebe7f1 100644 (file)
--- a/src/x-python.c
+++ b/src/x-python.c
@@ -1159,6 +1159,9 @@ extract_python (f, real_filename, logical_filename, mdlp)
  {
    message_list_ty *mlp = mdlp->item[0]->messages;
  
+  /* We convert our strings to UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
    fp = f;
    real_file_name = real_filename;
    logical_file_name = xstrdup (logical_filename);
@@ -1176,14 +1179,6 @@ extract_python (f, real_filename, logical_filename, mdlp)
    while (!extract_parenthesized (mlp, -1, 0))
      ;
  
-  /* We converted our strings to UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
    fp = NULL;
    real_file_name = NULL;
    logical_file_name = NULL;
diff --git a/src/x-tcl.c b/src/x-tcl.c

index a942796de5e4dfcb433a4a388ddcaa079714c4fa..b7446328702a8d5c900adb664dc4a55a0f00427a 100644 (file)
--- a/src/x-tcl.c
+++ b/src/x-tcl.c
@@ -1002,6 +1002,9 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
  {
    mlp = mdlp->item[0]->messages;
  
+  /* We convert our strings to UTF-8 encoding.  */
+  xgettext_current_source_encoding = po_charset_utf8;
+
    fp = f;
    real_file_name = real_filename;
    logical_file_name = xstrdup (logical_filename);
@@ -1018,14 +1021,6 @@ extract_tcl (f, real_filename, logical_filename, mdlp)
    /* Eat tokens until eof is seen.  */
    read_command_list ('\0');
  
-  /* We converted our strings to UTF-8 encoding.  If not all the strings
-     were plain ASCII, set the charset in the header to UTF-8.  */
-  if (!is_ascii_message_list (mlp))
-    {
-      const char *canon_utf_8 = po_charset_canonicalize ("UTF-8");
-      iconv_message_list (mlp, canon_utf_8, canon_utf_8, NULL);
-    }
-
    fp = NULL;
    real_file_name = NULL;
    logical_file_name = NULL;
diff --git a/src/xgettext.c b/src/xgettext.c

index 4224c2e95b62b8ef7678f76ab30fed175c9248eb..52f67477b97d5f18ced14a42f6a2577db67a8ce8 100644 (file)
--- a/src/xgettext.c
+++ b/src/xgettext.c
@@ -48,6 +48,9 @@
  #include "stpcpy.h"
  #include "po.h"
  #include "message.h"
+#include "po-charset.h"
+#include "msgl-iconv.h"
+#include "msgl-ascii.h"
  #include "po-time.h"
  #include "write-po.h"
  #include "format.h"
@@ -111,6 +114,24 @@ static char *output_dir;
  /* If nonzero omit header with information about this run.  */
  int xgettext_omit_header;
  
+/* Canonicalized encoding name for all input files.  */
+const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file.  */
+const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+iconv_t xgettext_current_source_iconv;
+#endif
+
  /* Long options.  */
  static const struct option long_options[] =
  {
@@ -127,6 +148,7 @@ static const struct option long_options[] =
    { "files-from", required_argument, NULL, 'f' },
    { "force-po", no_argument, &force_po, 1 },
    { "foreign-user", no_argument, NULL, CHAR_MAX + 2 },
+  { "from-code", required_argument, NULL, CHAR_MAX + 3 },
    { "help", no_argument, NULL, 'h' },
    { "indent", no_argument, NULL, 'i' },
    { "join-existing", no_argument, NULL, 'j' },
@@ -137,7 +159,7 @@ static const struct option long_options[] =
    { "msgstr-suffix", optional_argument, NULL, 'M' },
    { "no-escape", no_argument, NULL, 'e' },
    { "no-location", no_argument, &line_comment, 0 },
-  { "no-wrap", no_argument, NULL, CHAR_MAX + 3 },
+  { "no-wrap", no_argument, NULL, CHAR_MAX + 4 },
    { "omit-header", no_argument, &xgettext_omit_header, 1 },
    { "output", required_argument, NULL, 'o' },
    { "output-dir", required_argument, NULL, 'p' },
@@ -220,6 +242,7 @@ main (argc, argv)
  
    /* Set initial value of variables.  */
    default_domain = MESSAGE_DOMAIN_DEFAULT;
+  xgettext_global_source_encoding = po_charset_ascii;
  
    while ((optchar = getopt_long (argc, argv,
                                  "ac::Cd:D:eEf:Fhijk::l:L:m::M::no:p:sTVw:x:",
@@ -366,7 +389,12 @@ main (argc, argv)
        case CHAR_MAX + 2:       /* --foreign-user */
         copyright_holder = "";
         break;
-      case CHAR_MAX + 3:       /* --no-wrap */
+      case CHAR_MAX + 3:       /* --from-code */
+       xgettext_global_source_encoding = po_charset_canonicalize (optarg);
+       if (xgettext_global_source_encoding == NULL)
+         xgettext_global_source_encoding = po_charset_ascii;
+       break;
+      case CHAR_MAX + 4:       /* --no-wrap */
         message_page_width_ignore ();
         break;
        default:
@@ -453,6 +481,37 @@ xgettext cannot work without keywords to look for"));
    for (cnt = optind; cnt < argc; ++cnt)
      string_list_append_unique (file_list, argv[cnt]);
  
+  /* Allocate converter from xgettext_global_source_encoding to UTF-8 (except
+     from ASCII or UTF-8, when this conversion is a no-op).  */
+  if (xgettext_global_source_encoding != po_charset_ascii
+      && xgettext_global_source_encoding != po_charset_utf8)
+    {
+#if HAVE_ICONV
+      iconv_t cd;
+
+      /* Avoid glibc-2.1 bug with EUC-KR.  */
+# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
+      if (strcmp (xgettext_global_source_encoding, "EUC-KR") == 0)
+       cd = (iconv_t)(-1);
+      else
+# endif
+      cd = iconv_open (po_charset_utf8, xgettext_global_source_encoding);
+      if (cd == (iconv_t)(-1))
+       error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
+and iconv() does not support this conversion."),
+              xgettext_global_source_encoding, po_charset_utf8,
+              basename (program_name));
+      xgettext_global_source_iconv = cd;
+#else
+      error (EXIT_FAILURE, 0, _("\
+Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
+This version was built without iconv()."),
+            xgettext_global_source_encoding, po_charset_utf8,
+            basename (program_name));
+#endif
+    }
+
    /* Allocate a message list to remember all the messages.  */
    mdlp = msgdomain_list_alloc (true);
  
@@ -519,6 +578,13 @@ warning: file `%s' extension `%s' is unknown; will try C"), fname, extension);
    if (!xgettext_omit_header)
      finalize_header (mdlp);
  
+  /* Free the allocated converter.  */
+#if HAVE_ICONV
+  if (xgettext_global_source_encoding != po_charset_ascii
+      && xgettext_global_source_encoding != po_charset_utf8)
+    iconv_close (xgettext_global_source_iconv);
+#endif
+
    /* Sorting the list of messages.  */
    if (sort_by_filepos)
      msgdomain_list_sort_by_filepos (mdlp);
@@ -585,6 +651,14 @@ Choice of input file language:\n\
                                     YCP, Tcl, PHP, RST, Glade)\n\
    -C, --c++                      shorthand for --language=C++\n\
  By default the language is guessed depending on the input file name extension.\n\
+"));
+      printf ("\n");
+      /* xgettext: no-wrap */
+      printf (_("\
+Input file interpretation:\n\
+      --from-code=NAME           encoding of input files\n\
+                                   (except for Python, Tcl, Glade)\n\
+By default the input files are assumed to be in ASCII.\n\
  "));
        printf ("\n");
        /* xgettext: no-wrap */
@@ -890,6 +964,13 @@ extract_from_file (file_name, extractor, mdlp)
    char *real_file_name;
    FILE *fp = xgettext_open (file_name, &logical_file_name, &real_file_name);
  
+  /* Set the default for the source file encoding.  May be overridden by
+     the extractor function.  */
+  xgettext_current_source_encoding = xgettext_global_source_encoding;
+#if HAVE_ICONV
+  xgettext_current_source_iconv = xgettext_global_source_iconv;
+#endif
+
    extractor (fp, real_file_name, logical_file_name, mdlp);
  
    if (fp != stdin)
@@ -905,6 +986,36 @@ extract_from_file (file_name, extractor, mdlp)
  static struct formatstring_parser *current_formatstring_parser;
  
  
+/* Convert the given string from xgettext_current_source_encoding to
+   the output file encoding (i.e. ASCII or UTF-8).  */
+#define CONVERT_STRING(string) \
+  if (xgettext_current_source_encoding == po_charset_ascii)            \
+    {                                                                  \
+      if (!is_ascii_string (string))                                   \
+       {                                                               \
+         char buffer[21];                                              \
+         if (pos->line_number == (size_t)(-1))                         \
+           buffer[0] = '\0';                                           \
+         else                                                          \
+           sprintf (buffer, ":%ld", (long) pos->line_number);          \
+         error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
+                pos->file_name, buffer);                               \
+       }                                                               \
+    }                                                                  \
+  else if (xgettext_current_source_encoding != po_charset_utf8)                \
+    {                                                                  \
+      string = convert_string (xgettext_current_source_iconv, string); \
+    }
+
+#if !HAVE_ICONV
+/* If we don't have iconv(), the only supported values for
+   xgettext_global_source_encoding and thus also for
+   xgettext_current_source_encoding are ASCII and UTF-8.
+   convert_string() should not be called in this case.  */
+#define convert_string(cd,string) (abort (), (string))
+#endif
+
+
  message_ty *
  remember_a_message (mlp, string, pos)
       message_list_ty *mlp;
@@ -934,6 +1045,8 @@ remember_a_message (mlp, string, pos)
      is_format[i] = undecided;
    do_wrap = undecided;
  
+  CONVERT_STRING (msgid);
+
    if (msgid[0] == '\0' && !xgettext_omit_header)
      {
        char buffer[21];
@@ -999,6 +1112,8 @@ meta information, not the empty string.\n")));
           if (s == NULL)
             break;
  
+         CONVERT_STRING (s);
+
           /* To reduce the possibility of unwanted matches be do a two
              step match: the line must contain `xgettext:' and one of
              the possible format description strings.  */
@@ -1102,6 +1217,8 @@ remember_a_message_plural (mp, string, pos)
  
    msgid_plural = string;
  
+  CONVERT_STRING (msgid_plural);
+
    /* See if the message is already a plural message.  */
    if (mp->msgid_plural == NULL)
      {
@@ -1205,53 +1322,78 @@ finalize_header (mdlp)
  {
    /* If the generated PO file has plural forms, add a Plural-Forms template
       to the constructed header.  */
-  bool has_plural;
-  size_t i, j;
+  {
+    bool has_plural;
+    size_t i, j;
  
-  has_plural = false;
-  for (i = 0; i < mdlp->nitems; i++)
-    {
-      message_list_ty *mlp = mdlp->item[i]->messages;
+    has_plural = false;
+    for (i = 0; i < mdlp->nitems; i++)
+      {
+       message_list_ty *mlp = mdlp->item[i]->messages;
  
-      for (j = 0; j < mlp->nitems; j++)
-       {
-         message_ty *mp = mlp->item[j];
+       for (j = 0; j < mlp->nitems; j++)
+         {
+           message_ty *mp = mlp->item[j];
  
-         if (mp->msgid_plural != NULL)
-           {
-             has_plural = true;
-             break;
-           }
-       }
-      if (has_plural)
-       break;
-    }
+           if (mp->msgid_plural != NULL)
+             {
+               has_plural = true;
+               break;
+             }
+         }
+       if (has_plural)
+         break;
+      }
  
-  if (has_plural)
-    {
-      message_ty *header = message_list_search (mdlp->item[0]->messages, "");
-      if (header != NULL
-         && strstr (header->msgstr, "Plural-Forms:") == NULL)
-       {
-         size_t insertpos = strlen (header->msgstr);
-         const char *suffix;
-         size_t suffix_len;
-         char *new_msgstr;
-
-         suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
-         if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
-           suffix++;
-         suffix_len = strlen (suffix);
-         new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
-         memcpy (new_msgstr, header->msgstr, insertpos);
-         memcpy (new_msgstr + insertpos, suffix, suffix_len);
-         memcpy (new_msgstr + insertpos + suffix_len,
-                 header->msgstr + insertpos,
-                 header->msgstr_len - insertpos);
-         header->msgstr = new_msgstr;
-         header->msgstr_len = header->msgstr_len + suffix_len;
-       }
-    }
+    if (has_plural)
+      {
+       message_ty *header = message_list_search (mdlp->item[0]->messages, "");
+       if (header != NULL
+           && strstr (header->msgstr, "Plural-Forms:") == NULL)
+         {
+           size_t insertpos = strlen (header->msgstr);
+           const char *suffix;
+           size_t suffix_len;
+           char *new_msgstr;
+
+           suffix = "\nPlural-Forms: nplurals=INTEGER; plural=EXPRESSION;\n";
+           if (insertpos == 0 || header->msgstr[insertpos-1] == '\n')
+             suffix++;
+           suffix_len = strlen (suffix);
+           new_msgstr = (char *) xmalloc (header->msgstr_len + suffix_len);
+           memcpy (new_msgstr, header->msgstr, insertpos);
+           memcpy (new_msgstr + insertpos, suffix, suffix_len);
+           memcpy (new_msgstr + insertpos + suffix_len,
+                   header->msgstr + insertpos,
+                   header->msgstr_len - insertpos);
+           header->msgstr = new_msgstr;
+           header->msgstr_len = header->msgstr_len + suffix_len;
+         }
+      }
+  }
+
+  /* If not all the strings were plain ASCII, set the charset in the header
+     to UTF-8.  All messages have already been converted to UTF-8 in
+     remember_a_message and remember_a_message_plural.  */
+  {
+    bool has_nonascii = false;
+    size_t i;
+
+    for (i = 0; i < mdlp->nitems; i++)
+      {
+       message_list_ty *mlp = mdlp->item[i]->messages;
+
+       if (!is_ascii_message_list (mlp))
+         has_nonascii = true;
+      }
+
+    if (has_nonascii)
+      {
+       message_list_ty *mlp = mdlp->item[0]->messages;
+
+       iconv_message_list (mlp, po_charset_utf8, po_charset_utf8, NULL);
+      }
+  }
  }
  
  
diff --git a/src/xgettext.h b/src/xgettext.h

index 6356449fa6937ba462681938eab976a230dda92a..ad36fbd6c5192d285c65a0a1730344d97f5768f1 100644 (file)
--- a/src/xgettext.h
+++ b/src/xgettext.h
@@ -21,6 +21,11 @@
  #define _XGETTEXT_H
  
  #include <stddef.h>
+
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
+
  #include "message.h"
  #include "pos.h"
  
@@ -37,6 +42,24 @@ extern bool substring_match;
  extern void split_keywordspec PARAMS ((const char *spec, const char **endp,
                                        int *argnum1p, int *argnum2p));
  
+/* Canonicalized encoding name for all input files.  */
+extern const char *xgettext_global_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_global_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+extern iconv_t xgettext_global_source_iconv;
+#endif
+
+/* Canonicalized encoding name for the current input file.  */
+extern const char *xgettext_current_source_encoding;
+
+#if HAVE_ICONV
+/* Converter from xgettext_current_source_encoding to UTF-8 (except from
+   ASCII or UTF-8, when this conversion is a no-op).  */
+extern iconv_t xgettext_current_source_iconv;
+#endif
+
  /* List of messages whose msgids must not be extracted, or NULL.
     Used by remember_a_message().  */
  extern message_list_ty *exclude;
diff --git a/tests/ChangeLog b/tests/ChangeLog

index 031d109bee9d574942769bb29bfee761a98597e8..5f76d0f2c8f5341d6b077bf6b6a2592e55e5a3e3 100644 (file)
--- a/tests/ChangeLog
+++ b/tests/ChangeLog
@@ -1,3 +1,9 @@
+2002-11-05  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext-23: New file.
+       * msgmerge-21: New file.
+       * Makefile.am (TESTS): Add them.
+
  2002-11-01  Bruno Haible  <bruno@clisp.org>
  
         * format-awk-1, format-awk-2, format-c-1, format-c-2, format-elisp-1,
diff --git a/tests/Makefile.am b/tests/Makefile.am

index 649ec7ec6b2364cce4848f31d420940615de1ee1..73dfdf899d2997354814e24c540e345b3de10c20 100644 (file)
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -39,13 +39,14 @@ TESTS = gettext-1 gettext-2 \
         msgmerge-1 msgmerge-2 msgmerge-3 msgmerge-4 msgmerge-5 msgmerge-6 \
         msgmerge-7 msgmerge-8 msgmerge-9 msgmerge-10 msgmerge-11 msgmerge-12 \
         msgmerge-13 msgmerge-14 msgmerge-15 msgmerge-16 msgmerge-17 \
-       msgmerge-18 msgmerge-19 msgmerge-20 \
+       msgmerge-18 msgmerge-19 msgmerge-20 msgmerge-21 \
         msgunfmt-1 msgunfmt-2 msgunfmt-3 \
         msguniq-1 msguniq-2 msguniq-3 \
         xgettext-1 xgettext-2 xgettext-3 xgettext-4 xgettext-5 xgettext-6 \
         xgettext-7 xgettext-8 xgettext-9 xgettext-10 xgettext-11 xgettext-12 \
         xgettext-13 xgettext-14 xgettext-15 xgettext-16 xgettext-17 \
         xgettext-18 xgettext-19 xgettext-20 xgettext-21 xgettext-22 \
+       xgettext-23 \
         format-awk-1 format-awk-2 \
         format-c-1 format-c-2 format-c-3 format-c-4 \
         format-elisp-1 format-elisp-2 \
diff --git a/tests/msgmerge-21 b/tests/msgmerge-21

new file mode 100755 (executable)

index 0000000..11e52dc
--- /dev/null
+++ b/tests/msgmerge-21
@@ -0,0 +1,99 @@
+#! /bin/sh
+
+# Test merging of a ref.pot in UTF-8 encoding against a def.po in legacy
+# encoding (that was produced from an older version of ref.pot, in ASCII
+# encoding).
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles mm-test21-ru.po"
+cat <<\EOF > mm-test21-ru.po
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=koi8-r\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:9
+msgid "Division durch Null"
+msgstr "ÄÅÌÅÎÉÅ ÎÁ ÎÕÌØ"
+EOF
+
+tmpfiles="$tmpfiles mm-test21.pot"
+cat <<\EOF > mm-test21.pot
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr ""
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+tmpfiles="$tmpfiles mm-test21.out"
+: ${MSGMERGE=msgmerge}
+${MSGMERGE} -q mm-test21-ru.po mm-test21.pot -o mm-test21.out
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles mm-test21.ok"
+cat <<\EOF > mm-test21.ok
+# Russian messages for CLISP
+# Copyright (C) 1998 Free Software Foundation, Inc.
+# Eduard Haritonov <hed@iis.nsk.su>, 1998.
+# Arseny Slobodjuck <ampy@ich.dvo.ru>, 2002.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: GNU elvis 1.7\n"
+"POT-Creation-Date: 2002-11-01 01:22+0100\n"
+"PO-Revision-Date: 2002-11-01 01:23+0100\n"
+"Last-Translator: Arseny Slobodjuck <ampy@ich.dvo.ru>\n"
+"Language-Team: Russian <ru@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#: arith.c:10
+msgid "Division durch Null"
+msgstr "деление на нуль"
+
+#: arith.c:15
+msgid "Überlauf"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} mm-test21.ok mm-test21.out
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
diff --git a/tests/xgettext-23 b/tests/xgettext-23

new file mode 100755 (executable)

index 0000000..5928b6c
--- /dev/null
+++ b/tests/xgettext-23
@@ -0,0 +1,60 @@
+#! /bin/sh
+
+# Test extraction of non-ASCII msgids.
+
+tmpfiles=""
+trap 'rm -fr $tmpfiles' 1 2 3 15
+
+tmpfiles="$tmpfiles xg-test23.c"
+cat <<EOF > xg-test23.c
+void foo (int option)
+{
+  printf (_("%s: neznámý pøepínaè -- %c\n"), option);
+  printf (_("%s: pøepínaè vy¾aduje argument -- %c\n"), option);
+}
+EOF
+
+tmpfiles="$tmpfiles xg-test23.po"
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --no-location -k_ -o xg-test23.po xg-test23.c 2>/dev/null
+test $? = 1 || { rm -fr $tmpfiles; exit 1; }
+${XGETTEXT} --no-location -k_ --from-code=iso-8859-2 -o xg-test23.po xg-test23.c
+test $? = 0 || { rm -fr $tmpfiles; exit 1; }
+
+tmpfiles="$tmpfiles xg-test23.pot"
+sed -e '/POT-Creation-Date/d' < xg-test23.po > xg-test23.pot
+
+tmpfiles="$tmpfiles xg-test23.ok"
+cat <<EOF > xg-test23.ok
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=UTF-8\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#, c-format
+msgid "%s: neznámý přepínač -- %c\n"
+msgstr ""
+
+#, c-format
+msgid "%s: přepínač vyžaduje argument -- %c\n"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-test23.ok xg-test23.pot
+result=$?
+
+rm -fr $tmpfiles
+
+exit $result
author	Bruno Haible <bruno@clisp.org>
	Tue, 5 Nov 2002 12:21:13 +0000 (12:21 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 23 Jun 2009 10:08:50 +0000 (12:08 +0200)
NEWS		patch \| blob \| blame \| history
doc/ChangeLog		patch \| blob \| blame \| history
doc/xgettext.texi		patch \| blob \| blame \| history
src/ChangeLog		patch \| blob \| blame \| history
src/msgcmp.c		patch \| blob \| blame \| history
src/msginit.c		patch \| blob \| blame \| history
src/msgl-iconv.c		patch \| blob \| blame \| history
src/msgl-iconv.h		patch \| blob \| blame \| history
src/msgmerge.c		patch \| blob \| blame \| history
src/po-charset.c		patch \| blob \| blame \| history
src/po-charset.h		patch \| blob \| blame \| history
src/write-po.c		patch \| blob \| blame \| history
src/x-glade.c		patch \| blob \| blame \| history
src/x-python.c		patch \| blob \| blame \| history
src/x-tcl.c		patch \| blob \| blame \| history
src/xgettext.c		patch \| blob \| blame \| history
src/xgettext.h		patch \| blob \| blame \| history
tests/ChangeLog		patch \| blob \| blame \| history
tests/Makefile.am		patch \| blob \| blame \| history
tests/msgmerge-21	[new file with mode: 0755]	patch \| blob
tests/xgettext-23	[new file with mode: 0755]	patch \| blob