Fix parsing of strings in CJK encodings.

author Bruno Haible <bruno@clisp.org>

Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)

committer Bruno Haible <bruno@clisp.org>

Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)
author Bruno Haible <bruno@clisp.org>
Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)
committer Bruno Haible <bruno@clisp.org>
Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)
diff --git a/src/ChangeLog b/src/ChangeLog

index ca1a4cf942863f4174579adb490ad1274d3d8803..cf4062be84f58c4bef13de020a786f169e55b605 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,25 @@
+2001-03-03  Bruno Haible  <haible@clisp.cons.org>
+
+       Fix parsing of strings in CJK encodings.
+       * po.h (PO_BASE_TY): New field next_is_fuzzy.
+       * po-lex.h: Include iconv.h.
+       (po_lex_charset, po_lex_iconv): New declarations.
+       * po.c (SIZEOF): New macro.
+       (po_alloc): Initialize next_is_fuzzy.
+       (po_callback_message): Add check of charset in header entry.
+       Set po_lex_charset and po_lex_iconv.
+       After calling po_directive_message, reset next_is_fuzzy.
+       (po_callback_comment): Set next_is_fuzzy.
+       * msgfmt.c (format_directive_message): Remove check of charset in
+       header entry, now done in po.c.
+       * po-lex.c (po_lex_charset, po_lex_iconv): New variables.
+       (lex_open): Initialize them.
+       (lex_close): Reset them.
+       (po_gram_lex): While parsing a string, use 'po_lex_iconv' to avoid
+       treating the second byte of a multi-byte character as an ASCII
+       character.
+       * Makefile.am (msgcmp_LDADD, msgfmt_LDADD): New variables.
+
  2001-03-03  Bruno Haible  <haible@clisp.cons.org>
  
         * write-po.h: New file, pieces of message.h.
diff --git a/src/Makefile.am b/src/Makefile.am

index eab2f5360ccea2776470ea5887ada44b657438d6..2284f19d4a14dc32eb8dd6eda95ee0f6ad5cc3d7 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -51,7 +51,11 @@ po.c str-list.c xget-lex.c xgettext.c dir-list.c write-po.c
  msgcomm_SOURCES = msgcomm.c message.c po-gram-gen.y po-hash-gen.y po-lex.c \
  open-po.c po.c str-list.c dir-list.c write-po.c
  
-# Link dependencies. write-po.c pulls in linebreak.c which may need -liconv.
+# Link dependencies.
+# po-lex.c and po.c may need -liconv.
+# write-po.c pulls in linebreak.c which may need -liconv.
+msgcmp_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
+msgfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
  msgmerge_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
  msgunfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
  xgettext_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
diff --git a/src/msgfmt.c b/src/msgfmt.c

index 00bab404b0f6b72efaf24d23cd093073d9c2af37..4ed68aade3be28db5231a41480c6af35c365824f 100644 (file)
--- a/src/msgfmt.c
+++ b/src/msgfmt.c
@@ -608,87 +608,6 @@ some header fields still have the initial default value"));
             error (0, 0, _("field `%s' still has initial default value"),
                    required_fields[initial]);
         }
-
-      /* Verify the validity of CHARSET.  Even if not in verbose mode,
-        because the consequences are not harmless.  */
-      {
-       const char *charsetstr = strstr (msgstr_string, "charset=");
-
-       if (charsetstr != NULL)
-         {
-           /* The list of charsets supported by glibc's iconv() and by
-              the portable iconv() across platforms.  Taken from
-              intl/config.charset.  */
-           static const char *standard_charsets[] =
-           {
-             "ASCII", "ANSI_X3.4-1968", "US-ASCII",
-             "ISO-8859-1", "ISO_8859-1",
-             "ISO-8859-2", "ISO_8859-2",
-             "ISO-8859-3", "ISO_8859-3",
-             "ISO-8859-4", "ISO_8859-4",
-             "ISO-8859-5", "ISO_8859-5",
-             "ISO-8859-6", "ISO_8859-6",
-             "ISO-8859-7", "ISO_8859-7",
-             "ISO-8859-8", "ISO_8859-8",
-             "ISO-8859-9", "ISO_8859-9",
-             "ISO-8859-13", "ISO_8859-13",
-             "ISO-8859-15", "ISO_8859-15",
-             "KOI8-R",
-             "KOI8-U",
-             "CP850",
-             "CP866",
-             "CP874",
-             "CP932",
-             "CP949",
-             "CP950",
-             "CP1250",
-             "CP1251",
-             "CP1252",
-             "CP1253",
-             "CP1254",
-             "CP1255",
-             "CP1256",
-             "CP1257",
-             "GB2312",
-             "EUC-JP",
-             "EUC-KR",
-             "EUC-TW",
-             "BIG5",
-             "BIG5HKSCS",
-             "GBK",
-             "GB18030",
-             "SJIS",
-             "JOHAB",
-             "TIS-620",
-             "VISCII",
-             "UTF-8"
-           };
-           size_t len;
-           char *charset;
-           size_t i;
-
-           charsetstr += strlen ("charset=");
-           len = strcspn (charsetstr, " \t\n");
-           charset = (char *) alloca (len + 1);
-           memcpy (charset, charsetstr, len);
-           charset[len] = '\0';
-
-           for (i = 0; i < SIZEOF (standard_charsets); i++)
-             if (strcasecmp (charset, standard_charsets[i]) == 0)
-               break;
-           if (i == SIZEOF (standard_charsets))
-             error (0, 0, _("\
-%s: warning: charset \"%s\" is not a portable encoding name\n\
-%*s  warning: charset conversion might not work"),
-                    gram_pos.file_name, charset,
-                    strlen (gram_pos.file_name), "");
-         }
-       else
-         error (0, 0, _("\
-%s: warning: charset missing in header\n\
-%*s  warning: charset conversion will not work"),
-                gram_pos.file_name, strlen (gram_pos.file_name), "");
-      }
      }
    else
      /* We don't count the header entry in the statistic so place the
diff --git a/src/po-lex.c b/src/po-lex.c

index 1ba8264d3d72e03c801e1f9c8052edc543357a3a..cf43fe7521377cd2ecdc58f4742e99830846e5ec 100644 (file)
--- a/src/po-lex.c
+++ b/src/po-lex.c
@@ -59,6 +59,10 @@
  static FILE *fp;
  lex_pos_ty gram_pos;
  unsigned int gram_max_allowed_errors = 20;
+const char *po_lex_charset;
+#if HAVE_ICONV
+iconv_t po_lex_iconv;
+#endif
  static int pass_comments = 0;
  static int pass_obsolete_entries = 0;
  
@@ -81,6 +85,10 @@ lex_open (fname)
            _("error while opening \"%s\" for reading"), fname);
  
    gram_pos.line_number = 1;
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  po_lex_iconv = (iconv_t)(-1);
+#endif
  }
  
  
@@ -97,6 +105,14 @@ lex_close ()
    gram_pos.file_name = 0;
    gram_pos.line_number = 0;
    error_message_count = 0;
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  if (po_lex_iconv != (iconv_t)(-1))
+    {
+      iconv_close (po_lex_iconv);
+      po_lex_iconv = (iconv_t)(-1);
+    }
+#endif
  }
  
  
@@ -427,38 +443,85 @@ po_gram_lex ()
           break;
  
         case '"':
-         bufpos = 0;
-         while (1)
-           {
-             if (bufpos >= bufmax)
-               {
-                 bufmax += 100;
-                 buf = xrealloc (buf, bufmax);
-               }
-             c = lex_getc ();
-             if (c == '\n')
-               {
-                 po_gram_error (_("end-of-line within string"));
-                 break;
-               }
-             if (c == EOF)
-               {
-                 po_gram_error (_("end-of-file within string"));
-                 break;
-               }
-             if (c == '"')
-               break;
-
-             if (c == '\\')
-               c = control_sequence ();
+         /* Accumulate a string.  */
+         {
+#if HAVE_ICONV
+           size_t bufmbpos = 0;
+#endif
  
-             buf[bufpos++] = c;
-           }
-         buf[bufpos] = 0;
+           bufpos = 0;
+           while (1)
+             {
+               if (bufpos >= bufmax)
+                 {
+                   bufmax += 100;
+                   buf = xrealloc (buf, bufmax);
+                 }
+               c = lex_getc ();
+               if (c == EOF)
+                 {
+                   po_gram_error (_("end-of-file within string"));
+                   break;
+                 }
+               if (c == '\n')
+                 {
+                   po_gram_error (_("end-of-line within string"));
+                   break;
+                 }
+#if HAVE_ICONV
+               /* Interpret c only if it is the first byte of a multi-byte
+                  character.  Don't interpret it as ASCII when it is the
+                  second byte.  This is needed for the BIG5, BIG5HKSCS, GBK,
+                  GB18030, SJIS, JOHAB encodings.  */
+               if (po_lex_iconv == (iconv_t)(-1) || bufmbpos == bufpos)
+#endif
+                 {
+                   if (c == '"')
+                     break;
+
+                   if (c == '\\')
+                     {
+                       buf[bufpos++] = control_sequence ();
+#if HAVE_ICONV
+                       bufmbpos++;
+#endif
+                       continue;
+                     }
+                 }
+
+               /* Add c to the accumulator.  */
+               buf[bufpos++] = c;
+#if HAVE_ICONV
+               if (po_lex_iconv != (iconv_t)(-1))
+                 {
+                   /* If c terminates a multibyte character, set
+                      bufmbpos = bufpos.  Otherwise keep bufmbpos
+                      pointing at the start of the multibyte character.  */
+                   char scratchbuf[64];
+                   const char *inptr = &buf[bufmbpos];
+                   size_t insize = bufpos - bufmbpos;
+                   char *outptr = &scratchbuf[0];
+                   size_t outsize = sizeof (scratchbuf);
+                   if (iconv (po_lex_iconv,
+                              (ICONV_CONST char **) &inptr, &insize,
+                              &outptr, &outsize)
+                       == (size_t)(-1)
+                       && errno == EILSEQ)
+                     {
+                       po_gram_error (_("invalid multibyte sequence"));
+                       bufmbpos = bufpos;
+                     }
+                   else
+                     bufmbpos = inptr - buf;
+                 }
+#endif
+             }
+           buf[bufpos] = 0;
  
-         /* FIXME: Treatment of embedded \000 chars is incorrect.  */
-         po_gram_lval.string = xstrdup (buf);
-         return STRING;
+           /* FIXME: Treatment of embedded \000 chars is incorrect.  */
+           po_gram_lval.string = xstrdup (buf);
+           return STRING;
+         }
  
         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
diff --git a/src/po-lex.h b/src/po-lex.h

index f9cede0d37965b160ba117900fd7b7fa607c00f8..4beb7bc640d7cd1d3887047bb6ec9a5a982b813a 100644 (file)
--- a/src/po-lex.h
+++ b/src/po-lex.h
@@ -21,6 +21,9 @@
  #define _PO_LEX_H
  
  #include <sys/types.h>
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
  #include "error.h"
  #include "pos.h"
  
@@ -36,6 +39,14 @@ extern lex_pos_ty gram_pos;
     terminate.  Cf. error_message_count, declared in <error.h>.  */
  extern unsigned int gram_max_allowed_errors;
  
+/* The PO file's encoding, as specified in the header entry.  */
+extern const char *po_lex_charset;
+
+#if HAVE_ICONV
+/* Converter from the PO file's encoding to UTF-8.  */
+extern iconv_t po_lex_iconv;
+#endif
+
  
  /* Open the PO file FNAME and prepare its lexical analysis.  */
  extern void lex_open PARAMS ((const char *__fname));
diff --git a/src/po.c b/src/po.c

index 0e1ab8653bfdf37dca2d5a461dcec7041e4b2fd1..b87fcd3c627d1ebee4c626107cd762c353f8e12c 100644 (file)
--- a/src/po.c
+++ b/src/po.c
@@ -29,6 +29,11 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
  #include "po.h"
  #include "po-hash.h"
  #include "system.h"
+#include "libgettext.h"
+
+#define _(str) gettext (str)
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
  
  /* Prototypes for local functions.  */
  static void po_parse_brief PARAMS ((po_ty *__pop));
@@ -59,6 +64,7 @@ po_alloc (pomp)
  
    pop = xmalloc (pomp->size);
    pop->method = pomp;
+  pop->next_is_fuzzy = 0;
    if (pomp->constructor)
      pomp->constructor (pop);
    return pop;
@@ -163,8 +169,141 @@ po_callback_message (msgid, msgid_pos, msgid_plural,
       lex_pos_ty *msgstr_pos;
  {
    /* assert(callback_arg); */
+
+  /* Test for header entry.  */
+  if (msgid[0] == '\0' && !callback_arg->next_is_fuzzy)
+    {
+      /* Verify the validity of CHARSET.  It is necessary
+        1. for the correct treatment of multibyte characters containing
+           0x5C bytes in the PO lexer,
+        2. so that at run time, gettext() can call iconv() to convert
+           msgstr.  */
+      const char *charsetstr = strstr (msgstr, "charset=");
+
+      if (charsetstr != NULL)
+       {
+         /* The list of charsets supported by glibc's iconv() and by
+            the portable iconv() across platforms.  Taken from
+            intl/config.charset.  */
+         static const char *standard_charsets[] =
+         {
+           "ASCII", "ANSI_X3.4-1968", "US-ASCII",
+           "ISO-8859-1", "ISO_8859-1",
+           "ISO-8859-2", "ISO_8859-2",
+           "ISO-8859-3", "ISO_8859-3",
+           "ISO-8859-4", "ISO_8859-4",
+           "ISO-8859-5", "ISO_8859-5",
+           "ISO-8859-6", "ISO_8859-6",
+           "ISO-8859-7", "ISO_8859-7",
+           "ISO-8859-8", "ISO_8859-8",
+           "ISO-8859-9", "ISO_8859-9",
+           "ISO-8859-13", "ISO_8859-13",
+           "ISO-8859-15", "ISO_8859-15",
+           "KOI8-R",
+           "KOI8-U",
+           "CP850",
+           "CP866",
+           "CP874",
+           "CP932",
+           "CP949",
+           "CP950",
+           "CP1250",
+           "CP1251",
+           "CP1252",
+           "CP1253",
+           "CP1254",
+           "CP1255",
+           "CP1256",
+           "CP1257",
+           "GB2312",
+           "EUC-JP",
+           "EUC-KR",
+           "EUC-TW",
+           "BIG5",
+           "BIG5HKSCS",
+           "GBK",
+           "GB18030",
+           "SJIS",
+           "JOHAB",
+           "TIS-620",
+           "VISCII",
+           "UTF-8"
+         };
+         size_t len;
+         char *charset;
+         size_t i;
+
+         charsetstr += strlen ("charset=");
+         len = strcspn (charsetstr, " \t\n");
+         charset = (char *) alloca (len + 1);
+         memcpy (charset, charsetstr, len);
+         charset[len] = '\0';
+
+         for (i = 0; i < SIZEOF (standard_charsets); i++)
+           if (strcasecmp (charset, standard_charsets[i]) == 0)
+             break;
+         if (i == SIZEOF (standard_charsets))
+           {
+             error (0, 0, _("\
+%s: warning: charset \"%s\" is not a portable encoding name\n\
+%*s  warning: charset conversion might not work"),
+                    gram_pos.file_name, charset,
+                    strlen (gram_pos.file_name), "");
+             --error_message_count;
+           }
+         else
+           {
+             po_lex_charset = standard_charsets[i];
+#if HAVE_ICONV
+             if (po_lex_iconv != (iconv_t)(-1))
+               iconv_close (po_lex_iconv);
+             po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
+             if (po_lex_iconv == (iconv_t)(-1))
+               {
+                 /* For CJK encodings which have double-byte characters
+                    ending in 0x5C, the string parser is likely to be
+                    confused if it can't see the character boundaries.  */
+                 const char *note =
+                   (strcmp (po_lex_charset, "BIG5") == 0
+                    || strcmp (po_lex_charset, "BIG5HKSCS") == 0
+                    || strcmp (po_lex_charset, "GBK") == 0
+                    || strcmp (po_lex_charset, "GB18030") == 0
+                    || strcmp (po_lex_charset, "SJIS") == 0
+                    || strcmp (po_lex_charset, "JOHAB") == 0
+                    ? _(", expect parse errors")
+                    : "");
+
+# if _LIBICONV_VERSION
+                 error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s"),
+                        gram_pos.file_name, po_lex_charset, note);
+# else
+                 error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s\n\
+%*s  warning: consider installing libiconv and then reinstalling GNU gettext"),
+                        gram_pos.file_name, po_lex_charset, note,
+                        strlen (gram_pos.file_name), "");
+# endif
+                 --error_message_count;
+               }
+#endif
+           }
+       }
+      else
+       {
+         error (0, 0, _("\
+%s: warning: charset missing in header\n\
+%*s  warning: charset conversion will not work"),
+                gram_pos.file_name, strlen (gram_pos.file_name), "");
+         --error_message_count;
+       }
+    }
+
    po_directive_message (callback_arg, msgid, msgid_pos, msgid_plural,
                         msgstr, msgstr_len, msgstr_pos);
+
+  /* Prepare for next message.  */
+  callback_arg->next_is_fuzzy = 0;
  }
  
  
@@ -220,8 +359,12 @@ po_callback_comment (s)
         po_comment (callback_arg, s + 1);
      }
    else if (*s == ',' || *s == '!')
-    /* Get all entries in the special comment line.  */
-    po_comment_special (callback_arg, s + 1);
+    {
+      /* Get all entries in the special comment line.  */
+      if (strstr (s + 1, "fuzzy") != NULL)
+       callback_arg->next_is_fuzzy = 1;
+      po_comment_special (callback_arg, s + 1);
+    }
    else
      {
        /* It looks like a plain vanilla comment, but Solaris-style file
diff --git a/src/po.h b/src/po.h

index 0b77f8e85bdb170838a3531e5cc5f2de9c6b498c..e46e1db2b962ff46f9f15a8b3c24da826645607d 100644 (file)
--- a/src/po.h
+++ b/src/po.h
@@ -106,7 +106,8 @@ struct po_method_ty
     etc.  */
  
  #define PO_BASE_TY \
-  po_method_ty *method;
+  po_method_ty *method; \
+  int next_is_fuzzy;
  
  typedef struct po_ty po_ty;
  struct po_ty
author	Bruno Haible <bruno@clisp.org>
	Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Wed, 7 Mar 2001 14:38:03 +0000 (14:38 +0000)
src/ChangeLog		patch \| blob \| blame \| history
src/Makefile.am		patch \| blob \| blame \| history
src/msgfmt.c		patch \| blob \| blame \| history
src/po-lex.c		patch \| blob \| blame \| history
src/po-lex.h		patch \| blob \| blame \| history
src/po.c		patch \| blob \| blame \| history
src/po.h		patch \| blob \| blame \| history