From: Bruno Haible <bruno@clisp.org>
Date: Wed, 7 Mar 2001 14:38:03 +0000 (+0000)
Subject: Fix parsing of strings in CJK encodings.
X-Git-Tag: v0.10.36~124
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c265adb81a4bc541803a9b5e745c3cf136de539b;p=thirdparty%2Fgettext.git

Fix parsing of strings in CJK encodings.
---

diff --git a/src/ChangeLog b/src/ChangeLog
index ca1a4cf94..cf4062be8 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,25 @@
+2001-03-03  Bruno Haible  <haible@clisp.cons.org>
+
+	Fix parsing of strings in CJK encodings.
+	* po.h (PO_BASE_TY): New field next_is_fuzzy.
+	* po-lex.h: Include iconv.h.
+	(po_lex_charset, po_lex_iconv): New declarations.
+	* po.c (SIZEOF): New macro.
+	(po_alloc): Initialize next_is_fuzzy.
+	(po_callback_message): Add check of charset in header entry.
+	Set po_lex_charset and po_lex_iconv.
+	After calling po_directive_message, reset next_is_fuzzy.
+	(po_callback_comment): Set next_is_fuzzy.
+	* msgfmt.c (format_directive_message): Remove check of charset in
+	header entry, now done in po.c.
+	* po-lex.c (po_lex_charset, po_lex_iconv): New variables.
+	(lex_open): Initialize them.
+	(lex_close): Reset them.
+	(po_gram_lex): While parsing a string, use 'po_lex_iconv' to avoid
+	treating the second byte of a multi-byte character as an ASCII
+	character.
+	* Makefile.am (msgcmp_LDADD, msgfmt_LDADD): New variables.
+
 2001-03-03  Bruno Haible  <haible@clisp.cons.org>
 
 	* write-po.h: New file, pieces of message.h.
diff --git a/src/Makefile.am b/src/Makefile.am
index eab2f5360..2284f19d4 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -51,7 +51,11 @@ po.c str-list.c xget-lex.c xgettext.c dir-list.c write-po.c
 msgcomm_SOURCES = msgcomm.c message.c po-gram-gen.y po-hash-gen.y po-lex.c \
 open-po.c po.c str-list.c dir-list.c write-po.c
 
-# Link dependencies. write-po.c pulls in linebreak.c which may need -liconv.
+# Link dependencies.
+# po-lex.c and po.c may need -liconv.
+# write-po.c pulls in linebreak.c which may need -liconv.
+msgcmp_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
+msgfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
 msgmerge_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
 msgunfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
 xgettext_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
diff --git a/src/msgfmt.c b/src/msgfmt.c
index 00bab404b..4ed68aade 100644
--- a/src/msgfmt.c
+++ b/src/msgfmt.c
@@ -608,87 +608,6 @@ some header fields still have the initial default value"));
 	    error (0, 0, _("field `%s' still has initial default value"),
 		   required_fields[initial]);
 	}
-
-      /* Verify the validity of CHARSET.  Even if not in verbose mode,
-	 because the consequences are not harmless.  */
-      {
-	const char *charsetstr = strstr (msgstr_string, "charset=");
-
-	if (charsetstr != NULL)
-	  {
-	    /* The list of charsets supported by glibc's iconv() and by
-	       the portable iconv() across platforms.  Taken from
-	       intl/config.charset.  */
-	    static const char *standard_charsets[] =
-	    {
-	      "ASCII", "ANSI_X3.4-1968", "US-ASCII",
-	      "ISO-8859-1", "ISO_8859-1",
-	      "ISO-8859-2", "ISO_8859-2",
-	      "ISO-8859-3", "ISO_8859-3",
-	      "ISO-8859-4", "ISO_8859-4",
-	      "ISO-8859-5", "ISO_8859-5",
-	      "ISO-8859-6", "ISO_8859-6",
-	      "ISO-8859-7", "ISO_8859-7",
-	      "ISO-8859-8", "ISO_8859-8",
-	      "ISO-8859-9", "ISO_8859-9",
-	      "ISO-8859-13", "ISO_8859-13",
-	      "ISO-8859-15", "ISO_8859-15",
-	      "KOI8-R",
-	      "KOI8-U",
-	      "CP850",
-	      "CP866",
-	      "CP874",
-	      "CP932",
-	      "CP949",
-	      "CP950",
-	      "CP1250",
-	      "CP1251",
-	      "CP1252",
-	      "CP1253",
-	      "CP1254",
-	      "CP1255",
-	      "CP1256",
-	      "CP1257",
-	      "GB2312",
-	      "EUC-JP",
-	      "EUC-KR",
-	      "EUC-TW",
-	      "BIG5",
-	      "BIG5HKSCS",
-	      "GBK",
-	      "GB18030",
-	      "SJIS",
-	      "JOHAB",
-	      "TIS-620",
-	      "VISCII",
-	      "UTF-8"
-	    };
-	    size_t len;
-	    char *charset;
-	    size_t i;
-
-	    charsetstr += strlen ("charset=");
-	    len = strcspn (charsetstr, " \t\n");
-	    charset = (char *) alloca (len + 1);
-	    memcpy (charset, charsetstr, len);
-	    charset[len] = '\0';
-
-	    for (i = 0; i < SIZEOF (standard_charsets); i++)
-	      if (strcasecmp (charset, standard_charsets[i]) == 0)
-		break;
-	    if (i == SIZEOF (standard_charsets))
-	      error (0, 0, _("\
-%s: warning: charset \"%s\" is not a portable encoding name\n\
-%*s  warning: charset conversion might not work"),
-		     gram_pos.file_name, charset,
-		     strlen (gram_pos.file_name), "");
-	  }
-	else
-	  error (0, 0, _("\
-%s: warning: charset missing in header\n\
-%*s  warning: charset conversion will not work"),
-		 gram_pos.file_name, strlen (gram_pos.file_name), "");
-      }
     }
   else
     /* We don't count the header entry in the statistic so place the
diff --git a/src/po-lex.c b/src/po-lex.c
index 1ba8264d3..cf43fe752 100644
--- a/src/po-lex.c
+++ b/src/po-lex.c
@@ -59,6 +59,10 @@
 static FILE *fp;
 lex_pos_ty gram_pos;
 unsigned int gram_max_allowed_errors = 20;
+const char *po_lex_charset;
+#if HAVE_ICONV
+iconv_t po_lex_iconv;
+#endif
 static int pass_comments = 0;
 static int pass_obsolete_entries = 0;
 
@@ -81,6 +85,10 @@ lex_open (fname)
 	   _("error while opening \"%s\" for reading"), fname);
 
   gram_pos.line_number = 1;
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  po_lex_iconv = (iconv_t)(-1);
+#endif
 }
 
 
@@ -97,6 +105,14 @@ lex_close ()
   gram_pos.file_name = 0;
   gram_pos.line_number = 0;
   error_message_count = 0;
+  po_lex_charset = NULL;
+#if HAVE_ICONV
+  if (po_lex_iconv != (iconv_t)(-1))
+    {
+      iconv_close (po_lex_iconv);
+      po_lex_iconv = (iconv_t)(-1);
+    }
+#endif
 }
 
 
@@ -427,38 +443,85 @@ po_gram_lex ()
 	  break;
 
 	case '"':
-	  bufpos = 0;
-	  while (1)
-	    {
-	      if (bufpos >= bufmax)
-		{
-		  bufmax += 100;
-		  buf = xrealloc (buf, bufmax);
-		}
-	      c = lex_getc ();
-	      if (c == '\n')
-		{
-		  po_gram_error (_("end-of-line within string"));
-		  break;
-		}
-	      if (c == EOF)
-		{
-		  po_gram_error (_("end-of-file within string"));
-		  break;
-		}
-	      if (c == '"')
-		break;
-
-	      if (c == '\\')
-		c = control_sequence ();
+	  /* Accumulate a string.  */
+	  {
+#if HAVE_ICONV
+	    size_t bufmbpos = 0;
+#endif
 
-	      buf[bufpos++] = c;
-	    }
-	  buf[bufpos] = 0;
+	    bufpos = 0;
+	    while (1)
+	      {
+		if (bufpos >= bufmax)
+		  {
+		    bufmax += 100;
+		    buf = xrealloc (buf, bufmax);
+		  }
+		c = lex_getc ();
+		if (c == EOF)
+		  {
+		    po_gram_error (_("end-of-file within string"));
+		    break;
+		  }
+		if (c == '\n')
+		  {
+		    po_gram_error (_("end-of-line within string"));
+		    break;
+		  }
+#if HAVE_ICONV
+		/* Interpret c only if it is the first byte of a multi-byte
+		   character.  Don't interpret it as ASCII when it is the
+		   second byte.  This is needed for the BIG5, BIG5HKSCS, GBK,
+		   GB18030, SJIS, JOHAB encodings.  */
+		if (po_lex_iconv == (iconv_t)(-1) || bufmbpos == bufpos)
+#endif
+		  {
+		    if (c == '"')
+		      break;
+
+		    if (c == '\\')
+		      {
+			buf[bufpos++] = control_sequence ();
+#if HAVE_ICONV
+			bufmbpos++;
+#endif
+			continue;
+		      }
+		  }
+
+		/* Add c to the accumulator.  */
+		buf[bufpos++] = c;
+#if HAVE_ICONV
+		if (po_lex_iconv != (iconv_t)(-1))
+		  {
+		    /* If c terminates a multibyte character, set
+		       bufmbpos = bufpos.  Otherwise keep bufmbpos
+		       pointing at the start of the multibyte character.  */
+		    char scratchbuf[64];
+		    const char *inptr = &buf[bufmbpos];
+		    size_t insize = bufpos - bufmbpos;
+		    char *outptr = &scratchbuf[0];
+		    size_t outsize = sizeof (scratchbuf);
+		    if (iconv (po_lex_iconv,
+			       (ICONV_CONST char **) &inptr, &insize,
+			       &outptr, &outsize)
+			== (size_t)(-1)
+			&& errno == EILSEQ)
+		      {
+			po_gram_error (_("invalid multibyte sequence"));
+			bufmbpos = bufpos;
+		      }
+		    else
+		      bufmbpos = inptr - buf;
+		  }
+#endif
+	      }
+	    buf[bufpos] = 0;
 
-	  /* FIXME: Treatment of embedded \000 chars is incorrect.  */
-	  po_gram_lval.string = xstrdup (buf);
-	  return STRING;
+	    /* FIXME: Treatment of embedded \000 chars is incorrect.  */
+	    po_gram_lval.string = xstrdup (buf);
+	    return STRING;
+	  }
 
 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
diff --git a/src/po-lex.h b/src/po-lex.h
index f9cede0d3..4beb7bc64 100644
--- a/src/po-lex.h
+++ b/src/po-lex.h
@@ -21,6 +21,9 @@
 #define _PO_LEX_H
 
 #include <sys/types.h>
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
 #include "error.h"
 #include "pos.h"
 
@@ -36,6 +39,14 @@ extern lex_pos_ty gram_pos;
    terminate.  Cf. error_message_count, declared in <error.h>.  */
 extern unsigned int gram_max_allowed_errors;
 
+/* The PO file's encoding, as specified in the header entry.  */
+extern const char *po_lex_charset;
+
+#if HAVE_ICONV
+/* Converter from the PO file's encoding to UTF-8.  */
+extern iconv_t po_lex_iconv;
+#endif
+
 
 /* Open the PO file FNAME and prepare its lexical analysis.  */
 extern void lex_open PARAMS ((const char *__fname));
diff --git a/src/po.c b/src/po.c
index 0e1ab8653..b87fcd3c6 100644
--- a/src/po.c
+++ b/src/po.c
@@ -29,6 +29,11 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
 #include "po.h"
 #include "po-hash.h"
 #include "system.h"
+#include "libgettext.h"
+
+#define _(str) gettext (str)
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
 
 /* Prototypes for local functions.  */
 static void po_parse_brief PARAMS ((po_ty *__pop));
@@ -59,6 +64,7 @@ po_alloc (pomp)
 
   pop = xmalloc (pomp->size);
   pop->method = pomp;
+  pop->next_is_fuzzy = 0;
   if (pomp->constructor)
     pomp->constructor (pop);
   return pop;
@@ -163,8 +169,141 @@ po_callback_message (msgid, msgid_pos, msgid_plural,
      lex_pos_ty *msgstr_pos;
 {
   /* assert(callback_arg); */
+
+  /* Test for header entry.  */
+  if (msgid[0] == '\0' && !callback_arg->next_is_fuzzy)
+    {
+      /* Verify the validity of CHARSET.  It is necessary
+	 1. for the correct treatment of multibyte characters containing
+	    0x5C bytes in the PO lexer,
+	 2. so that at run time, gettext() can call iconv() to convert
+	    msgstr.  */
+      const char *charsetstr = strstr (msgstr, "charset=");
+
+      if (charsetstr != NULL)
+	{
+	  /* The list of charsets supported by glibc's iconv() and by
+	     the portable iconv() across platforms.  Taken from
+	     intl/config.charset.  */
+	  static const char *standard_charsets[] =
+	  {
+	    "ASCII", "ANSI_X3.4-1968", "US-ASCII",
+	    "ISO-8859-1", "ISO_8859-1",
+	    "ISO-8859-2", "ISO_8859-2",
+	    "ISO-8859-3", "ISO_8859-3",
+	    "ISO-8859-4", "ISO_8859-4",
+	    "ISO-8859-5", "ISO_8859-5",
+	    "ISO-8859-6", "ISO_8859-6",
+	    "ISO-8859-7", "ISO_8859-7",
+	    "ISO-8859-8", "ISO_8859-8",
+	    "ISO-8859-9", "ISO_8859-9",
+	    "ISO-8859-13", "ISO_8859-13",
+	    "ISO-8859-15", "ISO_8859-15",
+	    "KOI8-R",
+	    "KOI8-U",
+	    "CP850",
+	    "CP866",
+	    "CP874",
+	    "CP932",
+	    "CP949",
+	    "CP950",
+	    "CP1250",
+	    "CP1251",
+	    "CP1252",
+	    "CP1253",
+	    "CP1254",
+	    "CP1255",
+	    "CP1256",
+	    "CP1257",
+	    "GB2312",
+	    "EUC-JP",
+	    "EUC-KR",
+	    "EUC-TW",
+	    "BIG5",
+	    "BIG5HKSCS",
+	    "GBK",
+	    "GB18030",
+	    "SJIS",
+	    "JOHAB",
+	    "TIS-620",
+	    "VISCII",
+	    "UTF-8"
+	  };
+	  size_t len;
+	  char *charset;
+	  size_t i;
+
+	  charsetstr += strlen ("charset=");
+	  len = strcspn (charsetstr, " \t\n");
+	  charset = (char *) alloca (len + 1);
+	  memcpy (charset, charsetstr, len);
+	  charset[len] = '\0';
+
+	  for (i = 0; i < SIZEOF (standard_charsets); i++)
+	    if (strcasecmp (charset, standard_charsets[i]) == 0)
+	      break;
+	  if (i == SIZEOF (standard_charsets))
+	    {
+	      error (0, 0, _("\
+%s: warning: charset \"%s\" is not a portable encoding name\n\
+%*s  warning: charset conversion might not work"),
+		     gram_pos.file_name, charset,
+		     strlen (gram_pos.file_name), "");
+	      --error_message_count;
+	    }
+	  else
+	    {
+	      po_lex_charset = standard_charsets[i];
+#if HAVE_ICONV
+	      if (po_lex_iconv != (iconv_t)(-1))
+		iconv_close (po_lex_iconv);
+	      po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
+	      if (po_lex_iconv == (iconv_t)(-1))
+		{
+		  /* For CJK encodings which have double-byte characters
+		     ending in 0x5C, the string parser is likely to be
+		     confused if it can't see the character boundaries.  */
+		  const char *note =
+		    (strcmp (po_lex_charset, "BIG5") == 0
+		     || strcmp (po_lex_charset, "BIG5HKSCS") == 0
+		     || strcmp (po_lex_charset, "GBK") == 0
+		     || strcmp (po_lex_charset, "GB18030") == 0
+		     || strcmp (po_lex_charset, "SJIS") == 0
+		     || strcmp (po_lex_charset, "JOHAB") == 0
+		     ? _(", expect parse errors")
+		     : "");
+
+# if _LIBICONV_VERSION
+		  error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s"),
+			 gram_pos.file_name, po_lex_charset, note);
+# else
+		  error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s\n\
+%*s  warning: consider installing libiconv and then reinstalling GNU gettext"),
+			 gram_pos.file_name, po_lex_charset, note,
+			 strlen (gram_pos.file_name), "");
+# endif
+		  --error_message_count;
+		}
+#endif
+	    }
+	}
+      else
+	{
+	  error (0, 0, _("\
+%s: warning: charset missing in header\n\
+%*s  warning: charset conversion will not work"),
+		 gram_pos.file_name, strlen (gram_pos.file_name), "");
+	  --error_message_count;
+	}
+    }
+
   po_directive_message (callback_arg, msgid, msgid_pos, msgid_plural,
 			msgstr, msgstr_len, msgstr_pos);
+
+  /* Prepare for next message.  */
+  callback_arg->next_is_fuzzy = 0;
 }
 
 
@@ -220,8 +359,12 @@ po_callback_comment (s)
 	po_comment (callback_arg, s + 1);
     }
   else if (*s == ',' || *s == '!')
-    /* Get all entries in the special comment line.  */
-    po_comment_special (callback_arg, s + 1);
+    {
+      /* Get all entries in the special comment line.  */
+      if (strstr (s + 1, "fuzzy") != NULL)
+	callback_arg->next_is_fuzzy = 1;
+      po_comment_special (callback_arg, s + 1);
+    }
   else
     {
       /* It looks like a plain vanilla comment, but Solaris-style file
diff --git a/src/po.h b/src/po.h
index 0b77f8e85..e46e1db2b 100644
--- a/src/po.h
+++ b/src/po.h
@@ -106,7 +106,8 @@ struct po_method_ty
    etc.  */
 
 #define PO_BASE_TY \
-  po_method_ty *method;
+  po_method_ty *method; \
+  int next_is_fuzzy;
 
 typedef struct po_ty po_ty;
 struct po_ty