From: Bruno Haible <bruno@clisp.org>
Date: Mon, 7 Jan 2002 17:51:04 +0000 (+0000)
Subject: Make "msgmerge --update" work better on CJK files even if iconv() is not
X-Git-Tag: v0.11~105
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dbf56656d6d922e0d91ae387840b05ed65c80abe;p=thirdparty%2Fgettext.git

Make "msgmerge --update" work better on CJK files even if iconv() is not
available.
---

diff --git a/src/ChangeLog b/src/ChangeLog
index 2fa84e52f..f9d5dff8a 100644
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,23 @@
+2002-01-05  Bruno Haible  <bruno@clisp.org>
+
+	Make "msgmerge --update" work better on CJK files even if iconv() is
+	not available.
+	* po-charset.h (po_is_charset_weird): New declaration.
+	(po_is_charset_weird_cjk): Likewise..
+	(po_lex_weird_cjk): New variable declaration.
+	* po-charset.c (po_is_charset_weird): New function, extracted from
+	po_lex_charset_set.
+	(po_is_charset_weird_cjk): New function.
+	(po_lex_weird_cjk): New variable.
+	(po_lex_charset_init): Initialize po_lex_weird_cjk.
+	(po_lex_charset_set): Call po_is_charset_weird and
+	po_is_charset_weird_cjk. Set po_lex_weird_cjk.
+	(po_lex_charset_close): Reset po_lex_weird_cjk.
+	* po-lex.c (mbfile_getc): If po_lex_weird_cjk is set, possibly return
+	a double byte instead of single byte.
+	* write-po.c (wrap): Call po_is_charset_weird_cjk. If it returns true,
+	group double bytes where possible.
+
 2002-01-05  Bruno Haible  <bruno@clisp.org>
 
 	* gettext.c: TESTS version is now separate.
diff --git a/src/po-charset.c b/src/po-charset.c
index 4e25d6ed9..357fb5776 100644
--- a/src/po-charset.c
+++ b/src/po-charset.c
@@ -1,5 +1,5 @@
 /* Charset handling while reading PO files.
-   Copyright (C) 2001 Free Software Foundation, Inc.
+   Copyright (C) 2001-2002 Free Software Foundation, Inc.
    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
 
    This program is free software; you can redistribute it and/or modify
@@ -117,6 +117,52 @@ po_charset_ascii_compatible (canon_charset)
     return true;
 }
 
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+   characters ending in 0x5C.  */
+bool po_is_charset_weird (canon_charset)
+     const char *canon_charset;
+{
+  static const char *weird_charsets[] =
+  {
+    "BIG5",
+    "BIG5-HKSCS",
+    "GBK",
+    "GB18030",
+    "SHIFT_JIS",
+    "JOHAB"
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_charsets); i++)
+    if (strcmp (canon_charset, weird_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+   An encoding has CJK structure if every valid character stream is composed
+   of single bytes in the range 0x{00..7F} and of byte pairs in the range
+   0x{80..FF}{30..FF}.  */
+bool po_is_charset_weird_cjk (canon_charset)
+     const char *canon_charset;
+{
+  static const char *weird_cjk_charsets[] =
+  {			/* single bytes   double bytes       */
+    "BIG5",		/* 0x{00..7F},    0x{A1..F9}{40..FE} */
+    "BIG5-HKSCS",	/* 0x{00..7F},    0x{88..FE}{40..FE} */
+    "GBK",		/* 0x{00..7F},    0x{81..FE}{40..FE} */
+    "GB18030",		/* 0x{00..7F},    0x{81..FE}{30..FE} */
+    "SHIFT_JIS",	/* 0x{00..7F},    0x{81..F9}{40..FC} */
+    "JOHAB"		/* 0x{00..7F},    0x{84..F9}{31..FE} */
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
+    if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
 
 /* The PO file's encoding, as specified in the header entry.  */
 const char *po_lex_charset;
@@ -125,6 +171,9 @@ const char *po_lex_charset;
 /* Converter from the PO file's encoding to UTF-8.  */
 iconv_t po_lex_iconv;
 #endif
+/* If no converter is available, some information about the structure of the
+   PO file's encoding.  */
+bool po_lex_weird_cjk;
 
 void
 po_lex_charset_init ()
@@ -133,6 +182,7 @@ po_lex_charset_init ()
 #if HAVE_ICONV
   po_lex_iconv = (iconv_t)(-1);
 #endif
+  po_lex_weird_cjk = false;
 }
 
 void
@@ -177,19 +227,6 @@ Message conversion to user's charset might not work.\n"),
 	}
       else
 	{
-	  /* The list of encodings in standard_charsets which have
-	     double-byte characters ending in 0x5C.  For these encodings,
-	     the string parser is likely to be confused if it can't see
-	     the character boundaries.  */
-	  static const char *weird_charsets[] =
-	  {
-	    "BIG5",
-	    "BIG5-HKSCS",
-	    "GBK",
-	    "GB18030",
-	    "SHIFT_JIS",
-	    "JOHAB"
-	  };
 	  const char *envval;
 
 	  po_lex_charset = canon_charset;
@@ -212,6 +249,7 @@ Message conversion to user's charset might not work.\n"),
 #if HAVE_ICONV
 	      po_lex_iconv = (iconv_t)(-1);
 #endif
+	      po_lex_weird_cjk = false;
 	    }
 	  else
 	    {
@@ -226,13 +264,15 @@ Message conversion to user's charset might not work.\n"),
 	      po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
 	      if (po_lex_iconv == (iconv_t)(-1))
 		{
-		  size_t i;
 		  const char *note;
 
-		  for (i = 0; i < SIZEOF (weird_charsets); i++)
-		    if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
-		      break;
-		  if (i < SIZEOF (weird_charsets))
+		  /* Test for a charset which has double-byte characters
+		     ending in 0x5C.  For these encodings, the string parser
+		     is likely to be confused if it can't see the character
+		     boundaries.  */
+		  po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+		  if (po_is_charset_weird (po_lex_charset)
+		      && !po_lex_weird_cjk)
 		    note = _("Continuing anyway, expect parse errors.");
 		  else
 		    note = _("Continuing anyway.");
@@ -255,12 +295,12 @@ would fix this problem.\n")));
 		  multiline_warning (NULL, xasprintf (_("%s\n"), note));
 		}
 #else
-	      size_t i;
-
-	      for (i = 0; i < SIZEOF (weird_charsets); i++)
-		if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
-		  break;
-	      if (i < SIZEOF (weird_charsets))
+	      /* Test for a charset which has double-byte characters
+		 ending in 0x5C.  For these encodings, the string parser
+		 is likely to be confused if it can't see the character
+		 boundaries.  */
+	      po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+	      if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
 		{
 		  const char *note =
 		    _("Continuing anyway, expect parse errors.");
@@ -309,4 +349,5 @@ po_lex_charset_close ()
       po_lex_iconv = (iconv_t)(-1);
     }
 #endif
+  po_lex_weird_cjk = false;
 }
diff --git a/src/po-charset.h b/src/po-charset.h
index 5922c9439..216ab330a 100644
--- a/src/po-charset.h
+++ b/src/po-charset.h
@@ -1,5 +1,5 @@
 /* Charset handling while reading PO files.
-   Copyright (C) 2001 Free Software Foundation, Inc.
+   Copyright (C) 2001-2002 Free Software Foundation, Inc.
    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
 
    This program is free software; you can redistribute it and/or modify
@@ -36,6 +36,16 @@ extern const char *po_charset_ascii;
 /* Test for ASCII compatibility.  */
 extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
 
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+   characters ending in 0x5C.  */
+extern bool po_is_charset_weird PARAMS ((const char *canon_charset));
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+   An encoding has CJK structure if every valid character stream is composed
+   of single bytes in the range 0x{00..7F} and of byte pairs in the range
+   0x{80..FF}{30..FF}.  */
+extern bool po_is_charset_weird_cjk PARAMS ((const char *canon_charset));
+
 
 /* The PO file's encoding, as specified in the header entry.  */
 extern const char *po_lex_charset;
@@ -44,6 +54,9 @@ extern const char *po_lex_charset;
 /* Converter from the PO file's encoding to UTF-8.  */
 extern iconv_t po_lex_iconv;
 #endif
+/* If no converter is available, some information about the structure of the
+   PO file's encoding.  */
+extern bool po_lex_weird_cjk;
 
 /* Initialize the PO file's encoding.  */
 extern void po_lex_charset_init PARAMS ((void));
diff --git a/src/po-lex.c b/src/po-lex.c
index 98a6eca3f..76ba7e166 100644
--- a/src/po-lex.c
+++ b/src/po-lex.c
@@ -1,5 +1,5 @@
 /* GNU gettext - internationalization aids
-   Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995-1999, 2000-2002 Free Software Foundation, Inc.
 
    This file was written by Peter Miller <millerp@canb.auug.org.au>.
    Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
@@ -611,8 +611,32 @@ incomplete multibyte sequence at end of line"));
   else
 #endif
     {
-      /* Return a single byte.  */
-      bytes = 1;
+      if (po_lex_weird_cjk
+	  /* Special handling of encodings with CJK structure.  */
+	  && (unsigned char) mbf->buf[0] >= 0x80)
+	{
+	  if (mbf->bufcount == 1)
+	    {
+	      /* Read one more byte.  */
+	      int c = getc (mbf->fp);
+	      if (c != EOF)
+		{
+		  mbf->buf[1] = (unsigned char) c;
+		  mbf->bufcount++;
+		}
+	    }
+	  if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
+	    /* Return a double byte.  */
+	    bytes = 2;
+	  else
+	    /* Return a single byte.  */
+	    bytes = 1;
+	}
+      else
+	{
+	  /* Return a single byte.  */
+	  bytes = 1;
+	}
 #if HAVE_ICONV
       mbc->uc_valid = false;
 #endif
diff --git a/src/write-po.c b/src/write-po.c
index 7f30e3092..6bb6f59b5 100644
--- a/src/write-po.c
+++ b/src/write-po.c
@@ -1,5 +1,5 @@
 /* GNU gettext - internationalization aids
-   Copyright (C) 1995-1998, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.
 
    This file was written by Peter Miller <millerp@canb.auug.org.au>
 
@@ -240,6 +240,7 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
   const char *envval;
   iconv_t conv;
 #endif
+  bool weird_cjk;
 
 #if HAVE_ICONV
   /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 don't know
@@ -261,7 +262,12 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
 # endif
     /* Use iconv() to parse multibyte characters.  */
     conv = iconv_open ("UTF-8", charset);
+
+  if (conv != (iconv_t)(-1))
+    weird_cjk = false;
+  else
 #endif
+    weird_cjk = po_is_charset_weird_cjk (po_charset_canonicalize (charset));
 
   /* Loop over the '\n' delimited portions of value.  */
   s = value;
@@ -341,7 +347,19 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
 		}
 	      else
 #endif
-		portion_len += 1;
+		{
+		  if (weird_cjk
+		      /* Special handling of encodings with CJK structure.  */
+		      && ep + 2 <= es
+		      && (unsigned char) ep[0] >= 0x80
+		      && (unsigned char) ep[1] >= 0x30)
+		    {
+		      portion_len += 2;
+		      ep += 1;
+		    }
+		  else
+		    portion_len += 1;
+		}
 	    }
 	}
       portion = (char *) xmalloc (portion_len);
@@ -434,8 +452,22 @@ internationalized messages should not contain the `\\%c' escape sequence"),
 	      else
 #endif
 		{
-		  *pp++ = c;
-		  op++;
+		  if (weird_cjk
+		      /* Special handling of encodings with CJK structure.  */
+		      && ep + 2 <= es
+		      && (unsigned char) c >= 0x80
+		      && (unsigned char) ep[1] >= 0x30)
+		    {
+		      *pp++ = c;
+		      ep += 1;
+		      *pp++ = *ep;
+		      op += 2;
+		    }
+		  else
+		    {
+		      *pp++ = c;
+		      op++;
+		    }
 		}
 	    }
 	}