Make "msgmerge --update" work better on CJK files even if iconv() is not

author Bruno Haible <bruno@clisp.org>

Mon, 7 Jan 2002 17:51:04 +0000 (17:51 +0000)

committer Bruno Haible <bruno@clisp.org>

Sun, 21 Jun 2009 22:36:38 +0000 (00:36 +0200)
author Bruno Haible <bruno@clisp.org>
Mon, 7 Jan 2002 17:51:04 +0000 (17:51 +0000)
committer Bruno Haible <bruno@clisp.org>
Sun, 21 Jun 2009 22:36:38 +0000 (00:36 +0200)
diff --git a/src/ChangeLog b/src/ChangeLog

index 2fa84e52f685614f7cc999e3ae91e5f57dd3b179..f9d5dff8a73849f1432b4965d5a72b2ed398a1f9 100644 (file)
--- a/src/ChangeLog
+++ b/src/ChangeLog
@@ -1,3 +1,23 @@
+2002-01-05  Bruno Haible  <bruno@clisp.org>
+
+       Make "msgmerge --update" work better on CJK files even if iconv() is
+       not available.
+       * po-charset.h (po_is_charset_weird): New declaration.
+       (po_is_charset_weird_cjk): Likewise..
+       (po_lex_weird_cjk): New variable declaration.
+       * po-charset.c (po_is_charset_weird): New function, extracted from
+       po_lex_charset_set.
+       (po_is_charset_weird_cjk): New function.
+       (po_lex_weird_cjk): New variable.
+       (po_lex_charset_init): Initialize po_lex_weird_cjk.
+       (po_lex_charset_set): Call po_is_charset_weird and
+       po_is_charset_weird_cjk. Set po_lex_weird_cjk.
+       (po_lex_charset_close): Reset po_lex_weird_cjk.
+       * po-lex.c (mbfile_getc): If po_lex_weird_cjk is set, possibly return
+       a double byte instead of single byte.
+       * write-po.c (wrap): Call po_is_charset_weird_cjk. If it returns true,
+       group double bytes where possible.
+
  2002-01-05  Bruno Haible  <bruno@clisp.org>
  
         * gettext.c: TESTS version is now separate.
diff --git a/src/po-charset.c b/src/po-charset.c

index 4e25d6ed926339b301c156fdbd05cefc21cfd97c..357fb5776394a875ad4c308174c150af1eed36a7 100644 (file)
--- a/src/po-charset.c
+++ b/src/po-charset.c
@@ -1,5 +1,5 @@
  /* Charset handling while reading PO files.
-   Copyright (C) 2001 Free Software Foundation, Inc.
+   Copyright (C) 2001-2002 Free Software Foundation, Inc.
     Written by Bruno Haible <haible@clisp.cons.org>, 2001.
  
     This program is free software; you can redistribute it and/or modify
@@ -117,6 +117,52 @@ po_charset_ascii_compatible (canon_charset)
      return true;
  }
  
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+   characters ending in 0x5C.  */
+bool po_is_charset_weird (canon_charset)
+     const char *canon_charset;
+{
+  static const char *weird_charsets[] =
+  {
+    "BIG5",
+    "BIG5-HKSCS",
+    "GBK",
+    "GB18030",
+    "SHIFT_JIS",
+    "JOHAB"
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_charsets); i++)
+    if (strcmp (canon_charset, weird_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+   An encoding has CJK structure if every valid character stream is composed
+   of single bytes in the range 0x{00..7F} and of byte pairs in the range
+   0x{80..FF}{30..FF}.  */
+bool po_is_charset_weird_cjk (canon_charset)
+     const char *canon_charset;
+{
+  static const char *weird_cjk_charsets[] =
+  {                    /* single bytes   double bytes       */
+    "BIG5",            /* 0x{00..7F},    0x{A1..F9}{40..FE} */
+    "BIG5-HKSCS",      /* 0x{00..7F},    0x{88..FE}{40..FE} */
+    "GBK",             /* 0x{00..7F},    0x{81..FE}{40..FE} */
+    "GB18030",         /* 0x{00..7F},    0x{81..FE}{30..FE} */
+    "SHIFT_JIS",       /* 0x{00..7F},    0x{81..F9}{40..FC} */
+    "JOHAB"            /* 0x{00..7F},    0x{84..F9}{31..FE} */
+  };
+  size_t i;
+
+  for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
+    if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
+      return true;
+  return false;
+}
+
  
  /* The PO file's encoding, as specified in the header entry.  */
  const char *po_lex_charset;
@@ -125,6 +171,9 @@ const char *po_lex_charset;
  /* Converter from the PO file's encoding to UTF-8.  */
  iconv_t po_lex_iconv;
  #endif
+/* If no converter is available, some information about the structure of the
+   PO file's encoding.  */
+bool po_lex_weird_cjk;
  
  void
  po_lex_charset_init ()
@@ -133,6 +182,7 @@ po_lex_charset_init ()
  #if HAVE_ICONV
    po_lex_iconv = (iconv_t)(-1);
  #endif
+  po_lex_weird_cjk = false;
  }
  
  void
@@ -177,19 +227,6 @@ Message conversion to user's charset might not work.\n"),
         }
        else
         {
-         /* The list of encodings in standard_charsets which have
-            double-byte characters ending in 0x5C.  For these encodings,
-            the string parser is likely to be confused if it can't see
-            the character boundaries.  */
-         static const char *weird_charsets[] =
-         {
-           "BIG5",
-           "BIG5-HKSCS",
-           "GBK",
-           "GB18030",
-           "SHIFT_JIS",
-           "JOHAB"
-         };
           const char *envval;
  
           po_lex_charset = canon_charset;
@@ -212,6 +249,7 @@ Message conversion to user's charset might not work.\n"),
  #if HAVE_ICONV
               po_lex_iconv = (iconv_t)(-1);
  #endif
+             po_lex_weird_cjk = false;
             }
           else
             {
@@ -226,13 +264,15 @@ Message conversion to user's charset might not work.\n"),
               po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
               if (po_lex_iconv == (iconv_t)(-1))
                 {
-                 size_t i;
                   const char *note;
  
-                 for (i = 0; i < SIZEOF (weird_charsets); i++)
-                   if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
-                     break;
-                 if (i < SIZEOF (weird_charsets))
+                 /* Test for a charset which has double-byte characters
+                    ending in 0x5C.  For these encodings, the string parser
+                    is likely to be confused if it can't see the character
+                    boundaries.  */
+                 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+                 if (po_is_charset_weird (po_lex_charset)
+                     && !po_lex_weird_cjk)
                     note = _("Continuing anyway, expect parse errors.");
                   else
                     note = _("Continuing anyway.");
@@ -255,12 +295,12 @@ would fix this problem.\n")));
                   multiline_warning (NULL, xasprintf (_("%s\n"), note));
                 }
  #else
-             size_t i;
-
-             for (i = 0; i < SIZEOF (weird_charsets); i++)
-               if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
-                 break;
-             if (i < SIZEOF (weird_charsets))
+             /* Test for a charset which has double-byte characters
+                ending in 0x5C.  For these encodings, the string parser
+                is likely to be confused if it can't see the character
+                boundaries.  */
+             po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+             if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
                 {
                   const char *note =
                     _("Continuing anyway, expect parse errors.");
@@ -309,4 +349,5 @@ po_lex_charset_close ()
        po_lex_iconv = (iconv_t)(-1);
      }
  #endif
+  po_lex_weird_cjk = false;
  }
diff --git a/src/po-charset.h b/src/po-charset.h

index 5922c94391c91378cb5ad81ea65344627ac7cd4d..216ab330a9c9be6526e4fa0e21ff5d4aa008fe0f 100644 (file)
--- a/src/po-charset.h
+++ b/src/po-charset.h
@@ -1,5 +1,5 @@
  /* Charset handling while reading PO files.
-   Copyright (C) 2001 Free Software Foundation, Inc.
+   Copyright (C) 2001-2002 Free Software Foundation, Inc.
     Written by Bruno Haible <haible@clisp.cons.org>, 2001.
  
     This program is free software; you can redistribute it and/or modify
@@ -36,6 +36,16 @@ extern const char *po_charset_ascii;
  /* Test for ASCII compatibility.  */
  extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
  
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+   characters ending in 0x5C.  */
+extern bool po_is_charset_weird PARAMS ((const char *canon_charset));
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+   An encoding has CJK structure if every valid character stream is composed
+   of single bytes in the range 0x{00..7F} and of byte pairs in the range
+   0x{80..FF}{30..FF}.  */
+extern bool po_is_charset_weird_cjk PARAMS ((const char *canon_charset));
+
  
  /* The PO file's encoding, as specified in the header entry.  */
  extern const char *po_lex_charset;
@@ -44,6 +54,9 @@ extern const char *po_lex_charset;
  /* Converter from the PO file's encoding to UTF-8.  */
  extern iconv_t po_lex_iconv;
  #endif
+/* If no converter is available, some information about the structure of the
+   PO file's encoding.  */
+extern bool po_lex_weird_cjk;
  
  /* Initialize the PO file's encoding.  */
  extern void po_lex_charset_init PARAMS ((void));
diff --git a/src/po-lex.c b/src/po-lex.c

index 98a6eca3f8924de6a714783ccf4ebada482367b4..76ba7e166f10e7ed49632479cc96025dd3d7101d 100644 (file)
--- a/src/po-lex.c
+++ b/src/po-lex.c
@@ -1,5 +1,5 @@
  /* GNU gettext - internationalization aids
-   Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995-1999, 2000-2002 Free Software Foundation, Inc.
  
     This file was written by Peter Miller <millerp@canb.auug.org.au>.
     Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
@@ -611,8 +611,32 @@ incomplete multibyte sequence at end of line"));
    else
  #endif
      {
-      /* Return a single byte.  */
-      bytes = 1;
+      if (po_lex_weird_cjk
+         /* Special handling of encodings with CJK structure.  */
+         && (unsigned char) mbf->buf[0] >= 0x80)
+       {
+         if (mbf->bufcount == 1)
+           {
+             /* Read one more byte.  */
+             int c = getc (mbf->fp);
+             if (c != EOF)
+               {
+                 mbf->buf[1] = (unsigned char) c;
+                 mbf->bufcount++;
+               }
+           }
+         if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
+           /* Return a double byte.  */
+           bytes = 2;
+         else
+           /* Return a single byte.  */
+           bytes = 1;
+       }
+      else
+       {
+         /* Return a single byte.  */
+         bytes = 1;
+       }
  #if HAVE_ICONV
        mbc->uc_valid = false;
  #endif
diff --git a/src/write-po.c b/src/write-po.c

index 7f30e30927fec3f90235cbfc8e8625c7c875cfc9..6bb6f59b5c390dbf67c05462d03afe2b3da9ceaa 100644 (file)
--- a/src/write-po.c
+++ b/src/write-po.c
@@ -1,5 +1,5 @@
  /* GNU gettext - internationalization aids
-   Copyright (C) 1995-1998, 2000, 2001 Free Software Foundation, Inc.
+   Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.
  
     This file was written by Peter Miller <millerp@canb.auug.org.au>
  
@@ -240,6 +240,7 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
    const char *envval;
    iconv_t conv;
  #endif
+  bool weird_cjk;
  
  #if HAVE_ICONV
    /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 don't know
@@ -261,7 +262,12 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
  # endif
      /* Use iconv() to parse multibyte characters.  */
      conv = iconv_open ("UTF-8", charset);
+
+  if (conv != (iconv_t)(-1))
+    weird_cjk = false;
+  else
  #endif
+    weird_cjk = po_is_charset_weird_cjk (po_charset_canonicalize (charset));
  
    /* Loop over the '\n' delimited portions of value.  */
    s = value;
@@ -341,7 +347,19 @@ wrap (fp, line_prefix, name, value, do_wrap, charset)
                 }
               else
  #endif
-               portion_len += 1;
+               {
+                 if (weird_cjk
+                     /* Special handling of encodings with CJK structure.  */
+                     && ep + 2 <= es
+                     && (unsigned char) ep[0] >= 0x80
+                     && (unsigned char) ep[1] >= 0x30)
+                   {
+                     portion_len += 2;
+                     ep += 1;
+                   }
+                 else
+                   portion_len += 1;
+               }
             }
         }
        portion = (char *) xmalloc (portion_len);
@@ -434,8 +452,22 @@ internationalized messages should not contain the `\\%c' escape sequence"),
               else
  #endif
                 {
-                 *pp++ = c;
-                 op++;
+                 if (weird_cjk
+                     /* Special handling of encodings with CJK structure.  */
+                     && ep + 2 <= es
+                     && (unsigned char) c >= 0x80
+                     && (unsigned char) ep[1] >= 0x30)
+                   {
+                     *pp++ = c;
+                     ep += 1;
+                     *pp++ = *ep;
+                     op += 2;
+                   }
+                 else
+                   {
+                     *pp++ = c;
+                     op++;
+                   }
                 }
             }
         }
author	Bruno Haible <bruno@clisp.org>
	Mon, 7 Jan 2002 17:51:04 +0000 (17:51 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Sun, 21 Jun 2009 22:36:38 +0000 (00:36 +0200)
src/ChangeLog		patch \| blob \| blame \| history
src/po-charset.c		patch \| blob \| blame \| history
src/po-charset.h		patch \| blob \| blame \| history
src/po-lex.c		patch \| blob \| blame \| history
src/write-po.c		patch \| blob \| blame \| history