available.
+2002-01-05 Bruno Haible <bruno@clisp.org>
+
+ Make "msgmerge --update" work better on CJK files even if iconv() is
+ not available.
+ * po-charset.h (po_is_charset_weird): New declaration.
+ (po_is_charset_weird_cjk): Likewise..
+ (po_lex_weird_cjk): New variable declaration.
+ * po-charset.c (po_is_charset_weird): New function, extracted from
+ po_lex_charset_set.
+ (po_is_charset_weird_cjk): New function.
+ (po_lex_weird_cjk): New variable.
+ (po_lex_charset_init): Initialize po_lex_weird_cjk.
+ (po_lex_charset_set): Call po_is_charset_weird and
+ po_is_charset_weird_cjk. Set po_lex_weird_cjk.
+ (po_lex_charset_close): Reset po_lex_weird_cjk.
+ * po-lex.c (mbfile_getc): If po_lex_weird_cjk is set, possibly return
+ a double byte instead of single byte.
+ * write-po.c (wrap): Call po_is_charset_weird_cjk. If it returns true,
+ group double bytes where possible.
+
2002-01-05 Bruno Haible <bruno@clisp.org>
* gettext.c: TESTS version is now separate.
/* Charset handling while reading PO files.
- Copyright (C) 2001 Free Software Foundation, Inc.
+ Copyright (C) 2001-2002 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software; you can redistribute it and/or modify
return true;
}
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+ characters ending in 0x5C. */
+bool po_is_charset_weird (canon_charset)
+ const char *canon_charset;
+{
+ static const char *weird_charsets[] =
+ {
+ "BIG5",
+ "BIG5-HKSCS",
+ "GBK",
+ "GB18030",
+ "SHIFT_JIS",
+ "JOHAB"
+ };
+ size_t i;
+
+ for (i = 0; i < SIZEOF (weird_charsets); i++)
+ if (strcmp (canon_charset, weird_charsets[i]) == 0)
+ return true;
+ return false;
+}
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+ An encoding has CJK structure if every valid character stream is composed
+ of single bytes in the range 0x{00..7F} and of byte pairs in the range
+ 0x{80..FF}{30..FF}. */
+bool po_is_charset_weird_cjk (canon_charset)
+ const char *canon_charset;
+{
+ static const char *weird_cjk_charsets[] =
+ { /* single bytes double bytes */
+ "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
+ "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
+ "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
+ "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
+ "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
+ "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
+ };
+ size_t i;
+
+ for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
+ if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
+ return true;
+ return false;
+}
+
/* The PO file's encoding, as specified in the header entry. */
const char *po_lex_charset;
/* Converter from the PO file's encoding to UTF-8. */
iconv_t po_lex_iconv;
#endif
+/* If no converter is available, some information about the structure of the
+ PO file's encoding. */
+bool po_lex_weird_cjk;
void
po_lex_charset_init ()
#if HAVE_ICONV
po_lex_iconv = (iconv_t)(-1);
#endif
+ po_lex_weird_cjk = false;
}
void
}
else
{
- /* The list of encodings in standard_charsets which have
- double-byte characters ending in 0x5C. For these encodings,
- the string parser is likely to be confused if it can't see
- the character boundaries. */
- static const char *weird_charsets[] =
- {
- "BIG5",
- "BIG5-HKSCS",
- "GBK",
- "GB18030",
- "SHIFT_JIS",
- "JOHAB"
- };
const char *envval;
po_lex_charset = canon_charset;
#if HAVE_ICONV
po_lex_iconv = (iconv_t)(-1);
#endif
+ po_lex_weird_cjk = false;
}
else
{
po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
if (po_lex_iconv == (iconv_t)(-1))
{
- size_t i;
const char *note;
- for (i = 0; i < SIZEOF (weird_charsets); i++)
- if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
- break;
- if (i < SIZEOF (weird_charsets))
+ /* Test for a charset which has double-byte characters
+ ending in 0x5C. For these encodings, the string parser
+ is likely to be confused if it can't see the character
+ boundaries. */
+ po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+ if (po_is_charset_weird (po_lex_charset)
+ && !po_lex_weird_cjk)
note = _("Continuing anyway, expect parse errors.");
else
note = _("Continuing anyway.");
multiline_warning (NULL, xasprintf (_("%s\n"), note));
}
#else
- size_t i;
-
- for (i = 0; i < SIZEOF (weird_charsets); i++)
- if (strcmp (po_lex_charset, weird_charsets[i]) == 0)
- break;
- if (i < SIZEOF (weird_charsets))
+ /* Test for a charset which has double-byte characters
+ ending in 0x5C. For these encodings, the string parser
+ is likely to be confused if it can't see the character
+ boundaries. */
+ po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
+ if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
{
const char *note =
_("Continuing anyway, expect parse errors.");
po_lex_iconv = (iconv_t)(-1);
}
#endif
+ po_lex_weird_cjk = false;
}
/* Charset handling while reading PO files.
- Copyright (C) 2001 Free Software Foundation, Inc.
+ Copyright (C) 2001-2002 Free Software Foundation, Inc.
Written by Bruno Haible <haible@clisp.cons.org>, 2001.
This program is free software; you can redistribute it and/or modify
/* Test for ASCII compatibility. */
extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset));
+/* Test for a weird encoding, i.e. an encoding which has double-byte
+ characters ending in 0x5C. */
+extern bool po_is_charset_weird PARAMS ((const char *canon_charset));
+
+/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
+ An encoding has CJK structure if every valid character stream is composed
+ of single bytes in the range 0x{00..7F} and of byte pairs in the range
+ 0x{80..FF}{30..FF}. */
+extern bool po_is_charset_weird_cjk PARAMS ((const char *canon_charset));
+
/* The PO file's encoding, as specified in the header entry. */
extern const char *po_lex_charset;
/* Converter from the PO file's encoding to UTF-8. */
extern iconv_t po_lex_iconv;
#endif
+/* If no converter is available, some information about the structure of the
+ PO file's encoding. */
+extern bool po_lex_weird_cjk;
/* Initialize the PO file's encoding. */
extern void po_lex_charset_init PARAMS ((void));
/* GNU gettext - internationalization aids
- Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 1995-1999, 2000-2002 Free Software Foundation, Inc.
This file was written by Peter Miller <millerp@canb.auug.org.au>.
Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
else
#endif
{
- /* Return a single byte. */
- bytes = 1;
+ if (po_lex_weird_cjk
+ /* Special handling of encodings with CJK structure. */
+ && (unsigned char) mbf->buf[0] >= 0x80)
+ {
+ if (mbf->bufcount == 1)
+ {
+ /* Read one more byte. */
+ int c = getc (mbf->fp);
+ if (c != EOF)
+ {
+ mbf->buf[1] = (unsigned char) c;
+ mbf->bufcount++;
+ }
+ }
+ if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
+ /* Return a double byte. */
+ bytes = 2;
+ else
+ /* Return a single byte. */
+ bytes = 1;
+ }
+ else
+ {
+ /* Return a single byte. */
+ bytes = 1;
+ }
#if HAVE_ICONV
mbc->uc_valid = false;
#endif
/* GNU gettext - internationalization aids
- Copyright (C) 1995-1998, 2000, 2001 Free Software Foundation, Inc.
+ Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.
This file was written by Peter Miller <millerp@canb.auug.org.au>
const char *envval;
iconv_t conv;
#endif
+ bool weird_cjk;
#if HAVE_ICONV
/* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 don't know
# endif
/* Use iconv() to parse multibyte characters. */
conv = iconv_open ("UTF-8", charset);
+
+ if (conv != (iconv_t)(-1))
+ weird_cjk = false;
+ else
#endif
+ weird_cjk = po_is_charset_weird_cjk (po_charset_canonicalize (charset));
/* Loop over the '\n' delimited portions of value. */
s = value;
}
else
#endif
- portion_len += 1;
+ {
+ if (weird_cjk
+ /* Special handling of encodings with CJK structure. */
+ && ep + 2 <= es
+ && (unsigned char) ep[0] >= 0x80
+ && (unsigned char) ep[1] >= 0x30)
+ {
+ portion_len += 2;
+ ep += 1;
+ }
+ else
+ portion_len += 1;
+ }
}
}
portion = (char *) xmalloc (portion_len);
else
#endif
{
- *pp++ = c;
- op++;
+ if (weird_cjk
+ /* Special handling of encodings with CJK structure. */
+ && ep + 2 <= es
+ && (unsigned char) c >= 0x80
+ && (unsigned char) ep[1] >= 0x30)
+ {
+ *pp++ = c;
+ ep += 1;
+ *pp++ = *ep;
+ op += 2;
+ }
+ else
+ {
+ *pp++ = c;
+ op++;
+ }
}
}
}