From: Bruno Haible Date: Mon, 7 Jan 2002 17:51:04 +0000 (+0000) Subject: Make "msgmerge --update" work better on CJK files even if iconv() is not X-Git-Tag: v0.11~105 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dbf56656d6d922e0d91ae387840b05ed65c80abe;p=thirdparty%2Fgettext.git Make "msgmerge --update" work better on CJK files even if iconv() is not available. --- diff --git a/src/ChangeLog b/src/ChangeLog index 2fa84e52f..f9d5dff8a 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,23 @@ +2002-01-05 Bruno Haible + + Make "msgmerge --update" work better on CJK files even if iconv() is + not available. + * po-charset.h (po_is_charset_weird): New declaration. + (po_is_charset_weird_cjk): Likewise.. + (po_lex_weird_cjk): New variable declaration. + * po-charset.c (po_is_charset_weird): New function, extracted from + po_lex_charset_set. + (po_is_charset_weird_cjk): New function. + (po_lex_weird_cjk): New variable. + (po_lex_charset_init): Initialize po_lex_weird_cjk. + (po_lex_charset_set): Call po_is_charset_weird and + po_is_charset_weird_cjk. Set po_lex_weird_cjk. + (po_lex_charset_close): Reset po_lex_weird_cjk. + * po-lex.c (mbfile_getc): If po_lex_weird_cjk is set, possibly return + a double byte instead of single byte. + * write-po.c (wrap): Call po_is_charset_weird_cjk. If it returns true, + group double bytes where possible. + 2002-01-05 Bruno Haible * gettext.c: TESTS version is now separate. diff --git a/src/po-charset.c b/src/po-charset.c index 4e25d6ed9..357fb5776 100644 --- a/src/po-charset.c +++ b/src/po-charset.c @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001-2002 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software; you can redistribute it and/or modify @@ -117,6 +117,52 @@ po_charset_ascii_compatible (canon_charset) return true; } +/* Test for a weird encoding, i.e. an encoding which has double-byte + characters ending in 0x5C. */ +bool po_is_charset_weird (canon_charset) + const char *canon_charset; +{ + static const char *weird_charsets[] = + { + "BIG5", + "BIG5-HKSCS", + "GBK", + "GB18030", + "SHIFT_JIS", + "JOHAB" + }; + size_t i; + + for (i = 0; i < SIZEOF (weird_charsets); i++) + if (strcmp (canon_charset, weird_charsets[i]) == 0) + return true; + return false; +} + +/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure. + An encoding has CJK structure if every valid character stream is composed + of single bytes in the range 0x{00..7F} and of byte pairs in the range + 0x{80..FF}{30..FF}. */ +bool po_is_charset_weird_cjk (canon_charset) + const char *canon_charset; +{ + static const char *weird_cjk_charsets[] = + { /* single bytes double bytes */ + "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */ + "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */ + "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */ + "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */ + "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */ + "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */ + }; + size_t i; + + for (i = 0; i < SIZEOF (weird_cjk_charsets); i++) + if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0) + return true; + return false; +} + /* The PO file's encoding, as specified in the header entry. */ const char *po_lex_charset; @@ -125,6 +171,9 @@ const char *po_lex_charset; /* Converter from the PO file's encoding to UTF-8. */ iconv_t po_lex_iconv; #endif +/* If no converter is available, some information about the structure of the + PO file's encoding. */ +bool po_lex_weird_cjk; void po_lex_charset_init () @@ -133,6 +182,7 @@ po_lex_charset_init () #if HAVE_ICONV po_lex_iconv = (iconv_t)(-1); #endif + po_lex_weird_cjk = false; } void @@ -177,19 +227,6 @@ Message conversion to user's charset might not work.\n"), } else { - /* The list of encodings in standard_charsets which have - double-byte characters ending in 0x5C. For these encodings, - the string parser is likely to be confused if it can't see - the character boundaries. */ - static const char *weird_charsets[] = - { - "BIG5", - "BIG5-HKSCS", - "GBK", - "GB18030", - "SHIFT_JIS", - "JOHAB" - }; const char *envval; po_lex_charset = canon_charset; @@ -212,6 +249,7 @@ Message conversion to user's charset might not work.\n"), #if HAVE_ICONV po_lex_iconv = (iconv_t)(-1); #endif + po_lex_weird_cjk = false; } else { @@ -226,13 +264,15 @@ Message conversion to user's charset might not work.\n"), po_lex_iconv = iconv_open ("UTF-8", po_lex_charset); if (po_lex_iconv == (iconv_t)(-1)) { - size_t i; const char *note; - for (i = 0; i < SIZEOF (weird_charsets); i++) - if (strcmp (po_lex_charset, weird_charsets[i]) == 0) - break; - if (i < SIZEOF (weird_charsets)) + /* Test for a charset which has double-byte characters + ending in 0x5C. For these encodings, the string parser + is likely to be confused if it can't see the character + boundaries. */ + po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); + if (po_is_charset_weird (po_lex_charset) + && !po_lex_weird_cjk) note = _("Continuing anyway, expect parse errors."); else note = _("Continuing anyway."); @@ -255,12 +295,12 @@ would fix this problem.\n"))); multiline_warning (NULL, xasprintf (_("%s\n"), note)); } #else - size_t i; - - for (i = 0; i < SIZEOF (weird_charsets); i++) - if (strcmp (po_lex_charset, weird_charsets[i]) == 0) - break; - if (i < SIZEOF (weird_charsets)) + /* Test for a charset which has double-byte characters + ending in 0x5C. For these encodings, the string parser + is likely to be confused if it can't see the character + boundaries. */ + po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset); + if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk) { const char *note = _("Continuing anyway, expect parse errors."); @@ -309,4 +349,5 @@ po_lex_charset_close () po_lex_iconv = (iconv_t)(-1); } #endif + po_lex_weird_cjk = false; } diff --git a/src/po-charset.h b/src/po-charset.h index 5922c9439..216ab330a 100644 --- a/src/po-charset.h +++ b/src/po-charset.h @@ -1,5 +1,5 @@ /* Charset handling while reading PO files. - Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001-2002 Free Software Foundation, Inc. Written by Bruno Haible , 2001. This program is free software; you can redistribute it and/or modify @@ -36,6 +36,16 @@ extern const char *po_charset_ascii; /* Test for ASCII compatibility. */ extern bool po_charset_ascii_compatible PARAMS ((const char *canon_charset)); +/* Test for a weird encoding, i.e. an encoding which has double-byte + characters ending in 0x5C. */ +extern bool po_is_charset_weird PARAMS ((const char *canon_charset)); + +/* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure. + An encoding has CJK structure if every valid character stream is composed + of single bytes in the range 0x{00..7F} and of byte pairs in the range + 0x{80..FF}{30..FF}. */ +extern bool po_is_charset_weird_cjk PARAMS ((const char *canon_charset)); + /* The PO file's encoding, as specified in the header entry. */ extern const char *po_lex_charset; @@ -44,6 +54,9 @@ extern const char *po_lex_charset; /* Converter from the PO file's encoding to UTF-8. */ extern iconv_t po_lex_iconv; #endif +/* If no converter is available, some information about the structure of the + PO file's encoding. */ +extern bool po_lex_weird_cjk; /* Initialize the PO file's encoding. */ extern void po_lex_charset_init PARAMS ((void)); diff --git a/src/po-lex.c b/src/po-lex.c index 98a6eca3f..76ba7e166 100644 --- a/src/po-lex.c +++ b/src/po-lex.c @@ -1,5 +1,5 @@ /* GNU gettext - internationalization aids - Copyright (C) 1995-1999, 2000, 2001 Free Software Foundation, Inc. + Copyright (C) 1995-1999, 2000-2002 Free Software Foundation, Inc. This file was written by Peter Miller . Multibyte character handling by Bruno Haible . @@ -611,8 +611,32 @@ incomplete multibyte sequence at end of line")); else #endif { - /* Return a single byte. */ - bytes = 1; + if (po_lex_weird_cjk + /* Special handling of encodings with CJK structure. */ + && (unsigned char) mbf->buf[0] >= 0x80) + { + if (mbf->bufcount == 1) + { + /* Read one more byte. */ + int c = getc (mbf->fp); + if (c != EOF) + { + mbf->buf[1] = (unsigned char) c; + mbf->bufcount++; + } + } + if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30) + /* Return a double byte. */ + bytes = 2; + else + /* Return a single byte. */ + bytes = 1; + } + else + { + /* Return a single byte. */ + bytes = 1; + } #if HAVE_ICONV mbc->uc_valid = false; #endif diff --git a/src/write-po.c b/src/write-po.c index 7f30e3092..6bb6f59b5 100644 --- a/src/write-po.c +++ b/src/write-po.c @@ -1,5 +1,5 @@ /* GNU gettext - internationalization aids - Copyright (C) 1995-1998, 2000, 2001 Free Software Foundation, Inc. + Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc. This file was written by Peter Miller @@ -240,6 +240,7 @@ wrap (fp, line_prefix, name, value, do_wrap, charset) const char *envval; iconv_t conv; #endif + bool weird_cjk; #if HAVE_ICONV /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35 don't know @@ -261,7 +262,12 @@ wrap (fp, line_prefix, name, value, do_wrap, charset) # endif /* Use iconv() to parse multibyte characters. */ conv = iconv_open ("UTF-8", charset); + + if (conv != (iconv_t)(-1)) + weird_cjk = false; + else #endif + weird_cjk = po_is_charset_weird_cjk (po_charset_canonicalize (charset)); /* Loop over the '\n' delimited portions of value. */ s = value; @@ -341,7 +347,19 @@ wrap (fp, line_prefix, name, value, do_wrap, charset) } else #endif - portion_len += 1; + { + if (weird_cjk + /* Special handling of encodings with CJK structure. */ + && ep + 2 <= es + && (unsigned char) ep[0] >= 0x80 + && (unsigned char) ep[1] >= 0x30) + { + portion_len += 2; + ep += 1; + } + else + portion_len += 1; + } } } portion = (char *) xmalloc (portion_len); @@ -434,8 +452,22 @@ internationalized messages should not contain the `\\%c' escape sequence"), else #endif { - *pp++ = c; - op++; + if (weird_cjk + /* Special handling of encodings with CJK structure. */ + && ep + 2 <= es + && (unsigned char) c >= 0x80 + && (unsigned char) ep[1] >= 0x30) + { + *pp++ = c; + ep += 1; + *pp++ = *ep; + op += 2; + } + else + { + *pp++ = c; + op++; + } } } }