From: Bruno Haible Date: Wed, 7 Mar 2001 14:38:03 +0000 (+0000) Subject: Fix parsing of strings in CJK encodings. X-Git-Tag: v0.10.36~124 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c265adb81a4bc541803a9b5e745c3cf136de539b;p=thirdparty%2Fgettext.git Fix parsing of strings in CJK encodings. --- diff --git a/src/ChangeLog b/src/ChangeLog index ca1a4cf94..cf4062be8 100644 --- a/src/ChangeLog +++ b/src/ChangeLog @@ -1,3 +1,25 @@ +2001-03-03 Bruno Haible + + Fix parsing of strings in CJK encodings. + * po.h (PO_BASE_TY): New field next_is_fuzzy. + * po-lex.h: Include iconv.h. + (po_lex_charset, po_lex_iconv): New declarations. + * po.c (SIZEOF): New macro. + (po_alloc): Initialize next_is_fuzzy. + (po_callback_message): Add check of charset in header entry. + Set po_lex_charset and po_lex_iconv. + After calling po_directive_message, reset next_is_fuzzy. + (po_callback_comment): Set next_is_fuzzy. + * msgfmt.c (format_directive_message): Remove check of charset in + header entry, now done in po.c. + * po-lex.c (po_lex_charset, po_lex_iconv): New variables. + (lex_open): Initialize them. + (lex_close): Reset them. + (po_gram_lex): While parsing a string, use 'po_lex_iconv' to avoid + treating the second byte of a multi-byte character as an ASCII + character. + * Makefile.am (msgcmp_LDADD, msgfmt_LDADD): New variables. + 2001-03-03 Bruno Haible * write-po.h: New file, pieces of message.h. diff --git a/src/Makefile.am b/src/Makefile.am index eab2f5360..2284f19d4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -51,7 +51,11 @@ po.c str-list.c xget-lex.c xgettext.c dir-list.c write-po.c msgcomm_SOURCES = msgcomm.c message.c po-gram-gen.y po-hash-gen.y po-lex.c \ open-po.c po.c str-list.c dir-list.c write-po.c -# Link dependencies. write-po.c pulls in linebreak.c which may need -liconv. +# Link dependencies. +# po-lex.c and po.c may need -liconv. +# write-po.c pulls in linebreak.c which may need -liconv. +msgcmp_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@ +msgfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@ msgmerge_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@ msgunfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@ xgettext_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@ diff --git a/src/msgfmt.c b/src/msgfmt.c index 00bab404b..4ed68aade 100644 --- a/src/msgfmt.c +++ b/src/msgfmt.c @@ -608,87 +608,6 @@ some header fields still have the initial default value")); error (0, 0, _("field `%s' still has initial default value"), required_fields[initial]); } - - /* Verify the validity of CHARSET. Even if not in verbose mode, - because the consequences are not harmless. */ - { - const char *charsetstr = strstr (msgstr_string, "charset="); - - if (charsetstr != NULL) - { - /* The list of charsets supported by glibc's iconv() and by - the portable iconv() across platforms. Taken from - intl/config.charset. */ - static const char *standard_charsets[] = - { - "ASCII", "ANSI_X3.4-1968", "US-ASCII", - "ISO-8859-1", "ISO_8859-1", - "ISO-8859-2", "ISO_8859-2", - "ISO-8859-3", "ISO_8859-3", - "ISO-8859-4", "ISO_8859-4", - "ISO-8859-5", "ISO_8859-5", - "ISO-8859-6", "ISO_8859-6", - "ISO-8859-7", "ISO_8859-7", - "ISO-8859-8", "ISO_8859-8", - "ISO-8859-9", "ISO_8859-9", - "ISO-8859-13", "ISO_8859-13", - "ISO-8859-15", "ISO_8859-15", - "KOI8-R", - "KOI8-U", - "CP850", - "CP866", - "CP874", - "CP932", - "CP949", - "CP950", - "CP1250", - "CP1251", - "CP1252", - "CP1253", - "CP1254", - "CP1255", - "CP1256", - "CP1257", - "GB2312", - "EUC-JP", - "EUC-KR", - "EUC-TW", - "BIG5", - "BIG5HKSCS", - "GBK", - "GB18030", - "SJIS", - "JOHAB", - "TIS-620", - "VISCII", - "UTF-8" - }; - size_t len; - char *charset; - size_t i; - - charsetstr += strlen ("charset="); - len = strcspn (charsetstr, " \t\n"); - charset = (char *) alloca (len + 1); - memcpy (charset, charsetstr, len); - charset[len] = '\0'; - - for (i = 0; i < SIZEOF (standard_charsets); i++) - if (strcasecmp (charset, standard_charsets[i]) == 0) - break; - if (i == SIZEOF (standard_charsets)) - error (0, 0, _("\ -%s: warning: charset \"%s\" is not a portable encoding name\n\ -%*s warning: charset conversion might not work"), - gram_pos.file_name, charset, - strlen (gram_pos.file_name), ""); - } - else - error (0, 0, _("\ -%s: warning: charset missing in header\n\ -%*s warning: charset conversion will not work"), - gram_pos.file_name, strlen (gram_pos.file_name), ""); - } } else /* We don't count the header entry in the statistic so place the diff --git a/src/po-lex.c b/src/po-lex.c index 1ba8264d3..cf43fe752 100644 --- a/src/po-lex.c +++ b/src/po-lex.c @@ -59,6 +59,10 @@ static FILE *fp; lex_pos_ty gram_pos; unsigned int gram_max_allowed_errors = 20; +const char *po_lex_charset; +#if HAVE_ICONV +iconv_t po_lex_iconv; +#endif static int pass_comments = 0; static int pass_obsolete_entries = 0; @@ -81,6 +85,10 @@ lex_open (fname) _("error while opening \"%s\" for reading"), fname); gram_pos.line_number = 1; + po_lex_charset = NULL; +#if HAVE_ICONV + po_lex_iconv = (iconv_t)(-1); +#endif } @@ -97,6 +105,14 @@ lex_close () gram_pos.file_name = 0; gram_pos.line_number = 0; error_message_count = 0; + po_lex_charset = NULL; +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + { + iconv_close (po_lex_iconv); + po_lex_iconv = (iconv_t)(-1); + } +#endif } @@ -427,38 +443,85 @@ po_gram_lex () break; case '"': - bufpos = 0; - while (1) - { - if (bufpos >= bufmax) - { - bufmax += 100; - buf = xrealloc (buf, bufmax); - } - c = lex_getc (); - if (c == '\n') - { - po_gram_error (_("end-of-line within string")); - break; - } - if (c == EOF) - { - po_gram_error (_("end-of-file within string")); - break; - } - if (c == '"') - break; - - if (c == '\\') - c = control_sequence (); + /* Accumulate a string. */ + { +#if HAVE_ICONV + size_t bufmbpos = 0; +#endif - buf[bufpos++] = c; - } - buf[bufpos] = 0; + bufpos = 0; + while (1) + { + if (bufpos >= bufmax) + { + bufmax += 100; + buf = xrealloc (buf, bufmax); + } + c = lex_getc (); + if (c == EOF) + { + po_gram_error (_("end-of-file within string")); + break; + } + if (c == '\n') + { + po_gram_error (_("end-of-line within string")); + break; + } +#if HAVE_ICONV + /* Interpret c only if it is the first byte of a multi-byte + character. Don't interpret it as ASCII when it is the + second byte. This is needed for the BIG5, BIG5HKSCS, GBK, + GB18030, SJIS, JOHAB encodings. */ + if (po_lex_iconv == (iconv_t)(-1) || bufmbpos == bufpos) +#endif + { + if (c == '"') + break; + + if (c == '\\') + { + buf[bufpos++] = control_sequence (); +#if HAVE_ICONV + bufmbpos++; +#endif + continue; + } + } + + /* Add c to the accumulator. */ + buf[bufpos++] = c; +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + { + /* If c terminates a multibyte character, set + bufmbpos = bufpos. Otherwise keep bufmbpos + pointing at the start of the multibyte character. */ + char scratchbuf[64]; + const char *inptr = &buf[bufmbpos]; + size_t insize = bufpos - bufmbpos; + char *outptr = &scratchbuf[0]; + size_t outsize = sizeof (scratchbuf); + if (iconv (po_lex_iconv, + (ICONV_CONST char **) &inptr, &insize, + &outptr, &outsize) + == (size_t)(-1) + && errno == EILSEQ) + { + po_gram_error (_("invalid multibyte sequence")); + bufmbpos = bufpos; + } + else + bufmbpos = inptr - buf; + } +#endif + } + buf[bufpos] = 0; - /* FIXME: Treatment of embedded \000 chars is incorrect. */ - po_gram_lval.string = xstrdup (buf); - return STRING; + /* FIXME: Treatment of embedded \000 chars is incorrect. */ + po_gram_lval.string = xstrdup (buf); + return STRING; + } case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': diff --git a/src/po-lex.h b/src/po-lex.h index f9cede0d3..4beb7bc64 100644 --- a/src/po-lex.h +++ b/src/po-lex.h @@ -21,6 +21,9 @@ #define _PO_LEX_H #include +#if HAVE_ICONV +#include +#endif #include "error.h" #include "pos.h" @@ -36,6 +39,14 @@ extern lex_pos_ty gram_pos; terminate. Cf. error_message_count, declared in . */ extern unsigned int gram_max_allowed_errors; +/* The PO file's encoding, as specified in the header entry. */ +extern const char *po_lex_charset; + +#if HAVE_ICONV +/* Converter from the PO file's encoding to UTF-8. */ +extern iconv_t po_lex_iconv; +#endif + /* Open the PO file FNAME and prepare its lexical analysis. */ extern void lex_open PARAMS ((const char *__fname)); diff --git a/src/po.c b/src/po.c index 0e1ab8653..b87fcd3c6 100644 --- a/src/po.c +++ b/src/po.c @@ -29,6 +29,11 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "po.h" #include "po-hash.h" #include "system.h" +#include "libgettext.h" + +#define _(str) gettext (str) + +#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) /* Prototypes for local functions. */ static void po_parse_brief PARAMS ((po_ty *__pop)); @@ -59,6 +64,7 @@ po_alloc (pomp) pop = xmalloc (pomp->size); pop->method = pomp; + pop->next_is_fuzzy = 0; if (pomp->constructor) pomp->constructor (pop); return pop; @@ -163,8 +169,141 @@ po_callback_message (msgid, msgid_pos, msgid_plural, lex_pos_ty *msgstr_pos; { /* assert(callback_arg); */ + + /* Test for header entry. */ + if (msgid[0] == '\0' && !callback_arg->next_is_fuzzy) + { + /* Verify the validity of CHARSET. It is necessary + 1. for the correct treatment of multibyte characters containing + 0x5C bytes in the PO lexer, + 2. so that at run time, gettext() can call iconv() to convert + msgstr. */ + const char *charsetstr = strstr (msgstr, "charset="); + + if (charsetstr != NULL) + { + /* The list of charsets supported by glibc's iconv() and by + the portable iconv() across platforms. Taken from + intl/config.charset. */ + static const char *standard_charsets[] = + { + "ASCII", "ANSI_X3.4-1968", "US-ASCII", + "ISO-8859-1", "ISO_8859-1", + "ISO-8859-2", "ISO_8859-2", + "ISO-8859-3", "ISO_8859-3", + "ISO-8859-4", "ISO_8859-4", + "ISO-8859-5", "ISO_8859-5", + "ISO-8859-6", "ISO_8859-6", + "ISO-8859-7", "ISO_8859-7", + "ISO-8859-8", "ISO_8859-8", + "ISO-8859-9", "ISO_8859-9", + "ISO-8859-13", "ISO_8859-13", + "ISO-8859-15", "ISO_8859-15", + "KOI8-R", + "KOI8-U", + "CP850", + "CP866", + "CP874", + "CP932", + "CP949", + "CP950", + "CP1250", + "CP1251", + "CP1252", + "CP1253", + "CP1254", + "CP1255", + "CP1256", + "CP1257", + "GB2312", + "EUC-JP", + "EUC-KR", + "EUC-TW", + "BIG5", + "BIG5HKSCS", + "GBK", + "GB18030", + "SJIS", + "JOHAB", + "TIS-620", + "VISCII", + "UTF-8" + }; + size_t len; + char *charset; + size_t i; + + charsetstr += strlen ("charset="); + len = strcspn (charsetstr, " \t\n"); + charset = (char *) alloca (len + 1); + memcpy (charset, charsetstr, len); + charset[len] = '\0'; + + for (i = 0; i < SIZEOF (standard_charsets); i++) + if (strcasecmp (charset, standard_charsets[i]) == 0) + break; + if (i == SIZEOF (standard_charsets)) + { + error (0, 0, _("\ +%s: warning: charset \"%s\" is not a portable encoding name\n\ +%*s warning: charset conversion might not work"), + gram_pos.file_name, charset, + strlen (gram_pos.file_name), ""); + --error_message_count; + } + else + { + po_lex_charset = standard_charsets[i]; +#if HAVE_ICONV + if (po_lex_iconv != (iconv_t)(-1)) + iconv_close (po_lex_iconv); + po_lex_iconv = iconv_open ("UTF-8", po_lex_charset); + if (po_lex_iconv == (iconv_t)(-1)) + { + /* For CJK encodings which have double-byte characters + ending in 0x5C, the string parser is likely to be + confused if it can't see the character boundaries. */ + const char *note = + (strcmp (po_lex_charset, "BIG5") == 0 + || strcmp (po_lex_charset, "BIG5HKSCS") == 0 + || strcmp (po_lex_charset, "GBK") == 0 + || strcmp (po_lex_charset, "GB18030") == 0 + || strcmp (po_lex_charset, "SJIS") == 0 + || strcmp (po_lex_charset, "JOHAB") == 0 + ? _(", expect parse errors") + : ""); + +# if _LIBICONV_VERSION + error (0, 0, _("\ +%s: warning: charset \"%s\" is not supported by iconv%s"), + gram_pos.file_name, po_lex_charset, note); +# else + error (0, 0, _("\ +%s: warning: charset \"%s\" is not supported by iconv%s\n\ +%*s warning: consider installing libiconv and then reinstalling GNU gettext"), + gram_pos.file_name, po_lex_charset, note, + strlen (gram_pos.file_name), ""); +# endif + --error_message_count; + } +#endif + } + } + else + { + error (0, 0, _("\ +%s: warning: charset missing in header\n\ +%*s warning: charset conversion will not work"), + gram_pos.file_name, strlen (gram_pos.file_name), ""); + --error_message_count; + } + } + po_directive_message (callback_arg, msgid, msgid_pos, msgid_plural, msgstr, msgstr_len, msgstr_pos); + + /* Prepare for next message. */ + callback_arg->next_is_fuzzy = 0; } @@ -220,8 +359,12 @@ po_callback_comment (s) po_comment (callback_arg, s + 1); } else if (*s == ',' || *s == '!') - /* Get all entries in the special comment line. */ - po_comment_special (callback_arg, s + 1); + { + /* Get all entries in the special comment line. */ + if (strstr (s + 1, "fuzzy") != NULL) + callback_arg->next_is_fuzzy = 1; + po_comment_special (callback_arg, s + 1); + } else { /* It looks like a plain vanilla comment, but Solaris-style file diff --git a/src/po.h b/src/po.h index 0b77f8e85..e46e1db2b 100644 --- a/src/po.h +++ b/src/po.h @@ -106,7 +106,8 @@ struct po_method_ty etc. */ #define PO_BASE_TY \ - po_method_ty *method; + po_method_ty *method; \ + int next_is_fuzzy; typedef struct po_ty po_ty; struct po_ty