+2001-03-03 Bruno Haible <haible@clisp.cons.org>
+
+ Fix parsing of strings in CJK encodings.
+ * po.h (PO_BASE_TY): New field next_is_fuzzy.
+ * po-lex.h: Include iconv.h.
+ (po_lex_charset, po_lex_iconv): New declarations.
+ * po.c (SIZEOF): New macro.
+ (po_alloc): Initialize next_is_fuzzy.
+ (po_callback_message): Add check of charset in header entry.
+ Set po_lex_charset and po_lex_iconv.
+ After calling po_directive_message, reset next_is_fuzzy.
+ (po_callback_comment): Set next_is_fuzzy.
+ * msgfmt.c (format_directive_message): Remove check of charset in
+ header entry, now done in po.c.
+ * po-lex.c (po_lex_charset, po_lex_iconv): New variables.
+ (lex_open): Initialize them.
+ (lex_close): Reset them.
+ (po_gram_lex): While parsing a string, use 'po_lex_iconv' to avoid
+ treating the second byte of a multi-byte character as an ASCII
+ character.
+ * Makefile.am (msgcmp_LDADD, msgfmt_LDADD): New variables.
+
2001-03-03 Bruno Haible <haible@clisp.cons.org>
* write-po.h: New file, pieces of message.h.
msgcomm_SOURCES = msgcomm.c message.c po-gram-gen.y po-hash-gen.y po-lex.c \
open-po.c po.c str-list.c dir-list.c write-po.c
-# Link dependencies. write-po.c pulls in linebreak.c which may need -liconv.
+# Link dependencies.
+# po-lex.c and po.c may need -liconv.
+# write-po.c pulls in linebreak.c which may need -liconv.
+msgcmp_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
+msgfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
msgmerge_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
msgunfmt_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
xgettext_LDADD = ../lib/libnlsut.a @INTLLIBS@ @LIBICONV@
error (0, 0, _("field `%s' still has initial default value"),
required_fields[initial]);
}
-
- /* Verify the validity of CHARSET. Even if not in verbose mode,
- because the consequences are not harmless. */
- {
- const char *charsetstr = strstr (msgstr_string, "charset=");
-
- if (charsetstr != NULL)
- {
- /* The list of charsets supported by glibc's iconv() and by
- the portable iconv() across platforms. Taken from
- intl/config.charset. */
- static const char *standard_charsets[] =
- {
- "ASCII", "ANSI_X3.4-1968", "US-ASCII",
- "ISO-8859-1", "ISO_8859-1",
- "ISO-8859-2", "ISO_8859-2",
- "ISO-8859-3", "ISO_8859-3",
- "ISO-8859-4", "ISO_8859-4",
- "ISO-8859-5", "ISO_8859-5",
- "ISO-8859-6", "ISO_8859-6",
- "ISO-8859-7", "ISO_8859-7",
- "ISO-8859-8", "ISO_8859-8",
- "ISO-8859-9", "ISO_8859-9",
- "ISO-8859-13", "ISO_8859-13",
- "ISO-8859-15", "ISO_8859-15",
- "KOI8-R",
- "KOI8-U",
- "CP850",
- "CP866",
- "CP874",
- "CP932",
- "CP949",
- "CP950",
- "CP1250",
- "CP1251",
- "CP1252",
- "CP1253",
- "CP1254",
- "CP1255",
- "CP1256",
- "CP1257",
- "GB2312",
- "EUC-JP",
- "EUC-KR",
- "EUC-TW",
- "BIG5",
- "BIG5HKSCS",
- "GBK",
- "GB18030",
- "SJIS",
- "JOHAB",
- "TIS-620",
- "VISCII",
- "UTF-8"
- };
- size_t len;
- char *charset;
- size_t i;
-
- charsetstr += strlen ("charset=");
- len = strcspn (charsetstr, " \t\n");
- charset = (char *) alloca (len + 1);
- memcpy (charset, charsetstr, len);
- charset[len] = '\0';
-
- for (i = 0; i < SIZEOF (standard_charsets); i++)
- if (strcasecmp (charset, standard_charsets[i]) == 0)
- break;
- if (i == SIZEOF (standard_charsets))
- error (0, 0, _("\
-%s: warning: charset \"%s\" is not a portable encoding name\n\
-%*s warning: charset conversion might not work"),
- gram_pos.file_name, charset,
- strlen (gram_pos.file_name), "");
- }
- else
- error (0, 0, _("\
-%s: warning: charset missing in header\n\
-%*s warning: charset conversion will not work"),
- gram_pos.file_name, strlen (gram_pos.file_name), "");
- }
}
else
/* We don't count the header entry in the statistic so place the
static FILE *fp;
lex_pos_ty gram_pos;
unsigned int gram_max_allowed_errors = 20;
+const char *po_lex_charset;
+#if HAVE_ICONV
+iconv_t po_lex_iconv;
+#endif
static int pass_comments = 0;
static int pass_obsolete_entries = 0;
_("error while opening \"%s\" for reading"), fname);
gram_pos.line_number = 1;
+ po_lex_charset = NULL;
+#if HAVE_ICONV
+ po_lex_iconv = (iconv_t)(-1);
+#endif
}
gram_pos.file_name = 0;
gram_pos.line_number = 0;
error_message_count = 0;
+ po_lex_charset = NULL;
+#if HAVE_ICONV
+ if (po_lex_iconv != (iconv_t)(-1))
+ {
+ iconv_close (po_lex_iconv);
+ po_lex_iconv = (iconv_t)(-1);
+ }
+#endif
}
break;
case '"':
- bufpos = 0;
- while (1)
- {
- if (bufpos >= bufmax)
- {
- bufmax += 100;
- buf = xrealloc (buf, bufmax);
- }
- c = lex_getc ();
- if (c == '\n')
- {
- po_gram_error (_("end-of-line within string"));
- break;
- }
- if (c == EOF)
- {
- po_gram_error (_("end-of-file within string"));
- break;
- }
- if (c == '"')
- break;
-
- if (c == '\\')
- c = control_sequence ();
+ /* Accumulate a string. */
+ {
+#if HAVE_ICONV
+ size_t bufmbpos = 0;
+#endif
- buf[bufpos++] = c;
- }
- buf[bufpos] = 0;
+ bufpos = 0;
+ while (1)
+ {
+ if (bufpos >= bufmax)
+ {
+ bufmax += 100;
+ buf = xrealloc (buf, bufmax);
+ }
+ c = lex_getc ();
+ if (c == EOF)
+ {
+ po_gram_error (_("end-of-file within string"));
+ break;
+ }
+ if (c == '\n')
+ {
+ po_gram_error (_("end-of-line within string"));
+ break;
+ }
+#if HAVE_ICONV
+ /* Interpret c only if it is the first byte of a multi-byte
+ character. Don't interpret it as ASCII when it is the
+ second byte. This is needed for the BIG5, BIG5HKSCS, GBK,
+ GB18030, SJIS, JOHAB encodings. */
+ if (po_lex_iconv == (iconv_t)(-1) || bufmbpos == bufpos)
+#endif
+ {
+ if (c == '"')
+ break;
+
+ if (c == '\\')
+ {
+ buf[bufpos++] = control_sequence ();
+#if HAVE_ICONV
+ bufmbpos++;
+#endif
+ continue;
+ }
+ }
+
+ /* Add c to the accumulator. */
+ buf[bufpos++] = c;
+#if HAVE_ICONV
+ if (po_lex_iconv != (iconv_t)(-1))
+ {
+ /* If c terminates a multibyte character, set
+ bufmbpos = bufpos. Otherwise keep bufmbpos
+ pointing at the start of the multibyte character. */
+ char scratchbuf[64];
+ const char *inptr = &buf[bufmbpos];
+ size_t insize = bufpos - bufmbpos;
+ char *outptr = &scratchbuf[0];
+ size_t outsize = sizeof (scratchbuf);
+ if (iconv (po_lex_iconv,
+ (ICONV_CONST char **) &inptr, &insize,
+ &outptr, &outsize)
+ == (size_t)(-1)
+ && errno == EILSEQ)
+ {
+ po_gram_error (_("invalid multibyte sequence"));
+ bufmbpos = bufpos;
+ }
+ else
+ bufmbpos = inptr - buf;
+ }
+#endif
+ }
+ buf[bufpos] = 0;
- /* FIXME: Treatment of embedded \000 chars is incorrect. */
- po_gram_lval.string = xstrdup (buf);
- return STRING;
+ /* FIXME: Treatment of embedded \000 chars is incorrect. */
+ po_gram_lval.string = xstrdup (buf);
+ return STRING;
+ }
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
#define _PO_LEX_H
#include <sys/types.h>
+#if HAVE_ICONV
+#include <iconv.h>
+#endif
#include "error.h"
#include "pos.h"
terminate. Cf. error_message_count, declared in <error.h>. */
extern unsigned int gram_max_allowed_errors;
+/* The PO file's encoding, as specified in the header entry. */
+extern const char *po_lex_charset;
+
+#if HAVE_ICONV
+/* Converter from the PO file's encoding to UTF-8. */
+extern iconv_t po_lex_iconv;
+#endif
+
/* Open the PO file FNAME and prepare its lexical analysis. */
extern void lex_open PARAMS ((const char *__fname));
#include "po.h"
#include "po-hash.h"
#include "system.h"
+#include "libgettext.h"
+
+#define _(str) gettext (str)
+
+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
/* Prototypes for local functions. */
static void po_parse_brief PARAMS ((po_ty *__pop));
pop = xmalloc (pomp->size);
pop->method = pomp;
+ pop->next_is_fuzzy = 0;
if (pomp->constructor)
pomp->constructor (pop);
return pop;
lex_pos_ty *msgstr_pos;
{
/* assert(callback_arg); */
+
+ /* Test for header entry. */
+ if (msgid[0] == '\0' && !callback_arg->next_is_fuzzy)
+ {
+ /* Verify the validity of CHARSET. It is necessary
+ 1. for the correct treatment of multibyte characters containing
+ 0x5C bytes in the PO lexer,
+ 2. so that at run time, gettext() can call iconv() to convert
+ msgstr. */
+ const char *charsetstr = strstr (msgstr, "charset=");
+
+ if (charsetstr != NULL)
+ {
+ /* The list of charsets supported by glibc's iconv() and by
+ the portable iconv() across platforms. Taken from
+ intl/config.charset. */
+ static const char *standard_charsets[] =
+ {
+ "ASCII", "ANSI_X3.4-1968", "US-ASCII",
+ "ISO-8859-1", "ISO_8859-1",
+ "ISO-8859-2", "ISO_8859-2",
+ "ISO-8859-3", "ISO_8859-3",
+ "ISO-8859-4", "ISO_8859-4",
+ "ISO-8859-5", "ISO_8859-5",
+ "ISO-8859-6", "ISO_8859-6",
+ "ISO-8859-7", "ISO_8859-7",
+ "ISO-8859-8", "ISO_8859-8",
+ "ISO-8859-9", "ISO_8859-9",
+ "ISO-8859-13", "ISO_8859-13",
+ "ISO-8859-15", "ISO_8859-15",
+ "KOI8-R",
+ "KOI8-U",
+ "CP850",
+ "CP866",
+ "CP874",
+ "CP932",
+ "CP949",
+ "CP950",
+ "CP1250",
+ "CP1251",
+ "CP1252",
+ "CP1253",
+ "CP1254",
+ "CP1255",
+ "CP1256",
+ "CP1257",
+ "GB2312",
+ "EUC-JP",
+ "EUC-KR",
+ "EUC-TW",
+ "BIG5",
+ "BIG5HKSCS",
+ "GBK",
+ "GB18030",
+ "SJIS",
+ "JOHAB",
+ "TIS-620",
+ "VISCII",
+ "UTF-8"
+ };
+ size_t len;
+ char *charset;
+ size_t i;
+
+ charsetstr += strlen ("charset=");
+ len = strcspn (charsetstr, " \t\n");
+ charset = (char *) alloca (len + 1);
+ memcpy (charset, charsetstr, len);
+ charset[len] = '\0';
+
+ for (i = 0; i < SIZEOF (standard_charsets); i++)
+ if (strcasecmp (charset, standard_charsets[i]) == 0)
+ break;
+ if (i == SIZEOF (standard_charsets))
+ {
+ error (0, 0, _("\
+%s: warning: charset \"%s\" is not a portable encoding name\n\
+%*s warning: charset conversion might not work"),
+ gram_pos.file_name, charset,
+ strlen (gram_pos.file_name), "");
+ --error_message_count;
+ }
+ else
+ {
+ po_lex_charset = standard_charsets[i];
+#if HAVE_ICONV
+ if (po_lex_iconv != (iconv_t)(-1))
+ iconv_close (po_lex_iconv);
+ po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
+ if (po_lex_iconv == (iconv_t)(-1))
+ {
+ /* For CJK encodings which have double-byte characters
+ ending in 0x5C, the string parser is likely to be
+ confused if it can't see the character boundaries. */
+ const char *note =
+ (strcmp (po_lex_charset, "BIG5") == 0
+ || strcmp (po_lex_charset, "BIG5HKSCS") == 0
+ || strcmp (po_lex_charset, "GBK") == 0
+ || strcmp (po_lex_charset, "GB18030") == 0
+ || strcmp (po_lex_charset, "SJIS") == 0
+ || strcmp (po_lex_charset, "JOHAB") == 0
+ ? _(", expect parse errors")
+ : "");
+
+# if _LIBICONV_VERSION
+ error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s"),
+ gram_pos.file_name, po_lex_charset, note);
+# else
+ error (0, 0, _("\
+%s: warning: charset \"%s\" is not supported by iconv%s\n\
+%*s warning: consider installing libiconv and then reinstalling GNU gettext"),
+ gram_pos.file_name, po_lex_charset, note,
+ strlen (gram_pos.file_name), "");
+# endif
+ --error_message_count;
+ }
+#endif
+ }
+ }
+ else
+ {
+ error (0, 0, _("\
+%s: warning: charset missing in header\n\
+%*s warning: charset conversion will not work"),
+ gram_pos.file_name, strlen (gram_pos.file_name), "");
+ --error_message_count;
+ }
+ }
+
po_directive_message (callback_arg, msgid, msgid_pos, msgid_plural,
msgstr, msgstr_len, msgstr_pos);
+
+ /* Prepare for next message. */
+ callback_arg->next_is_fuzzy = 0;
}
po_comment (callback_arg, s + 1);
}
else if (*s == ',' || *s == '!')
- /* Get all entries in the special comment line. */
- po_comment_special (callback_arg, s + 1);
+ {
+ /* Get all entries in the special comment line. */
+ if (strstr (s + 1, "fuzzy") != NULL)
+ callback_arg->next_is_fuzzy = 1;
+ po_comment_special (callback_arg, s + 1);
+ }
else
{
/* It looks like a plain vanilla comment, but Solaris-style file
etc. */
#define PO_BASE_TY \
- po_method_ty *method;
+ po_method_ty *method; \
+ int next_is_fuzzy;
typedef struct po_ty po_ty;
struct po_ty