From: Bruno Haible Date: Fri, 27 Jun 2003 12:35:05 +0000 (+0000) Subject: Change the Perl backend to convert extracted strings and comments to UTF-8 X-Git-Tag: v0.13~402 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=9a22897a00674d16b378b6aca00710e64b3da770;p=thirdparty%2Fgettext.git Change the Perl backend to convert extracted strings and comments to UTF-8 on the fly. --- diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index 2026c962f..8a4e029d3 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,20 @@ +2003-06-27 Bruno Haible + + * xgettext.h (from_current_source_encoding): New declaraction. + * xgettext.c (from_current_source_encoding): New function. + (CONVERT_STRING): Use it. + * x-perl.c: Include po-charset.h. + (get_here_document): Convert each line to UTF-8. + (phase2_getc): Convert each comment to UTF-8. Tell xgettext_comment_add + to not convert it. + (extract_quotelike_pass1_utf8): New function. + (extract_quotelike): Use extract_quotelike_pass1_utf8. + (extract_triple_quotelike): Likewise. + (extract_variable): Tell remember_a_message to not convert the string. + (interpolate_keywords): Likewise. + (extract_balanced): Tell remember_a_message, remember_a_message_plural + to not convert the string. + 2003-06-23 Guido Flohr * x-perl.c (extract_quotelike_pass3): Fix handling of doubled diff --git a/gettext-tools/src/x-perl.c b/gettext-tools/src/x-perl.c index e11923b43..145344bdd 100644 --- a/gettext-tools/src/x-perl.c +++ b/gettext-tools/src/x-perl.c @@ -34,6 +34,7 @@ #include "progname.h" #include "xmalloc.h" #include "exit.h" +#include "po-charset.h" #include "ucs4-utf8.h" #include "uniname.h" #include "gettext.h" @@ -203,7 +204,9 @@ phase1_ungetc (int c) } } -/* Read a here document and return its contents. */ +/* Read a here document and return its contents. + The delimiter is an UTF-8 encoded string; the resulting string is UTF-8 + encoded as well. */ static char * get_here_document (const char *delimiter) @@ -228,6 +231,7 @@ get_here_document (const char *delimiter) for (;;) { int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp); + char *my_line_utf8; bool chomp; if (read_bytes < 0) @@ -251,6 +255,21 @@ get_here_document (const char *delimiter) ++here_eaten; + /* Convert to UTF-8. */ + my_line_utf8 = + from_current_source_encoding (my_linebuf, logical_file_name, + line_number + here_eaten); + if (my_line_utf8 != my_linebuf) + { + if (strlen (my_line_utf8) >= my_linebuf_size) + { + my_linebuf_size = strlen (my_line_utf8) + 1; + my_linebuf = xrealloc (my_linebuf, my_linebuf_size); + } + strcpy (my_linebuf, my_line_utf8); + free (my_line_utf8); + } + /* Undosify. This is important for catching the end of <string); break; @@ -634,6 +663,19 @@ extract_quotelike_pass1 (int delim) } } +/* Like extract_quotelike_pass1, but return the complete string in UTF-8 + encoding. */ +static char * +extract_quotelike_pass1_utf8 (int delim) +{ + char *string = extract_quotelike_pass1 (delim); + char *utf8_string = + from_current_source_encoding (string, logical_file_name, line_number); + if (utf8_string != string) + free (string); + return utf8_string; +} + /* ========= Reading of tokens and commands. Extracting strings. ========= */ @@ -718,7 +760,7 @@ extract_oct (const char *string, size_t len, unsigned int *result) static void extract_quotelike (token_ty *tp, int delim) { - char *string = extract_quotelike_pass1 (delim); + char *string = extract_quotelike_pass1_utf8 (delim); size_t len = strlen (string); tp->type = token_type_string; @@ -742,7 +784,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim, tp->type = token_type_regex_op; - string = extract_quotelike_pass1 (delim); + string = extract_quotelike_pass1_utf8 (delim); if (interpolate) interpolate_keywords (mlp, string, line_number); free (string); @@ -759,7 +801,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim, delim = phase2_getc (); } } - string = extract_quotelike_pass1 (delim); + string = extract_quotelike_pass1_utf8 (delim); if (interpolate) interpolate_keywords (mlp, string, line_number); free (string); @@ -1358,7 +1400,9 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first) lex_pos_ty pos; pos.line_number = line_number; pos.file_name = logical_file_name; + xgettext_current_source_encoding = po_charset_utf8; remember_a_message (mlp, xstrdup (t1->string), &pos); + xgettext_current_source_encoding = xgettext_global_source_encoding; free_token (t2); free_token (t1); } @@ -1442,7 +1486,7 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first) /* Actually a simplified version of extract_variable(). It searches for variables inside a double-quoted string that may interpolate to - some keyword hash (reference). */ + some keyword hash (reference). The string is UTF-8 encoded. */ static void interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) { @@ -1659,6 +1703,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) extract_quotelike_pass3 (&token, EXIT_FAILURE); /* The string can only shrink with interpolation (because we ignore \Q). */ + if (!(strlen (token.string) <= bufpos)) + abort (); strcpy (buffer, token.string); free (token.string); state = wait_rbrace; @@ -1735,7 +1781,9 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno) buffer[bufpos] = '\0'; token.string = xstrdup (buffer); extract_quotelike_pass3 (&token, EXIT_FAILURE); + xgettext_current_source_encoding = po_charset_utf8; remember_a_message (mlp, token.string, &pos); + xgettext_current_source_encoding = xgettext_global_source_encoding; /* FALLTHROUGH */ default: state = initial; @@ -2300,7 +2348,6 @@ token_stack_dump (token_stack_ty *stack) case token_type_named_op: case token_type_string: case token_type_symbol: - case token_type_keyword_symbol: case token_type_variable: fprintf (stderr, " string: %s\n", token->string); break; @@ -2683,26 +2730,29 @@ extract_balanced (message_list_ty *mlp, int arg_sg, int arg_pl, int state, if (extract_all) { lex_pos_ty pos; + char *string; pos.file_name = logical_file_name; pos.line_number = tp->line_number; - remember_a_message (mlp, collect_message (mlp, tp, - EXIT_SUCCESS), - &pos); + string = collect_message (mlp, tp, EXIT_SUCCESS); + xgettext_current_source_encoding = po_charset_utf8; + remember_a_message (mlp, string, &pos); + xgettext_current_source_encoding = xgettext_global_source_encoding; } else if (state) { lex_pos_ty pos; + char *string; pos.file_name = logical_file_name; pos.line_number = tp->line_number; if (arg_count == arg_sg) { - plural_mp = - remember_a_message (mlp, collect_message (mlp, tp, - EXIT_FAILURE), - &pos); + string = collect_message (mlp, tp, EXIT_FAILURE); + xgettext_current_source_encoding = po_charset_utf8; + plural_mp = remember_a_message (mlp, string, &pos); + xgettext_current_source_encoding = xgettext_global_source_encoding; arg_sg = -1; } else if (arg_count == arg_pl && plural_mp == NULL) @@ -2714,10 +2764,10 @@ extract_balanced (message_list_ty *mlp, int arg_sg, int arg_pl, int state, } else if (arg_count == arg_pl) { - remember_a_message_plural (plural_mp, - collect_message (mlp, tp, - EXIT_FAILURE), - &pos); + string = collect_message (mlp, tp, EXIT_FAILURE); + xgettext_current_source_encoding = po_charset_utf8; + remember_a_message_plural (plural_mp, string, &pos); + xgettext_current_source_encoding = xgettext_global_source_encoding; arg_pl = -1; } } diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c index c72ca0751..81d085d94 100644 --- a/gettext-tools/src/xgettext.c +++ b/gettext-tools/src/xgettext.c @@ -1009,27 +1009,6 @@ static struct formatstring_parser *current_formatstring_parser1; static struct formatstring_parser *current_formatstring_parser2; -/* Convert the given string from xgettext_current_source_encoding to - the output file encoding (i.e. ASCII or UTF-8). */ -#define CONVERT_STRING(string) \ - if (xgettext_current_source_encoding == po_charset_ascii) \ - { \ - if (!is_ascii_string (string)) \ - { \ - char buffer[21]; \ - if (pos->line_number == (size_t)(-1)) \ - buffer[0] = '\0'; \ - else \ - sprintf (buffer, ":%ld", (long) pos->line_number); \ - error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \ - pos->file_name, buffer); \ - } \ - } \ - else if (xgettext_current_source_encoding != po_charset_utf8) \ - { \ - string = convert_string (xgettext_current_source_iconv, string); \ - } - #if !HAVE_ICONV /* If we don't have iconv(), the only supported values for xgettext_global_source_encoding and thus also for @@ -1038,6 +1017,40 @@ static struct formatstring_parser *current_formatstring_parser2; #define convert_string(cd,string) (abort (), (string)) #endif +/* Convert the given string from xgettext_current_source_encoding to + the output file encoding (i.e. ASCII or UTF-8). + The resulting string is either the argument string, or freshly allocated. + The file_name and line_number are only used for error message purposes. */ +char * +from_current_source_encoding (const char *string, + const char *file_name, size_t line_number) +{ + if (xgettext_current_source_encoding == po_charset_ascii) + { + if (!is_ascii_string (string)) + { + char buffer[21]; + + if (line_number == (size_t)(-1)) + buffer[0] = '\0'; + else + sprintf (buffer, ":%ld", (long) line_number); + error (EXIT_FAILURE, 0, _("\ +Non-ASCII string at %s%s.\n\ +Please specify the source encoding through --from-code."), + file_name, buffer); + } + } + else if (xgettext_current_source_encoding != po_charset_utf8) + string = convert_string (xgettext_current_source_iconv, string); + + return (char *) string; +} + +#define CONVERT_STRING(string) \ + string = from_current_source_encoding (string, pos->file_name, \ + pos->line_number); + message_ty * remember_a_message (message_list_ty *mlp, char *string, lex_pos_ty *pos) @@ -1135,7 +1148,7 @@ meta information, not the empty string.\n"))); CONVERT_STRING (s); - /* To reduce the possibility of unwanted matches be do a two + /* To reduce the possibility of unwanted matches we do a two step match: the line must contain `xgettext:' and one of the possible format description strings. */ if ((t = strstr (s, "xgettext:")) != NULL) diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h index a37c54429..8086054d5 100644 --- a/gettext-tools/src/xgettext.h +++ b/gettext-tools/src/xgettext.h @@ -1,5 +1,5 @@ /* xgettext common functions. - Copyright (C) 2001-2002 Free Software Foundation, Inc. + Copyright (C) 2001-2003 Free Software Foundation, Inc. Written by Peter Miller and Bruno Haible , 2001. @@ -60,6 +60,14 @@ extern const char *xgettext_current_source_encoding; extern iconv_t xgettext_current_source_iconv; #endif +/* Convert the given string from xgettext_current_source_encoding to + the output file encoding (i.e. ASCII or UTF-8). + The resulting string is either the argument string, or freshly allocated. + The file_name and line_number are only used for error message purposes. */ +extern char *from_current_source_encoding (const char *string, + const char *file_name, + size_t line_number); + /* List of messages whose msgids must not be extracted, or NULL. Used by remember_a_message(). */ extern message_list_ty *exclude; diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index d68e150d5..cd470fc8c 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,8 @@ +2003-06-27 Bruno Haible + + * xgettext-27: Also test Unicode character names in here documents. + * xgettext-28: Likewise. + 2003-06-24 Bruno Haible * xgettext-27: New file. diff --git a/gettext-tools/tests/xgettext-27 b/gettext-tools/tests/xgettext-27 index ca60da55a..2284a1cb3 100755 --- a/gettext-tools/tests/xgettext-27 +++ b/gettext-tools/tests/xgettext-27 @@ -9,6 +9,9 @@ tmpfiles="$tmpfiles xg-test27.pl" cat <<\EOF > xg-test27.pl use charnames ':full'; printf "%s\n", gettext "Böse Bübchen - wo sind sie blo\N{LATIN SMALL LETTER SHARP S}?"; +print gettext < xg-test28.pl use charnames ':full'; printf "%s\n", gettext "Böse Bübchen - wo sind sie blo\N{LATIN SMALL LETTER SHARP S}?"; +print gettext <