From: Daiki Ueno Date: Wed, 7 May 2014 09:51:14 +0000 (+0900) Subject: c: Interpret string literals lazily X-Git-Tag: v0.19~44 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8e319a8bc7e535c5e2b0475d46aecdec0bf89dbe;p=thirdparty%2Fgettext.git c: Interpret string literals lazily * x-c.c (P7_EOF, P7_STRING_END, P7_QUOTES, P7_QUOTE, P7_NEWLINE) (UNICODE, IS_UNICODE, UNICODE_VALUE): Remove. (phase7_get): Remove. (phase7_ungetc): Remove. (phase5_get): Use 'phase3_get' directly to extract string literals; use 'arglist_parser_remember_literal' instead of 'arglist_parser_remember'. (literalstring_parse): New function. (literalstring_c): New variable. (extract_parenthesized): Remove the 'xgettext_current_source_encoding' setting to prevent encoding conversion around 'arglist_parser_done'. * x-c.h (SCANNERS_C): Register 'literalstring_c' as a literalstring_parser. (literalstring_c): New variable declaration. --- diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index 26a4b151a..510a38d16 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,22 @@ +2014-05-09 Daiki Ueno + + c: Interpret string literals lazily + * x-c.c (P7_EOF, P7_STRING_END, P7_QUOTES, P7_QUOTE, P7_NEWLINE) + (UNICODE, IS_UNICODE, UNICODE_VALUE): Remove. + (phase7_get): Remove. + (phase7_ungetc): Remove. + (phase5_get): Use 'phase3_get' directly to extract string + literals; use 'arglist_parser_remember_literal' instead of + 'arglist_parser_remember'. + (literalstring_parse): New function. + (literalstring_c): New variable. + (extract_parenthesized): Remove the + 'xgettext_current_source_encoding' setting to prevent encoding + conversion around 'arglist_parser_done'. + * x-c.h (SCANNERS_C): Register 'literalstring_c' as a + literalstring_parser. + (literalstring_c): New variable declaration. + 2014-05-09 Daiki Ueno xgettext: Provide a way to interpret string literals lazily diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c index 9050433b5..2ae2f3c9e 100644 --- a/gettext-tools/src/x-c.c +++ b/gettext-tools/src/x-c.c @@ -860,228 +860,226 @@ struct token_ty }; -/* 7. Replace escape sequences within character strings with their - single character equivalents. This is called from phase 5, because - we don't have to worry about the #include argument. There are - pathological cases which could bite us (like the DOS directory - separator), but just pretend it can't happen. */ - -/* Return value of phase7_getc when EOF is reached. */ -#define P7_EOF (-1) -#define P7_STRING_END (-2) - -/* Replace escape sequences within character strings with their single - character equivalents. */ -#define P7_QUOTES (-3) -#define P7_QUOTE (-4) -#define P7_NEWLINE (-5) - -/* Convert an UTF-16 or UTF-32 code point to a return value that can be - distinguished from a single-byte return value. */ -#define UNICODE(code) (0x100 + (code)) - -/* Test a return value of phase7_getuc whether it designates an UTF-16 or - UTF-32 code point. */ -#define IS_UNICODE(p7_result) ((p7_result) >= 0x100) - -/* Extract the UTF-16 or UTF-32 code of a return value that satisfies - IS_UNICODE. */ -#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100) +/* Free the memory pointed to by a 'struct token_ty'. */ +static inline void +free_token (token_ty *tp) +{ + if (tp->type == token_type_name || tp->type == token_type_string_literal) + free (tp->string); + if (tp->type == token_type_string_literal + || tp->type == token_type_objc_special) + drop_reference (tp->comment); +} -static int -phase7_getc () +static char * +literalstring_parse (const char *string, lex_pos_ty *pos, + enum literalstring_escape_type type) { - int c, n, j; + struct mixed_string_buffer *bp; + const char *p; - /* Use phase 3, because phase 4 elides comments. */ - c = phase3_getc (); + /* Start accumulating the string. */ + bp = mixed_string_buffer_alloc (lc_string, + logical_file_name, + line_number); - /* Return a magic newline indicator, so that we can distinguish - between the user requesting a newline in the string (e.g. using - "\n" or "\012") from the user failing to terminate the string or - character constant. The ANSI C standard says: 3.1.3.4 Character - Constants contain "any character except single quote, backslash or - newline; or an escape sequence" and 3.1.4 String Literals contain - "any character except double quote, backslash or newline; or an - escape sequence". - - Most compilers give a fatal error in this case, however gcc is - stupidly silent, even though this is a very common typo. OK, so - "gcc --pedantic" will tell me, but that gripes about too much other - stuff. Could I have a "gcc -Wnewline-in-string" option, or - better yet a "gcc -fno-newline-in-string" option, please? Gcc is - also inconsistent between string literals and character constants: - you may not embed newlines in character constants; try it, you get - a useful diagnostic. --PMiller */ - if (c == '\n') - return P7_NEWLINE; - - if (c == '"') - return P7_QUOTES; - if (c == '\'') - return P7_QUOTE; - if (c != '\\') - return c; - c = phase3_getc (); - switch (c) + for (p = string; *p != '\0'; p++) { - default: - /* Unknown escape sequences really should be an error, but just - ignore them, and let the real compiler complain. */ - phase3_ungetc (c); - return '\\'; - - case '"': - case '\'': - case '?': - case '\\': - return c; + int c; - case 'a': - return '\a'; - case 'b': - return '\b'; - - /* The \e escape is preculiar to gcc, and assumes an ASCII - character set (or superset). We don't provide support for it - here. */ - - case 'f': - return '\f'; - case 'n': - return '\n'; - case 'r': - return '\r'; - case 't': - return '\t'; - case 'v': - return '\v'; - - case 'x': - c = phase3_getc (); - switch (c) + if (*p != '\\') { - default: - phase3_ungetc (c); - phase3_ungetc ('x'); - return '\\'; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - break; - } - n = 0; - for (;;) - { - switch (c) - { - default: - phase3_ungetc (c); - return n; - - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - n = n * 16 + c - '0'; - break; - - case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': - n = n * 16 + 10 + c - 'A'; - break; - - case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': - n = n * 16 + 10 + c - 'a'; - break; - } - c = phase3_getc (); + mixed_string_buffer_append_char (bp, *p); + continue; } - return n; - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - n = 0; - for (j = 0; j < 3; ++j) + if (!(type & LET_ANSI_C) && !(type & LET_UNICODE)) { - n = n * 8 + c - '0'; - c = phase3_getc (); - switch (c) - { - default: - break; - - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - continue; - } - break; + mixed_string_buffer_append_char (bp, '\\'); + continue; } - phase3_ungetc (c); - return n; - case 'U': case 'u': - { - unsigned char buf[8]; + c = *++p; - n = 0; - for (j = 0; j < (c == 'u' ? 4 : 8); j++) + if (type & LET_ANSI_C) + switch (c) { - int c1 = phase3_getc (); - - if (c1 >= '0' && c1 <= '9') - n = (n << 4) + (c1 - '0'); - else if (c1 >= 'A' && c1 <= 'F') - n = (n << 4) + (c1 - 'A' + 10); - else if (c1 >= 'a' && c1 <= 'f') - n = (n << 4) + (c1 - 'a' + 10); - else + case '"': + case '\'': + case '?': + case '\\': + mixed_string_buffer_append_char (bp, c); + continue; + + case 'a': + mixed_string_buffer_append_char (bp, '\a'); + continue; + case 'b': + mixed_string_buffer_append_char (bp, '\b'); + continue; + + /* The \e escape is preculiar to gcc, and assumes an ASCII + character set (or superset). We don't provide support for it + here. */ + + case 'f': + mixed_string_buffer_append_char (bp, '\f'); + continue; + case 'n': + mixed_string_buffer_append_char (bp, '\n'); + continue; + case 'r': + mixed_string_buffer_append_char (bp, '\r'); + continue; + case 't': + mixed_string_buffer_append_char (bp, '\t'); + continue; + case 'v': + mixed_string_buffer_append_char (bp, '\v'); + continue; + + case 'x': + c = *++p; + switch (c) { - phase3_ungetc (c1); - while (--j >= 0) - phase3_ungetc (buf[j]); - phase3_ungetc (c); - return '\\'; + default: + mixed_string_buffer_append_char (bp, '\\'); + mixed_string_buffer_append_char (bp, 'x'); + mixed_string_buffer_append_char (bp, c); + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + { + int n; + + for (n = 0; ; ++p) + { + switch (*p) + { + default: + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + n = n * 16 + *p - '0'; + continue; + + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': + n = n * 16 + 10 + *p - 'A'; + continue; + + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': + n = n * 16 + 10 + *p - 'a'; + continue; + } + break; + } + + mixed_string_buffer_append_char (bp, n); + --p; + } + break; } + continue; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + { + int n, j; + + for (n = 0, j = 0; j < 3; ++j) + { + n = n * 8 + c - '0'; + switch (*++p) + { + default: + break; + + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + continue; + } + break; + } - buf[j] = c1; + mixed_string_buffer_append_char (bp, n); + --p; + } + continue; } - if (n < 0x110000) - return UNICODE (n); + if (type & LET_UNICODE) + switch (c) + { + case 'U': case 'u': + { + unsigned char buf[8]; + int length = c == 'u' ? 4 : 8; + int n, j; - error_with_progname = false; - error (0, 0, _("%s:%d: warning: invalid Unicode character"), - logical_file_name, line_number); - error_with_progname = true; + for (n = 0, j = 0; j < length; j++) + { + int c1 = *++p; + + if (c1 >= '0' && c1 <= '9') + n = (n << 4) + (c1 - '0'); + else if (c1 >= 'A' && c1 <= 'F') + n = (n << 4) + (c1 - 'A' + 10); + else if (c1 >= 'a' && c1 <= 'f') + n = (n << 4) + (c1 - 'a' + 10); + else + break; + + buf[j] = c1; + } - while (--j >= 0) - phase3_ungetc (buf[j]); - phase3_ungetc (c); - return '\\'; - } - } -} + if (j == length) + { + if (n < 0x110000) + mixed_string_buffer_append_unicode (bp, n); + else + { + error_with_progname = false; + error_at_line (0, 0, + pos->file_name, pos->line_number, + _("\ +warning: invalid Unicode character")); + error_with_progname = true; + } + } + else + { + int i; + mixed_string_buffer_append_char (bp, '\\'); + mixed_string_buffer_append_char (bp, c); -static void -phase7_ungetc (int c) -{ - phase3_ungetc (c); -} + for (i = 0; i < j; i++) + mixed_string_buffer_append_char (bp, buf[i]); + --p; + } + } + continue; + } -/* Free the memory pointed to by a 'struct token_ty'. */ -static inline void -free_token (token_ty *tp) -{ - if (tp->type == token_type_name || tp->type == token_type_string_literal) - free (tp->string); - if (tp->type == token_type_string_literal - || tp->type == token_type_objc_special) - drop_reference (tp->comment); + mixed_string_buffer_append_char (bp, c); + } + + return mixed_string_buffer_done (bp); } +struct literalstring_parser literalstring_c = + { + literalstring_parse + }; + /* 5. Parse each resulting logical line as preprocessing tokens and white space. Preprocessing tokens and C tokens don't always match. */ @@ -1097,6 +1095,7 @@ phase5_get (token_ty *tp) static int bufmax; int bufpos; int c; + int last_was_backslash; if (phase5_pushback_length) { @@ -1276,19 +1275,30 @@ phase5_get (token_ty *tp) but ignoring it has no effect unless one of the keywords is "L". Just pretend it won't happen. Also, we don't need to remember the character constant. */ + last_was_backslash = false; for (;;) { - c = phase7_getc (); - if (c == P7_NEWLINE) + c = phase3_getc (); + if (last_was_backslash) { + last_was_backslash = false; + continue; + } + switch (c) + { + case '\\': + last_was_backslash = true; + continue; + case '\n': error_with_progname = false; error (0, 0, _("%s:%d: warning: unterminated character constant"), logical_file_name, line_number - 1); error_with_progname = true; - phase7_ungetc ('\n'); + phase3_ungetc ('\n'); + break; + case EOF: case '\'': break; } - if (c == EOF || c == P7_QUOTE) break; } tp->type = token_type_character_constant; @@ -1296,49 +1306,55 @@ phase5_get (token_ty *tp) case '"': { - struct mixed_string_buffer *bp; - - /* Start accumulating the string. */ - bp = mixed_string_buffer_alloc (lc_string, - logical_file_name, - line_number); - /* We could worry about the 'L' before wide string constants, but since gettext's argument is not a wide character string, let the compiler complain about the argument not matching the prototype. Just pretend it won't happen. */ + last_was_backslash = false; + bufpos = 0; for (;;) { - c = phase7_getc (); - - /* Keep line_number in sync. */ - bp->line_number = line_number; - - if (c == P7_NEWLINE) + c = phase3_getc (); + if (last_was_backslash) + { + last_was_backslash = false; + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + continue; + } + switch (c) { + case '\\': + last_was_backslash = true; + /* FALLTHROUGH */ + default: + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + continue; + + case '\n': error_with_progname = false; error (0, 0, _("%s:%d: warning: unterminated string literal"), logical_file_name, line_number - 1); error_with_progname = true; - phase7_ungetc ('\n'); + phase3_ungetc ('\n'); + break; + case EOF: case '"': break; } - if (c == EOF || c == P7_QUOTES) - break; - if (c == P7_QUOTE) - c = '\''; - if (IS_UNICODE (c)) - { - assert (UNICODE_VALUE (c) >= 0 - && UNICODE_VALUE (c) < 0x110000); - mixed_string_buffer_append_unicode (bp, - UNICODE_VALUE (c)); - } - else - mixed_string_buffer_append_char (bp, c); + break; } + buffer[bufpos] = 0; tp->type = token_type_string_literal; - tp->string = mixed_string_buffer_done (bp); + tp->string = xstrdup (buffer); tp->comment = add_reference (savable_comment); return; } @@ -1914,10 +1930,7 @@ extract_parenthesized (message_list_ty *mlp, arglist_parser_alloc (mlp, state ? next_shapes : NULL))) { - xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); - xgettext_current_source_encoding = - xgettext_global_source_encoding; return true; } next_context_iter = null_context_list_iterator; @@ -1926,9 +1939,7 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_rparen: - xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); - xgettext_current_source_encoding = xgettext_global_source_encoding; return false; case xgettext_token_type_comma: @@ -1962,16 +1973,41 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_string_literal: - xgettext_current_source_encoding = po_charset_utf8; if (extract_all) - remember_a_message (mlp, NULL, token.string, inner_context, - &token.pos, NULL, token.comment); + { + char *string; + refcounted_string_list_ty *comment; + const char *encoding; + + string = literalstring_parse (token.string, &token.pos, + LET_ANSI_C | LET_UNICODE); + free (token.string); + token.string = string; + + if (token.comment != NULL) + { + comment = savable_comment_convert_encoding (token.comment, + &token.pos); + drop_reference (token.comment); + token.comment = comment; + } + + /* token.string and token.comment are already converted + to UTF-8. Prevent further conversion in + remember_a_message. */ + encoding = xgettext_current_source_encoding; + xgettext_current_source_encoding = po_charset_utf8; + remember_a_message (mlp, NULL, token.string, inner_context, + &token.pos, NULL, token.comment); + xgettext_current_source_encoding = encoding; + } else - arglist_parser_remember (argparser, arg, token.string, - inner_context, - token.pos.file_name, token.pos.line_number, - token.comment); - xgettext_current_source_encoding = xgettext_global_source_encoding; + arglist_parser_remember_literal (argparser, arg, token.string, + inner_context, + token.pos.file_name, + token.pos.line_number, + token.comment, + LET_ANSI_C | LET_UNICODE); drop_reference (token.comment); next_context_iter = null_context_list_iterator; selectorcall_context_iter = null_context_list_iterator; @@ -1985,9 +2021,7 @@ extract_parenthesized (message_list_ty *mlp, continue; case xgettext_token_type_eof: - xgettext_current_source_encoding = po_charset_utf8; arglist_parser_done (argparser, arg); - xgettext_current_source_encoding = xgettext_global_source_encoding; return true; default: diff --git a/gettext-tools/src/x-c.h b/gettext-tools/src/x-c.h index 28c5b9282..85a31414f 100644 --- a/gettext-tools/src/x-c.h +++ b/gettext-tools/src/x-c.h @@ -43,16 +43,20 @@ extern "C" { #define SCANNERS_C \ { "C", extract_c, \ &flag_table_c, \ - &formatstring_c, NULL, NULL }, \ + &formatstring_c, NULL, \ + &literalstring_c }, \ { "C++", extract_c, \ &flag_table_c, \ - &formatstring_c, NULL, NULL }, \ + &formatstring_c, NULL, \ + &literalstring_c }, \ { "ObjectiveC", extract_objc, \ &flag_table_objc, \ - &formatstring_c, &formatstring_objc, NULL }, \ + &formatstring_c, &formatstring_objc, \ + &literalstring_c }, \ { "GCC-source", extract_c, \ &flag_table_gcc_internal, \ - &formatstring_gcc_internal, &formatstring_gfc_internal, NULL }, \ + &formatstring_gcc_internal, &formatstring_gfc_internal, \ + &literalstring_c }, \ /* Scan a C/C++ file and add its translatable strings to mdlp. */ extern void extract_c (FILE *fp, const char *real_filename, @@ -80,6 +84,9 @@ extern void init_flag_table_objc (void); extern void init_flag_table_gcc_internal (void); +extern DLL_VARIABLE struct literalstring_parser literalstring_c; + + #ifdef __cplusplus } #endif diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 756871c79..80d19260d 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,9 @@ +2014-05-09 Daiki Ueno + + c: Interpret string literals lazily + * xgettext-c-19: New file. + * Makefile.am (TESTS): Add new test. + 2014-05-03 Daiki Ueno tests: Add test for prefixed comment tag diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index ba26362a0..077f32157 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -75,7 +75,7 @@ TESTS = gettext-1 gettext-2 gettext-3 gettext-4 gettext-5 gettext-6 gettext-7 \ xgettext-c-2 xgettext-c-3 xgettext-c-4 xgettext-c-5 \ xgettext-c-6 xgettext-c-7 xgettext-c-8 xgettext-c-9 xgettext-c-10 \ xgettext-c-11 xgettext-c-12 xgettext-c-13 xgettext-c-14 xgettext-c-15 \ - xgettext-c-16 xgettext-c-17 xgettext-c-18 \ + xgettext-c-16 xgettext-c-17 xgettext-c-18 xgettext-c-19 \ xgettext-csharp-1 xgettext-csharp-2 xgettext-csharp-3 \ xgettext-csharp-4 xgettext-csharp-5 xgettext-csharp-6 \ xgettext-csharp-7 \ diff --git a/gettext-tools/tests/xgettext-c-19 b/gettext-tools/tests/xgettext-c-19 new file mode 100755 index 000000000..581a20092 --- /dev/null +++ b/gettext-tools/tests/xgettext-c-19 @@ -0,0 +1,61 @@ +#! /bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test C support: mixing ANSI escapes, Unicode escapes, and bare +# multibyte characters. + +cat <<\EOF > xg-c-19.in.c +/* 最初のコメント */ +"最初の文字列"; + +/* 二番目のコメント */ +gettext ("二番目の文字列"); + +/* 三番目のコメント */ +pgettext ("\u30B3\u30F3\u30C6\u30AF\u30B9\u30C8", "\xBB\xB0\xC8\xD6\xCC\xDC\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3"); +EOF + +: ${ICONV=iconv} +iconv --from UTF-8 --to EUC-JP < xg-c-19.in.c > xg-c-19.c \ + || { echo "Skipping test: iconv does not work for EUC-JP"; exit 77; } + +: ${XGETTEXT=xgettext} +${XGETTEXT} --from-code=EUC-JP --add-comments --no-location \ + -o - xg-c-19.c | grep -v 'POT-Creation-Date' > xg-c-19.tmp.po \ + || exit 1 +LC_ALL=C tr -d '\r' < xg-c-19.tmp.po > xg-c-19.po || exit 1 + +cat < xg-c-19.ok +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER +# This file is distributed under the same license as the PACKAGE package. +# FIRST AUTHOR , YEAR. +# +#, fuzzy +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"Report-Msgid-Bugs-To: \n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"Language: \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +#. 二番目のコメント +msgid "二番目の文字列" +msgstr "" + +#. 三番目のコメント +msgctxt "コンテクスト" +msgid "三番目の文字列" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-c-19.ok xg-c-19.po +result=$? + +exit $result