From: Daiki Ueno Date: Tue, 13 Jan 2015 03:09:08 +0000 (+0900) Subject: c++: Make C++11 raw string recognition stricter X-Git-Tag: v0.19.5~91 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=383b5ba48b20bc6583ee63f13741dd84e96c5002;p=thirdparty%2Fgettext.git c++: Make C++11 raw string recognition stricter Reported by Vaclav Slavik at: . * gettext-tools/src/x-c.c (struct token_ty): New field 'escape'. (struct xgettext_token_ty): New field 'escape'. (phase5_get): Recognize raw strings more strictly. Set 'escape' field of token appropriately for string literals. (extract_parenthesized): Respect 'escape' field of token. * gettext-tools/tests/xgettext-c-20: Adjust to the latest xgettext C++ scanner change; don't accept unbalanced prefix in raw string literal. --- diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog index bf50cb930..3332eab00 100644 --- a/gettext-tools/src/ChangeLog +++ b/gettext-tools/src/ChangeLog @@ -1,3 +1,14 @@ +2015-01-13 Daiki Ueno + + c++: Make C++11 raw string recognition stricter + Reported by Vaclav Slavik at: + . + * x-c.c (struct token_ty): New field 'escape'. + (struct xgettext_token_ty): New field 'escape'. + (phase5_get): Recognize raw strings more strictly. Set 'escape' + field of token appropriately for string literals. + (extract_parenthesized): Respect 'escape' field of token. + 2015-01-13 Daiki Ueno c++: Differentiate scanning logic from C diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c index 32b92d2ae..a8f07a13b 100644 --- a/gettext-tools/src/x-c.c +++ b/gettext-tools/src/x-c.c @@ -858,6 +858,7 @@ struct token_ty char *string; /* for token_type_name, token_type_string_literal */ refcounted_string_list_ty *comment; /* for token_type_string_literal, token_type_objc_special */ + enum literalstring_escape_type escape; /* for token_type_string_literal */ long number; int line_number; }; @@ -1101,6 +1102,9 @@ phase5_get (token_ty *tp) int c; int last_was_backslash; bool raw_expected; + int delimiter_left_end; + int delimiter_right_start; + int last_rparen; if (phase5_pushback_length) { @@ -1393,11 +1397,14 @@ phase5_get (token_ty *tp) let the compiler complain about the argument not matching the prototype. Just pretend it won't happen. */ last_was_backslash = false; + delimiter_left_end = -1; + delimiter_right_start = -1; + last_rparen = -1; bufpos = 0; for (;;) { c = phase3_getc (); - if (last_was_backslash) + if (last_was_backslash && !raw_expected) { last_was_backslash = false; if (bufpos >= bufmax) @@ -1414,7 +1421,14 @@ phase5_get (token_ty *tp) last_was_backslash = true; /* FALLTHROUGH */ default: - if (c == '\n' && !raw_expected) + if (raw_expected) + { + if (c == '(' && delimiter_left_end < 0) + delimiter_left_end = bufpos; + else if (c == ')' && delimiter_left_end >= 0) + last_rparen = bufpos; + } + else if (c == '\n') { error_with_progname = false; error (0, 0, @@ -1424,18 +1438,35 @@ phase5_get (token_ty *tp) phase3_ungetc ('\n'); break; } - else + if (bufpos >= bufmax) { - if (bufpos >= bufmax) + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + continue; + + case '"': + if (raw_expected && delimiter_left_end >= 0) + { + if (last_rparen < 0 + || delimiter_left_end != bufpos - (last_rparen + 1) + || strncmp (buffer, buffer + last_rparen + 1, + delimiter_left_end) != 0) { - bufmax = 2 * bufmax + 10; - buffer = xrealloc (buffer, bufmax); + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + continue; } - buffer[bufpos++] = c; - continue; + delimiter_right_start = last_rparen; } + break; - case EOF: case '"': + case EOF: break; } break; @@ -1449,13 +1480,7 @@ phase5_get (token_ty *tp) if (raw_expected) { - char *delimiter_left_end; - char *delimiter_right_start; - - if (!(delimiter_left_end = strchr (buffer, '(')) - || !(delimiter_right_start = strrchr (buffer, ')')) - || strncmp (buffer, delimiter_right_start + 1, - (delimiter_left_end - buffer)) != 0) + if (delimiter_left_end < 0 || delimiter_right_start < 0) { error_with_progname = false; error (0, 0, _("%s:%d: warning: unterminated string literal"), @@ -1464,15 +1489,17 @@ phase5_get (token_ty *tp) } else { - *delimiter_right_start = '\0'; + buffer[delimiter_right_start] = '\0'; tp->type = token_type_string_literal; - tp->string = xstrdup (delimiter_left_end + 1); + tp->string = xstrdup (&buffer[delimiter_left_end + 1]); + tp->escape = LET_NONE; tp->comment = add_reference (savable_comment); return; } } tp->type = token_type_string_literal; tp->string = xstrdup (buffer); + tp->escape = LET_ANSI_C | LET_UNICODE; tp->comment = add_reference (savable_comment); return; } @@ -1726,6 +1753,7 @@ phase8a_get (token_ty *tp) tp->string = new_string; tp->comment = add_reference (savable_comment); tp->type = token_type_string_literal; + tp->escape = LET_ANSI_C | LET_UNICODE; } } @@ -1806,7 +1834,10 @@ phase8c_unget (token_ty *tp) /* 8. Concatenate adjacent string literals to form single string literals (because we don't expand macros, there are a few things we - will miss). */ + will miss). + + FIXME: handle the case when the string literals have different + tp->escape setting. */ static void phase8_get (token_ty *tp) @@ -1862,6 +1893,9 @@ struct xgettext_token_ty xgettext_token_type_keyword, xgettext_token_type_symbol. */ char *string; + /* This field is used only for xgettext_token_type_string_literal. */ + enum literalstring_escape_type escape; + /* This field is used only for xgettext_token_type_string_literal. */ refcounted_string_list_ty *comment; @@ -1937,6 +1971,7 @@ x_c_lex (xgettext_token_ty *tp) tp->type = xgettext_token_type_string_literal; tp->string = token.string; + tp->escape = token.escape; tp->comment = token.comment; tp->pos.file_name = logical_file_name; tp->pos.line_number = token.line_number; @@ -2098,7 +2133,7 @@ extract_parenthesized (message_list_ty *mlp, const char *encoding; string = literalstring_parse (token.string, &token.pos, - LET_ANSI_C | LET_UNICODE); + token.escape); free (token.string); token.string = string; @@ -2125,7 +2160,7 @@ extract_parenthesized (message_list_ty *mlp, token.pos.file_name, token.pos.line_number, token.comment, - LET_ANSI_C | LET_UNICODE); + token.escape); drop_reference (token.comment); next_context_iter = null_context_list_iterator; selectorcall_context_iter = null_context_list_iterator; diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog index 8ab049aa5..d8616bfa7 100644 --- a/gettext-tools/tests/ChangeLog +++ b/gettext-tools/tests/ChangeLog @@ -1,3 +1,8 @@ +2015-01-13 Daiki Ueno + + * xgettext-c-20: Adjust to the latest xgettext C++ scanner change; + don't accept unbalanced prefix in raw string literal. + 2015-01-13 Daiki Ueno * xgettext-c-20: Adjust the source file name from *.c to *.cc to diff --git a/gettext-tools/tests/xgettext-c-20 b/gettext-tools/tests/xgettext-c-20 index 274096262..69bb8a20e 100755 --- a/gettext-tools/tests/xgettext-c-20 +++ b/gettext-tools/tests/xgettext-c-20 @@ -51,14 +51,6 @@ LR"aaa( This is a wide raw string )aaa"; -// Missing opening parenthesis; be tolerate and treat it as a normal string. -gettext (u8R"bar) -aaa"); - -// Missing closing parenthesis; be tolerate and treat it as a normal string. -gettext (u8R"aaa( -bar"); - // 'LR' prefixed raw string should be skipped. LR"( @@ -107,18 +99,6 @@ msgid "" "\n" "This is a raw UTF-8 string\n" msgstr "" - -#. Missing opening parenthesis; be tolerate and treat it as a normal string. -msgid "" -"bar)\n" -"aaa" -msgstr "" - -#. Missing closing parenthesis; be tolerate and treat it as a normal string. -msgid "" -"aaa(\n" -"bar" -msgstr "" EOF : ${DIFF=diff}