From: Daiki Ueno <ueno@gnu.org>
Date: Tue, 13 Jan 2015 03:09:08 +0000 (+0900)
Subject: c++: Make C++11 raw string recognition stricter
X-Git-Tag: v0.19.5~91
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=383b5ba48b20bc6583ee63f13741dd84e96c5002;p=thirdparty%2Fgettext.git

c++: Make C++11 raw string recognition stricter

Reported by Vaclav Slavik at:
<http://savannah.gnu.org/bugs/?43970>.
* gettext-tools/src/x-c.c (struct token_ty): New field 'escape'.
(struct xgettext_token_ty): New field 'escape'.
(phase5_get): Recognize raw strings more strictly.  Set 'escape'
field of token appropriately for string literals.
(extract_parenthesized): Respect 'escape' field of token.

* gettext-tools/tests/xgettext-c-20: Adjust to the latest xgettext C++
scanner change; don't accept unbalanced prefix in raw string literal.
---

diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog
index bf50cb930..3332eab00 100644
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13  Daiki Ueno  <ueno@gnu.org>
+
+	c++: Make C++11 raw string recognition stricter
+	Reported by Vaclav Slavik at:
+	<http://savannah.gnu.org/bugs/?43970>.
+	* x-c.c (struct token_ty): New field 'escape'.
+	(struct xgettext_token_ty): New field 'escape'.
+	(phase5_get): Recognize raw strings more strictly.  Set 'escape'
+	field of token appropriately for string literals.
+	(extract_parenthesized): Respect 'escape' field of token.
+
 2015-01-13  Daiki Ueno  <ueno@gnu.org>
 
 	c++: Differentiate scanning logic from C
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index 32b92d2ae..a8f07a13b 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -858,6 +858,7 @@ struct token_ty
   char *string;         /* for token_type_name, token_type_string_literal */
   refcounted_string_list_ty *comment;   /* for token_type_string_literal,
                                            token_type_objc_special */
+  enum literalstring_escape_type escape; /* for token_type_string_literal */
   long number;
   int line_number;
 };
@@ -1101,6 +1102,9 @@ phase5_get (token_ty *tp)
   int c;
   int last_was_backslash;
   bool raw_expected;
+  int delimiter_left_end;
+  int delimiter_right_start;
+  int last_rparen;
 
   if (phase5_pushback_length)
     {
@@ -1393,11 +1397,14 @@ phase5_get (token_ty *tp)
            let the compiler complain about the argument not matching the
            prototype.  Just pretend it won't happen.  */
         last_was_backslash = false;
+        delimiter_left_end = -1;
+        delimiter_right_start = -1;
+        last_rparen = -1;
         bufpos = 0;
         for (;;)
           {
             c = phase3_getc ();
-            if (last_was_backslash)
+            if (last_was_backslash && !raw_expected)
               {
                 last_was_backslash = false;
                 if (bufpos >= bufmax)
@@ -1414,7 +1421,14 @@ phase5_get (token_ty *tp)
                 last_was_backslash = true;
                 /* FALLTHROUGH */
               default:
-                if (c == '\n' && !raw_expected)
+                if (raw_expected)
+                  {
+                    if (c == '(' && delimiter_left_end < 0)
+                      delimiter_left_end = bufpos;
+                    else if (c == ')' && delimiter_left_end >= 0)
+                      last_rparen = bufpos;
+                  }
+                else if (c == '\n')
                   {
                     error_with_progname = false;
                     error (0, 0,
@@ -1424,18 +1438,35 @@ phase5_get (token_ty *tp)
                     phase3_ungetc ('\n');
                     break;
                   }
-                else
+                if (bufpos >= bufmax)
                   {
-                    if (bufpos >= bufmax)
+                    bufmax = 2 * bufmax + 10;
+                    buffer = xrealloc (buffer, bufmax);
+                  }
+                buffer[bufpos++] = c;
+                continue;
+
+              case '"':
+                if (raw_expected && delimiter_left_end >= 0)
+                  {
+                    if (last_rparen < 0
+                        || delimiter_left_end != bufpos - (last_rparen + 1)
+                        || strncmp (buffer, buffer + last_rparen + 1,
+                                    delimiter_left_end) != 0)
                       {
-                        bufmax = 2 * bufmax + 10;
-                        buffer = xrealloc (buffer, bufmax);
+                        if (bufpos >= bufmax)
+                          {
+                            bufmax = 2 * bufmax + 10;
+                            buffer = xrealloc (buffer, bufmax);
+                          }
+                        buffer[bufpos++] = c;
+                        continue;
                       }
-                    buffer[bufpos++] = c;
-                    continue;
+                    delimiter_right_start = last_rparen;
                   }
+                break;
 
-              case EOF: case '"':
+              case EOF:
                 break;
               }
             break;
@@ -1449,13 +1480,7 @@ phase5_get (token_ty *tp)
 
         if (raw_expected)
           {
-            char *delimiter_left_end;
-            char *delimiter_right_start;
-
-            if (!(delimiter_left_end = strchr (buffer, '('))
-                || !(delimiter_right_start = strrchr (buffer, ')'))
-                || strncmp (buffer, delimiter_right_start + 1,
-                            (delimiter_left_end - buffer)) != 0)
+            if (delimiter_left_end < 0 || delimiter_right_start < 0)
               {
                 error_with_progname = false;
                 error (0, 0, _("%s:%d: warning: unterminated string literal"),
@@ -1464,15 +1489,17 @@ phase5_get (token_ty *tp)
               }
             else
               {
-                *delimiter_right_start = '\0';
+                buffer[delimiter_right_start] = '\0';
                 tp->type = token_type_string_literal;
-                tp->string = xstrdup (delimiter_left_end + 1);
+                tp->string = xstrdup (&buffer[delimiter_left_end + 1]);
+                tp->escape = LET_NONE;
                 tp->comment = add_reference (savable_comment);
                 return;
               }
           }
         tp->type = token_type_string_literal;
         tp->string = xstrdup (buffer);
+        tp->escape = LET_ANSI_C | LET_UNICODE;
         tp->comment = add_reference (savable_comment);
         return;
       }
@@ -1726,6 +1753,7 @@ phase8a_get (token_ty *tp)
       tp->string = new_string;
       tp->comment = add_reference (savable_comment);
       tp->type = token_type_string_literal;
+      tp->escape = LET_ANSI_C | LET_UNICODE;
     }
 }
 
@@ -1806,7 +1834,10 @@ phase8c_unget (token_ty *tp)
 
 /* 8. Concatenate adjacent string literals to form single string
    literals (because we don't expand macros, there are a few things we
-   will miss).  */
+   will miss).
+
+   FIXME: handle the case when the string literals have different
+   tp->escape setting.  */
 
 static void
 phase8_get (token_ty *tp)
@@ -1862,6 +1893,9 @@ struct xgettext_token_ty
      xgettext_token_type_keyword, xgettext_token_type_symbol.  */
   char *string;
 
+  /* This field is used only for xgettext_token_type_string_literal.  */
+  enum literalstring_escape_type escape;
+
   /* This field is used only for xgettext_token_type_string_literal.  */
   refcounted_string_list_ty *comment;
 
@@ -1937,6 +1971,7 @@ x_c_lex (xgettext_token_ty *tp)
 
           tp->type = xgettext_token_type_string_literal;
           tp->string = token.string;
+          tp->escape = token.escape;
           tp->comment = token.comment;
           tp->pos.file_name = logical_file_name;
           tp->pos.line_number = token.line_number;
@@ -2098,7 +2133,7 @@ extract_parenthesized (message_list_ty *mlp,
               const char *encoding;
 
               string = literalstring_parse (token.string, &token.pos,
-                                            LET_ANSI_C | LET_UNICODE);
+                                            token.escape);
               free (token.string);
               token.string = string;
 
@@ -2125,7 +2160,7 @@ extract_parenthesized (message_list_ty *mlp,
                                              token.pos.file_name,
                                              token.pos.line_number,
                                              token.comment,
-                                             LET_ANSI_C | LET_UNICODE);
+                                             token.escape);
           drop_reference (token.comment);
           next_context_iter = null_context_list_iterator;
           selectorcall_context_iter = null_context_list_iterator;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog
index 8ab049aa5..d8616bfa7 100644
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2015-01-13  Daiki Ueno  <ueno@gnu.org>
+
+	* xgettext-c-20: Adjust to the latest xgettext C++ scanner change;
+	don't accept unbalanced prefix in raw string literal.
+
 2015-01-13  Daiki Ueno  <ueno@gnu.org>
 
 	* xgettext-c-20: Adjust the source file name from *.c to *.cc to
diff --git a/gettext-tools/tests/xgettext-c-20 b/gettext-tools/tests/xgettext-c-20
index 274096262..69bb8a20e 100755
--- a/gettext-tools/tests/xgettext-c-20
+++ b/gettext-tools/tests/xgettext-c-20
@@ -51,14 +51,6 @@ LR"aaa(
 This is a wide raw string
 )aaa";
 
-// Missing opening parenthesis; be tolerate and treat it as a normal string.
-gettext (u8R"bar)
-aaa");
-
-// Missing closing parenthesis; be tolerate and treat it as a normal string.
-gettext (u8R"aaa(
-bar");
-
 // 'LR' prefixed raw string should be skipped.
 LR"(
 
@@ -107,18 +99,6 @@ msgid ""
 "\n"
 "This is a raw UTF-8 string\n"
 msgstr ""
-
-#. Missing opening parenthesis; be tolerate and treat it as a normal string.
-msgid ""
-"bar)\n"
-"aaa"
-msgstr ""
-
-#. Missing closing parenthesis; be tolerate and treat it as a normal string.
-msgid ""
-"aaa(\n"
-"bar"
-msgstr ""
 EOF
 
 : ${DIFF=diff}