c++: Make C++11 raw string recognition stricter

author Daiki Ueno <ueno@gnu.org>

Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)

committer Daiki Ueno <ueno@gnu.org>

Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)
author Daiki Ueno <ueno@gnu.org>
Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)
committer Daiki Ueno <ueno@gnu.org>
Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog

index bf50cb9304158a169feec0b714118d3f538b5c6d..3332eab0095e8bf7cefcb554343ebabd0ab24bf5 100644 (file)
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13  Daiki Ueno  <ueno@gnu.org>
+
+       c++: Make C++11 raw string recognition stricter
+       Reported by Vaclav Slavik at:
+       <http://savannah.gnu.org/bugs/?43970>.
+       * x-c.c (struct token_ty): New field 'escape'.
+       (struct xgettext_token_ty): New field 'escape'.
+       (phase5_get): Recognize raw strings more strictly.  Set 'escape'
+       field of token appropriately for string literals.
+       (extract_parenthesized): Respect 'escape' field of token.
+
  2015-01-13  Daiki Ueno  <ueno@gnu.org>
  
         c++: Differentiate scanning logic from C
diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c

index 32b92d2ae01656c6232b03710d919ef2791f58de..a8f07a13b8d48580fddab5b8076b1c1791c52494 100644 (file)
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -858,6 +858,7 @@ struct token_ty
    char *string;         /* for token_type_name, token_type_string_literal */
    refcounted_string_list_ty *comment;   /* for token_type_string_literal,
                                             token_type_objc_special */
+  enum literalstring_escape_type escape; /* for token_type_string_literal */
    long number;
    int line_number;
  };
@@ -1101,6 +1102,9 @@ phase5_get (token_ty *tp)
    int c;
    int last_was_backslash;
    bool raw_expected;
+  int delimiter_left_end;
+  int delimiter_right_start;
+  int last_rparen;
  
    if (phase5_pushback_length)
      {
@@ -1393,11 +1397,14 @@ phase5_get (token_ty *tp)
             let the compiler complain about the argument not matching the
             prototype.  Just pretend it won't happen.  */
          last_was_backslash = false;
+        delimiter_left_end = -1;
+        delimiter_right_start = -1;
+        last_rparen = -1;
          bufpos = 0;
          for (;;)
            {
              c = phase3_getc ();
-            if (last_was_backslash)
+            if (last_was_backslash && !raw_expected)
                {
                  last_was_backslash = false;
                  if (bufpos >= bufmax)
@@ -1414,7 +1421,14 @@ phase5_get (token_ty *tp)
                  last_was_backslash = true;
                  /* FALLTHROUGH */
                default:
-                if (c == '\n' && !raw_expected)
+                if (raw_expected)
+                  {
+                    if (c == '(' && delimiter_left_end < 0)
+                      delimiter_left_end = bufpos;
+                    else if (c == ')' && delimiter_left_end >= 0)
+                      last_rparen = bufpos;
+                  }
+                else if (c == '\n')
                    {
                      error_with_progname = false;
                      error (0, 0,
@@ -1424,18 +1438,35 @@ phase5_get (token_ty *tp)
                      phase3_ungetc ('\n');
                      break;
                    }
-                else
+                if (bufpos >= bufmax)
                    {
-                    if (bufpos >= bufmax)
+                    bufmax = 2 * bufmax + 10;
+                    buffer = xrealloc (buffer, bufmax);
+                  }
+                buffer[bufpos++] = c;
+                continue;
+
+              case '"':
+                if (raw_expected && delimiter_left_end >= 0)
+                  {
+                    if (last_rparen < 0
+                        || delimiter_left_end != bufpos - (last_rparen + 1)
+                        || strncmp (buffer, buffer + last_rparen + 1,
+                                    delimiter_left_end) != 0)
                        {
-                        bufmax = 2 * bufmax + 10;
-                        buffer = xrealloc (buffer, bufmax);
+                        if (bufpos >= bufmax)
+                          {
+                            bufmax = 2 * bufmax + 10;
+                            buffer = xrealloc (buffer, bufmax);
+                          }
+                        buffer[bufpos++] = c;
+                        continue;
                        }
-                    buffer[bufpos++] = c;
-                    continue;
+                    delimiter_right_start = last_rparen;
                    }
+                break;
  
-              case EOF: case '"':
+              case EOF:
                  break;
                }
              break;
@@ -1449,13 +1480,7 @@ phase5_get (token_ty *tp)
  
          if (raw_expected)
            {
-            char *delimiter_left_end;
-            char *delimiter_right_start;
-
-            if (!(delimiter_left_end = strchr (buffer, '('))
-                || !(delimiter_right_start = strrchr (buffer, ')'))
-                || strncmp (buffer, delimiter_right_start + 1,
-                            (delimiter_left_end - buffer)) != 0)
+            if (delimiter_left_end < 0 || delimiter_right_start < 0)
                {
                  error_with_progname = false;
                  error (0, 0, _("%s:%d: warning: unterminated string literal"),
@@ -1464,15 +1489,17 @@ phase5_get (token_ty *tp)
                }
              else
                {
-                *delimiter_right_start = '\0';
+                buffer[delimiter_right_start] = '\0';
                  tp->type = token_type_string_literal;
-                tp->string = xstrdup (delimiter_left_end + 1);
+                tp->string = xstrdup (&buffer[delimiter_left_end + 1]);
+                tp->escape = LET_NONE;
                  tp->comment = add_reference (savable_comment);
                  return;
                }
            }
          tp->type = token_type_string_literal;
          tp->string = xstrdup (buffer);
+        tp->escape = LET_ANSI_C | LET_UNICODE;
          tp->comment = add_reference (savable_comment);
          return;
        }
@@ -1726,6 +1753,7 @@ phase8a_get (token_ty *tp)
        tp->string = new_string;
        tp->comment = add_reference (savable_comment);
        tp->type = token_type_string_literal;
+      tp->escape = LET_ANSI_C | LET_UNICODE;
      }
  }
  
@@ -1806,7 +1834,10 @@ phase8c_unget (token_ty *tp)
  
  /* 8. Concatenate adjacent string literals to form single string
     literals (because we don't expand macros, there are a few things we
-   will miss).  */
+   will miss).
+
+   FIXME: handle the case when the string literals have different
+   tp->escape setting.  */
  
  static void
  phase8_get (token_ty *tp)
@@ -1862,6 +1893,9 @@ struct xgettext_token_ty
       xgettext_token_type_keyword, xgettext_token_type_symbol.  */
    char *string;
  
+  /* This field is used only for xgettext_token_type_string_literal.  */
+  enum literalstring_escape_type escape;
+
    /* This field is used only for xgettext_token_type_string_literal.  */
    refcounted_string_list_ty *comment;
  
@@ -1937,6 +1971,7 @@ x_c_lex (xgettext_token_ty *tp)
  
            tp->type = xgettext_token_type_string_literal;
            tp->string = token.string;
+          tp->escape = token.escape;
            tp->comment = token.comment;
            tp->pos.file_name = logical_file_name;
            tp->pos.line_number = token.line_number;
@@ -2098,7 +2133,7 @@ extract_parenthesized (message_list_ty *mlp,
                const char *encoding;
  
                string = literalstring_parse (token.string, &token.pos,
-                                            LET_ANSI_C | LET_UNICODE);
+                                            token.escape);
                free (token.string);
                token.string = string;
  
@@ -2125,7 +2160,7 @@ extract_parenthesized (message_list_ty *mlp,
                                               token.pos.file_name,
                                               token.pos.line_number,
                                               token.comment,
-                                             LET_ANSI_C | LET_UNICODE);
+                                             token.escape);
            drop_reference (token.comment);
            next_context_iter = null_context_list_iterator;
            selectorcall_context_iter = null_context_list_iterator;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog

index 8ab049aa5be309905abbc39ce87d29380e0c5b02..d8616bfa7cf940d521f259254141237e823d2e00 100644 (file)
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2015-01-13  Daiki Ueno  <ueno@gnu.org>
+
+       * xgettext-c-20: Adjust to the latest xgettext C++ scanner change;
+       don't accept unbalanced prefix in raw string literal.
+
  2015-01-13  Daiki Ueno  <ueno@gnu.org>
  
         * xgettext-c-20: Adjust the source file name from *.c to *.cc to
diff --git a/gettext-tools/tests/xgettext-c-20 b/gettext-tools/tests/xgettext-c-20

index 2740962624281ce6bee36729df4bccf626104a0c..69bb8a20eab557df2dcac5f3c1601f29d706b6c0 100755 (executable)
--- a/gettext-tools/tests/xgettext-c-20
+++ b/gettext-tools/tests/xgettext-c-20
@@ -51,14 +51,6 @@ LR"aaa(
  This is a wide raw string
  )aaa";
  
-// Missing opening parenthesis; be tolerate and treat it as a normal string.
-gettext (u8R"bar)
-aaa");
-
-// Missing closing parenthesis; be tolerate and treat it as a normal string.
-gettext (u8R"aaa(
-bar");
-
  // 'LR' prefixed raw string should be skipped.
  LR"(
  
@@ -107,18 +99,6 @@ msgid ""
  "\n"
  "This is a raw UTF-8 string\n"
  msgstr ""
-
-#. Missing opening parenthesis; be tolerate and treat it as a normal string.
-msgid ""
-"bar)\n"
-"aaa"
-msgstr ""
-
-#. Missing closing parenthesis; be tolerate and treat it as a normal string.
-msgid ""
-"aaa(\n"
-"bar"
-msgstr ""
  EOF
  
  : ${DIFF=diff}
author	Daiki Ueno <ueno@gnu.org>
	Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)
committer	Daiki Ueno <ueno@gnu.org>
	Tue, 13 Jan 2015 03:09:08 +0000 (12:09 +0900)
gettext-tools/src/ChangeLog		patch \| blob \| blame \| history
gettext-tools/src/x-c.c		patch \| blob \| blame \| history
gettext-tools/tests/ChangeLog		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-c-20		patch \| blob \| blame \| history