From: Bruno Haible <bruno@clisp.org>
Date: Sun, 4 Nov 2018 19:20:03 +0000 (+0100)
Subject: xgettext: Add support for C++11 raw string literals.
X-Git-Tag: v0.20~268
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f5a433a04e00c3c946d99d6246cdb98064a8f3e5;p=thirdparty%2Fgettext.git

xgettext: Add support for C++11 raw string literals.

* gettext-tools/src/x-c.c (phase5_get): Recognize the R"..." syntax and its variants.
* gettext-tools/tests/xgettext-c-20: Change expected error message.
---

diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c
index d8d9d0fc0..187f32998 100644
--- a/gettext-tools/src/x-c.c
+++ b/gettext-tools/src/x-c.c
@@ -1283,6 +1283,174 @@ phase5_get (token_ty *tp)
                  bind_textdomain_codeset (DOMAIN, "UTF-8") first.  */
               if (bufpos == 2 && buffer[0] == 'u' && buffer[1] == '8')
                 goto string_literal;
+              /* Recognize C++11 raw string literals.
+                 See ISO C++ 11 section 2.14.5 [lex.string].
+                 Here it is important to properly parse all cases according to
+                 the standard, otherwise our parser could get confused by
+                 double-quotes inside the raw string.
+                 Note: The programmer who passes an UTF-8 encoded string to
+                 gettext() or similar API functions will have to have called
+                 bind_textdomain_codeset (DOMAIN, "UTF-8") first.  */
+              if (cxx_extensions
+                  && (bufpos == 1
+                      || (bufpos == 2
+                          && (buffer[0] == 'u' || buffer[0] == 'U'
+                              || buffer[0] == 'L'))
+                      || (bufpos == 3 && buffer[0] == 'u' && buffer[1] == '8'))
+                  && buffer[bufpos - 1] == 'R')
+                {
+                  /* Only R and u8R raw strings can be used as gettext()
+                     arguments, for type reasons.  */
+                  const bool relevant = (bufpos != 2);
+                  int starting_line_number = line_number;
+                  bufpos = 0;
+                  /* Start the buffer with a closing parenthesis.  This makes the
+                     parsing code below simpler.  */
+                  buffer[bufpos++] = ')';
+                  /* Parse the initial delimiter.  */
+                  for (;;)
+                    {
+                      bool valid_delimiter_char;
+
+                      c = phase3_getc ();
+                      switch (c)
+                        {
+                        case 'A': case 'B': case 'C': case 'D': case 'E':
+                        case 'F': case 'G': case 'H': case 'I': case 'J':
+                        case 'K': case 'L': case 'M': case 'N': case 'O':
+                        case 'P': case 'Q': case 'R': case 'S': case 'T':
+                        case 'U': case 'V': case 'W': case 'X': case 'Y':
+                        case 'Z':
+                        case 'a': case 'b': case 'c': case 'd': case 'e':
+                        case 'f': case 'g': case 'h': case 'i': case 'j':
+                        case 'k': case 'l': case 'm': case 'n': case 'o':
+                        case 'p': case 'q': case 'r': case 's': case 't':
+                        case 'u': case 'v': case 'w': case 'x': case 'y':
+                        case 'z':
+                        case '0': case '1': case '2': case '3': case '4':
+                        case '5': case '6': case '7': case '8': case '9':
+                        case '_': case '{': case '}': case '[': case ']':
+                        case '#': case '<': case '>': case '%': case ':':
+                        case ';': case '.': case '?': case '*': case '+':
+                        case '-': case '/': case '^': case '&': case '|':
+                        case '~': case '!': case '=': case ',': case '\'':
+                          valid_delimiter_char = true;
+                          break;
+                        case '"':
+                          /* A double-quote within the delimiter! This is too
+                             weird.  We don't support this.  */
+                          error_with_progname = false;
+                          error (0, 0, _("%s:%d: warning: a double-quote in the delimiter of a raw string literal is unsupported"),
+                                 logical_file_name, starting_line_number);
+                          error_with_progname = true;
+                          /* FALLTHROUGH */
+                        default:
+                          valid_delimiter_char = false;
+                          break;
+                        }
+                      if (!valid_delimiter_char)
+                        break;
+
+                      if (bufpos >= bufmax)
+                        {
+                          bufmax = 2 * bufmax + 10;
+                          buffer = xrealloc (buffer, bufmax);
+                        }
+                      buffer[bufpos++] = c;
+                    }
+                  if (c == '(')
+                    {
+                      struct mixed_string_buffer *bp;
+                      /* The state is either 0 or
+                         N, after a ')' and N-1 bytes of the delimiter have been
+                         encountered.  */
+                      int state;
+
+                      /* Start accumulating the string.  */
+                      if (relevant)
+                        bp = mixed_string_buffer_alloc (lc_string,
+                                                        logical_file_name,
+                                                        line_number);
+                      else
+                        bp = NULL;
+                      state = 0;
+
+                      for (;;)
+                        {
+                          c = phase3_getc ();
+
+                          /* Keep line_number in sync.  */
+                          if (relevant)
+                            bp->line_number = line_number;
+
+                          if (c == EOF)
+                            break;
+
+                          /* Update the state.  */
+                          if (c == (state < bufpos ? buffer[state] : '"'))
+                            {
+                              if (state < bufpos)
+                                state++;
+                              else /* state == bufpos && c == '"' */
+                                {
+                                  /* Finished parsing the string.  */
+                                  if (relevant)
+                                    {
+                                      tp->type = token_type_string_literal;
+                                      tp->string = mixed_string_buffer_done (bp);
+                                      tp->comment = add_reference (savable_comment);
+                                    }
+                                  else
+                                    tp->type = token_type_symbol;
+                                  return;
+                                }
+                            }
+                          else
+                            {
+                              int i;
+
+                              /* None of the bytes buffer[0]...buffer[state-1]
+                                 can be ')'.  */
+                              if (relevant)
+                                for (i = 0; i < state; i++)
+                                  mixed_string_buffer_append_char (bp, buffer[i]);
+
+                              /* But c may be ')'.  */
+                              if (c == ')')
+                                state = 1;
+                              else
+                                {
+                                  if (relevant)
+                                    mixed_string_buffer_append_char (bp, c);
+                                  state = 0;
+                                }
+                            }
+                        }
+                    }
+                  if (c == EOF)
+                    {
+                      error_with_progname = false;
+                      error (0, 0, _("%s:%d: warning: unterminated raw string literal"),
+                             logical_file_name, starting_line_number);
+                      error_with_progname = true;
+                      tp->type = token_type_eof;
+                      return;
+                    }
+                  /* The error message for c == '"' was already emitted above.  */
+                  if (c != '"')
+                    {
+                      error_with_progname = false;
+                      error (0, 0, _("%s:%d: warning: invalid raw string literal syntax"),
+                             logical_file_name, starting_line_number);
+                      error_with_progname = true;
+                    }
+                  /* To get into a sane state, read up until the next double-quote,
+                     newline, or EOF.  */
+                  while (!(c == EOF || c == '"' || c == '\n'))
+                    c = phase3_getc ();
+                  tp->type = token_type_symbol;
+                  return;
+                }
               /* FALLTHROUGH */
 
             default:
diff --git a/gettext-tools/tests/xgettext-c-20 b/gettext-tools/tests/xgettext-c-20
index 2e56bcb7b..214c77e3d 100755
--- a/gettext-tools/tests/xgettext-c-20
+++ b/gettext-tools/tests/xgettext-c-20
@@ -19,7 +19,7 @@ bbb
 )ccc");
 EOF
 
-(LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --no-wrap -o - err1.cc 2>&1; exit) | grep 'unterminated string literal' || Exit 1
+(LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --no-wrap -o - err1.cc 2>&1; exit) | grep 'unterminated raw string literal' || Exit 1
 
 cat <<\EOF > xg-c-20.cc
 #define X ""