From: Bruno Haible Date: Sun, 4 Nov 2018 19:20:03 +0000 (+0100) Subject: xgettext: Add support for C++11 raw string literals. X-Git-Tag: v0.20~268 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f5a433a04e00c3c946d99d6246cdb98064a8f3e5;p=thirdparty%2Fgettext.git xgettext: Add support for C++11 raw string literals. * gettext-tools/src/x-c.c (phase5_get): Recognize the R"..." syntax and its variants. * gettext-tools/tests/xgettext-c-20: Change expected error message. --- diff --git a/gettext-tools/src/x-c.c b/gettext-tools/src/x-c.c index d8d9d0fc0..187f32998 100644 --- a/gettext-tools/src/x-c.c +++ b/gettext-tools/src/x-c.c @@ -1283,6 +1283,174 @@ phase5_get (token_ty *tp) bind_textdomain_codeset (DOMAIN, "UTF-8") first. */ if (bufpos == 2 && buffer[0] == 'u' && buffer[1] == '8') goto string_literal; + /* Recognize C++11 raw string literals. + See ISO C++ 11 section 2.14.5 [lex.string]. + Here it is important to properly parse all cases according to + the standard, otherwise our parser could get confused by + double-quotes inside the raw string. + Note: The programmer who passes an UTF-8 encoded string to + gettext() or similar API functions will have to have called + bind_textdomain_codeset (DOMAIN, "UTF-8") first. */ + if (cxx_extensions + && (bufpos == 1 + || (bufpos == 2 + && (buffer[0] == 'u' || buffer[0] == 'U' + || buffer[0] == 'L')) + || (bufpos == 3 && buffer[0] == 'u' && buffer[1] == '8')) + && buffer[bufpos - 1] == 'R') + { + /* Only R and u8R raw strings can be used as gettext() + arguments, for type reasons. */ + const bool relevant = (bufpos != 2); + int starting_line_number = line_number; + bufpos = 0; + /* Start the buffer with a closing parenthesis. This makes the + parsing code below simpler. */ + buffer[bufpos++] = ')'; + /* Parse the initial delimiter. */ + for (;;) + { + bool valid_delimiter_char; + + c = phase3_getc (); + switch (c) + { + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + case '_': case '{': case '}': case '[': case ']': + case '#': case '<': case '>': case '%': case ':': + case ';': case '.': case '?': case '*': case '+': + case '-': case '/': case '^': case '&': case '|': + case '~': case '!': case '=': case ',': case '\'': + valid_delimiter_char = true; + break; + case '"': + /* A double-quote within the delimiter! This is too + weird. We don't support this. */ + error_with_progname = false; + error (0, 0, _("%s:%d: warning: a double-quote in the delimiter of a raw string literal is unsupported"), + logical_file_name, starting_line_number); + error_with_progname = true; + /* FALLTHROUGH */ + default: + valid_delimiter_char = false; + break; + } + if (!valid_delimiter_char) + break; + + if (bufpos >= bufmax) + { + bufmax = 2 * bufmax + 10; + buffer = xrealloc (buffer, bufmax); + } + buffer[bufpos++] = c; + } + if (c == '(') + { + struct mixed_string_buffer *bp; + /* The state is either 0 or + N, after a ')' and N-1 bytes of the delimiter have been + encountered. */ + int state; + + /* Start accumulating the string. */ + if (relevant) + bp = mixed_string_buffer_alloc (lc_string, + logical_file_name, + line_number); + else + bp = NULL; + state = 0; + + for (;;) + { + c = phase3_getc (); + + /* Keep line_number in sync. */ + if (relevant) + bp->line_number = line_number; + + if (c == EOF) + break; + + /* Update the state. */ + if (c == (state < bufpos ? buffer[state] : '"')) + { + if (state < bufpos) + state++; + else /* state == bufpos && c == '"' */ + { + /* Finished parsing the string. */ + if (relevant) + { + tp->type = token_type_string_literal; + tp->string = mixed_string_buffer_done (bp); + tp->comment = add_reference (savable_comment); + } + else + tp->type = token_type_symbol; + return; + } + } + else + { + int i; + + /* None of the bytes buffer[0]...buffer[state-1] + can be ')'. */ + if (relevant) + for (i = 0; i < state; i++) + mixed_string_buffer_append_char (bp, buffer[i]); + + /* But c may be ')'. */ + if (c == ')') + state = 1; + else + { + if (relevant) + mixed_string_buffer_append_char (bp, c); + state = 0; + } + } + } + } + if (c == EOF) + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated raw string literal"), + logical_file_name, starting_line_number); + error_with_progname = true; + tp->type = token_type_eof; + return; + } + /* The error message for c == '"' was already emitted above. */ + if (c != '"') + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: invalid raw string literal syntax"), + logical_file_name, starting_line_number); + error_with_progname = true; + } + /* To get into a sane state, read up until the next double-quote, + newline, or EOF. */ + while (!(c == EOF || c == '"' || c == '\n')) + c = phase3_getc (); + tp->type = token_type_symbol; + return; + } /* FALLTHROUGH */ default: diff --git a/gettext-tools/tests/xgettext-c-20 b/gettext-tools/tests/xgettext-c-20 index 2e56bcb7b..214c77e3d 100755 --- a/gettext-tools/tests/xgettext-c-20 +++ b/gettext-tools/tests/xgettext-c-20 @@ -19,7 +19,7 @@ bbb )ccc"); EOF -(LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --no-wrap -o - err1.cc 2>&1; exit) | grep 'unterminated string literal' || Exit 1 +(LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location --no-wrap -o - err1.cc 2>&1; exit) | grep 'unterminated raw string literal' || Exit 1 cat <<\EOF > xg-c-20.cc #define X ""