From: Bruno Haible Date: Sun, 29 Sep 2019 10:58:09 +0000 (+0200) Subject: xgettext: Recognize text blocks in Java parser. X-Git-Tag: v0.21~162 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5b5a0b64e7b1febae1a2ba44c1159b0e93b0d0b3;p=thirdparty%2Fgettext.git xgettext: Recognize text blocks in Java parser. * autogen.sh (GNULIB_MODULES_TOOLS_FOR_SRC): Add unictype/syntax-java-whitespace. * gettext-tools/src/x-java.c: Include unictype.h. (strip_indent): New function. (phase5_get): Parse text blocks (Java 13 syntax). * gettext-tools/tests/xgettext-java-2: Add tests of text blocks. * gettext-tools/doc/gettext.texi (Java): Mention the text block syntax. * NEWS: Mention the change. --- diff --git a/NEWS b/NEWS index 69dba3d26..3cf56d4cd 100644 --- a/NEWS +++ b/NEWS @@ -14,8 +14,9 @@ Version 0.21 - September 2019 xgettext now assumes a Python source file is in UTF-8 encoding by default, as stated in PEP 3120. - Java: - xgettext now recognizes format strings in the Formatter syntax. They - are marked as 'java-printf-format' in POT and PO files. + o xgettext now recognizes format strings in the Formatter syntax. They + are marked as 'java-printf-format' in POT and PO files. + o xgettext now recognizes text blocks as string literals. - Desktop Entry: The value of the 'Icon' property is no longer extracted into the POT file by xgettext. The documentation explains how to localize icons. diff --git a/autogen.sh b/autogen.sh index 4de8e553b..48ce5591e 100755 --- a/autogen.sh +++ b/autogen.sh @@ -198,6 +198,7 @@ if ! $skip_gnulib; then sys_time trim unictype/ctype-space + unictype/syntax-java-whitespace unilbrk/ulc-width-linebreaks uniname/uniname unistd diff --git a/gettext-tools/doc/gettext.texi b/gettext-tools/doc/gettext.texi index 20316a5ea..072318ff6 100644 --- a/gettext-tools/doc/gettext.texi +++ b/gettext-tools/doc/gettext.texi @@ -10305,7 +10305,7 @@ default-jdk @code{java} @item String syntax -"abc" +"abc", """text block""" @item gettext shorthand _("abc") diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c index 62b5429fa..8c0ee331f 100644 --- a/gettext-tools/src/x-java.c +++ b/gettext-tools/src/x-java.c @@ -44,6 +44,7 @@ #include "hash.h" #include "po-charset.h" #include "unistr.h" +#include "unictype.h" #include "gettext.h" #define _(s) gettext(s) @@ -52,8 +53,8 @@ /* The Java syntax is defined in the - Java Language Specification, Second Edition, - (available from http://java.sun.com/), + Java Language Specification + (available from https://docs.oracle.com/javase/specs/), chapter 3 "Lexical Structure". */ @@ -568,7 +569,7 @@ enum token_type_ty token_type_rbrace, /* } */ token_type_comma, /* , */ token_type_dot, /* . */ - token_type_string_literal, /* "abc" */ + token_type_string_literal, /* "abc", """text block""" */ token_type_number, /* 1.23 */ token_type_symbol, /* identifier, keyword, null */ token_type_plus, /* + */ @@ -692,6 +693,340 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter) } +/* Strip the common indentation of the non-blank lines of the given string and + remove all trailing whitespace of all lines. + Like the Java method String.stripIndent does. + */ +static void +strip_indent (mixed_string_ty *ms) +{ + size_t nsegments = ms->nsegments; + size_t minimum_indentation = SIZE_MAX; + { + size_t curr_line_indentation = 0; + bool curr_line_blank = true; + size_t i; + + for (i = 0; i < nsegments; i++) + { + struct mixed_string_segment *segment = ms->segments[i]; + + if (segment->type == utf8_encoded + || (segment->type == source_encoded + && xgettext_current_source_encoding == po_charset_utf8)) + { + /* Consider Unicode whitespace characters. */ + size_t seglength = segment->length; + size_t j; + + for (j = 0; j < seglength; ) + { + ucs4_t uc; + int bytes = + u8_mbtouc (&uc, (const uint8_t *) &segment->contents[j], + seglength - j); + j += bytes; + if (uc == 0x000a) + { + /* Newline. */ + if (!curr_line_blank) + if (minimum_indentation > curr_line_indentation) + minimum_indentation = curr_line_indentation; + curr_line_indentation = 0; + curr_line_blank = true; + } + else if (uc_is_java_whitespace (uc)) + { + /* Whitespace character. */ + if (curr_line_blank) + /* Every whitespace character counts as 1, even the TAB + character. */ + curr_line_indentation++; + } + else + { + /* Other character. */ + curr_line_blank = false; + } + } + } + else + { + /* When the encoding is not UTF-8, consider only ASCII whitespace + characters. */ + size_t seglength = segment->length; + size_t j; + + for (j = 0; j < seglength; j++) + { + char c = segment->contents[j]; + if (c == '\n') + { + /* Newline. */ + if (!curr_line_blank) + if (minimum_indentation > curr_line_indentation) + minimum_indentation = curr_line_indentation; + curr_line_indentation = 0; + curr_line_blank = true; + } + else if (c == ' ' + || (c >= 0x09 && c <= 0x0d) + || (c >= 0x1c && c <= 0x1f)) + { + /* Whitespace character. */ + if (curr_line_blank) + /* Every whitespace character counts as 1, even the TAB + character. */ + curr_line_indentation++; + } + else + { + /* Other character. */ + curr_line_blank = false; + } + } + } + } + /* The indentation of the last line matters even if is blank. */ + if (minimum_indentation > curr_line_indentation) + minimum_indentation = curr_line_indentation; + } + + /* The same loop as above, but this time remove the leading + minimum_indentation whitespace characters and all trailing whitespace + characters from every line. */ + { + size_t start_of_curr_line_i = 0; + size_t start_of_curr_line_j = 0; + size_t start_of_trailing_whitespace_i = 0; + size_t start_of_trailing_whitespace_j = 0; + size_t whitespace_to_remove = minimum_indentation; + size_t i; + + for (i = 0; i < nsegments; i++) + { + struct mixed_string_segment *segment = ms->segments[i]; + /* Perform a sliding copy from segment->contents[from_j] to + segment->contents[to_j]. 0 <= to_j <= from_j. */ + size_t to_j; + + if (segment->type == utf8_encoded + || (segment->type == source_encoded + && xgettext_current_source_encoding == po_charset_utf8)) + { + /* Consider Unicode whitespace characters. */ + size_t seglength = segment->length; + size_t from_j; + + for (to_j = from_j = 0; from_j < seglength; ) + { + ucs4_t uc; + int bytes = + u8_mbtouc (&uc, (const uint8_t *) &segment->contents[from_j], + seglength - from_j); + if (uc == 0x000a) + { + /* Newline. */ + if (whitespace_to_remove > 0) + { + /* It was a blank line with fewer than minimum_indentation + whitespace characters. Remove all this whitespace. */ + if (start_of_curr_line_i < i) + { + size_t k; + ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j; + for (k = start_of_curr_line_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_curr_line_j; + } + else + { + /* Remove the trailing whitespace characters from the + current line. */ + if (start_of_trailing_whitespace_i < i) + { + size_t k; + ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j; + for (k = start_of_trailing_whitespace_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_trailing_whitespace_j; + } + } + if (to_j < from_j) + memmove (&segment->contents[to_j], &segment->contents[from_j], bytes); + from_j += bytes; + to_j += bytes; + if (uc == 0x000a) + { + /* Newline. */ + start_of_curr_line_i = i; + start_of_curr_line_j = to_j; + start_of_trailing_whitespace_i = i; + start_of_trailing_whitespace_j = to_j; + whitespace_to_remove = minimum_indentation; + } + else if (uc_is_java_whitespace (uc)) + { + /* Whitespace character. */ + if (whitespace_to_remove > 0 + && --whitespace_to_remove == 0) + { + /* Remove the leading minimum_indentation whitespace + characters from the current line. */ + if (start_of_curr_line_i < i) + { + size_t k; + ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j; + for (k = start_of_curr_line_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_curr_line_j; + } + } + else + { + /* Other character. */ + if (whitespace_to_remove > 0) + abort (); + start_of_trailing_whitespace_i = i; + start_of_trailing_whitespace_j = to_j; + } + } + } + else + { + /* When the encoding is not UTF-8, consider only ASCII whitespace + characters. */ + size_t seglength = segment->length; + size_t from_j; + + for (to_j = from_j = 0; from_j < seglength; ) + { + char c = segment->contents[from_j++]; + if (c == '\n') + { + /* Newline. */ + if (whitespace_to_remove > 0) + { + /* It was a blank line with fewer than minimum_indentation + whitespace characters. Remove all this whitespace. */ + if (start_of_curr_line_i < i) + { + size_t k; + ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j; + for (k = start_of_curr_line_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_curr_line_j; + } + else + { + /* Remove the trailing whitespace characters from the + current line. */ + if (start_of_trailing_whitespace_i < i) + { + size_t k; + ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j; + for (k = start_of_trailing_whitespace_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_trailing_whitespace_j; + } + } + segment->contents[to_j++] = c; + if (c == '\n') + { + /* Newline. */ + start_of_curr_line_i = i; + start_of_curr_line_j = to_j; + start_of_trailing_whitespace_i = i; + start_of_trailing_whitespace_j = to_j; + whitespace_to_remove = minimum_indentation; + } + else if (c == ' ' + || (c >= 0x09 && c <= 0x0d) + || (c >= 0x1c && c <= 0x1f)) + { + /* Whitespace character. */ + if (whitespace_to_remove > 0 + && --whitespace_to_remove == 0) + { + /* Remove the leading minimum_indentation whitespace + characters from the current line. */ + if (start_of_curr_line_i < i) + { + size_t k; + ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j; + for (k = start_of_curr_line_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_curr_line_j; + } + } + else + { + /* Other character. */ + if (whitespace_to_remove > 0) + abort (); + start_of_trailing_whitespace_i = i; + start_of_trailing_whitespace_j = to_j; + } + } + } + if (i + 1 == nsegments) + { + /* Handle the last line. */ + if (whitespace_to_remove > 0) + { + /* It was a blank line with fewer than minimum_indentation + whitespace characters. Remove all this whitespace. */ + if (start_of_curr_line_i < i) + { + size_t k; + ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j; + for (k = start_of_curr_line_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_curr_line_j; + } + else + { + /* Remove the trailing whitespace characters from the + current line. */ + if (start_of_trailing_whitespace_i < i) + { + size_t k; + ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j; + for (k = start_of_trailing_whitespace_i + 1; k < i; k++) + ms->segments[k]->length = 0; + to_j = 0; + } + else + to_j = start_of_trailing_whitespace_j; + } + } + segment->length = to_j; + } + } +} + + /* Combine characters into tokens. Discard whitespace. */ static token_ty phase5_pushback[3]; @@ -849,6 +1184,94 @@ phase5_get (token_ty *tp) } case '"': + { + int c2 = phase3_getc (); + if (c2 == '"') + { + int c3 = phase3_getc (); + if (c3 == '"') + { + /* Text block. Specification: + */ + struct mixed_string_buffer block; + unsigned int consecutive_unescaped_doublequotes; + mixed_string_ty *block_content; + + /* Parse the part up to and including the first newline. */ + for (;;) + { + int ic = phase3_getc (); + if (ic == P2_EOF) + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated text block"), + logical_file_name, line_number); + error_with_progname = true; + tp->type = token_type_other; + return; + } + if (RED (ic) == ' ' || RED (ic) == '\t' || RED (ic) == '\f') + ; + else if (RED (ic) == '\n') + break; + else + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: invalid syntax in text block"), + logical_file_name, line_number); + error_with_progname = true; + tp->type = token_type_other; + return; + } + } + + /* Parse the part after the first newline. */ + mixed_string_buffer_init (&block, lc_string, + logical_file_name, line_number); + consecutive_unescaped_doublequotes = 0; + for (;;) + { + int ic = phase3_getc (); + if (RED (ic) == '"') + { + consecutive_unescaped_doublequotes++; + if (consecutive_unescaped_doublequotes == 3) + break; + } + else + { + while (consecutive_unescaped_doublequotes > 0) + { + mixed_string_buffer_append (&block, '"'); + consecutive_unescaped_doublequotes--; + } + if (ic == P2_EOF) + { + error_with_progname = false; + error (0, 0, _("%s:%d: warning: unterminated text block"), + logical_file_name, block.line_number); + error_with_progname = true; + break; + } + if (RED (ic) == '\\') + ic = do_getc_escaped (); + mixed_string_buffer_append (&block, ic); + } + } + block_content = mixed_string_buffer_result (&block); + + /* Remove the common indentation from the content. */ + strip_indent (block_content); + + tp->mixed_string = block_content; + tp->comment = add_reference (savable_comment); + tp->type = token_type_string_literal; + return; + } + phase3_ungetc (c3); + } + phase3_ungetc (c2); + } /* String literal. */ { struct mixed_string_buffer literal; diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2 index 5f20864fe..4fb7816ad 100755 --- a/gettext-tools/tests/xgettext-java-2 +++ b/gettext-tools/tests/xgettext-java-2 @@ -51,6 +51,27 @@ comment! */ "this is a single " /* now comes the concatenation! */ + // after + } }, "this is the second argument"); + // Text blocks are extracted. + gettext (""" + a one-liner block """); + gettext (""" + ibam forte via sacra sicut meus est mos + nescio quid meditans nugarum totus in illis + + accurrit quidam notus mihi nomine tantum + arreptaque manu "quid agis, dulcissime rerum?" + """); + gettext (""" + \"""a text block in a text block\""" """); + gettext (""" + testing mixed ASCII and Unicode whitespace + \u0020 line 2 + \u0020line 3 + \u0020line 4 + and trailing whitespace + as well \u0020 + also in the last line\u0020""" + ); } } EOF @@ -163,6 +184,35 @@ msgstr "" #: xg-j-2.java:46 msgid "this is the second argument" msgstr "" + +#. Text blocks are extracted. +#: xg-j-2.java:48 +msgid "a one-liner block" +msgstr "" + +#: xg-j-2.java:50 +msgid "" +"ibam forte via sacra sicut meus est mos\n" +"nescio quid meditans nugarum totus in illis\n" +"\n" +"accurrit quidam notus mihi nomine tantum\n" +"arreptaque manu \"quid agis, dulcissime rerum?\"\n" +msgstr "" + +#: xg-j-2.java:57 +msgid "\"\"\"a text block in a text block\"\"\"" +msgstr "" + +#: xg-j-2.java:59 +msgid "" +"testing mixed ASCII and Unicode whitespace\n" +"line 2\n" +"line 3\n" +" line 4\n" +"and trailing whitespace\n" +"as well\n" +"also in the last line" +msgstr "" EOF : ${DIFF=diff}