From: Bruno Haible <bruno@clisp.org>
Date: Sun, 29 Sep 2019 10:58:09 +0000 (+0200)
Subject: xgettext: Recognize text blocks in Java parser.
X-Git-Tag: v0.21~162
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5b5a0b64e7b1febae1a2ba44c1159b0e93b0d0b3;p=thirdparty%2Fgettext.git

xgettext: Recognize text blocks in Java parser.

* autogen.sh (GNULIB_MODULES_TOOLS_FOR_SRC): Add
unictype/syntax-java-whitespace.
* gettext-tools/src/x-java.c: Include unictype.h.
(strip_indent): New function.
(phase5_get): Parse text blocks (Java 13 syntax).
* gettext-tools/tests/xgettext-java-2: Add tests of text blocks.
* gettext-tools/doc/gettext.texi (Java): Mention the text block syntax.
* NEWS: Mention the change.
---

diff --git a/NEWS b/NEWS
index 69dba3d26..3cf56d4cd 100644
--- a/NEWS
+++ b/NEWS
@@ -14,8 +14,9 @@ Version 0.21 - September 2019
     xgettext now assumes a Python source file is in UTF-8 encoding by default,
     as stated in PEP 3120.
   - Java:
-    xgettext now recognizes format strings in the Formatter syntax.  They
-    are marked as 'java-printf-format' in POT and PO files.
+    o xgettext now recognizes format strings in the Formatter syntax.  They
+      are marked as 'java-printf-format' in POT and PO files.
+    o xgettext now recognizes text blocks as string literals.
   - Desktop Entry:
     The value of the 'Icon' property is no longer extracted into the POT file
     by xgettext.  The documentation explains how to localize icons.
diff --git a/autogen.sh b/autogen.sh
index 4de8e553b..48ce5591e 100755
--- a/autogen.sh
+++ b/autogen.sh
@@ -198,6 +198,7 @@ if ! $skip_gnulib; then
     sys_time
     trim
     unictype/ctype-space
+    unictype/syntax-java-whitespace
     unilbrk/ulc-width-linebreaks
     uniname/uniname
     unistd
diff --git a/gettext-tools/doc/gettext.texi b/gettext-tools/doc/gettext.texi
index 20316a5ea..072318ff6 100644
--- a/gettext-tools/doc/gettext.texi
+++ b/gettext-tools/doc/gettext.texi
@@ -10305,7 +10305,7 @@ default-jdk
 @code{java}
 
 @item String syntax
-"abc"
+"abc", """text block"""
 
 @item gettext shorthand
 _("abc")
diff --git a/gettext-tools/src/x-java.c b/gettext-tools/src/x-java.c
index 62b5429fa..8c0ee331f 100644
--- a/gettext-tools/src/x-java.c
+++ b/gettext-tools/src/x-java.c
@@ -44,6 +44,7 @@
 #include "hash.h"
 #include "po-charset.h"
 #include "unistr.h"
+#include "unictype.h"
 #include "gettext.h"
 
 #define _(s) gettext(s)
@@ -52,8 +53,8 @@
 
 
 /* The Java syntax is defined in the
-     Java Language Specification, Second Edition,
-     (available from http://java.sun.com/),
+     Java Language Specification
+     (available from https://docs.oracle.com/javase/specs/),
      chapter 3 "Lexical Structure".  */
 
 
@@ -568,7 +569,7 @@ enum token_type_ty
   token_type_rbrace,            /* } */
   token_type_comma,             /* , */
   token_type_dot,               /* . */
-  token_type_string_literal,    /* "abc" */
+  token_type_string_literal,    /* "abc", """text block""" */
   token_type_number,            /* 1.23 */
   token_type_symbol,            /* identifier, keyword, null */
   token_type_plus,              /* + */
@@ -692,6 +693,340 @@ accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
 }
 
 
+/* Strip the common indentation of the non-blank lines of the given string and
+   remove all trailing whitespace of all lines.
+   Like the Java method String.stripIndent does.
+   <https://docs.oracle.com/en/java/javase/13/docs/api/java.base/java/lang/String.html#stripIndent()>  */
+static void
+strip_indent (mixed_string_ty *ms)
+{
+  size_t nsegments = ms->nsegments;
+  size_t minimum_indentation = SIZE_MAX;
+  {
+    size_t curr_line_indentation = 0;
+    bool curr_line_blank = true;
+    size_t i;
+
+    for (i = 0; i < nsegments; i++)
+      {
+        struct mixed_string_segment *segment = ms->segments[i];
+
+        if (segment->type == utf8_encoded
+            || (segment->type == source_encoded
+                && xgettext_current_source_encoding == po_charset_utf8))
+          {
+            /* Consider Unicode whitespace characters.  */
+            size_t seglength = segment->length;
+            size_t j;
+
+            for (j = 0; j < seglength; )
+              {
+                ucs4_t uc;
+                int bytes =
+                  u8_mbtouc (&uc, (const uint8_t *) &segment->contents[j],
+                             seglength - j);
+                j += bytes;
+                if (uc == 0x000a)
+                  {
+                    /* Newline.  */
+                    if (!curr_line_blank)
+                      if (minimum_indentation > curr_line_indentation)
+                        minimum_indentation = curr_line_indentation;
+                    curr_line_indentation = 0;
+                    curr_line_blank = true;
+                  }
+                else if (uc_is_java_whitespace (uc))
+                  {
+                    /* Whitespace character.  */
+                    if (curr_line_blank)
+                      /* Every whitespace character counts as 1, even the TAB
+                         character.  */
+                      curr_line_indentation++;
+                  }
+                else
+                  {
+                    /* Other character.  */
+                    curr_line_blank = false;
+                  }
+              }
+          }
+        else
+          {
+            /* When the encoding is not UTF-8, consider only ASCII whitespace
+               characters.  */
+            size_t seglength = segment->length;
+            size_t j;
+
+            for (j = 0; j < seglength; j++)
+              {
+                char c = segment->contents[j];
+                if (c == '\n')
+                  {
+                    /* Newline.  */
+                    if (!curr_line_blank)
+                      if (minimum_indentation > curr_line_indentation)
+                        minimum_indentation = curr_line_indentation;
+                    curr_line_indentation = 0;
+                    curr_line_blank = true;
+                  }
+                else if (c == ' '
+                         || (c >= 0x09 && c <= 0x0d)
+                         || (c >= 0x1c && c <= 0x1f))
+                  {
+                    /* Whitespace character.  */
+                    if (curr_line_blank)
+                      /* Every whitespace character counts as 1, even the TAB
+                         character.  */
+                      curr_line_indentation++;
+                  }
+                else
+                  {
+                    /* Other character.  */
+                    curr_line_blank = false;
+                  }
+              }
+          }
+      }
+    /* The indentation of the last line matters even if is blank.  */
+    if (minimum_indentation > curr_line_indentation)
+      minimum_indentation = curr_line_indentation;
+  }
+
+  /* The same loop as above, but this time remove the leading
+     minimum_indentation whitespace characters and all trailing whitespace
+     characters from every line.  */
+  {
+    size_t start_of_curr_line_i = 0;
+    size_t start_of_curr_line_j = 0;
+    size_t start_of_trailing_whitespace_i = 0;
+    size_t start_of_trailing_whitespace_j = 0;
+    size_t whitespace_to_remove = minimum_indentation;
+    size_t i;
+
+    for (i = 0; i < nsegments; i++)
+      {
+        struct mixed_string_segment *segment = ms->segments[i];
+        /* Perform a sliding copy from segment->contents[from_j] to
+           segment->contents[to_j].  0 <= to_j <= from_j.  */
+        size_t to_j;
+
+        if (segment->type == utf8_encoded
+            || (segment->type == source_encoded
+                && xgettext_current_source_encoding == po_charset_utf8))
+          {
+            /* Consider Unicode whitespace characters.  */
+            size_t seglength = segment->length;
+            size_t from_j;
+
+            for (to_j = from_j = 0; from_j < seglength; )
+              {
+                ucs4_t uc;
+                int bytes =
+                  u8_mbtouc (&uc, (const uint8_t *) &segment->contents[from_j],
+                             seglength - from_j);
+                if (uc == 0x000a)
+                  {
+                    /* Newline.  */
+                    if (whitespace_to_remove > 0)
+                      {
+                        /* It was a blank line with fewer than minimum_indentation
+                           whitespace characters.  Remove all this whitespace.  */
+                        if (start_of_curr_line_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
+                            for (k = start_of_curr_line_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_curr_line_j;
+                      }
+                    else
+                      {
+                        /* Remove the trailing whitespace characters from the
+                           current line.  */
+                        if (start_of_trailing_whitespace_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
+                            for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_trailing_whitespace_j;
+                      }
+                  }
+                if (to_j < from_j)
+                  memmove (&segment->contents[to_j], &segment->contents[from_j], bytes);
+                from_j += bytes;
+                to_j += bytes;
+                if (uc == 0x000a)
+                  {
+                    /* Newline.  */
+                    start_of_curr_line_i = i;
+                    start_of_curr_line_j = to_j;
+                    start_of_trailing_whitespace_i = i;
+                    start_of_trailing_whitespace_j = to_j;
+                    whitespace_to_remove = minimum_indentation;
+                  }
+                else if (uc_is_java_whitespace (uc))
+                  {
+                    /* Whitespace character.  */
+                    if (whitespace_to_remove > 0
+                        && --whitespace_to_remove == 0)
+                      {
+                        /* Remove the leading minimum_indentation whitespace
+                           characters from the current line.  */
+                        if (start_of_curr_line_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
+                            for (k = start_of_curr_line_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_curr_line_j;
+                      }
+                  }
+                else
+                  {
+                    /* Other character.  */
+                    if (whitespace_to_remove > 0)
+                      abort ();
+                    start_of_trailing_whitespace_i = i;
+                    start_of_trailing_whitespace_j = to_j;
+                  }
+              }
+          }
+        else
+          {
+            /* When the encoding is not UTF-8, consider only ASCII whitespace
+               characters.  */
+            size_t seglength = segment->length;
+            size_t from_j;
+
+            for (to_j = from_j = 0; from_j < seglength; )
+              {
+                char c = segment->contents[from_j++];
+                if (c == '\n')
+                  {
+                    /* Newline.  */
+                    if (whitespace_to_remove > 0)
+                      {
+                        /* It was a blank line with fewer than minimum_indentation
+                           whitespace characters.  Remove all this whitespace.  */
+                        if (start_of_curr_line_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
+                            for (k = start_of_curr_line_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_curr_line_j;
+                      }
+                    else
+                      {
+                        /* Remove the trailing whitespace characters from the
+                           current line.  */
+                        if (start_of_trailing_whitespace_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
+                            for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_trailing_whitespace_j;
+                      }
+                  }
+                segment->contents[to_j++] = c;
+                if (c == '\n')
+                  {
+                    /* Newline.  */
+                    start_of_curr_line_i = i;
+                    start_of_curr_line_j = to_j;
+                    start_of_trailing_whitespace_i = i;
+                    start_of_trailing_whitespace_j = to_j;
+                    whitespace_to_remove = minimum_indentation;
+                  }
+                else if (c == ' '
+                         || (c >= 0x09 && c <= 0x0d)
+                         || (c >= 0x1c && c <= 0x1f))
+                  {
+                    /* Whitespace character.  */
+                    if (whitespace_to_remove > 0
+                        && --whitespace_to_remove == 0)
+                      {
+                        /* Remove the leading minimum_indentation whitespace
+                           characters from the current line.  */
+                        if (start_of_curr_line_i < i)
+                          {
+                            size_t k;
+                            ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
+                            for (k = start_of_curr_line_i + 1; k < i; k++)
+                              ms->segments[k]->length = 0;
+                            to_j = 0;
+                          }
+                        else
+                          to_j = start_of_curr_line_j;
+                      }
+                  }
+                else
+                  {
+                    /* Other character.  */
+                    if (whitespace_to_remove > 0)
+                      abort ();
+                    start_of_trailing_whitespace_i = i;
+                    start_of_trailing_whitespace_j = to_j;
+                  }
+              }
+          }
+        if (i + 1 == nsegments)
+          {
+            /* Handle the last line.  */
+            if (whitespace_to_remove > 0)
+              {
+                /* It was a blank line with fewer than minimum_indentation
+                   whitespace characters.  Remove all this whitespace.  */
+                if (start_of_curr_line_i < i)
+                  {
+                    size_t k;
+                    ms->segments[start_of_curr_line_i]->length = start_of_curr_line_j;
+                    for (k = start_of_curr_line_i + 1; k < i; k++)
+                      ms->segments[k]->length = 0;
+                    to_j = 0;
+                  }
+                else
+                  to_j = start_of_curr_line_j;
+              }
+            else
+              {
+                /* Remove the trailing whitespace characters from the
+                   current line.  */
+                if (start_of_trailing_whitespace_i < i)
+                  {
+                    size_t k;
+                    ms->segments[start_of_trailing_whitespace_i]->length = start_of_trailing_whitespace_j;
+                    for (k = start_of_trailing_whitespace_i + 1; k < i; k++)
+                      ms->segments[k]->length = 0;
+                    to_j = 0;
+                  }
+                else
+                  to_j = start_of_trailing_whitespace_j;
+              }
+          }
+        segment->length = to_j;
+      }
+  }
+}
+
+
 /* Combine characters into tokens.  Discard whitespace.  */
 
 static token_ty phase5_pushback[3];
@@ -849,6 +1184,94 @@ phase5_get (token_ty *tp)
           }
 
         case '"':
+          {
+            int c2 = phase3_getc ();
+            if (c2 == '"')
+              {
+                int c3 = phase3_getc ();
+                if (c3 == '"')
+                  {
+                    /* Text block.  Specification:
+                       <https://docs.oracle.com/javase/specs/jls/se13/preview/text-blocks.html>  */
+                    struct mixed_string_buffer block;
+                    unsigned int consecutive_unescaped_doublequotes;
+                    mixed_string_ty *block_content;
+
+                    /* Parse the part up to and including the first newline.  */
+                    for (;;)
+                      {
+                        int ic = phase3_getc ();
+                        if (ic == P2_EOF)
+                          {
+                            error_with_progname = false;
+                            error (0, 0, _("%s:%d: warning: unterminated text block"),
+                                   logical_file_name, line_number);
+                            error_with_progname = true;
+                            tp->type = token_type_other;
+                            return;
+                          }
+                        if (RED (ic) == ' ' || RED (ic) == '\t' || RED (ic) == '\f')
+                          ;
+                        else if (RED (ic) == '\n')
+                          break;
+                        else
+                          {
+                            error_with_progname = false;
+                            error (0, 0, _("%s:%d: warning: invalid syntax in text block"),
+                                   logical_file_name, line_number);
+                            error_with_progname = true;
+                            tp->type = token_type_other;
+                            return;
+                          }
+                      }
+
+                    /* Parse the part after the first newline.  */
+                    mixed_string_buffer_init (&block, lc_string,
+                                              logical_file_name, line_number);
+                    consecutive_unescaped_doublequotes = 0;
+                    for (;;)
+                      {
+                        int ic = phase3_getc ();
+                        if (RED (ic) == '"')
+                          {
+                            consecutive_unescaped_doublequotes++;
+                            if (consecutive_unescaped_doublequotes == 3)
+                              break;
+                          }
+                        else
+                          {
+                            while (consecutive_unescaped_doublequotes > 0)
+                              {
+                                mixed_string_buffer_append (&block, '"');
+                                consecutive_unescaped_doublequotes--;
+                              }
+                            if (ic == P2_EOF)
+                              {
+                                error_with_progname = false;
+                                error (0, 0, _("%s:%d: warning: unterminated text block"),
+                                       logical_file_name, block.line_number);
+                                error_with_progname = true;
+                                break;
+                              }
+                            if (RED (ic) == '\\')
+                              ic = do_getc_escaped ();
+                            mixed_string_buffer_append (&block, ic);
+                          }
+                      }
+                    block_content = mixed_string_buffer_result (&block);
+
+                    /* Remove the common indentation from the content.  */
+                    strip_indent (block_content);
+
+                    tp->mixed_string = block_content;
+                    tp->comment = add_reference (savable_comment);
+                    tp->type = token_type_string_literal;
+                    return;
+                  }
+                phase3_ungetc (c3);
+              }
+            phase3_ungetc (c2);
+          }
           /* String literal.  */
           {
             struct mixed_string_buffer literal;
diff --git a/gettext-tools/tests/xgettext-java-2 b/gettext-tools/tests/xgettext-java-2
index 5f20864fe..4fb7816ad 100755
--- a/gettext-tools/tests/xgettext-java-2
+++ b/gettext-tools/tests/xgettext-java-2
@@ -51,6 +51,27 @@ comment! */ "this is a single " /* now comes the concatenation! */ + // after +
           }
         },
         "this is the second argument");
+    // Text blocks are extracted.
+    gettext ("""
+        a one-liner block """);
+    gettext ("""
+        ibam forte via sacra sicut meus est mos
+        nescio quid meditans nugarum totus in illis
+            
+        accurrit quidam notus mihi nomine tantum
+        arreptaque manu "quid agis, dulcissime rerum?"
+             """);
+    gettext ("""
+        \"""a text block in a text block\""" """);
+    gettext ("""
+        testing mixed ASCII and Unicode whitespace
+      \u0020 line 2
+       \u0020line 3
+        \u0020line 4
+        and trailing whitespace  
+        as well \u0020
+        also in the last line\u0020"""
+        );
   }
 }
 EOF
@@ -163,6 +184,35 @@ msgstr ""
 #: xg-j-2.java:46
 msgid "this is the second argument"
 msgstr ""
+
+#. Text blocks are extracted.
+#: xg-j-2.java:48
+msgid "a one-liner block"
+msgstr ""
+
+#: xg-j-2.java:50
+msgid ""
+"ibam forte via sacra sicut meus est mos\n"
+"nescio quid meditans nugarum totus in illis\n"
+"\n"
+"accurrit quidam notus mihi nomine tantum\n"
+"arreptaque manu \"quid agis, dulcissime rerum?\"\n"
+msgstr ""
+
+#: xg-j-2.java:57
+msgid "\"\"\"a text block in a text block\"\"\""
+msgstr ""
+
+#: xg-j-2.java:59
+msgid ""
+"testing mixed ASCII and Unicode whitespace\n"
+"line 2\n"
+"line 3\n"
+" line 4\n"
+"and trailing whitespace\n"
+"as well\n"
+"also in the last line"
+msgstr ""
 EOF
 
 : ${DIFF=diff}