Change the Perl backend to convert extracted strings and comments to UTF-8

author Bruno Haible <bruno@clisp.org>

Fri, 27 Jun 2003 12:35:05 +0000 (12:35 +0000)

committer Bruno Haible <bruno@clisp.org>

Tue, 23 Jun 2009 10:10:46 +0000 (12:10 +0200)
author Bruno Haible <bruno@clisp.org>
Fri, 27 Jun 2003 12:35:05 +0000 (12:35 +0000)
committer Bruno Haible <bruno@clisp.org>
Tue, 23 Jun 2009 10:10:46 +0000 (12:10 +0200)
diff --git a/gettext-tools/src/ChangeLog b/gettext-tools/src/ChangeLog

index 2026c962fada4a2578e22c4e98e7878cd0b9fcb4..8a4e029d3823dbd0fbcfdb044e2490138d7d8913 100644 (file)
--- a/gettext-tools/src/ChangeLog
+++ b/gettext-tools/src/ChangeLog
@@ -1,3 +1,20 @@
+2003-06-27  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext.h (from_current_source_encoding): New declaraction.
+       * xgettext.c (from_current_source_encoding): New function.
+       (CONVERT_STRING): Use it.
+       * x-perl.c: Include po-charset.h.
+       (get_here_document): Convert each line to UTF-8.
+       (phase2_getc): Convert each comment to UTF-8. Tell xgettext_comment_add
+       to not convert it.
+       (extract_quotelike_pass1_utf8): New function.
+       (extract_quotelike): Use extract_quotelike_pass1_utf8.
+       (extract_triple_quotelike): Likewise.
+       (extract_variable): Tell remember_a_message to not convert the string.
+       (interpolate_keywords): Likewise.
+       (extract_balanced): Tell remember_a_message, remember_a_message_plural
+       to not convert the string.
+
  2003-06-23  Guido Flohr  <guido@imperia.net>
  
         * x-perl.c (extract_quotelike_pass3): Fix handling of doubled
diff --git a/gettext-tools/src/x-perl.c b/gettext-tools/src/x-perl.c

index e11923b433454e0cdc5459119da30b87d976a1ce..145344bdd668561395b161c380960d4a321a4dc5 100644 (file)
--- a/gettext-tools/src/x-perl.c
+++ b/gettext-tools/src/x-perl.c
@@ -34,6 +34,7 @@
  #include "progname.h"
  #include "xmalloc.h"
  #include "exit.h"
+#include "po-charset.h"
  #include "ucs4-utf8.h"
  #include "uniname.h"
  #include "gettext.h"
@@ -203,7 +204,9 @@ phase1_ungetc (int c)
      }
  }
  
-/* Read a here document and return its contents.  */
+/* Read a here document and return its contents.
+   The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
+   encoded as well.  */
  
  static char *
  get_here_document (const char *delimiter)
@@ -228,6 +231,7 @@ get_here_document (const char *delimiter)
    for (;;)
      {
        int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
+      char *my_line_utf8;
        bool chomp;
  
        if (read_bytes < 0)
@@ -251,6 +255,21 @@ get_here_document (const char *delimiter)
  
        ++here_eaten;
  
+      /* Convert to UTF-8.  */
+      my_line_utf8 =
+       from_current_source_encoding (my_linebuf, logical_file_name,
+                                     line_number + here_eaten);
+      if (my_line_utf8 != my_linebuf)
+       {
+         if (strlen (my_line_utf8) >= my_linebuf_size)
+           {
+             my_linebuf_size = strlen (my_line_utf8) + 1;
+             my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
+           }
+         strcpy (my_linebuf, my_line_utf8);
+         free (my_line_utf8);
+       }
+
        /* Undosify.  This is important for catching the end of <<EOF and
          <<'EOF'.  We could rely on stdio doing this for us but you
          it is not uncommon to to come across Perl scripts with CRLF
@@ -347,6 +366,7 @@ phase2_getc ()
    size_t buflen;
    int lineno;
    int c;
+  char *utf8_string;
  
    c = phase1_getc ();
    if (c == '#')
@@ -365,6 +385,7 @@ phase2_getc ()
               break;
             }
         }
+      /* Accumulate the comment.  */
        for (;;)
         {
           c = phase1_getc ();
@@ -383,7 +404,13 @@ phase2_getc ()
           buffer = xrealloc (buffer, bufmax);
         }
        buffer[buflen] = '\0';
-      xgettext_comment_add (buffer);
+      /* Convert it to UTF-8.  */
+      utf8_string =
+       from_current_source_encoding (buffer, logical_file_name, lineno);
+      /* Save it until we encounter the corresponding string.  */
+      xgettext_current_source_encoding = po_charset_utf8;
+      xgettext_comment_add (utf8_string);
+      xgettext_current_source_encoding = xgettext_global_source_encoding;
        last_comment_line = lineno;
      }
    return c;
@@ -430,10 +457,12 @@ enum token_type_ty
    token_type_named_op,          /* if, unless, while, ... */
    token_type_variable,          /* $... */
    token_type_symbol,           /* symbol, number */
-  token_type_keyword_symbol,    /* keyword symbol (used by parser) */
    token_type_regex_op,         /* s, tr, y, m.  */
    token_type_dot,               /* . */
-  token_type_other             /* regexp, misc. operator */
+  token_type_other,            /* regexp, misc. operator */
+  /* The following are not really token types, but variants used by
+     the parser.  */
+  token_type_keyword_symbol    /* keyword symbol */
  };
  typedef enum token_type_ty token_type_ty;
  
@@ -454,9 +483,12 @@ struct token_ty
  {
    token_type_ty type;
    string_type_ty string_type;  /* for token_type_string */
-  char *string;                        /* for token_type_named_op, token_type_string,
-                                  token_type_symbol, token_type_keyword_symbol,
-                                  token_type_variable */
+  char *string;                        /* for:                 in encoding:
+                                  token_type_named_op  ASCII
+                                  token_type_string    UTF-8
+                                  token_type_symbol    ASCII
+                                  token_type_variable  global_source_encoding
+                                */
    int line_number;
  };
  
@@ -496,8 +528,6 @@ token2string (const token_ty *token)
        return "token_type_variable";
      case token_type_symbol:
        return "token_type_symbol";
-    case token_type_keyword_symbol:
-      return "token_type_keyword_symbol";
      case token_type_regex_op:
        return "token_type_regex_op";
      case token_type_dot:
@@ -519,7 +549,6 @@ free_token (token_ty *tp)
      case token_type_named_op:
      case token_type_string:
      case token_type_symbol:
-    case token_type_keyword_symbol:
      case token_type_variable:
        free (tp->string);
        break;
@@ -634,6 +663,19 @@ extract_quotelike_pass1 (int delim)
      }
  }
  
+/* Like extract_quotelike_pass1, but return the complete string in UTF-8
+   encoding.  */
+static char *
+extract_quotelike_pass1_utf8 (int delim)
+{
+  char *string = extract_quotelike_pass1 (delim);
+  char *utf8_string =
+    from_current_source_encoding (string, logical_file_name, line_number);
+  if (utf8_string != string)
+    free (string);
+  return utf8_string;
+}
+
  
  /* ========= Reading of tokens and commands.  Extracting strings.  ========= */
  
@@ -718,7 +760,7 @@ extract_oct (const char *string, size_t len, unsigned int *result)
  static void
  extract_quotelike (token_ty *tp, int delim)
  {
-  char *string = extract_quotelike_pass1 (delim);
+  char *string = extract_quotelike_pass1_utf8 (delim);
    size_t len = strlen (string);
  
    tp->type = token_type_string;
@@ -742,7 +784,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
  
    tp->type = token_type_regex_op;
  
-  string = extract_quotelike_pass1 (delim);
+  string = extract_quotelike_pass1_utf8 (delim);
    if (interpolate)
      interpolate_keywords (mlp, string, line_number);
    free (string);
@@ -759,7 +801,7 @@ extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
           delim = phase2_getc ();
         }
      }
-  string = extract_quotelike_pass1 (delim);
+  string = extract_quotelike_pass1_utf8 (delim);
    if (interpolate)
      interpolate_keywords (mlp, string, line_number);
    free (string);
@@ -1358,7 +1400,9 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first)
                       lex_pos_ty pos;
                       pos.line_number = line_number;
                       pos.file_name = logical_file_name;
+                     xgettext_current_source_encoding = po_charset_utf8;
                       remember_a_message (mlp, xstrdup (t1->string), &pos);
+                     xgettext_current_source_encoding = xgettext_global_source_encoding;
                       free_token (t2);
                       free_token (t1);
                     }
@@ -1442,7 +1486,7 @@ extract_variable (message_list_ty *mlp, token_ty *tp, int first)
  
  /* Actually a simplified version of extract_variable().  It searches for
     variables inside a double-quoted string that may interpolate to
-   some keyword hash (reference).  */
+   some keyword hash (reference).  The string is UTF-8 encoded.  */
  static void
  interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
  {
@@ -1659,6 +1703,8 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
               extract_quotelike_pass3 (&token, EXIT_FAILURE);
               /* The string can only shrink with interpolation (because
                  we ignore \Q).  */
+             if (!(strlen (token.string) <= bufpos))
+               abort ();
               strcpy (buffer, token.string);
               free (token.string);
               state = wait_rbrace;
@@ -1735,7 +1781,9 @@ interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
               buffer[bufpos] = '\0';
               token.string = xstrdup (buffer);
               extract_quotelike_pass3 (&token, EXIT_FAILURE);
+             xgettext_current_source_encoding = po_charset_utf8;
               remember_a_message (mlp, token.string, &pos);
+             xgettext_current_source_encoding = xgettext_global_source_encoding;
               /* FALLTHROUGH */
             default:
               state = initial;
@@ -2300,7 +2348,6 @@ token_stack_dump (token_stack_ty *stack)
         case token_type_named_op:
         case token_type_string:
         case token_type_symbol:
-       case token_type_keyword_symbol:
         case token_type_variable:
           fprintf (stderr, "    string: %s\n", token->string);
           break;
@@ -2683,26 +2730,29 @@ extract_balanced (message_list_ty *mlp, int arg_sg, int arg_pl, int state,
           if (extract_all)
             {
               lex_pos_ty pos;
+             char *string;
  
               pos.file_name = logical_file_name;
               pos.line_number = tp->line_number;
-             remember_a_message (mlp, collect_message (mlp, tp,
-                                                       EXIT_SUCCESS),
-                                 &pos);
+             string = collect_message (mlp, tp, EXIT_SUCCESS);
+             xgettext_current_source_encoding = po_charset_utf8;
+             remember_a_message (mlp, string, &pos);
+             xgettext_current_source_encoding = xgettext_global_source_encoding;
             }
           else if (state)
             {
               lex_pos_ty pos;
+             char *string;
  
               pos.file_name = logical_file_name;
               pos.line_number = tp->line_number;
  
               if (arg_count == arg_sg)
                 {
-                 plural_mp =
-                   remember_a_message (mlp, collect_message (mlp, tp,
-                                                             EXIT_FAILURE),
-                                       &pos);
+                 string = collect_message (mlp, tp, EXIT_FAILURE);
+                 xgettext_current_source_encoding = po_charset_utf8;
+                 plural_mp = remember_a_message (mlp, string, &pos);
+                 xgettext_current_source_encoding = xgettext_global_source_encoding;
                   arg_sg = -1;
                 }
               else if (arg_count == arg_pl && plural_mp == NULL)
@@ -2714,10 +2764,10 @@ extract_balanced (message_list_ty *mlp, int arg_sg, int arg_pl, int state,
                 }
               else if (arg_count == arg_pl)
                 {
-                 remember_a_message_plural (plural_mp,
-                                            collect_message (mlp, tp,
-                                                             EXIT_FAILURE),
-                                            &pos);
+                 string = collect_message (mlp, tp, EXIT_FAILURE);
+                 xgettext_current_source_encoding = po_charset_utf8;
+                 remember_a_message_plural (plural_mp, string, &pos);
+                 xgettext_current_source_encoding = xgettext_global_source_encoding;
                   arg_pl = -1;
                 }
             }
diff --git a/gettext-tools/src/xgettext.c b/gettext-tools/src/xgettext.c

index c72ca0751eb2701b99abe1dd394a7945536d9196..81d085d940e7f8cadd30fab08ae4b83f4aa58326 100644 (file)
--- a/gettext-tools/src/xgettext.c
+++ b/gettext-tools/src/xgettext.c
@@ -1009,27 +1009,6 @@ static struct formatstring_parser *current_formatstring_parser1;
  static struct formatstring_parser *current_formatstring_parser2;
  
  
-/* Convert the given string from xgettext_current_source_encoding to
-   the output file encoding (i.e. ASCII or UTF-8).  */
-#define CONVERT_STRING(string) \
-  if (xgettext_current_source_encoding == po_charset_ascii)            \
-    {                                                                  \
-      if (!is_ascii_string (string))                                   \
-       {                                                               \
-         char buffer[21];                                              \
-         if (pos->line_number == (size_t)(-1))                         \
-           buffer[0] = '\0';                                           \
-         else                                                          \
-           sprintf (buffer, ":%ld", (long) pos->line_number);          \
-         error (EXIT_FAILURE, 0, _("Non-ASCII string at %s%s.\nPlease specify the source encoding through --from-code."), \
-                pos->file_name, buffer);                               \
-       }                                                               \
-    }                                                                  \
-  else if (xgettext_current_source_encoding != po_charset_utf8)                \
-    {                                                                  \
-      string = convert_string (xgettext_current_source_iconv, string); \
-    }
-
  #if !HAVE_ICONV
  /* If we don't have iconv(), the only supported values for
     xgettext_global_source_encoding and thus also for
@@ -1038,6 +1017,40 @@ static struct formatstring_parser *current_formatstring_parser2;
  #define convert_string(cd,string) (abort (), (string))
  #endif
  
+/* Convert the given string from xgettext_current_source_encoding to
+   the output file encoding (i.e. ASCII or UTF-8).
+   The resulting string is either the argument string, or freshly allocated.
+   The file_name and line_number are only used for error message purposes.  */
+char *
+from_current_source_encoding (const char *string,
+                             const char *file_name, size_t line_number)
+{
+  if (xgettext_current_source_encoding == po_charset_ascii)
+    {
+      if (!is_ascii_string (string))
+       {
+         char buffer[21];
+
+         if (line_number == (size_t)(-1))
+           buffer[0] = '\0';
+         else
+           sprintf (buffer, ":%ld", (long) line_number);
+         error (EXIT_FAILURE, 0, _("\
+Non-ASCII string at %s%s.\n\
+Please specify the source encoding through --from-code."),
+                file_name, buffer);
+       }
+    }
+  else if (xgettext_current_source_encoding != po_charset_utf8)
+    string = convert_string (xgettext_current_source_iconv, string);
+
+  return (char *) string;
+}
+
+#define CONVERT_STRING(string) \
+  string = from_current_source_encoding (string, pos->file_name, \
+                                        pos->line_number);
+
  
  message_ty *
  remember_a_message (message_list_ty *mlp, char *string, lex_pos_ty *pos)
@@ -1135,7 +1148,7 @@ meta information, not the empty string.\n")));
  
           CONVERT_STRING (s);
  
-         /* To reduce the possibility of unwanted matches be do a two
+         /* To reduce the possibility of unwanted matches we do a two
              step match: the line must contain `xgettext:' and one of
              the possible format description strings.  */
           if ((t = strstr (s, "xgettext:")) != NULL)
diff --git a/gettext-tools/src/xgettext.h b/gettext-tools/src/xgettext.h

index a37c54429fb89f2c970938f708fffcfa1ecc9ddb..8086054d52356e4355e534684238b9b5270bfaef 100644 (file)
--- a/gettext-tools/src/xgettext.h
+++ b/gettext-tools/src/xgettext.h
@@ -1,5 +1,5 @@
  /* xgettext common functions.
-   Copyright (C) 2001-2002 Free Software Foundation, Inc.
+   Copyright (C) 2001-2003 Free Software Foundation, Inc.
     Written by Peter Miller <millerp@canb.auug.org.au>
     and Bruno Haible <haible@clisp.cons.org>, 2001.
  
@@ -60,6 +60,14 @@ extern const char *xgettext_current_source_encoding;
  extern iconv_t xgettext_current_source_iconv;
  #endif
  
+/* Convert the given string from xgettext_current_source_encoding to
+   the output file encoding (i.e. ASCII or UTF-8).
+   The resulting string is either the argument string, or freshly allocated.
+   The file_name and line_number are only used for error message purposes.  */
+extern char *from_current_source_encoding (const char *string,
+                                          const char *file_name,
+                                          size_t line_number);
+
  /* List of messages whose msgids must not be extracted, or NULL.
     Used by remember_a_message().  */
  extern message_list_ty *exclude;
diff --git a/gettext-tools/tests/ChangeLog b/gettext-tools/tests/ChangeLog

index d68e150d544ed243ce792fc60953303acd36f658..cd470fc8c273ef3b4208816794e6a5b5c5393555 100644 (file)
--- a/gettext-tools/tests/ChangeLog
+++ b/gettext-tools/tests/ChangeLog
@@ -1,3 +1,8 @@
+2003-06-27  Bruno Haible  <bruno@clisp.org>
+
+       * xgettext-27: Also test Unicode character names in here documents.
+       * xgettext-28: Likewise.
+
  2003-06-24  Bruno Haible  <bruno@clisp.org>
  
         * xgettext-27: New file.
diff --git a/gettext-tools/tests/xgettext-27 b/gettext-tools/tests/xgettext-27

index ca60da55a9c0bcd559f1769b78bfd3b76b0a734b..2284a1cb3851decba85a534408d4df945a2fa5ce 100755 (executable)
--- a/gettext-tools/tests/xgettext-27
+++ b/gettext-tools/tests/xgettext-27
@@ -9,6 +9,9 @@ tmpfiles="$tmpfiles xg-test27.pl"
  cat <<\EOF > xg-test27.pl
  use charnames ':full';
  printf "%s\n", gettext "Böse Bübchen - wo sind sie blo\N{LATIN SMALL LETTER SHARP S}?";
+print gettext <<STR
+Die europäische Währung ist \N{EURO SIGN}.
+STR
  EOF
  
  tmpfiles="$tmpfiles xg-test27.po"
@@ -40,6 +43,10 @@ msgstr ""
  #: xg-test27.pl:2
  msgid "Böse Bübchen - wo sind sie bloß?"
  msgstr ""
+
+#: xg-test27.pl:4
+msgid "Die europäische Währung ist €.\n"
+msgstr ""
  EOF
  
  : ${DIFF=diff}
diff --git a/gettext-tools/tests/xgettext-28 b/gettext-tools/tests/xgettext-28

index 1f55fb42de2ca54588d7e0c57c413e7624600769..e84aafe14cb92a7c120880d82d060e30e42446ac 100755 (executable)
--- a/gettext-tools/tests/xgettext-28
+++ b/gettext-tools/tests/xgettext-28
@@ -9,6 +9,9 @@ tmpfiles="$tmpfiles xg-test28.pl"
  cat <<\EOF > xg-test28.pl
  use charnames ':full';
  printf "%s\n", gettext "Böse Bübchen - wo sind sie blo\N{LATIN SMALL LETTER SHARP S}?";
+print gettext <<STR
+Die europäische Währung ist \N{EURO SIGN}.
+STR
  EOF
  
  tmpfiles="$tmpfiles xg-test28.po"
@@ -40,6 +43,10 @@ msgstr ""
  #: xg-test28.pl:2
  msgid "Böse Bübchen - wo sind sie bloß?"
  msgstr ""
+
+#: xg-test28.pl:4
+msgid "Die europäische Währung ist €.\n"
+msgstr ""
  EOF
  
  : ${DIFF=diff}
author	Bruno Haible <bruno@clisp.org>
	Fri, 27 Jun 2003 12:35:05 +0000 (12:35 +0000)
committer	Bruno Haible <bruno@clisp.org>
	Tue, 23 Jun 2009 10:10:46 +0000 (12:10 +0200)
gettext-tools/src/ChangeLog		patch \| blob \| blame \| history
gettext-tools/src/x-perl.c		patch \| blob \| blame \| history
gettext-tools/src/xgettext.c		patch \| blob \| blame \| history
gettext-tools/src/xgettext.h		patch \| blob \| blame \| history
gettext-tools/tests/ChangeLog		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-27		patch \| blob \| blame \| history
gettext-tools/tests/xgettext-28		patch \| blob \| blame \| history