Update.

[thirdparty/glibc.git] / locale / programs / linereader.c
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c

index 6692164b60a87bcf8b61e1138f7827439cb5e185..99ed0f24802988e3b8664cc9c1fc80312f0090ed 100644 (file)
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -1,6 +1,6 @@
-/* Copyright (C) 1996, 1997, 1998 Free Software Foundation, Inc.
+/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
     This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@gnu.ai.mit.edu>, 1996.
+   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
  
     The GNU C Library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Library General Public License as
@@ -28,22 +28,20 @@
  #include <stdlib.h>
  #include <string.h>
  
+#include "charmap.h"
  #include "error.h"
  #include "linereader.h"
-#include "charset.h"
+#include "localedef.h"
  #include "stringtrans.h"
  
  
-void *xmalloc (size_t __n);
-void *xrealloc (void *__p, size_t __n);
-char *xstrdup (const char *__str);
-
-
+/* Prototypes for local functions.  */
  static struct token *get_toplvl_escape (struct linereader *lr);
  static struct token *get_symname (struct linereader *lr);
  static struct token *get_ident (struct linereader *lr);
  static struct token *get_string (struct linereader *lr,
-                                const struct charset_t *charset);
+                                const struct charmap_t *charmap,
+                                const struct repertoire_t *repertoire);
  
  
  struct linereader *
@@ -126,9 +124,14 @@ lr_next (struct linereader *lr)
  
    if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
      {
+#if 0
+      /* XXX Is this correct?  */
        /* An escaped newline character is substituted with a single <SP>.  */
        --n;
        lr->buf[n - 1] = ' ';
+#else
+      n -= 2;
+#endif
      }
  
    lr->buf[n] = '\0';
@@ -149,7 +152,8 @@ extern char *program_name;
  
  
  struct token *
-lr_token (struct linereader *lr, const struct charset_t *charset)
+lr_token (struct linereader *lr, const struct charmap_t *charmap,
+         const struct repertoire_t *repertoire)
  {
    int ch;
  
@@ -193,12 +197,29 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
      return get_toplvl_escape (lr);
  
    /* Match ellipsis.  */
-  if (ch == '.' && strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+  if (ch == '.')
      {
-      lr_getc (lr);
-      lr_getc (lr);
-      lr->token.tok = tok_ellipsis;
-      return &lr->token;
+      if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
+       {
+         lr_getc (lr);
+         lr_getc (lr);
+         lr_getc (lr);
+         lr->token.tok = tok_ellipsis4;
+         return &lr->token;
+       }
+      if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
+       {
+         lr_getc (lr);
+         lr_getc (lr);
+         lr->token.tok = tok_ellipsis3;
+         return &lr->token;
+       }
+      if (lr->buf[lr->idx] == '.')
+       {
+         lr_getc (lr);
+         lr->token.tok = tok_ellipsis2;
+         return &lr->token;
+       }
      }
  
    switch (ch)
@@ -238,7 +259,7 @@ lr_token (struct linereader *lr, const struct charset_t *charset)
        return &lr->token;
  
      case '"':
-      return get_string (lr, charset);
+      return get_string (lr, charmap, repertoire);
  
      case '-':
        ch = lr_getc (lr);
@@ -261,7 +282,7 @@ get_toplvl_escape (struct linereader *lr)
    /* This is supposed to be a numeric value.  We return the
       numerical value and the number of bytes.  */
    size_t start_idx = lr->idx - 1;
-  unsigned int value = 0;
+  char *bytes = lr->token.val.charcode.bytes;
    int nbytes = 0;
    int ch;
  
@@ -287,11 +308,11 @@ get_toplvl_escape (struct linereader *lr)
           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
         {
         esc_error:
-         lr->token.val.str.start = &lr->buf[start_idx];
+         lr->token.val.str.startmb = &lr->buf[start_idx];
  
           while (ch != EOF && !isspace (ch))
             ch = lr_getc (lr);
-         lr->token.val.str.len = lr->idx - start_idx;
+         lr->token.val.str.lenmb = lr->idx - start_idx;
  
           lr->token.tok = tok_error;
           return &lr->token;
@@ -322,10 +343,7 @@ get_toplvl_escape (struct linereader *lr)
           ch = lr_getc (lr);
         }
  
-      value *= 256;
-      value += byte;
-
-      ++nbytes;
+      bytes[nbytes++] = byte;
      }
    while (ch == lr->escape_char && nbytes < 4);
  
@@ -335,23 +353,52 @@ get_toplvl_escape (struct linereader *lr)
    lr_ungetn (lr, 1);
  
    lr->token.tok = tok_charcode;
-  lr->token.val.charcode.val = value;
    lr->token.val.charcode.nbytes = nbytes;
  
    return &lr->token;
  }
  
  
-#define ADDC(ch)                                                           \
-  do                                                                       \
-    {                                                                      \
-      if (bufact == bufmax)                                                \
-       {                                                                   \
-         bufmax *= 2;                                                      \
-         buf = xrealloc (buf, bufmax);                                     \
-       }                                                                   \
-      buf[bufact++] = (ch);                                                \
-    }                                                                      \
+#define ADDC(ch) \
+  do                                                                         \
+    {                                                                        \
+      if (bufact == bufmax)                                                  \
+       {                                                                     \
+         bufmax *= 2;                                                        \
+         buf = xrealloc (buf, bufmax);                                       \
+       }                                                                     \
+      buf[bufact++] = (ch);                                                  \
+    }                                                                        \
+  while (0)
+
+
+#define ADDS(s, l) \
+  do                                                                         \
+    {                                                                        \
+      size_t _l = (l);                                                       \
+      if (bufact + _l > bufmax)                                                      \
+       {                                                                     \
+         if (bufact < _l)                                                    \
+           bufact = _l;                                                      \
+         bufmax *= 2;                                                        \
+         buf = xrealloc (buf, bufmax);                                       \
+       }                                                                     \
+      memcpy (&buf[bufact], s, _l);                                          \
+      bufact += _l;                                                          \
+    }                                                                        \
+  while (0)
+
+
+#define ADDWC(ch) \
+  do                                                                         \
+    {                                                                        \
+      if (buf2act == buf2max)                                                \
+       {                                                                     \
+         buf2max *= 2;                                                       \
+         buf2 = xrealloc (buf2, buf2max * 4);                                \
+       }                                                                     \
+      buf2[buf2act++] = (ch);                                                \
+    }                                                                        \
    while (0)
  
  
@@ -399,9 +446,8 @@ get_symname (struct linereader *lr)
        if (cp == &buf[bufact - 1])
         {
           /* Yes, it is.  */
-         lr->token.tok = bufact == 6 ? tok_ucs2 : tok_ucs4;
-         lr->token.val.charcode.val = strtoul (buf, NULL, 16);
-         lr->token.val.charcode.nbytes = lr->token.tok == tok_ucs2 ? 2 : 4;
+         lr->token.tok = tok_ucs4;
+         lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
  
           return &lr->token;
         }
@@ -422,8 +468,8 @@ get_symname (struct linereader *lr)
        buf[bufact] = '\0';
        buf = xrealloc (buf, bufact + 1);
  
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact - 1;
+      lr->token.val.str.startmb = buf;
+      lr->token.val.str.lenmb = bufact - 1;
      }
  
    return &lr->token;
@@ -446,8 +492,18 @@ get_ident (struct linereader *lr)
  
    while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
          && ch != '<' && ch != ',')
-    /* XXX Handle escape sequences?  */
-    ADDC (ch);
+    {
+      if (ch == lr->escape_char)
+       {
+         ch = lr_getc (lr);
+         if (ch == '\n' || ch == EOF)
+           {
+             lr_error (lr, _("invalid escape sequence"));
+             break;
+           }
+       }
+      ADDC (ch);
+    }
  
    lr_ungetn (lr, 1);
  
@@ -465,8 +521,8 @@ get_ident (struct linereader *lr)
        buf[bufact] = '\0';
        buf = xrealloc (buf, bufact + 1);
  
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact;
+      lr->token.val.str.startmb = buf;
+      lr->token.val.str.lenmb = bufact;
      }
  
    return &lr->token;
@@ -474,113 +530,247 @@ get_ident (struct linereader *lr)
  
  
  static struct token *
-get_string (struct linereader *lr, const struct charset_t *charset)
+get_string (struct linereader *lr, const struct charmap_t *charmap,
+           const struct repertoire_t *repertoire)
  {
-  int illegal_string = 0;
-  char *buf, *cp;
+  int return_widestr = lr->return_widestr;
+  char *buf;
+  char *buf2 = NULL;
    size_t bufact;
    size_t bufmax = 56;
-  int ch;
  
+  /* We must return two different strings.  */
    buf = xmalloc (bufmax);
    bufact = 0;
  
-  while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
-    if (ch != '<' || charset == NULL)
-      {
-       if (ch == lr->escape_char)
-         {
-           ch = lr_getc (lr);
-           if (ch == '\n' || ch == EOF)
-             break;
-         }
+  /* We know it'll be a string.  */
+  lr->token.tok = tok_string;
+
+  /* If we need not translate the strings (i.e., expand <...> parts)
+     we can run a simple loop.  */
+  if (!lr->translate_strings)
+    {
+      int ch;
+
+      buf2 = NULL;
+      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
         ADDC (ch);
-      }
-    else
-      {
-       /* We have to get the value of the symbol.  */
-       unsigned int value;
-       size_t startidx = bufact;
-
-       if (!lr->translate_strings)
-         ADDC ('<');
-
-       while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
-         {
-           if (ch == lr->escape_char)
-             {
-               ch = lr_getc (lr);
-               if (ch == '\n' || ch == EOF)
-                 break;
-             }
-           ADDC (ch);
-         }
-
-       if (ch == '\n' || ch == EOF)
-         lr_error (lr, _("unterminated string"));
-       else
-         if (!lr->translate_strings)
-           ADDC ('>');
-
-       if (lr->translate_strings)
-         {
-           value = charset_find_value (charset, &buf[startidx],
-                                       bufact - startidx);
-           if ((wchar_t) value == ILLEGAL_CHAR_VALUE)
+
+      /* Catch errors with trailing escape character.  */
+      if (bufact > 0 && buf[bufact - 1] == lr->escape_char
+         && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
+       {
+         lr_error (lr, _("illegal escape sequence at end of string"));
+         --bufact;
+       }
+      else if (ch == '\n' || ch == EOF)
+       lr_error (lr, _("unterminated string"));
+
+      ADDC ('\0');
+    }
+  else
+    {
+      int illegal_string = 0;
+      size_t buf2act = 0;
+      size_t buf2max = 56 * sizeof (uint32_t);
+      int ch;
+      int warned = 0;
+
+      /* We have to provide the wide character result as well.  */
+      if (return_widestr)
+       buf2 = xmalloc (buf2max);
+
+      /* Read until the end of the string (or end of the line or file).  */
+      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
+       {
+         size_t startidx;
+         uint32_t wch;
+         struct charseq *seq;
+
+         if (ch != '<')
+           {
+             /* The standards leave it up to the implementation to decide
+                what to do with character which stand for themself.  We
+                could jump through hoops to find out the value relative to
+                the charmap and the repertoire map, but instead we leave
+                it up to the locale definition author to write a better
+                definition.  We assume here that every character which
+                stands for itself is encoded using ISO 8859-1.  Using the
+                escape character is allowed.  */
+             if (ch == lr->escape_char)
+               {
+                 ch = lr_getc (lr);
+                 if (ch == '\n' || ch == EOF)
+                   break;
+               }
+
+             if (verbose && !warned)
+               {
+                 lr_error (lr, _("\
+non-symbolic character value should not be used"));
+                 warned = 1;
+               }
+
+             ADDC (ch);
+             if (return_widestr)
+               ADDWC ((uint32_t) ch);
+
+             continue;
+           }
+
+         /* Now we have to search for the end of the symbolic name, i.e.,
+            the closing '>'.  */
+         startidx = bufact;
+         while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
+           {
+             if (ch == lr->escape_char)
+               {
+                 ch = lr_getc (lr);
+                 if (ch == '\n' || ch == EOF)
+                   break;
+               }
+             ADDC (ch);
+           }
+         if (ch == '\n' || ch == EOF)
+           /* Not a correct string.  */
+           break;
+         if (bufact == startidx)
+           {
+             /* <> is no correct name.  Ignore it and also signal an
+                error.  */
               illegal_string = 1;
-           bufact = startidx;
+             continue;
+           }
  
-           if (bufmax - bufact < 8)
-             {
-               bufmax *= 2;
-               buf = (char *) xrealloc (buf, bufmax);
-             }
+         /* It might be a Uxxxx symbol.  */
+         if (buf[startidx] == 'U'
+             && (bufact - startidx == 5 || bufact - startidx == 9))
+           {
+             char *cp = buf + startidx + 1;
+             while (cp < &buf[bufact] && isxdigit (*cp))
+               ++cp;
+
+             if (cp == &buf[bufact])
+               {
+                 const char *symbol = NULL;
+
+                 /* Yes, it is.  */
+                 ADDC ('\0');
+                 wch = strtoul (buf + startidx + 1, NULL, 16);
+
+                 /* Now forget about the name we just added.  */
+                 bufact = startidx;
+
+                 if (return_widestr)
+                   ADDWC (wch);
+
+                 /* Now determine from the repertoire the name of the
+                    character and find it in the charmap.  */
+                 if (repertoire != NULL)
+                   symbol = repertoire_find_symbol (repertoire, wch);
+
+                 if (symbol == NULL)
+                   {
+                     /* We cannot generate a string since we cannot map
+                        from the Unicode number to the character symbol.  */
+                     lr_error (lr,
+                               _("character <U%0*X> not in repertoire map"),
+                               wch > 0xffff ? 8 : 4, wch);
+
+                     illegal_string = 1;
+                   }
+                 else
+                   {
+                     seq = charmap_find_value (charmap, symbol,
+                                               strlen (symbol));
+
+                     if (seq == NULL)
+                       {
+                         /* Not a known name.  */
+                         lr_error (lr,
+                                   _("symbol `%s' not in charmap"), symbol);
+                         illegal_string = 1;
+                       }
+                     else
+                       ADDS (seq->bytes, seq->nbytes);
+                   }
+
+                 continue;
+               }
+           }
+
+         if (return_widestr)
+           {
+             /* We now have the symbolic name in buf[startidx] to
+                buf[bufact-1].  Now find out the value for this
+                character in the repertoire map as well as in the
+                charmap (in this order).  */
+             wch = repertoire_find_value (repertoire, &buf[startidx],
+                                          bufact - startidx);
+             if (wch == ILLEGAL_CHAR_VALUE)
+               {
+                 /* This name is not in the repertoire map.  */
+                 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
+                           bufact - startidx, &buf[startidx]);
+                 illegal_string = 1;
+               }
+             else
+               ADDWC (wch);
+           }
+
+         /* Now the same for the multibyte representation.  */
+         seq = charmap_find_value (charmap, &buf[startidx],
+                                   bufact - startidx);
  
-           cp = &buf[bufact];
-           if (encode_char (value, &cp))
+         if (seq == NULL)
+           {
+             /* This name is not in the charmap.  */
+             lr_error (lr, _("symbol `%.*s' not in charmap"),
+                       bufact - startidx, &buf[startidx]);
               illegal_string = 1;
  
-           bufact = cp - buf;
-         }
-      }
+             /* Now forget about the name we just added.  */
+             bufact = startidx;
+           }
+         else
+           {
+             /* Now forget about the name we just added.  */
+             bufact = startidx;
  
-  /* Catch errors with trailing escape character.  */
-  if (bufact > 0 && buf[bufact - 1] == lr->escape_char
-      && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
-    {
-      lr_error (lr, _("illegal escape sequence at end of string"));
-      --bufact;
-    }
-  else if (ch == '\n' || ch == EOF)
-    lr_error (lr, _("unterminated string"));
+             ADDS (seq->bytes, seq->nbytes);
+           }
+       }
  
-  /* Terminate string if necessary.  */
-  if (lr->translate_strings)
-    {
-      cp = &buf[bufact];
-      if (encode_char (0, &cp))
-       illegal_string = 1;
+      if (ch == '\n' || ch == EOF)
+       {
+         lr_error (lr, _("unterminated string"));
+         illegal_string = 1;
+       }
  
-      bufact = cp - buf;
-    }
-  else
-    ADDC ('\0');
+      if (illegal_string)
+       {
+         free (buf);
+         if (buf2 != NULL)
+           free (buf2);
+         lr->token.val.str.startmb = NULL;
+         lr->token.val.str.lenmb = 0;
  
-  lr->token.tok = tok_string;
+         return &lr->token;
+       }
  
-  if (illegal_string)
-    {
-      free (buf);
-      lr->token.val.str.start = NULL;
-      lr->token.val.str.len = 0;
-    }
-  else
-    {
-      buf = xrealloc (buf, bufact + 1);
+      ADDC ('\0');
  
-      lr->token.val.str.start = buf;
-      lr->token.val.str.len = bufact;
+      if (return_widestr)
+       {
+         ADDWC (0);
+         lr->token.val.str.startwc = xrealloc (buf2,
+                                               buf2act * sizeof (uint32_t));
+         lr->token.val.str.lenwc = buf2act;
+       }
      }
  
+  lr->token.val.str.startmb = xrealloc (buf, bufact);
+  lr->token.val.str.lenmb = bufact;
+
    return &lr->token;
  }