[thirdparty/glibc.git] / locale / programs / linereader.c

/* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Library General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU C Library; see the file COPYING.LIB.  If not,
   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
   Boston, MA 02111-1307, USA.  */

#ifdef HAVE_CONFIG_H
# include <config.h>
#endif

#include <ctype.h>
#include <errno.h>
#include <libintl.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>

#include "charmap.h"
#include "error.h"
#include "linereader.h"
#include "localedef.h"


/* Prototypes for local functions.  */
static struct token *get_toplvl_escape (struct linereader *lr);
static struct token *get_symname (struct linereader *lr);
static struct token *get_ident (struct linereader *lr);
static struct token *get_string (struct linereader *lr,
				 const struct charmap_t *charmap,
				 const struct repertoire_t *repertoire);


struct linereader *
lr_open (const char *fname, kw_hash_fct_t hf)
{
  FILE *fp;

  if (fname == NULL || strcmp (fname, "-") == 0
      || strcmp (fname, "/dev/stdin") == 0)
    return lr_create (stdin, "<stdin>", hf);
  else
    {
      fp = fopen (fname, "r");
      if (fp == NULL)
	return NULL;
      return lr_create (fp, fname, hf);
    }
}

struct linereader *
lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
{
  struct linereader *result;
  int n;

  result = (struct linereader *) xmalloc (sizeof (*result));

  result->fp = fp;
  result->fname = xstrdup (fname);
  result->buf = NULL;
  result->bufsize = 0;
  result->lineno = 1;
  result->idx = 0;
  result->comment_char = '#';
  result->escape_char = '\\';
  result->translate_strings = 1;

  n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  if (n < 0)
    {
      int save = errno;
      fclose (result->fp);
      free ((char *) result->fname);
      free (result);
      errno = save;
      return NULL;
    }

  if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
    n -= 2;

  result->buf[n] = '\0';
  result->bufact = n;
  result->hash_fct = hf;

  return result;
}


int
lr_eof (struct linereader *lr)
{
  return lr->bufact = 0;
}


void
lr_close (struct linereader *lr)
{
  fclose (lr->fp);
  free (lr->buf);
  free (lr);
}


int
lr_next (struct linereader *lr)
{
  int n;

  n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
  if (n < 0)
    return -1;

  ++lr->lineno;

  if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
    {
#if 0
      /* XXX Is this correct?  */
      /* An escaped newline character is substituted with a single <SP>.  */
      --n;
      lr->buf[n - 1] = ' ';
#else
      n -= 2;
#endif
    }

  lr->buf[n] = '\0';
  lr->bufact = n;
  lr->idx = 0;

  return 0;
}


/* Defined in error.c.  */
/* This variable is incremented each time `error' is called.  */
extern unsigned int error_message_count;

/* The calling program should define program_name and set it to the
   name of the executing program.  */
extern char *program_name;


struct token *
lr_token (struct linereader *lr, const struct charmap_t *charmap,
	  const struct repertoire_t *repertoire)
{
  int ch;

  while (1)
    {
      do
	{
	  ch = lr_getc (lr);

	  if (ch == EOF)
	    {
	      lr->token.tok = tok_eof;
	      return &lr->token;
	    };

	  if (ch == '\n')
	    {
	      lr->token.tok = tok_eol;
	      return &lr->token;
	    }
	}
      while (isspace (ch));

      if (ch == EOF)
	{
	  lr->token.tok = tok_eof;
	  return &lr->token;
	};

      if (ch != lr->comment_char)
	break;

      /* Is there an newline at the end of the buffer?  */
      if (lr->buf[lr->bufact - 1] != '\n')
	{
	  /* No.  Some people want this to mean that only the line in
	     the file not the logical, concatenated line is ignored.
	     Let's try this.  */
	  lr->idx = lr->bufact;
	  continue;
	}

      /* Ignore rest of line.  */
      lr_ignore_rest (lr, 0);
      lr->token.tok = tok_eol;
      return &lr->token;
    }

  /* Match escape sequences.  */
  if (ch == lr->escape_char)
    return get_toplvl_escape (lr);

  /* Match ellipsis.  */
  if (ch == '.')
    {
      if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
	{
	  int cnt;
	  for (cnt = 0; cnt < 10; ++cnt)
	    lr_getc (lr);
	  lr->token.tok = tok_ellipsis4_2;
	  return &lr->token;
	}
      if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
	{
	  lr_getc (lr);
	  lr_getc (lr);
	  lr_getc (lr);
	  lr->token.tok = tok_ellipsis4;
	  return &lr->token;
	}
      if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
	{
	  lr_getc (lr);
	  lr_getc (lr);
	  lr->token.tok = tok_ellipsis3;
	  return &lr->token;
	}
      if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
	{
	  int cnt;
	  for (cnt = 0; cnt < 6; ++cnt)
	    lr_getc (lr);
	  lr->token.tok = tok_ellipsis2_2;
	  return &lr->token;
	}
      if (lr->buf[lr->idx] == '.')
	{
	  lr_getc (lr);
	  lr->token.tok = tok_ellipsis2;
	  return &lr->token;
	}
    }

  switch (ch)
    {
    case '<':
      return get_symname (lr);

    case '0' ... '9':
      lr->token.tok = tok_number;
      lr->token.val.num = ch - '0';

      while (isdigit (ch = lr_getc (lr)))
	{
	  lr->token.val.num *= 10;
	  lr->token.val.num += ch - '0';
	}
      if (isalpha (ch))
	lr_error (lr, _("garbage at end of number"));
      lr_ungetn (lr, 1);

      return &lr->token;

    case ';':
      lr->token.tok = tok_semicolon;
      return &lr->token;

    case ',':
      lr->token.tok = tok_comma;
      return &lr->token;

    case '(':
      lr->token.tok = tok_open_brace;
      return &lr->token;

    case ')':
      lr->token.tok = tok_close_brace;
      return &lr->token;

    case '"':
      return get_string (lr, charmap, repertoire);

    case '-':
      ch = lr_getc (lr);
      if (ch == '1')
	{
	  lr->token.tok = tok_minus1;
	  return &lr->token;
	}
      lr_ungetn (lr, 2);
      break;
    }

  return get_ident (lr);
}


static struct token *
get_toplvl_escape (struct linereader *lr)
{
  /* This is supposed to be a numeric value.  We return the
     numerical value and the number of bytes.  */
  size_t start_idx = lr->idx - 1;
  char *bytes = lr->token.val.charcode.bytes;
  int nbytes = 0;
  int ch;

  do
    {
      unsigned int byte = 0;
      unsigned int base = 8;

      ch = lr_getc (lr);

      if (ch == 'd')
	{
	  base = 10;
	  ch = lr_getc (lr);
	}
      else if (ch == 'x')
	{
	  base = 16;
	  ch = lr_getc (lr);
	}

      if ((base == 16 && !isxdigit (ch))
	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
	{
	esc_error:
	  lr->token.val.str.startmb = &lr->buf[start_idx];

	  while (ch != EOF && !isspace (ch))
	    ch = lr_getc (lr);
	  lr->token.val.str.lenmb = lr->idx - start_idx;

	  lr->token.tok = tok_error;
	  return &lr->token;
	}

      if (isdigit (ch))
	byte = ch - '0';
      else
	byte = tolower (ch) - 'a' + 10;

      ch = lr_getc (lr);
      if ((base == 16 && !isxdigit (ch))
	  || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
	goto esc_error;

      byte *= base;
      if (isdigit (ch))
	byte += ch - '0';
      else
	byte += tolower (ch) - 'a' + 10;

      ch = lr_getc (lr);
      if (base != 16 && isdigit (ch))
	{
	  byte *= base;
	  byte += ch - '0';

	  ch = lr_getc (lr);
	}

      bytes[nbytes++] = byte;
    }
  while (ch == lr->escape_char && nbytes < 4);

  if (!isspace (ch))
    lr_error (lr, _("garbage at end of character code specification"));

  lr_ungetn (lr, 1);

  lr->token.tok = tok_charcode;
  lr->token.val.charcode.nbytes = nbytes;

  return &lr->token;
}


#define ADDC(ch) \
  do									      \
    {									      \
      if (bufact == bufmax)						      \
	{								      \
	  bufmax *= 2;							      \
	  buf = xrealloc (buf, bufmax);					      \
	}								      \
      buf[bufact++] = (ch);						      \
    }									      \
  while (0)


#define ADDS(s, l) \
  do									      \
    {									      \
      size_t _l = (l);							      \
      if (bufact + _l > bufmax)						      \
	{								      \
	  if (bufact < _l)						      \
	    bufact = _l;						      \
	  bufmax *= 2;							      \
	  buf = xrealloc (buf, bufmax);					      \
	}								      \
      memcpy (&buf[bufact], s, _l);					      \
      bufact += _l;							      \
    }									      \
  while (0)


#define ADDWC(ch) \
  do									      \
    {									      \
      if (buf2act == buf2max)						      \
	{								      \
	  buf2max *= 2;							      \
	  buf2 = xrealloc (buf2, buf2max * 4);				      \
	}								      \
      buf2[buf2act++] = (ch);						      \
    }									      \
  while (0)


static struct token *
get_symname (struct linereader *lr)
{
  /* Symbol in brackets.  We must distinguish three kinds:
     1. reserved words
     2. ISO 10646 position values
     3. all other.  */
  char *buf;
  size_t bufact = 0;
  size_t bufmax = 56;
  const struct keyword_t *kw;
  int ch;

  buf = (char *) xmalloc (bufmax);

  do
    {
      ch = lr_getc (lr);
      if (ch == lr->escape_char)
	{
	  int c2 = lr_getc (lr);
	  ADDC (c2);

	  if (c2 == '\n')
	    ch = '\n';
	}
      else
	ADDC (ch);
    }
  while (ch != '>' && ch != '\n');

  if (ch == '\n')
    lr_error (lr, _("unterminated symbolic name"));

  /* Test for ISO 10646 position value.  */
  if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
    {
      char *cp = buf + 1;
      while (cp < &buf[bufact - 1] && isxdigit (*cp))
	++cp;

      if (cp == &buf[bufact - 1])
	{
	  /* Yes, it is.  */
	  lr->token.tok = tok_ucs4;
	  lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);

	  return &lr->token;
	}
    }

  /* It is a symbolic name.  Test for reserved words.  */
  kw = lr->hash_fct (buf, bufact - 1);

  if (kw != NULL && kw->symname_or_ident == 1)
    {
      lr->token.tok = kw->token;
      free (buf);
    }
  else
    {
      lr->token.tok = tok_bsymbol;

      buf[bufact] = '\0';
      buf = xrealloc (buf, bufact + 1);

      lr->token.val.str.startmb = buf;
      lr->token.val.str.lenmb = bufact - 1;
    }

  return &lr->token;
}


static struct token *
get_ident (struct linereader *lr)
{
  char *buf;
  size_t bufact;
  size_t bufmax = 56;
  const struct keyword_t *kw;
  int ch;

  buf = xmalloc (bufmax);
  bufact = 0;

  ADDC (lr->buf[lr->idx - 1]);

  while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
	 && ch != '<' && ch != ',')
    {
      if (ch == lr->escape_char)
	{
	  ch = lr_getc (lr);
	  if (ch == '\n' || ch == EOF)
	    {
	      lr_error (lr, _("invalid escape sequence"));
	      break;
	    }
	}
      ADDC (ch);
    }

  lr_ungetn (lr, 1);

  kw = lr->hash_fct (buf, bufact);

  if (kw != NULL && kw->symname_or_ident == 0)
    {
      lr->token.tok = kw->token;
      free (buf);
    }
  else
    {
      lr->token.tok = tok_ident;

      buf[bufact] = '\0';
      buf = xrealloc (buf, bufact + 1);

      lr->token.val.str.startmb = buf;
      lr->token.val.str.lenmb = bufact;
    }

  return &lr->token;
}


static struct token *
get_string (struct linereader *lr, const struct charmap_t *charmap,
	    const struct repertoire_t *repertoire)
{
  int return_widestr = lr->return_widestr;
  char *buf;
  wchar_t *buf2 = NULL;
  size_t bufact;
  size_t bufmax = 56;

  /* We must return two different strings.  */
  buf = xmalloc (bufmax);
  bufact = 0;

  /* We know it'll be a string.  */
  lr->token.tok = tok_string;

  /* If we need not translate the strings (i.e., expand <...> parts)
     we can run a simple loop.  */
  if (!lr->translate_strings)
    {
      int ch;

      buf2 = NULL;
      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
	ADDC (ch);

      /* Catch errors with trailing escape character.  */
      if (bufact > 0 && buf[bufact - 1] == lr->escape_char
	  && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
	{
	  lr_error (lr, _("illegal escape sequence at end of string"));
	  --bufact;
	}
      else if (ch == '\n' || ch == EOF)
	lr_error (lr, _("unterminated string"));

      ADDC ('\0');
    }
  else
    {
      int illegal_string = 0;
      size_t buf2act = 0;
      size_t buf2max = 56 * sizeof (uint32_t);
      int ch;
      int warned = 0;

      /* We have to provide the wide character result as well.  */
      if (return_widestr)
	buf2 = xmalloc (buf2max);

      /* Read until the end of the string (or end of the line or file).  */
      while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
	{
	  size_t startidx;
	  uint32_t wch;
	  struct charseq *seq;

	  if (ch != '<')
	    {
	      /* The standards leave it up to the implementation to decide
		 what to do with character which stand for themself.  We
		 could jump through hoops to find out the value relative to
		 the charmap and the repertoire map, but instead we leave
		 it up to the locale definition author to write a better
		 definition.  We assume here that every character which
		 stands for itself is encoded using ISO 8859-1.  Using the
		 escape character is allowed.  */
	      if (ch == lr->escape_char)
		{
		  ch = lr_getc (lr);
		  if (ch == '\n' || ch == EOF)
		    break;
		}

	      if (verbose && !warned)
		{
		  lr_error (lr, _("\
non-symbolic character value should not be used"));
		  warned = 1;
		}

	      ADDC (ch);
	      if (return_widestr)
		ADDWC ((uint32_t) ch);

	      continue;
	    }

	  /* Now we have to search for the end of the symbolic name, i.e.,
	     the closing '>'.  */
	  startidx = bufact;
	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
	    {
	      if (ch == lr->escape_char)
		{
		  ch = lr_getc (lr);
		  if (ch == '\n' || ch == EOF)
		    break;
		}
	      ADDC (ch);
	    }
	  if (ch == '\n' || ch == EOF)
	    /* Not a correct string.  */
	    break;
	  if (bufact == startidx)
	    {
	      /* <> is no correct name.  Ignore it and also signal an
		 error.  */
	      illegal_string = 1;
	      continue;
	    }

	  /* It might be a Uxxxx symbol.  */
	  if (buf[startidx] == 'U'
	      && (bufact - startidx == 5 || bufact - startidx == 9))
	    {
	      char *cp = buf + startidx + 1;
	      while (cp < &buf[bufact] && isxdigit (*cp))
		++cp;

	      if (cp == &buf[bufact])
		{
		  char utmp[10];
		  const char *symbol = NULL;

		  /* Yes, it is.  */
		  ADDC ('\0');
		  wch = strtoul (buf + startidx + 1, NULL, 16);

		  /* Now forget about the name we just added.  */
		  bufact = startidx;

		  if (return_widestr)
		    ADDWC (wch);

		  /* See whether the charmap contains the Uxxxxxxxx names.  */
		  snprintf (utmp, sizeof (utmp), "U%08X", wch);
		  seq = charmap_find_value (charmap, utmp, 9);

		  if (seq == NULL)
		    {
		     /* No, this isn't the case.  Now determine from
			the repertoire the name of the character and
			find it in the charmap.  */
		      if (repertoire != NULL)
			symbol = repertoire_find_symbol (repertoire, wch);

		      if (symbol == NULL)
			/* We cannot generate a string since we
			   cannot map from the Unicode number to the
			   character symbol.  */
			illegal_string = 1;
		      else
			{
			  seq = charmap_find_value (charmap, symbol,
						    strlen (symbol));

			  if (seq == NULL)
			    /* Not a known name.  */
			    illegal_string = 1;
			}
		    }

		  if (seq != NULL)
		    ADDS (seq->bytes, seq->nbytes);

		  continue;
		}
	    }

	  /* We now have the symbolic name in buf[startidx] to
	     buf[bufact-1].  Now find out the value for this character
	     in the charmap as well as in the repertoire map (in this
	     order).  */
	  seq = charmap_find_value (charmap, &buf[startidx],
				    bufact - startidx);

	  if (seq == NULL)
	    {
	      /* This name is not in the charmap.  */
	      lr_error (lr, _("symbol `%.*s' not in charmap"),
			(int) (bufact - startidx), &buf[startidx]);
	      illegal_string = 1;
	    }

	  if (return_widestr)
	    {
	      /* Now the same for the multibyte representation.  */
	      if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
		wch = seq->ucs4;
	      else
		{
		  wch = repertoire_find_value (repertoire, &buf[startidx],
					       bufact - startidx);
		  if (seq != NULL)
		    seq->ucs4 = wch;
		}

	      if (wch == ILLEGAL_CHAR_VALUE)
		{
		  /* This name is not in the repertoire map.  */
		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
			    (int) (bufact - startidx), &buf[startidx]);
		  illegal_string = 1;
		}
	      else
		ADDWC (wch);
	    }

	  /* Now forget about the name we just added.  */
	  bufact = startidx;

	  /* And copy the bytes.  */
	  if (seq != NULL)
	    ADDS (seq->bytes, seq->nbytes);
	}

      if (ch == '\n' || ch == EOF)
	{
	  lr_error (lr, _("unterminated string"));
	  illegal_string = 1;
	}

      if (illegal_string)
	{
	  free (buf);
	  if (buf2 != NULL)
	    free (buf2);
	  lr->token.val.str.startmb = NULL;
	  lr->token.val.str.lenmb = 0;

	  return &lr->token;
	}

      ADDC ('\0');

      if (return_widestr)
	{
	  ADDWC (0);
	  lr->token.val.str.startwc = xrealloc (buf2,
						buf2act * sizeof (uint32_t));
	  lr->token.val.str.lenwc = buf2act;
	}
    }

  lr->token.val.str.startmb = xrealloc (buf, bufact);
  lr->token.val.str.lenmb = bufact;

  return &lr->token;
}
Commit	Line	Data
d569d333	1	/* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
5290baf0	2	This file is part of the GNU C Library.
4b10dd6c	3	Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
19bc17a9	4
5290baf0 UD	5	The GNU C Library is free software; you can redistribute it and/or
	6	modify it under the terms of the GNU Library General Public License as
	7	published by the Free Software Foundation; either version 2 of the
	8	License, or (at your option) any later version.
19bc17a9	9
5290baf0 UD	10	The GNU C Library is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	Library General Public License for more details.
19bc17a9	14
5290baf0 UD	15	You should have received a copy of the GNU Library General Public
	16	License along with the GNU C Library; see the file COPYING.LIB. If not,
	17	write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
	18	Boston, MA 02111-1307, USA. */
19bc17a9 RM	19
	20	#ifdef HAVE_CONFIG_H
	21	# include <config.h>
	22	#endif
	23
	24	#include <ctype.h>
	25	#include <errno.h>
	26	#include <libintl.h>
	27	#include <stdarg.h>
	28	#include <stdlib.h>
	29	#include <string.h>
	30
4b10dd6c	31	#include "charmap.h"
19bc17a9 RM	32	#include "error.h"
19bc17a9 RM	33	#include "linereader.h"
4b10dd6c	34	#include "localedef.h"
19bc17a9 RM	35
19bc17a9 RM	36
4b10dd6c	37	/* Prototypes for local functions. */
19bc17a9 RM	38	static struct token get_toplvl_escape (struct linereader lr);
	39	static struct token get_symname (struct linereader lr);
	40	static struct token get_ident (struct linereader lr);
	41	static struct token get_string (struct linereader lr,
4b10dd6c UD	42	const struct charmap_t *charmap,
4b10dd6c UD	43	const struct repertoire_t *repertoire);
19bc17a9 RM	44
	45
	46	struct linereader *
	47	lr_open (const char *fname, kw_hash_fct_t hf)
	48	{
	49	FILE *fp;
19bc17a9 RM	50
	51	if (fname == NULL \|\| strcmp (fname, "-") == 0
	52	\|\| strcmp (fname, "/dev/stdin") == 0)
3e076219	53	return lr_create (stdin, "<stdin>", hf);
19bc17a9 RM	54	else
	55	{
	56	fp = fopen (fname, "r");
	57	if (fp == NULL)
	58	return NULL;
3e076219	59	return lr_create (fp, fname, hf);
19bc17a9	60	}
3e076219 UD	61	}
	62
	63	struct linereader *
	64	lr_create (FILE fp, const char fname, kw_hash_fct_t hf)
	65	{
	66	struct linereader *result;
	67	int n;
19bc17a9 RM	68
	69	result = (struct linereader ) xmalloc (sizeof (result));
	70
	71	result->fp = fp;
3e076219	72	result->fname = xstrdup (fname);
19bc17a9 RM	73	result->buf = NULL;
	74	result->bufsize = 0;
	75	result->lineno = 1;
	76	result->idx = 0;
	77	result->comment_char = '#';
	78	result->escape_char = '\\';
	79	result->translate_strings = 1;
	80
	81	n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
	82	if (n < 0)
	83	{
	84	int save = errno;
	85	fclose (result->fp);
46ec036d	86	free ((char *) result->fname);
19bc17a9 RM	87	free (result);
	88	errno = save;
	89	return NULL;
	90	}
	91
	92	if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
	93	n -= 2;
	94
	95	result->buf[n] = '\0';
	96	result->bufact = n;
	97	result->hash_fct = hf;
	98
	99	return result;
	100	}
	101
	102
	103	int
	104	lr_eof (struct linereader *lr)
	105	{
	106	return lr->bufact = 0;
	107	}
	108
	109
	110	void
	111	lr_close (struct linereader *lr)
	112	{
	113	fclose (lr->fp);
	114	free (lr->buf);
	115	free (lr);
	116	}
	117
	118
	119	int
	120	lr_next (struct linereader *lr)
	121	{
	122	int n;
	123
	124	n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
	125	if (n < 0)
	126	return -1;
	127
	128	++lr->lineno;
	129
	130	if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
	131	{
4b10dd6c UD	132	#if 0
4b10dd6c UD	133	/* XXX Is this correct? */
19bc17a9 RM	134	/* An escaped newline character is substituted with a single <SP>. */
	135	--n;
	136	lr->buf[n - 1] = ' ';
4b10dd6c UD	137	#else
	138	n -= 2;
	139	#endif
19bc17a9 RM	140	}
	141
	142	lr->buf[n] = '\0';
	143	lr->bufact = n;
	144	lr->idx = 0;
	145
	146	return 0;
	147	}
	148
	149
	150	/* Defined in error.c. */
	151	/* This variable is incremented each time `error' is called. */
	152	extern unsigned int error_message_count;
	153
	154	/* The calling program should define program_name and set it to the
	155	name of the executing program. */
	156	extern char *program_name;
	157
	158
	159	struct token *
4b10dd6c UD	160	lr_token (struct linereader lr, const struct charmap_t charmap,
4b10dd6c UD	161	const struct repertoire_t *repertoire)
19bc17a9 RM	162	{
	163	int ch;
	164
	165	while (1)
	166	{
	167	do
	168	{
	169	ch = lr_getc (lr);
	170
76fbcfdd UD	171	if (ch == EOF)
	172	{
	173	lr->token.tok = tok_eof;
	174	return &lr->token;
	175	};
	176
19bc17a9 RM	177	if (ch == '\n')
	178	{
	179	lr->token.tok = tok_eol;
	180	return &lr->token;
	181	}
	182	}
	183	while (isspace (ch));
	184
	185	if (ch == EOF)
	186	{
	187	lr->token.tok = tok_eof;
	188	return &lr->token;
	189	};
	190
	191	if (ch != lr->comment_char)
	192	break;
	193
a0dc5206 UD	194	/* Is there an newline at the end of the buffer? */
	195	if (lr->buf[lr->bufact - 1] != '\n')
	196	{
	197	/* No. Some people want this to mean that only the line in
	198	the file not the logical, concatenated line is ignored.
	199	Let's try this. */
	200	lr->idx = lr->bufact;
	201	continue;
	202	}
	203
19bc17a9 RM	204	/* Ignore rest of line. */
	205	lr_ignore_rest (lr, 0);
	206	lr->token.tok = tok_eol;
	207	return &lr->token;
	208	}
	209
	210	/* Match escape sequences. */
	211	if (ch == lr->escape_char)
	212	return get_toplvl_escape (lr);
	213
	214	/* Match ellipsis. */
4b10dd6c	215	if (ch == '.')
19bc17a9	216	{
a0dc5206 UD	217	if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
	218	{
	219	int cnt;
	220	for (cnt = 0; cnt < 10; ++cnt)
	221	lr_getc (lr);
	222	lr->token.tok = tok_ellipsis4_2;
	223	return &lr->token;
	224	}
4b10dd6c UD	225	if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
	226	{
	227	lr_getc (lr);
	228	lr_getc (lr);
	229	lr_getc (lr);
	230	lr->token.tok = tok_ellipsis4;
	231	return &lr->token;
	232	}
	233	if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
	234	{
	235	lr_getc (lr);
	236	lr_getc (lr);
	237	lr->token.tok = tok_ellipsis3;
	238	return &lr->token;
	239	}
a0dc5206 UD	240	if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
	241	{
	242	int cnt;
	243	for (cnt = 0; cnt < 6; ++cnt)
	244	lr_getc (lr);
	245	lr->token.tok = tok_ellipsis2_2;
	246	return &lr->token;
	247	}
4b10dd6c UD	248	if (lr->buf[lr->idx] == '.')
	249	{
	250	lr_getc (lr);
	251	lr->token.tok = tok_ellipsis2;
	252	return &lr->token;
	253	}
19bc17a9 RM	254	}
	255
	256	switch (ch)
	257	{
	258	case '<':
	259	return get_symname (lr);
	260
	261	case '0' ... '9':
	262	lr->token.tok = tok_number;
	263	lr->token.val.num = ch - '0';
	264
	265	while (isdigit (ch = lr_getc (lr)))
	266	{
	267	lr->token.val.num *= 10;
	268	lr->token.val.num += ch - '0';
	269	}
	270	if (isalpha (ch))
5290baf0	271	lr_error (lr, _("garbage at end of number"));
19bc17a9 RM	272	lr_ungetn (lr, 1);
	273
	274	return &lr->token;
	275
	276	case ';':
	277	lr->token.tok = tok_semicolon;
	278	return &lr->token;
	279
	280	case ',':
	281	lr->token.tok = tok_comma;
	282	return &lr->token;
	283
	284	case '(':
	285	lr->token.tok = tok_open_brace;
	286	return &lr->token;
	287
	288	case ')':
	289	lr->token.tok = tok_close_brace;
	290	return &lr->token;
	291
	292	case '"':
4b10dd6c	293	return get_string (lr, charmap, repertoire);
19bc17a9 RM	294
	295	case '-':
	296	ch = lr_getc (lr);
	297	if (ch == '1')
	298	{
	299	lr->token.tok = tok_minus1;
	300	return &lr->token;
	301	}
	302	lr_ungetn (lr, 2);
	303	break;
	304	}
	305
	306	return get_ident (lr);
	307	}
	308
	309
	310	static struct token *
	311	get_toplvl_escape (struct linereader *lr)
	312	{
	313	/* This is supposed to be a numeric value. We return the
	314	numerical value and the number of bytes. */
	315	size_t start_idx = lr->idx - 1;
4b10dd6c	316	char *bytes = lr->token.val.charcode.bytes;
19bc17a9 RM	317	int nbytes = 0;
	318	int ch;
	319
	320	do
	321	{
	322	unsigned int byte = 0;
	323	unsigned int base = 8;
	324
	325	ch = lr_getc (lr);
	326
	327	if (ch == 'd')
	328	{
	329	base = 10;
	330	ch = lr_getc (lr);
	331	}
	332	else if (ch == 'x')
	333	{
	334	base = 16;
	335	ch = lr_getc (lr);
	336	}
	337
	338	if ((base == 16 && !isxdigit (ch))
ba1ffaa1	339	\|\| (base != 16 && (ch < '0' \|\| ch >= (int) ('0' + base))))
19bc17a9 RM	340	{
19bc17a9 RM	341	esc_error:
4b10dd6c	342	lr->token.val.str.startmb = &lr->buf[start_idx];
19bc17a9	343
76fbcfdd	344	while (ch != EOF && !isspace (ch))
19bc17a9	345	ch = lr_getc (lr);
4b10dd6c	346	lr->token.val.str.lenmb = lr->idx - start_idx;
19bc17a9 RM	347
	348	lr->token.tok = tok_error;
	349	return &lr->token;
	350	}
	351
	352	if (isdigit (ch))
	353	byte = ch - '0';
	354	else
4b10dd6c	355	byte = tolower (ch) - 'a' + 10;
19bc17a9 RM	356
	357	ch = lr_getc (lr);
	358	if ((base == 16 && !isxdigit (ch))
ba1ffaa1	359	\|\| (base != 16 && (ch < '0' \|\| ch >= (int) ('0' + base))))
19bc17a9 RM	360	goto esc_error;
	361
	362	byte *= base;
	363	if (isdigit (ch))
	364	byte += ch - '0';
	365	else
4b10dd6c	366	byte += tolower (ch) - 'a' + 10;
19bc17a9 RM	367
	368	ch = lr_getc (lr);
	369	if (base != 16 && isdigit (ch))
	370	{
	371	byte *= base;
679f5a56	372	byte += ch - '0';
19bc17a9 RM	373
	374	ch = lr_getc (lr);
	375	}
	376
4b10dd6c	377	bytes[nbytes++] = byte;
19bc17a9 RM	378	}
	379	while (ch == lr->escape_char && nbytes < 4);
	380
	381	if (!isspace (ch))
	382	lr_error (lr, _("garbage at end of character code specification"));
	383
	384	lr_ungetn (lr, 1);
	385
	386	lr->token.tok = tok_charcode;
19bc17a9 RM	387	lr->token.val.charcode.nbytes = nbytes;
	388
	389	return &lr->token;
	390	}
	391
	392
4b10dd6c UD	393	#define ADDC(ch) \
	394	do \
	395	{ \
	396	if (bufact == bufmax) \
	397	{ \
	398	bufmax *= 2; \
	399	buf = xrealloc (buf, bufmax); \
	400	} \
	401	buf[bufact++] = (ch); \
	402	} \
	403	while (0)
	404
	405
	406	#define ADDS(s, l) \
	407	do \
	408	{ \
	409	size_t _l = (l); \
	410	if (bufact + _l > bufmax) \
	411	{ \
	412	if (bufact < _l) \
	413	bufact = _l; \
	414	bufmax *= 2; \
	415	buf = xrealloc (buf, bufmax); \
	416	} \
	417	memcpy (&buf[bufact], s, _l); \
	418	bufact += _l; \
	419	} \
	420	while (0)
	421
	422
	423	#define ADDWC(ch) \
	424	do \
	425	{ \
	426	if (buf2act == buf2max) \
	427	{ \
	428	buf2max *= 2; \
	429	buf2 = xrealloc (buf2, buf2max * 4); \
	430	} \
	431	buf2[buf2act++] = (ch); \
	432	} \
19bc17a9 RM	433	while (0)
	434
	435
	436	static struct token *
	437	get_symname (struct linereader *lr)
	438	{
	439	/* Symbol in brackets. We must distinguish three kinds:
	440	1. reserved words
	441	2. ISO 10646 position values
	442	3. all other. */
	443	char *buf;
	444	size_t bufact = 0;
	445	size_t bufmax = 56;
	446	const struct keyword_t *kw;
	447	int ch;
	448
	449	buf = (char *) xmalloc (bufmax);
	450
	451	do
	452	{
	453	ch = lr_getc (lr);
	454	if (ch == lr->escape_char)
	455	{
	456	int c2 = lr_getc (lr);
	457	ADDC (c2);
	458
	459	if (c2 == '\n')
	460	ch = '\n';
	461	}
	462	else
	463	ADDC (ch);
	464	}
	465	while (ch != '>' && ch != '\n');
	466
	467	if (ch == '\n')
	468	lr_error (lr, _("unterminated symbolic name"));
	469
	470	/* Test for ISO 10646 position value. */
	471	if (buf[0] == 'U' && (bufact == 6 \|\| bufact == 10))
	472	{
	473	char *cp = buf + 1;
	474	while (cp < &buf[bufact - 1] && isxdigit (*cp))
	475	++cp;
	476
	477	if (cp == &buf[bufact - 1])
	478	{
	479	/* Yes, it is. */
4b10dd6c UD	480	lr->token.tok = tok_ucs4;
4b10dd6c UD	481	lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
19bc17a9 RM	482
	483	return &lr->token;
	484	}
	485	}
	486
	487	/* It is a symbolic name. Test for reserved words. */
	488	kw = lr->hash_fct (buf, bufact - 1);
	489
	490	if (kw != NULL && kw->symname_or_ident == 1)
	491	{
	492	lr->token.tok = kw->token;
	493	free (buf);
	494	}
	495	else
	496	{
	497	lr->token.tok = tok_bsymbol;
	498
	499	buf[bufact] = '\0';
	500	buf = xrealloc (buf, bufact + 1);
	501
4b10dd6c UD	502	lr->token.val.str.startmb = buf;
4b10dd6c UD	503	lr->token.val.str.lenmb = bufact - 1;
19bc17a9 RM	504	}
	505
	506	return &lr->token;
	507	}
	508
	509
	510	static struct token *
	511	get_ident (struct linereader *lr)
	512	{
	513	char *buf;
	514	size_t bufact;
	515	size_t bufmax = 56;
	516	const struct keyword_t *kw;
	517	int ch;
	518
	519	buf = xmalloc (bufmax);
	520	bufact = 0;
	521
	522	ADDC (lr->buf[lr->idx - 1]);
	523
	524	while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
	525	&& ch != '<' && ch != ',')
4b10dd6c UD	526	{
	527	if (ch == lr->escape_char)
	528	{
	529	ch = lr_getc (lr);
	530	if (ch == '\n' \|\| ch == EOF)
	531	{
	532	lr_error (lr, _("invalid escape sequence"));
	533	break;
	534	}
	535	}
	536	ADDC (ch);
	537	}
19bc17a9 RM	538
	539	lr_ungetn (lr, 1);
	540
	541	kw = lr->hash_fct (buf, bufact);
	542
	543	if (kw != NULL && kw->symname_or_ident == 0)
	544	{
	545	lr->token.tok = kw->token;
	546	free (buf);
	547	}
	548	else
	549	{
	550	lr->token.tok = tok_ident;
	551
	552	buf[bufact] = '\0';
	553	buf = xrealloc (buf, bufact + 1);
	554
4b10dd6c UD	555	lr->token.val.str.startmb = buf;
4b10dd6c UD	556	lr->token.val.str.lenmb = bufact;
19bc17a9 RM	557	}
	558
	559	return &lr->token;
	560	}
	561
	562
	563	static struct token *
4b10dd6c UD	564	get_string (struct linereader lr, const struct charmap_t charmap,
4b10dd6c UD	565	const struct repertoire_t *repertoire)
19bc17a9	566	{
4b10dd6c UD	567	int return_widestr = lr->return_widestr;
4b10dd6c UD	568	char *buf;
a9c27b3e	569	wchar_t *buf2 = NULL;
19bc17a9 RM	570	size_t bufact;
19bc17a9 RM	571	size_t bufmax = 56;
19bc17a9	572
4b10dd6c	573	/* We must return two different strings. */
19bc17a9 RM	574	buf = xmalloc (bufmax);
	575	bufact = 0;
	576
4b10dd6c UD	577	/* We know it'll be a string. */
	578	lr->token.tok = tok_string;
	579
	580	/* If we need not translate the strings (i.e., expand <...> parts)
	581	we can run a simple loop. */
	582	if (!lr->translate_strings)
	583	{
	584	int ch;
	585
	586	buf2 = NULL;
	587	while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
19bc17a9	588	ADDC (ch);
4b10dd6c UD	589
	590	/* Catch errors with trailing escape character. */
	591	if (bufact > 0 && buf[bufact - 1] == lr->escape_char
	592	&& (bufact == 1 \|\| buf[bufact - 2] != lr->escape_char))
	593	{
	594	lr_error (lr, _("illegal escape sequence at end of string"));
	595	--bufact;
	596	}
	597	else if (ch == '\n' \|\| ch == EOF)
	598	lr_error (lr, _("unterminated string"));
	599
	600	ADDC ('\0');
	601	}
	602	else
	603	{
	604	int illegal_string = 0;
	605	size_t buf2act = 0;
	606	size_t buf2max = 56 * sizeof (uint32_t);
	607	int ch;
	608	int warned = 0;
	609
	610	/* We have to provide the wide character result as well. */
	611	if (return_widestr)
	612	buf2 = xmalloc (buf2max);
	613
	614	/* Read until the end of the string (or end of the line or file). */
	615	while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
	616	{
	617	size_t startidx;
	618	uint32_t wch;
	619	struct charseq *seq;
	620
	621	if (ch != '<')
	622	{
	623	/* The standards leave it up to the implementation to decide
	624	what to do with character which stand for themself. We
	625	could jump through hoops to find out the value relative to
	626	the charmap and the repertoire map, but instead we leave
	627	it up to the locale definition author to write a better
	628	definition. We assume here that every character which
	629	stands for itself is encoded using ISO 8859-1. Using the
	630	escape character is allowed. */
	631	if (ch == lr->escape_char)
	632	{
	633	ch = lr_getc (lr);
	634	if (ch == '\n' \|\| ch == EOF)
	635	break;
	636	}
	637
	638	if (verbose && !warned)
	639	{
	640	lr_error (lr, _("\
	641	non-symbolic character value should not be used"));
	642	warned = 1;
	643	}
	644
	645	ADDC (ch);
	646	if (return_widestr)
	647	ADDWC ((uint32_t) ch);
	648
	649	continue;
	650	}
	651
	652	/* Now we have to search for the end of the symbolic name, i.e.,
653	the closing '>'. */
654	startidx = bufact;
655	while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
656	{
657	if (ch == lr->escape_char)
658	{
659	ch = lr_getc (lr);
660	if (ch == '\n' \|\| ch == EOF)
661	break;
662	}
663	ADDC (ch);
664	}
665	if (ch == '\n' \|\| ch == EOF)
666	/* Not a correct string. */
667	break;
668	if (bufact == startidx)
669	{
670	/* <> is no correct name. Ignore it and also signal an
671	error. */
19bc17a9	672	illegal_string = 1;
4b10dd6c UD	673	continue;
4b10dd6c UD	674	}
19bc17a9	675
4b10dd6c UD	676	/* It might be a Uxxxx symbol. */
	677	if (buf[startidx] == 'U'
	678	&& (bufact - startidx == 5 \|\| bufact - startidx == 9))
	679	{
	680	char *cp = buf + startidx + 1;
	681	while (cp < &buf[bufact] && isxdigit (*cp))
	682	++cp;
	683
	684	if (cp == &buf[bufact])
	685	{
3c833378	686	char utmp[10];
4b10dd6c UD	687	const char *symbol = NULL;
	688
	689	/* Yes, it is. */
	690	ADDC ('\0');
	691	wch = strtoul (buf + startidx + 1, NULL, 16);
	692
	693	/* Now forget about the name we just added. */
	694	bufact = startidx;
	695
	696	if (return_widestr)
	697	ADDWC (wch);
	698
3c833378 UD	699	/* See whether the charmap contains the Uxxxxxxxx names. */
	700	snprintf (utmp, sizeof (utmp), "U%08X", wch);
	701	seq = charmap_find_value (charmap, utmp, 9);
4b10dd6c	702
3c833378	703	if (seq == NULL)
4b10dd6c	704	{
3c833378 UD	705	/* No, this isn't the case. Now determine from
	706	the repertoire the name of the character and
	707	find it in the charmap. */
	708	if (repertoire != NULL)
	709	symbol = repertoire_find_symbol (repertoire, wch);
4b10dd6c	710
3c833378	711	if (symbol == NULL)
d364e525 UD	712	/* We cannot generate a string since we
	713	cannot map from the Unicode number to the
	714	character symbol. */
	715	illegal_string = 1;
4b10dd6c	716	else
3c833378 UD	717	{
	718	seq = charmap_find_value (charmap, symbol,
	719	strlen (symbol));
	720
	721	if (seq == NULL)
d364e525 UD	722	/* Not a known name. */
d364e525 UD	723	illegal_string = 1;
3c833378	724	}
4b10dd6c UD	725	}
4b10dd6c UD	726
3c833378 UD	727	if (seq != NULL)
	728	ADDS (seq->bytes, seq->nbytes);
	729
4b10dd6c UD	730	continue;
	731	}
	732	}
	733
3c833378 UD	734	/* We now have the symbolic name in buf[startidx] to
	735	buf[bufact-1]. Now find out the value for this character
	736	in the charmap as well as in the repertoire map (in this
	737	order). */
	738	seq = charmap_find_value (charmap, &buf[startidx],
	739	bufact - startidx);
	740
	741	if (seq == NULL)
	742	{
	743	/* This name is not in the charmap. */
	744	lr_error (lr, _("symbol `%.*s' not in charmap"),
	745	(int) (bufact - startidx), &buf[startidx]);
	746	illegal_string = 1;
	747	}
	748
4b10dd6c UD	749	if (return_widestr)
4b10dd6c UD	750	{
3c833378 UD	751	/* Now the same for the multibyte representation. */
	752	if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
	753	wch = seq->ucs4;
	754	else
	755	{
	756	wch = repertoire_find_value (repertoire, &buf[startidx],
	757	bufact - startidx);
	758	if (seq != NULL)
	759	seq->ucs4 = wch;
	760	}
	761
4b10dd6c UD	762	if (wch == ILLEGAL_CHAR_VALUE)
	763	{
	764	/* This name is not in the repertoire map. */
	765	lr_error (lr, _("symbol `%.*s' not in repertoire map"),
70e51ab9	766	(int) (bufact - startidx), &buf[startidx]);
4b10dd6c UD	767	illegal_string = 1;
	768	}
	769	else
	770	ADDWC (wch);
	771	}
	772
3c833378 UD	773	/* Now forget about the name we just added. */
3c833378 UD	774	bufact = startidx;
19bc17a9	775
3c833378 UD	776	/* And copy the bytes. */
	777	if (seq != NULL)
	778	ADDS (seq->bytes, seq->nbytes);
4b10dd6c	779	}
19bc17a9	780
4b10dd6c UD	781	if (ch == '\n' \|\| ch == EOF)
	782	{
	783	lr_error (lr, _("unterminated string"));
	784	illegal_string = 1;
	785	}
19bc17a9	786
4b10dd6c UD	787	if (illegal_string)
	788	{
	789	free (buf);
	790	if (buf2 != NULL)
	791	free (buf2);
	792	lr->token.val.str.startmb = NULL;
	793	lr->token.val.str.lenmb = 0;
19bc17a9	794
4b10dd6c UD	795	return &lr->token;
4b10dd6c UD	796	}
19bc17a9	797
4b10dd6c	798	ADDC ('\0');
19bc17a9	799
4b10dd6c UD	800	if (return_widestr)
	801	{
	802	ADDWC (0);
	803	lr->token.val.str.startwc = xrealloc (buf2,
	804	buf2act * sizeof (uint32_t));
	805	lr->token.val.str.lenwc = buf2act;
	806	}
19bc17a9 RM	807	}
19bc17a9 RM	808
4b10dd6c UD	809	lr->token.val.str.startmb = xrealloc (buf, bufact);
	810	lr->token.val.str.lenmb = bufact;
	811
19bc17a9 RM	812	return &lr->token;
19bc17a9 RM	813	}