locale/programs/linereader.c

   1 /* Copyright (C) 1996-2023 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3
   4    This program is free software; you can redistribute it and/or modify
   5    it under the terms of the GNU General Public License as published
   6    by the Free Software Foundation; version 2 of the License, or
   7    (at your option) any later version.
   8
   9    This program is distributed in the hope that it will be useful,
  10    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12    GNU General Public License for more details.
  13
  14    You should have received a copy of the GNU General Public License
  15    along with this program; if not, see <https://www.gnu.org/licenses/>.  */
  16
  17 #ifdef HAVE_CONFIG_H
  18 # include <config.h>
  19 #endif
  20
  21 #include <assert.h>
  22 #include <ctype.h>
  23 #include <errno.h>
  24 #include <libintl.h>
  25 #include <stdarg.h>
  26 #include <stdlib.h>
  27 #include <string.h>
  28 #include <stdint.h>
  29
  30 #include "localedef.h"
  31 #include "charmap.h"
  32 #include "error.h"
  33 #include "linereader.h"
  34 #include "locfile.h"
  35
  36 /* Prototypes for local functions.  */
  37 static struct token *get_toplvl_escape (struct linereader *lr);
  38 static struct token *get_symname (struct linereader *lr);
  39 static struct token *get_ident (struct linereader *lr);
  40 static struct token *get_string (struct linereader *lr,
  41                                  const struct charmap_t *charmap,
  42                                  struct localedef_t *locale,
  43                                  const struct repertoire_t *repertoire,
  44                                  int verbose);
  45 static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
  46
  47
  48 struct linereader *
  49 lr_open (const char *fname, kw_hash_fct_t hf)
  50 {
  51   FILE *fp;
  52
  53   if (fname == NULL || strcmp (fname, "-") == 0
  54       || strcmp (fname, "/dev/stdin") == 0)
  55     return lr_create (stdin, "<stdin>", hf);
  56   else
  57     {
  58       fp = fopen (fname, "rm");
  59       if (fp == NULL)
  60         return NULL;
  61       return lr_create (fp, fname, hf);
  62     }
  63 }
  64
  65 struct linereader *
  66 lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
  67 {
  68   struct linereader *result;
  69   int n;
  70
  71   result = (struct linereader *) xmalloc (sizeof (*result));
  72
  73   result->fp = fp;
  74   result->fname = xstrdup (fname);
  75   result->buf = NULL;
  76   result->bufsize = 0;
  77   result->lineno = 1;
  78   result->idx = 0;
  79   result->comment_char = '#';
  80   result->escape_char = '\\';
  81   result->translate_strings = 1;
  82   result->return_widestr = 0;
  83
  84   n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
  85   if (n < 0)
  86     {
  87       int save = errno;
  88       fclose (result->fp);
  89       free ((char *) result->fname);
  90       free (result);
  91       errno = save;
  92       return NULL;
  93     }
  94
  95   if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
  96     n -= 2;
  97
  98   result->buf[n] = '\0';
  99   result->bufact = n;
 100   result->hash_fct = hf;
 101
 102   return result;
 103 }
 104
 105
 106 int
 107 lr_eof (struct linereader *lr)
 108 {
 109   return lr->bufact = 0;
 110 }
 111
 112
 113 void
 114 lr_ignore_rest (struct linereader *lr, int verbose)
 115 {
 116   if (verbose)
 117     {
 118       while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
 119              && lr->buf[lr->idx] != lr->comment_char)
 120         if (lr->buf[lr->idx] == '\0')
 121           {
 122             if (lr_next (lr) < 0)
 123               return;
 124           }
 125         else
 126           ++lr->idx;
 127
 128       if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
 129           && lr->buf[lr->idx] != lr->comment_char)
 130         lr_error (lr, _("trailing garbage at end of line"));
 131     }
 132
 133   /* Ignore continued line.  */
 134   while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
 135     if (lr_next (lr) < 0)
 136       break;
 137
 138   lr->idx = lr->bufact;
 139 }
 140
 141
 142 void
 143 lr_close (struct linereader *lr)
 144 {
 145   fclose (lr->fp);
 146   free (lr->buf);
 147   free (lr);
 148 }
 149
 150
 151 int
 152 lr_next (struct linereader *lr)
 153 {
 154   int n;
 155
 156   n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
 157   if (n < 0)
 158     return -1;
 159
 160   ++lr->lineno;
 161
 162   if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
 163     {
 164 #if 0
 165       /* XXX Is this correct?  */
 166       /* An escaped newline character is substituted with a single <SP>.  */
 167       --n;
 168       lr->buf[n - 1] = ' ';
 169 #else
 170       n -= 2;
 171 #endif
 172     }
 173
 174   lr->buf[n] = '\0';
 175   lr->bufact = n;
 176   lr->idx = 0;
 177
 178   return 0;
 179 }
 180
 181
 182 /* Defined in error.c.  */
 183 /* This variable is incremented each time `error' is called.  */
 184 extern unsigned int error_message_count;
 185
 186 /* The calling program should define program_name and set it to the
 187    name of the executing program.  */
 188 extern char *program_name;
 189
 190
 191 struct token *
 192 lr_token (struct linereader *lr, const struct charmap_t *charmap,
 193           struct localedef_t *locale, const struct repertoire_t *repertoire,
 194           int verbose)
 195 {
 196   int ch;
 197
 198   while (1)
 199     {
 200       do
 201         {
 202           ch = lr_getc (lr);
 203
 204           if (ch == EOF)
 205             {
 206               lr->token.tok = tok_eof;
 207               return &lr->token;
 208             };
 209
 210           if (ch == '\n')
 211             {
 212               lr->token.tok = tok_eol;
 213               return &lr->token;
 214             }
 215         }
 216       while (isspace (ch));
 217
 218       if (ch != lr->comment_char)
 219         break;
 220
 221       /* Is there an newline at the end of the buffer?  */
 222       if (lr->buf[lr->bufact - 1] != '\n')
 223         {
 224           /* No.  Some people want this to mean that only the line in
 225              the file not the logical, concatenated line is ignored.
 226              Let's try this.  */
 227           lr->idx = lr->bufact;
 228           continue;
 229         }
 230
 231       /* Ignore rest of line.  */
 232       lr_ignore_rest (lr, 0);
 233       lr->token.tok = tok_eol;
 234       return &lr->token;
 235     }
 236
 237   /* Match escape sequences.  */
 238   if (ch == lr->escape_char)
 239     return get_toplvl_escape (lr);
 240
 241   /* Match ellipsis.  */
 242   if (ch == '.')
 243     {
 244       if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
 245         {
 246           int cnt;
 247           for (cnt = 0; cnt < 10; ++cnt)
 248             lr_getc (lr);
 249           lr->token.tok = tok_ellipsis4_2;
 250           return &lr->token;
 251         }
 252       if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
 253         {
 254           lr_getc (lr);
 255           lr_getc (lr);
 256           lr_getc (lr);
 257           lr->token.tok = tok_ellipsis4;
 258           return &lr->token;
 259         }
 260       if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
 261         {
 262           lr_getc (lr);
 263           lr_getc (lr);
 264           lr->token.tok = tok_ellipsis3;
 265           return &lr->token;
 266         }
 267       if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
 268         {
 269           int cnt;
 270           for (cnt = 0; cnt < 6; ++cnt)
 271             lr_getc (lr);
 272           lr->token.tok = tok_ellipsis2_2;
 273           return &lr->token;
 274         }
 275       if (lr->buf[lr->idx] == '.')
 276         {
 277           lr_getc (lr);
 278           lr->token.tok = tok_ellipsis2;
 279           return &lr->token;
 280         }
 281     }
 282
 283   switch (ch)
 284     {
 285     case '<':
 286       return get_symname (lr);
 287
 288     case '0' ... '9':
 289       lr->token.tok = tok_number;
 290       lr->token.val.num = ch - '0';
 291
 292       while (isdigit (ch = lr_getc (lr)))
 293         {
 294           lr->token.val.num *= 10;
 295           lr->token.val.num += ch - '0';
 296         }
 297       if (isalpha (ch))
 298         lr_error (lr, _("garbage at end of number"));
 299       lr_ungetn (lr, 1);
 300
 301       return &lr->token;
 302
 303     case ';':
 304       lr->token.tok = tok_semicolon;
 305       return &lr->token;
 306
 307     case ',':
 308       lr->token.tok = tok_comma;
 309       return &lr->token;
 310
 311     case '(':
 312       lr->token.tok = tok_open_brace;
 313       return &lr->token;
 314
 315     case ')':
 316       lr->token.tok = tok_close_brace;
 317       return &lr->token;
 318
 319     case '"':
 320       return get_string (lr, charmap, locale, repertoire, verbose);
 321
 322     case '-':
 323       ch = lr_getc (lr);
 324       if (ch == '1')
 325         {
 326           lr->token.tok = tok_minus1;
 327           return &lr->token;
 328         }
 329       lr_ungetn (lr, 2);
 330       break;
 331
 332     case 0x80 ... 0xff:         /* UTF-8 sequence.  */
 333       {
 334         uint32_t wch;
 335         if (!utf8_decode (lr, ch, &wch))
 336           {
 337             lr->token.tok = tok_error;
 338             return &lr->token;
 339           }
 340         lr->token.tok = tok_ucs4;
 341         lr->token.val.ucs4 = wch;
 342         return &lr->token;
 343       }
 344     }
 345
 346   return get_ident (lr);
 347 }
 348
 349
 350 static struct token *
 351 get_toplvl_escape (struct linereader *lr)
 352 {
 353   /* This is supposed to be a numeric value.  We return the
 354      numerical value and the number of bytes.  */
 355   size_t start_idx = lr->idx - 1;
 356   unsigned char *bytes = lr->token.val.charcode.bytes;
 357   size_t nbytes = 0;
 358   int ch;
 359
 360   do
 361     {
 362       unsigned int byte = 0;
 363       unsigned int base = 8;
 364
 365       ch = lr_getc (lr);
 366
 367       if (ch == 'd')
 368         {
 369           base = 10;
 370           ch = lr_getc (lr);
 371         }
 372       else if (ch == 'x')
 373         {
 374           base = 16;
 375           ch = lr_getc (lr);
 376         }
 377
 378       if ((base == 16 && !isxdigit (ch))
 379           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 380         {
 381         esc_error:
 382           lr->token.val.str.startmb = &lr->buf[start_idx];
 383
 384           while (ch != EOF && !isspace (ch))
 385             ch = lr_getc (lr);
 386           lr->token.val.str.lenmb = lr->idx - start_idx;
 387
 388           lr->token.tok = tok_error;
 389           return &lr->token;
 390         }
 391
 392       if (isdigit (ch))
 393         byte = ch - '0';
 394       else
 395         byte = tolower (ch) - 'a' + 10;
 396
 397       ch = lr_getc (lr);
 398       if ((base == 16 && !isxdigit (ch))
 399           || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
 400         goto esc_error;
 401
 402       byte *= base;
 403       if (isdigit (ch))
 404         byte += ch - '0';
 405       else
 406         byte += tolower (ch) - 'a' + 10;
 407
 408       ch = lr_getc (lr);
 409       if (base != 16 && isdigit (ch))
 410         {
 411           byte *= base;
 412           byte += ch - '0';
 413
 414           ch = lr_getc (lr);
 415         }
 416
 417       bytes[nbytes++] = byte;
 418     }
 419   while (ch == lr->escape_char
 420          && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
 421
 422   if (!isspace (ch))
 423     lr_error (lr, _("garbage at end of character code specification"));
 424
 425   lr_ungetn (lr, 1);
 426
 427   lr->token.tok = tok_charcode;
 428   lr->token.val.charcode.nbytes = nbytes;
 429
 430   return &lr->token;
 431 }
 432
 433 /* Multibyte string buffer.  */
 434 struct lr_buffer
 435 {
 436   size_t act;
 437   size_t max;
 438   char *buf;
 439 };
 440
 441 /* Initialize *LRB with a default-sized buffer.  */
 442 static void
 443 lr_buffer_init (struct lr_buffer *lrb)
 444 {
 445  lrb->act = 0;
 446  lrb->max = 56;
 447  lrb->buf = xmalloc (lrb->max);
 448 }
 449
 450 /* Transfers the buffer string from *LRB to LR->token.mbstr.  */
 451 static void
 452 lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
 453 {
 454   lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
 455   lr->token.val.str.startmb[lrb->act] = '\0';
 456   lr->token.val.str.lenmb = lrb->act;
 457 }
 458
 459 /* Adds CH to *LRB.  */
 460 static void
 461 addc (struct lr_buffer *lrb, char ch)
 462 {
 463   if (lrb->act == lrb->max)
 464     {
 465       lrb->max *= 2;
 466       lrb->buf = xrealloc (lrb->buf, lrb->max);
 467     }
 468   lrb->buf[lrb->act++] = ch;
 469 }
 470
 471 /* Adds L bytes at S to *LRB.  */
 472 static void
 473 adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
 474 {
 475   if (lrb->max - lrb->act < l)
 476     {
 477       size_t required_size = lrb->act + l;
 478       size_t new_max = 2 * lrb->max;
 479       if (new_max < required_size)
 480         new_max = required_size;
 481       lrb->buf = xrealloc (lrb->buf, new_max);
 482       lrb->max = new_max;
 483     }
 484   memcpy (lrb->buf + lrb->act, s, l);
 485   lrb->act += l;
 486 }
 487
 488 #define ADDWC(ch) \
 489   do                                                                          \
 490     {                                                                         \
 491       if (buf2act == buf2max)                                                 \
 492         {                                                                     \
 493           buf2max *= 2;                                                       \
 494           buf2 = xrealloc (buf2, buf2max * 4);                                \
 495         }                                                                     \
 496       buf2[buf2act++] = (ch);                                                 \
 497     }                                                                         \
 498   while (0)
 499
 500
 501 static struct token *
 502 get_symname (struct linereader *lr)
 503 {
 504   /* Symbol in brackets.  We must distinguish three kinds:
 505      1. reserved words
 506      2. ISO 10646 position values
 507      3. all other.  */
 508   const struct keyword_t *kw;
 509   int ch;
 510   struct lr_buffer lrb;
 511
 512   lr_buffer_init (&lrb);
 513
 514   do
 515     {
 516       ch = lr_getc (lr);
 517       if (ch == lr->escape_char)
 518         {
 519           int c2 = lr_getc (lr);
 520           addc (&lrb, c2);
 521
 522           if (c2 == '\n')
 523             ch = '\n';
 524         }
 525       else
 526         addc (&lrb, ch);
 527     }
 528   while (ch != '>' && ch != '\n');
 529
 530   if (ch == '\n')
 531     lr_error (lr, _("unterminated symbolic name"));
 532
 533   /* Test for ISO 10646 position value.  */
 534   if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
 535     {
 536       char *cp = lrb.buf + 1;
 537       while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
 538         ++cp;
 539
 540       if (cp == &lrb.buf[lrb.act - 1])
 541         {
 542           /* Yes, it is.  */
 543           lr->token.tok = tok_ucs4;
 544           lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
 545
 546           return &lr->token;
 547         }
 548     }
 549
 550   /* It is a symbolic name.  Test for reserved words.  */
 551   kw = lr->hash_fct (lrb.buf, lrb.act - 1);
 552
 553   if (kw != NULL && kw->symname_or_ident == 1)
 554     {
 555       lr->token.tok = kw->token;
 556       free (lrb.buf);
 557     }
 558   else
 559     {
 560       lr->token.tok = tok_bsymbol;
 561       lr_buffer_to_token (&lrb, lr);
 562       --lr->token.val.str.lenmb;  /* Hide the training '>'.  */
 563     }
 564
 565   return &lr->token;
 566 }
 567
 568
 569 static struct token *
 570 get_ident (struct linereader *lr)
 571 {
 572   const struct keyword_t *kw;
 573   int ch;
 574   struct lr_buffer lrb;
 575
 576   lr_buffer_init (&lrb);
 577
 578   addc (&lrb, lr->buf[lr->idx - 1]);
 579
 580   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 581          && ch != '<' && ch != ',' && ch != EOF)
 582     {
 583       if (ch == lr->escape_char)
 584         {
 585           ch = lr_getc (lr);
 586           if (ch == '\n' || ch == EOF)
 587             {
 588               lr_error (lr, _("invalid escape sequence"));
 589               break;
 590             }
 591         }
 592       addc (&lrb, ch);
 593     }
 594
 595   lr_ungetc (lr, ch);
 596
 597   kw = lr->hash_fct (lrb.buf, lrb.act);
 598
 599   if (kw != NULL && kw->symname_or_ident == 0)
 600     {
 601       lr->token.tok = kw->token;
 602       free (lrb.buf);
 603     }
 604   else
 605     {
 606       lr->token.tok = tok_ident;
 607       lr_buffer_to_token (&lrb, lr);
 608     }
 609
 610   return &lr->token;
 611 }
 612
 613 /* Process a decoded Unicode codepoint WCH in a string, placing the
 614    multibyte sequence into LRB.  Return false if the character is not
 615    found in CHARMAP/REPERTOIRE.  */
 616 static bool
 617 translate_unicode_codepoint (struct localedef_t *locale,
 618                              const struct charmap_t *charmap,
 619                              const struct repertoire_t *repertoire,
 620                              uint32_t wch, struct lr_buffer *lrb)
 621 {
 622   /* See whether the charmap contains the Uxxxxxxxx names.  */
 623   char utmp[10];
 624   snprintf (utmp, sizeof (utmp), "U%08X", wch);
 625   struct charseq *seq = charmap_find_value (charmap, utmp, 9);
 626
 627   if (seq == NULL)
 628     {
 629       /* No, this isn't the case.  Now determine from
 630          the repertoire the name of the character and
 631          find it in the charmap.  */
 632       if (repertoire != NULL)
 633         {
 634           const char *symbol = repertoire_find_symbol (repertoire, wch);
 635           if (symbol != NULL)
 636             seq = charmap_find_value (charmap, symbol, strlen (symbol));
 637         }
 638
 639       if (seq == NULL)
 640         {
 641 #ifndef NO_TRANSLITERATION
 642           /* Transliterate if possible.  */
 643           if (locale != NULL)
 644             {
 645               if ((locale->avail & CTYPE_LOCALE) == 0)
 646                 {
 647                   /* Load the CTYPE data now.  */
 648                   int old_needed = locale->needed;
 649
 650                   locale->needed = 0;
 651                   locale = load_locale (LC_CTYPE, locale->name,
 652                                         locale->repertoire_name,
 653                                         charmap, locale);
 654                   locale->needed = old_needed;
 655                 }
 656
 657               uint32_t *translit;
 658               if ((locale->avail & CTYPE_LOCALE) != 0
 659                   && ((translit = find_translit (locale, charmap, wch))
 660                       != NULL))
 661                 /* The CTYPE data contains a matching
 662                    transliteration.  */
 663                 {
 664                   for (int i = 0; translit[i] != 0; ++i)
 665                     {
 666                       snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
 667                       seq = charmap_find_value (charmap, utmp, 9);
 668                       assert (seq != NULL);
 669                       adds (lrb, seq->bytes, seq->nbytes);
 670                     }
 671                   return true;
 672                 }
 673             }
 674 #endif  /* NO_TRANSLITERATION */
 675
 676           /* Not a known name.  */
 677           return false;
 678         }
 679     }
 680
 681   if (seq != NULL)
 682     {
 683       adds (lrb, seq->bytes, seq->nbytes);
 684       return true;
 685     }
 686   else
 687     return false;
 688 }
 689
 690 /* Returns true if ch is not EOF (that is, non-negative) and a valid
 691    UTF-8 trailing byte.  */
 692 static bool
 693 utf8_valid_trailing (int ch)
 694 {
 695   return ch >= 0 && (ch & 0xc0) == 0x80;
 696 }
 697
 698 /* Reports an error for a broken UTF-8 sequence.  CH2 to CH4 may be
 699    EOF.  Always returns false.  */
 700 static bool
 701 utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
 702                      int ch4)
 703 {
 704   char buf[30];
 705
 706   if (ch2 < 0)
 707     snprintf (buf, sizeof (buf), "0x%02x", ch1);
 708   else if (ch3 < 0)
 709     snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
 710   else if (ch4 < 0)
 711     snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
 712   else
 713     snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
 714               ch1, ch2, ch3, ch4);
 715
 716   lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
 717   return false;
 718 }
 719
 720 /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
 721    stores the decoded codepoint in *WCH.  Returns false on failure and
 722    reports an error.  */
 723 static bool
 724 utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
 725 {
 726   /* See RFC 3629 section 4 and __gconv_transform_utf8_internal.  */
 727   if (ch1 < 0xc2)
 728     return utf8_sequence_error (lr, ch1, -1, -1, -1);
 729
 730   int ch2 = lr_getc (lr);
 731   if (!utf8_valid_trailing (ch2))
 732     return utf8_sequence_error (lr, ch1, ch2, -1, -1);
 733
 734   if (ch1 <= 0xdf)
 735     {
 736       uint32_t result = ((ch1 & 0x1f)  << 6) | (ch2 & 0x3f);
 737       if (result < 0x80)
 738         return utf8_sequence_error (lr, ch1, ch2, -1, -1);
 739       *wch = result;
 740       return true;
 741     }
 742
 743   int ch3 = lr_getc (lr);
 744   if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
 745     return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
 746
 747   if (ch1 <= 0xef)
 748     {
 749       uint32_t result = (((ch1 & 0x0f)  << 12)
 750                          | ((ch2 & 0x3f) << 6)
 751                          | (ch3 & 0x3f));
 752       if (result < 0x800)
 753         return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
 754       *wch = result;
 755       return true;
 756     }
 757
 758   int ch4 = lr_getc (lr);
 759   if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
 760     return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
 761
 762   uint32_t result = (((ch1 & 0x07)  << 18)
 763                      | ((ch2 & 0x3f) << 12)
 764                      | ((ch3 & 0x3f) << 6)
 765                      | (ch4 & 0x3f));
 766   if (result < 0x10000)
 767     return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
 768   *wch = result;
 769   return true;
 770 }
 771
 772 static struct token *
 773 get_string (struct linereader *lr, const struct charmap_t *charmap,
 774             struct localedef_t *locale, const struct repertoire_t *repertoire,
 775             int verbose)
 776 {
 777   int return_widestr = lr->return_widestr;
 778   struct lr_buffer lrb;
 779   wchar_t *buf2 = NULL;
 780
 781   lr_buffer_init (&lrb);
 782
 783   /* We know it'll be a string.  */
 784   lr->token.tok = tok_string;
 785
 786   /* If we need not translate the strings (i.e., expand <...> parts)
 787      we can run a simple loop.  */
 788   if (!lr->translate_strings)
 789     {
 790       int ch;
 791
 792       buf2 = NULL;
 793       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 794         {
 795           if (ch >= 0x80)
 796             lr_error (lr, _("illegal 8-bit character in untranslated string"));
 797           addc (&lrb, ch);
 798         }
 799
 800       /* Catch errors with trailing escape character.  */
 801       if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
 802           && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
 803         {
 804           lr_error (lr, _("illegal escape sequence at end of string"));
 805           --lrb.act;
 806         }
 807       else if (ch == '\n' || ch == EOF)
 808         lr_error (lr, _("unterminated string"));
 809
 810       addc (&lrb, '\0');
 811     }
 812   else
 813     {
 814       bool illegal_string = false;
 815       size_t buf2act = 0;
 816       size_t buf2max = 56 * sizeof (uint32_t);
 817       int ch;
 818
 819       /* We have to provide the wide character result as well.  */
 820       if (return_widestr)
 821         buf2 = xmalloc (buf2max);
 822
 823       /* Read until the end of the string (or end of the line or file).  */
 824       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
 825         {
 826           size_t startidx;
 827           uint32_t wch;
 828           struct charseq *seq;
 829
 830           if (ch != '<')
 831             {
 832               /* The standards leave it up to the implementation to
 833                  decide what to do with characters which stand for
 834                  themselves.  This implementation treats the input
 835                  file as encoded in UTF-8.  */
 836               if (ch == lr->escape_char)
 837                 {
 838                   ch = lr_getc (lr);
 839                   if (ch >= 0x80)
 840                     {
 841                       lr_error (lr, _("illegal 8-bit escape sequence"));
 842                       illegal_string = true;
 843                       break;
 844                     }
 845                   if (ch == '\n' || ch == EOF)
 846                     break;
 847                   addc (&lrb, ch);
 848                   wch = ch;
 849                 }
 850               else if (ch < 0x80)
 851                 {
 852                   wch = ch;
 853                   addc (&lrb, ch);
 854                 }
 855               else              /* UTF-8 sequence.  */
 856                 {
 857                   if (!utf8_decode (lr, ch, &wch))
 858                     {
 859                       illegal_string = true;
 860                       break;
 861                     }
 862                   if (!translate_unicode_codepoint (locale, charmap,
 863                                                     repertoire, wch, &lrb))
 864                     {
 865                       /* Ignore the rest of the string.  Callers may
 866                          skip this string because it cannot be encoded
 867                          in the output character set.  */
 868                       illegal_string = true;
 869                       continue;
 870                     }
 871                 }
 872
 873               if (return_widestr)
 874                 ADDWC (wch);
 875
 876               continue;
 877             }
 878
 879           /* Now we have to search for the end of the symbolic name, i.e.,
 880              the closing '>'.  */
 881           startidx = lrb.act;
 882           while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 883             {
 884               if (ch == lr->escape_char)
 885                 {
 886                   ch = lr_getc (lr);
 887                   if (ch == '\n' || ch == EOF)
 888                     break;
 889                 }
 890               addc (&lrb, ch);
 891             }
 892           if (ch == '\n' || ch == EOF)
 893             /* Not a correct string.  */
 894             break;
 895           if (lrb.act == startidx)
 896             {
 897               /* <> is no correct name.  Ignore it and also signal an
 898                  error.  */
 899               illegal_string = true;
 900               continue;
 901             }
 902
 903           /* It might be a Uxxxx symbol.  */
 904           if (lrb.buf[startidx] == 'U'
 905               && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
 906             {
 907               char *cp = lrb.buf + startidx + 1;
 908               while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
 909                 ++cp;
 910
 911               if (cp == &lrb.buf[lrb.act])
 912                 {
 913                   /* Yes, it is.  */
 914                   addc (&lrb, '\0');
 915                   wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
 916
 917                   /* Now forget about the name we just added.  */
 918                   lrb.act = startidx;
 919
 920                   if (return_widestr)
 921                     ADDWC (wch);
 922
 923                   if (!translate_unicode_codepoint (locale, charmap,
 924                                                     repertoire, wch, &lrb))
 925                     illegal_string = true;
 926                   continue;
 927                 }
 928             }
 929
 930           /* We now have the symbolic name in lrb.buf[startidx] to
 931              lrb.buf[lrb.act-1].  Now find out the value for this character
 932              in the charmap as well as in the repertoire map (in this
 933              order).  */
 934           seq = charmap_find_value (charmap, &lrb.buf[startidx],
 935                                     lrb.act - startidx);
 936
 937           if (seq == NULL)
 938             {
 939               /* This name is not in the charmap.  */
 940               lr_error (lr, _("symbol `%.*s' not in charmap"),
 941                         (int) (lrb.act - startidx), &lrb.buf[startidx]);
 942               illegal_string = true;
 943             }
 944
 945           if (return_widestr)
 946             {
 947               /* Now the same for the multibyte representation.  */
 948               if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
 949                 wch = seq->ucs4;
 950               else
 951                 {
 952                   wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
 953                                                lrb.act - startidx);
 954                   if (seq != NULL)
 955                     seq->ucs4 = wch;
 956                 }
 957
 958               if (wch == ILLEGAL_CHAR_VALUE)
 959                 {
 960                   /* This name is not in the repertoire map.  */
 961                   lr_error (lr, _("symbol `%.*s' not in repertoire map"),
 962                             (int) (lrb.act - startidx), &lrb.buf[startidx]);
 963                   illegal_string = true;
 964                 }
 965               else
 966                 ADDWC (wch);
 967             }
 968
 969           /* Now forget about the name we just added.  */
 970           lrb.act = startidx;
 971
 972           /* And copy the bytes.  */
 973           if (seq != NULL)
 974             adds (&lrb, seq->bytes, seq->nbytes);
 975         }
 976
 977       if (ch == '\n' || ch == EOF)
 978         {
 979           lr_error (lr, _("unterminated string"));
 980           illegal_string = true;
 981         }
 982
 983       if (illegal_string)
 984         {
 985           free (lrb.buf);
 986           free (buf2);
 987           lr->token.val.str.startmb = NULL;
 988           lr->token.val.str.lenmb = 0;
 989           lr->token.val.str.startwc = NULL;
 990           lr->token.val.str.lenwc = 0;
 991
 992           return &lr->token;
 993         }
 994
 995       addc (&lrb, '\0');
 996
 997       if (return_widestr)
 998         {
 999           ADDWC (0);
1000           lr->token.val.str.startwc = xrealloc (buf2,
1001                                                 buf2act * sizeof (uint32_t));
1002           lr->token.val.str.lenwc = buf2act;
1003         }
1004     }
1005
1006   lr_buffer_to_token (&lrb, lr);
1007
1008   return &lr->token;
1009 }