]> git.ipfire.org Git - thirdparty/glibc.git/blame - locale/programs/linereader.c
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / locale / programs / linereader.c
CommitLineData
dff8da6b 1/* Copyright (C) 1996-2024 Free Software Foundation, Inc.
5290baf0 2 This file is part of the GNU C Library.
19bc17a9 3
43bc8ac6 4 This program is free software; you can redistribute it and/or modify
2e2efe65
RM
5 it under the terms of the GNU General Public License as published
6 by the Free Software Foundation; version 2 of the License, or
7 (at your option) any later version.
19bc17a9 8
43bc8ac6 9 This program is distributed in the hope that it will be useful,
5290baf0 10 but WITHOUT ANY WARRANTY; without even the implied warranty of
43bc8ac6
UD
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
19bc17a9 13
43bc8ac6 14 You should have received a copy of the GNU General Public License
5a82c748 15 along with this program; if not, see <https://www.gnu.org/licenses/>. */
19bc17a9
RM
16
17#ifdef HAVE_CONFIG_H
18# include <config.h>
19#endif
20
47e8b443 21#include <assert.h>
19bc17a9
RM
22#include <ctype.h>
23#include <errno.h>
24#include <libintl.h>
25#include <stdarg.h>
26#include <stdlib.h>
27#include <string.h>
e054f494 28#include <stdint.h>
19bc17a9 29
f2b98f97 30#include "localedef.h"
4b10dd6c 31#include "charmap.h"
19bc17a9
RM
32#include "error.h"
33#include "linereader.h"
47e8b443 34#include "locfile.h"
93693c4d 35
4b10dd6c 36/* Prototypes for local functions. */
19bc17a9
RM
37static struct token *get_toplvl_escape (struct linereader *lr);
38static struct token *get_symname (struct linereader *lr);
39static struct token *get_ident (struct linereader *lr);
40static struct token *get_string (struct linereader *lr,
4b10dd6c 41 const struct charmap_t *charmap,
47e8b443 42 struct localedef_t *locale,
93693c4d
UD
43 const struct repertoire_t *repertoire,
44 int verbose);
b15538d7 45static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
19bc17a9
RM
46
47
48struct linereader *
49lr_open (const char *fname, kw_hash_fct_t hf)
50{
51 FILE *fp;
19bc17a9
RM
52
53 if (fname == NULL || strcmp (fname, "-") == 0
54 || strcmp (fname, "/dev/stdin") == 0)
3e076219 55 return lr_create (stdin, "<stdin>", hf);
19bc17a9
RM
56 else
57 {
2e2dc1a5 58 fp = fopen (fname, "rm");
19bc17a9
RM
59 if (fp == NULL)
60 return NULL;
3e076219 61 return lr_create (fp, fname, hf);
19bc17a9 62 }
3e076219
UD
63}
64
65struct linereader *
66lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf)
67{
68 struct linereader *result;
69 int n;
19bc17a9
RM
70
71 result = (struct linereader *) xmalloc (sizeof (*result));
72
73 result->fp = fp;
3e076219 74 result->fname = xstrdup (fname);
19bc17a9
RM
75 result->buf = NULL;
76 result->bufsize = 0;
77 result->lineno = 1;
78 result->idx = 0;
79 result->comment_char = '#';
80 result->escape_char = '\\';
81 result->translate_strings = 1;
7c11c4a1 82 result->return_widestr = 0;
19bc17a9
RM
83
84 n = getdelim (&result->buf, &result->bufsize, '\n', result->fp);
85 if (n < 0)
86 {
87 int save = errno;
88 fclose (result->fp);
46ec036d 89 free ((char *) result->fname);
19bc17a9
RM
90 free (result);
91 errno = save;
92 return NULL;
93 }
94
95 if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n')
96 n -= 2;
97
98 result->buf[n] = '\0';
99 result->bufact = n;
100 result->hash_fct = hf;
101
102 return result;
103}
104
105
106int
107lr_eof (struct linereader *lr)
108{
109 return lr->bufact = 0;
110}
111
112
dd9423a6
UD
113void
114lr_ignore_rest (struct linereader *lr, int verbose)
115{
116 if (verbose)
117 {
118 while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n'
119 && lr->buf[lr->idx] != lr->comment_char)
120 if (lr->buf[lr->idx] == '\0')
121 {
122 if (lr_next (lr) < 0)
123 return;
124 }
125 else
126 ++lr->idx;
127
128 if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp)
129 && lr->buf[lr->idx] != lr->comment_char)
130 lr_error (lr, _("trailing garbage at end of line"));
131 }
132
133 /* Ignore continued line. */
134 while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n')
135 if (lr_next (lr) < 0)
136 break;
137
138 lr->idx = lr->bufact;
139}
140
141
19bc17a9
RM
142void
143lr_close (struct linereader *lr)
144{
145 fclose (lr->fp);
146 free (lr->buf);
147 free (lr);
148}
149
150
151int
152lr_next (struct linereader *lr)
153{
154 int n;
155
156 n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp);
157 if (n < 0)
158 return -1;
159
160 ++lr->lineno;
161
162 if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n')
163 {
4b10dd6c
UD
164#if 0
165 /* XXX Is this correct? */
19bc17a9
RM
166 /* An escaped newline character is substituted with a single <SP>. */
167 --n;
168 lr->buf[n - 1] = ' ';
4b10dd6c
UD
169#else
170 n -= 2;
171#endif
19bc17a9
RM
172 }
173
174 lr->buf[n] = '\0';
175 lr->bufact = n;
176 lr->idx = 0;
177
178 return 0;
179}
180
181
182/* Defined in error.c. */
183/* This variable is incremented each time `error' is called. */
184extern unsigned int error_message_count;
185
186/* The calling program should define program_name and set it to the
187 name of the executing program. */
188extern char *program_name;
189
190
191struct token *
4b10dd6c 192lr_token (struct linereader *lr, const struct charmap_t *charmap,
47e8b443
UD
193 struct localedef_t *locale, const struct repertoire_t *repertoire,
194 int verbose)
19bc17a9
RM
195{
196 int ch;
197
198 while (1)
199 {
200 do
201 {
202 ch = lr_getc (lr);
203
76fbcfdd
UD
204 if (ch == EOF)
205 {
206 lr->token.tok = tok_eof;
207 return &lr->token;
208 };
209
19bc17a9
RM
210 if (ch == '\n')
211 {
212 lr->token.tok = tok_eol;
213 return &lr->token;
214 }
215 }
216 while (isspace (ch));
217
19bc17a9
RM
218 if (ch != lr->comment_char)
219 break;
220
a0dc5206
UD
221 /* Is there an newline at the end of the buffer? */
222 if (lr->buf[lr->bufact - 1] != '\n')
223 {
224 /* No. Some people want this to mean that only the line in
225 the file not the logical, concatenated line is ignored.
226 Let's try this. */
227 lr->idx = lr->bufact;
228 continue;
229 }
230
19bc17a9
RM
231 /* Ignore rest of line. */
232 lr_ignore_rest (lr, 0);
233 lr->token.tok = tok_eol;
234 return &lr->token;
235 }
236
237 /* Match escape sequences. */
238 if (ch == lr->escape_char)
239 return get_toplvl_escape (lr);
240
241 /* Match ellipsis. */
4b10dd6c 242 if (ch == '.')
19bc17a9 243 {
a0dc5206
UD
244 if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0)
245 {
246 int cnt;
247 for (cnt = 0; cnt < 10; ++cnt)
248 lr_getc (lr);
249 lr->token.tok = tok_ellipsis4_2;
250 return &lr->token;
251 }
4b10dd6c
UD
252 if (strncmp (&lr->buf[lr->idx], "...", 3) == 0)
253 {
254 lr_getc (lr);
255 lr_getc (lr);
256 lr_getc (lr);
257 lr->token.tok = tok_ellipsis4;
258 return &lr->token;
259 }
260 if (strncmp (&lr->buf[lr->idx], "..", 2) == 0)
261 {
262 lr_getc (lr);
263 lr_getc (lr);
264 lr->token.tok = tok_ellipsis3;
265 return &lr->token;
266 }
a0dc5206
UD
267 if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0)
268 {
269 int cnt;
270 for (cnt = 0; cnt < 6; ++cnt)
271 lr_getc (lr);
272 lr->token.tok = tok_ellipsis2_2;
273 return &lr->token;
274 }
4b10dd6c
UD
275 if (lr->buf[lr->idx] == '.')
276 {
277 lr_getc (lr);
278 lr->token.tok = tok_ellipsis2;
279 return &lr->token;
280 }
19bc17a9
RM
281 }
282
283 switch (ch)
284 {
285 case '<':
286 return get_symname (lr);
287
288 case '0' ... '9':
289 lr->token.tok = tok_number;
290 lr->token.val.num = ch - '0';
291
292 while (isdigit (ch = lr_getc (lr)))
293 {
294 lr->token.val.num *= 10;
295 lr->token.val.num += ch - '0';
296 }
297 if (isalpha (ch))
5290baf0 298 lr_error (lr, _("garbage at end of number"));
19bc17a9
RM
299 lr_ungetn (lr, 1);
300
301 return &lr->token;
302
303 case ';':
304 lr->token.tok = tok_semicolon;
305 return &lr->token;
306
307 case ',':
308 lr->token.tok = tok_comma;
309 return &lr->token;
310
311 case '(':
312 lr->token.tok = tok_open_brace;
313 return &lr->token;
314
315 case ')':
316 lr->token.tok = tok_close_brace;
317 return &lr->token;
318
319 case '"':
47e8b443 320 return get_string (lr, charmap, locale, repertoire, verbose);
19bc17a9
RM
321
322 case '-':
323 ch = lr_getc (lr);
324 if (ch == '1')
325 {
326 lr->token.tok = tok_minus1;
327 return &lr->token;
328 }
329 lr_ungetn (lr, 2);
330 break;
b15538d7
FW
331
332 case 0x80 ... 0xff: /* UTF-8 sequence. */
9d77023b
FW
333 {
334 uint32_t wch;
335 if (!utf8_decode (lr, ch, &wch))
336 {
337 lr->token.tok = tok_error;
338 return &lr->token;
339 }
340 lr->token.tok = tok_ucs4;
341 lr->token.val.ucs4 = wch;
342 return &lr->token;
343 }
19bc17a9
RM
344 }
345
346 return get_ident (lr);
347}
348
349
350static struct token *
351get_toplvl_escape (struct linereader *lr)
352{
353 /* This is supposed to be a numeric value. We return the
354 numerical value and the number of bytes. */
355 size_t start_idx = lr->idx - 1;
9cfe5381
RM
356 unsigned char *bytes = lr->token.val.charcode.bytes;
357 size_t nbytes = 0;
19bc17a9
RM
358 int ch;
359
360 do
361 {
362 unsigned int byte = 0;
363 unsigned int base = 8;
364
365 ch = lr_getc (lr);
366
367 if (ch == 'd')
368 {
369 base = 10;
370 ch = lr_getc (lr);
371 }
372 else if (ch == 'x')
373 {
374 base = 16;
375 ch = lr_getc (lr);
376 }
377
378 if ((base == 16 && !isxdigit (ch))
ba1ffaa1 379 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
19bc17a9
RM
380 {
381 esc_error:
4b10dd6c 382 lr->token.val.str.startmb = &lr->buf[start_idx];
19bc17a9 383
76fbcfdd 384 while (ch != EOF && !isspace (ch))
19bc17a9 385 ch = lr_getc (lr);
4b10dd6c 386 lr->token.val.str.lenmb = lr->idx - start_idx;
19bc17a9
RM
387
388 lr->token.tok = tok_error;
389 return &lr->token;
390 }
391
392 if (isdigit (ch))
393 byte = ch - '0';
394 else
4b10dd6c 395 byte = tolower (ch) - 'a' + 10;
19bc17a9
RM
396
397 ch = lr_getc (lr);
398 if ((base == 16 && !isxdigit (ch))
ba1ffaa1 399 || (base != 16 && (ch < '0' || ch >= (int) ('0' + base))))
19bc17a9
RM
400 goto esc_error;
401
402 byte *= base;
403 if (isdigit (ch))
404 byte += ch - '0';
405 else
4b10dd6c 406 byte += tolower (ch) - 'a' + 10;
19bc17a9
RM
407
408 ch = lr_getc (lr);
409 if (base != 16 && isdigit (ch))
410 {
411 byte *= base;
679f5a56 412 byte += ch - '0';
19bc17a9
RM
413
414 ch = lr_getc (lr);
415 }
416
4b10dd6c 417 bytes[nbytes++] = byte;
19bc17a9 418 }
c50ec4e0 419 while (ch == lr->escape_char
6dd67bd5 420 && nbytes < (int) sizeof (lr->token.val.charcode.bytes));
19bc17a9
RM
421
422 if (!isspace (ch))
423 lr_error (lr, _("garbage at end of character code specification"));
424
425 lr_ungetn (lr, 1);
426
427 lr->token.tok = tok_charcode;
19bc17a9
RM
428 lr->token.val.charcode.nbytes = nbytes;
429
430 return &lr->token;
431}
432
5dcbff58
FW
433/* Multibyte string buffer. */
434struct lr_buffer
435{
436 size_t act;
437 size_t max;
438 char *buf;
439};
19bc17a9 440
5dcbff58
FW
441/* Initialize *LRB with a default-sized buffer. */
442static void
443lr_buffer_init (struct lr_buffer *lrb)
444{
445 lrb->act = 0;
446 lrb->max = 56;
447 lrb->buf = xmalloc (lrb->max);
448}
4b10dd6c 449
5dcbff58
FW
450/* Transfers the buffer string from *LRB to LR->token.mbstr. */
451static void
452lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
453{
454 lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
455 lr->token.val.str.startmb[lrb->act] = '\0';
456 lr->token.val.str.lenmb = lrb->act;
457}
4b10dd6c 458
5dcbff58
FW
459/* Adds CH to *LRB. */
460static void
461addc (struct lr_buffer *lrb, char ch)
462{
463 if (lrb->act == lrb->max)
464 {
465 lrb->max *= 2;
466 lrb->buf = xrealloc (lrb->buf, lrb->max);
467 }
468 lrb->buf[lrb->act++] = ch;
469}
4b10dd6c 470
5dcbff58
FW
471/* Adds L bytes at S to *LRB. */
472static void
473adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
474{
475 if (lrb->max - lrb->act < l)
476 {
477 size_t required_size = lrb->act + l;
478 size_t new_max = 2 * lrb->max;
479 if (new_max < required_size)
480 new_max = required_size;
481 lrb->buf = xrealloc (lrb->buf, new_max);
482 lrb->max = new_max;
483 }
484 memcpy (lrb->buf + lrb->act, s, l);
485 lrb->act += l;
486}
4b10dd6c
UD
487
488#define ADDWC(ch) \
489 do \
490 { \
491 if (buf2act == buf2max) \
492 { \
493 buf2max *= 2; \
494 buf2 = xrealloc (buf2, buf2max * 4); \
495 } \
496 buf2[buf2act++] = (ch); \
497 } \
19bc17a9
RM
498 while (0)
499
500
501static struct token *
502get_symname (struct linereader *lr)
503{
504 /* Symbol in brackets. We must distinguish three kinds:
505 1. reserved words
506 2. ISO 10646 position values
507 3. all other. */
19bc17a9
RM
508 const struct keyword_t *kw;
509 int ch;
5dcbff58 510 struct lr_buffer lrb;
19bc17a9 511
5dcbff58 512 lr_buffer_init (&lrb);
19bc17a9
RM
513
514 do
515 {
516 ch = lr_getc (lr);
517 if (ch == lr->escape_char)
518 {
519 int c2 = lr_getc (lr);
5dcbff58 520 addc (&lrb, c2);
19bc17a9
RM
521
522 if (c2 == '\n')
523 ch = '\n';
524 }
525 else
5dcbff58 526 addc (&lrb, ch);
19bc17a9
RM
527 }
528 while (ch != '>' && ch != '\n');
529
530 if (ch == '\n')
531 lr_error (lr, _("unterminated symbolic name"));
532
533 /* Test for ISO 10646 position value. */
5dcbff58 534 if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
19bc17a9 535 {
5dcbff58
FW
536 char *cp = lrb.buf + 1;
537 while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
19bc17a9
RM
538 ++cp;
539
5dcbff58 540 if (cp == &lrb.buf[lrb.act - 1])
19bc17a9
RM
541 {
542 /* Yes, it is. */
4b10dd6c 543 lr->token.tok = tok_ucs4;
5dcbff58 544 lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
19bc17a9
RM
545
546 return &lr->token;
547 }
548 }
549
550 /* It is a symbolic name. Test for reserved words. */
5dcbff58 551 kw = lr->hash_fct (lrb.buf, lrb.act - 1);
19bc17a9
RM
552
553 if (kw != NULL && kw->symname_or_ident == 1)
554 {
555 lr->token.tok = kw->token;
5dcbff58 556 free (lrb.buf);
19bc17a9
RM
557 }
558 else
559 {
560 lr->token.tok = tok_bsymbol;
5dcbff58
FW
561 lr_buffer_to_token (&lrb, lr);
562 --lr->token.val.str.lenmb; /* Hide the training '>'. */
19bc17a9
RM
563 }
564
565 return &lr->token;
566}
567
568
569static struct token *
570get_ident (struct linereader *lr)
571{
19bc17a9
RM
572 const struct keyword_t *kw;
573 int ch;
5dcbff58 574 struct lr_buffer lrb;
19bc17a9 575
5dcbff58 576 lr_buffer_init (&lrb);
19bc17a9 577
5dcbff58 578 addc (&lrb, lr->buf[lr->idx - 1]);
19bc17a9
RM
579
580 while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
f126ef67 581 && ch != '<' && ch != ',' && ch != EOF)
4b10dd6c
UD
582 {
583 if (ch == lr->escape_char)
584 {
585 ch = lr_getc (lr);
586 if (ch == '\n' || ch == EOF)
587 {
588 lr_error (lr, _("invalid escape sequence"));
589 break;
590 }
591 }
5dcbff58 592 addc (&lrb, ch);
4b10dd6c 593 }
19bc17a9 594
f126ef67 595 lr_ungetc (lr, ch);
19bc17a9 596
5dcbff58 597 kw = lr->hash_fct (lrb.buf, lrb.act);
19bc17a9
RM
598
599 if (kw != NULL && kw->symname_or_ident == 0)
600 {
601 lr->token.tok = kw->token;
5dcbff58 602 free (lrb.buf);
19bc17a9
RM
603 }
604 else
605 {
606 lr->token.tok = tok_ident;
5dcbff58 607 lr_buffer_to_token (&lrb, lr);
19bc17a9
RM
608 }
609
610 return &lr->token;
611}
612
7dcaabb9
FW
613/* Process a decoded Unicode codepoint WCH in a string, placing the
614 multibyte sequence into LRB. Return false if the character is not
615 found in CHARMAP/REPERTOIRE. */
616static bool
617translate_unicode_codepoint (struct localedef_t *locale,
618 const struct charmap_t *charmap,
619 const struct repertoire_t *repertoire,
620 uint32_t wch, struct lr_buffer *lrb)
621{
622 /* See whether the charmap contains the Uxxxxxxxx names. */
623 char utmp[10];
624 snprintf (utmp, sizeof (utmp), "U%08X", wch);
625 struct charseq *seq = charmap_find_value (charmap, utmp, 9);
626
627 if (seq == NULL)
628 {
629 /* No, this isn't the case. Now determine from
630 the repertoire the name of the character and
631 find it in the charmap. */
632 if (repertoire != NULL)
633 {
634 const char *symbol = repertoire_find_symbol (repertoire, wch);
635 if (symbol != NULL)
636 seq = charmap_find_value (charmap, symbol, strlen (symbol));
637 }
638
639 if (seq == NULL)
640 {
641#ifndef NO_TRANSLITERATION
642 /* Transliterate if possible. */
643 if (locale != NULL)
644 {
645 if ((locale->avail & CTYPE_LOCALE) == 0)
646 {
647 /* Load the CTYPE data now. */
648 int old_needed = locale->needed;
649
650 locale->needed = 0;
651 locale = load_locale (LC_CTYPE, locale->name,
652 locale->repertoire_name,
653 charmap, locale);
654 locale->needed = old_needed;
655 }
656
657 uint32_t *translit;
658 if ((locale->avail & CTYPE_LOCALE) != 0
659 && ((translit = find_translit (locale, charmap, wch))
660 != NULL))
661 /* The CTYPE data contains a matching
662 transliteration. */
663 {
664 for (int i = 0; translit[i] != 0; ++i)
665 {
666 snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
667 seq = charmap_find_value (charmap, utmp, 9);
668 assert (seq != NULL);
669 adds (lrb, seq->bytes, seq->nbytes);
670 }
671 return true;
672 }
673 }
674#endif /* NO_TRANSLITERATION */
675
676 /* Not a known name. */
677 return false;
678 }
679 }
680
681 if (seq != NULL)
682 {
683 adds (lrb, seq->bytes, seq->nbytes);
684 return true;
685 }
686 else
687 return false;
688}
689
b15538d7
FW
690/* Returns true if ch is not EOF (that is, non-negative) and a valid
691 UTF-8 trailing byte. */
692static bool
693utf8_valid_trailing (int ch)
694{
695 return ch >= 0 && (ch & 0xc0) == 0x80;
696}
697
698/* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be
699 EOF. Always returns false. */
700static bool
701utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
702 int ch4)
703{
0b3503e2 704 char buf[38];
b15538d7
FW
705
706 if (ch2 < 0)
707 snprintf (buf, sizeof (buf), "0x%02x", ch1);
708 else if (ch3 < 0)
709 snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
710 else if (ch4 < 0)
711 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
712 else
713 snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
714 ch1, ch2, ch3, ch4);
715
716 lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
717 return false;
718}
719
720/* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
721 stores the decoded codepoint in *WCH. Returns false on failure and
722 reports an error. */
723static bool
724utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
725{
726 /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */
727 if (ch1 < 0xc2)
728 return utf8_sequence_error (lr, ch1, -1, -1, -1);
729
730 int ch2 = lr_getc (lr);
731 if (!utf8_valid_trailing (ch2))
732 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
733
734 if (ch1 <= 0xdf)
735 {
736 uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f);
737 if (result < 0x80)
738 return utf8_sequence_error (lr, ch1, ch2, -1, -1);
739 *wch = result;
740 return true;
741 }
742
743 int ch3 = lr_getc (lr);
744 if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
745 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
746
747 if (ch1 <= 0xef)
748 {
749 uint32_t result = (((ch1 & 0x0f) << 12)
750 | ((ch2 & 0x3f) << 6)
751 | (ch3 & 0x3f));
752 if (result < 0x800)
753 return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
754 *wch = result;
755 return true;
756 }
757
758 int ch4 = lr_getc (lr);
759 if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
760 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
761
762 uint32_t result = (((ch1 & 0x07) << 18)
763 | ((ch2 & 0x3f) << 12)
764 | ((ch3 & 0x3f) << 6)
765 | (ch4 & 0x3f));
766 if (result < 0x10000)
767 return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
768 *wch = result;
769 return true;
770}
19bc17a9
RM
771
772static struct token *
4b10dd6c 773get_string (struct linereader *lr, const struct charmap_t *charmap,
47e8b443
UD
774 struct localedef_t *locale, const struct repertoire_t *repertoire,
775 int verbose)
19bc17a9 776{
4b10dd6c 777 int return_widestr = lr->return_widestr;
5dcbff58 778 struct lr_buffer lrb;
a9c27b3e 779 wchar_t *buf2 = NULL;
19bc17a9 780
5dcbff58 781 lr_buffer_init (&lrb);
19bc17a9 782
4b10dd6c
UD
783 /* We know it'll be a string. */
784 lr->token.tok = tok_string;
785
786 /* If we need not translate the strings (i.e., expand <...> parts)
787 we can run a simple loop. */
788 if (!lr->translate_strings)
789 {
790 int ch;
791
792 buf2 = NULL;
793 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
b15538d7
FW
794 {
795 if (ch >= 0x80)
796 lr_error (lr, _("illegal 8-bit character in untranslated string"));
797 addc (&lrb, ch);
798 }
4b10dd6c
UD
799
800 /* Catch errors with trailing escape character. */
5dcbff58
FW
801 if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
802 && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
4b10dd6c
UD
803 {
804 lr_error (lr, _("illegal escape sequence at end of string"));
5dcbff58 805 --lrb.act;
4b10dd6c
UD
806 }
807 else if (ch == '\n' || ch == EOF)
808 lr_error (lr, _("unterminated string"));
809
5dcbff58 810 addc (&lrb, '\0');
4b10dd6c
UD
811 }
812 else
813 {
7dcaabb9 814 bool illegal_string = false;
4b10dd6c
UD
815 size_t buf2act = 0;
816 size_t buf2max = 56 * sizeof (uint32_t);
817 int ch;
4b10dd6c
UD
818
819 /* We have to provide the wide character result as well. */
820 if (return_widestr)
821 buf2 = xmalloc (buf2max);
822
823 /* Read until the end of the string (or end of the line or file). */
824 while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
825 {
826 size_t startidx;
827 uint32_t wch;
828 struct charseq *seq;
829
830 if (ch != '<')
831 {
b15538d7
FW
832 /* The standards leave it up to the implementation to
833 decide what to do with characters which stand for
834 themselves. This implementation treats the input
835 file as encoded in UTF-8. */
4b10dd6c
UD
836 if (ch == lr->escape_char)
837 {
838 ch = lr_getc (lr);
b15538d7
FW
839 if (ch >= 0x80)
840 {
841 lr_error (lr, _("illegal 8-bit escape sequence"));
842 illegal_string = true;
843 break;
844 }
4b10dd6c
UD
845 if (ch == '\n' || ch == EOF)
846 break;
b15538d7
FW
847 addc (&lrb, ch);
848 wch = ch;
849 }
850 else if (ch < 0x80)
851 {
852 wch = ch;
853 addc (&lrb, ch);
854 }
855 else /* UTF-8 sequence. */
856 {
857 if (!utf8_decode (lr, ch, &wch))
858 {
859 illegal_string = true;
860 break;
861 }
862 if (!translate_unicode_codepoint (locale, charmap,
863 repertoire, wch, &lrb))
864 {
865 /* Ignore the rest of the string. Callers may
866 skip this string because it cannot be encoded
867 in the output character set. */
868 illegal_string = true;
869 continue;
870 }
4b10dd6c
UD
871 }
872
4b10dd6c 873 if (return_widestr)
b15538d7 874 ADDWC (wch);
4b10dd6c
UD
875
876 continue;
877 }
878
879 /* Now we have to search for the end of the symbolic name, i.e.,
880 the closing '>'. */
5dcbff58 881 startidx = lrb.act;
4b10dd6c
UD
882 while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
883 {
884 if (ch == lr->escape_char)
885 {
886 ch = lr_getc (lr);
887 if (ch == '\n' || ch == EOF)
888 break;
889 }
5dcbff58 890 addc (&lrb, ch);
4b10dd6c
UD
891 }
892 if (ch == '\n' || ch == EOF)
893 /* Not a correct string. */
894 break;
5dcbff58 895 if (lrb.act == startidx)
4b10dd6c
UD
896 {
897 /* <> is no correct name. Ignore it and also signal an
898 error. */
7dcaabb9 899 illegal_string = true;
4b10dd6c
UD
900 continue;
901 }
19bc17a9 902
4b10dd6c 903 /* It might be a Uxxxx symbol. */
5dcbff58
FW
904 if (lrb.buf[startidx] == 'U'
905 && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
4b10dd6c 906 {
5dcbff58
FW
907 char *cp = lrb.buf + startidx + 1;
908 while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
4b10dd6c
UD
909 ++cp;
910
5dcbff58 911 if (cp == &lrb.buf[lrb.act])
4b10dd6c 912 {
4b10dd6c 913 /* Yes, it is. */
5dcbff58
FW
914 addc (&lrb, '\0');
915 wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
4b10dd6c
UD
916
917 /* Now forget about the name we just added. */
5dcbff58 918 lrb.act = startidx;
4b10dd6c
UD
919
920 if (return_widestr)
921 ADDWC (wch);
922
7dcaabb9
FW
923 if (!translate_unicode_codepoint (locale, charmap,
924 repertoire, wch, &lrb))
925 illegal_string = true;
4b10dd6c
UD
926 continue;
927 }
928 }
929
5dcbff58
FW
930 /* We now have the symbolic name in lrb.buf[startidx] to
931 lrb.buf[lrb.act-1]. Now find out the value for this character
3c833378
UD
932 in the charmap as well as in the repertoire map (in this
933 order). */
5dcbff58
FW
934 seq = charmap_find_value (charmap, &lrb.buf[startidx],
935 lrb.act - startidx);
3c833378
UD
936
937 if (seq == NULL)
938 {
939 /* This name is not in the charmap. */
940 lr_error (lr, _("symbol `%.*s' not in charmap"),
5dcbff58 941 (int) (lrb.act - startidx), &lrb.buf[startidx]);
7dcaabb9 942 illegal_string = true;
3c833378
UD
943 }
944
4b10dd6c
UD
945 if (return_widestr)
946 {
3c833378
UD
947 /* Now the same for the multibyte representation. */
948 if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
949 wch = seq->ucs4;
950 else
951 {
5dcbff58
FW
952 wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
953 lrb.act - startidx);
3c833378
UD
954 if (seq != NULL)
955 seq->ucs4 = wch;
956 }
957
4b10dd6c
UD
958 if (wch == ILLEGAL_CHAR_VALUE)
959 {
960 /* This name is not in the repertoire map. */
961 lr_error (lr, _("symbol `%.*s' not in repertoire map"),
5dcbff58 962 (int) (lrb.act - startidx), &lrb.buf[startidx]);
7dcaabb9 963 illegal_string = true;
4b10dd6c
UD
964 }
965 else
966 ADDWC (wch);
967 }
968
3c833378 969 /* Now forget about the name we just added. */
5dcbff58 970 lrb.act = startidx;
19bc17a9 971
3c833378
UD
972 /* And copy the bytes. */
973 if (seq != NULL)
5dcbff58 974 adds (&lrb, seq->bytes, seq->nbytes);
4b10dd6c 975 }
19bc17a9 976
4b10dd6c
UD
977 if (ch == '\n' || ch == EOF)
978 {
979 lr_error (lr, _("unterminated string"));
7dcaabb9 980 illegal_string = true;
4b10dd6c 981 }
19bc17a9 982
4b10dd6c
UD
983 if (illegal_string)
984 {
5dcbff58 985 free (lrb.buf);
72e6cdfa 986 free (buf2);
4b10dd6c
UD
987 lr->token.val.str.startmb = NULL;
988 lr->token.val.str.lenmb = 0;
d5fd1f3f
UD
989 lr->token.val.str.startwc = NULL;
990 lr->token.val.str.lenwc = 0;
19bc17a9 991
4b10dd6c
UD
992 return &lr->token;
993 }
19bc17a9 994
5dcbff58 995 addc (&lrb, '\0');
19bc17a9 996
4b10dd6c
UD
997 if (return_widestr)
998 {
999 ADDWC (0);
1000 lr->token.val.str.startwc = xrealloc (buf2,
1001 buf2act * sizeof (uint32_t));
1002 lr->token.val.str.lenwc = buf2act;
1003 }
19bc17a9
RM
1004 }
1005
5dcbff58 1006 lr_buffer_to_token (&lrb, lr);
4b10dd6c 1007
19bc17a9
RM
1008 return &lr->token;
1009}