]>
Commit | Line | Data |
---|---|---|
dff8da6b | 1 | /* Copyright (C) 1996-2024 Free Software Foundation, Inc. |
5290baf0 | 2 | This file is part of the GNU C Library. |
19bc17a9 | 3 | |
43bc8ac6 | 4 | This program is free software; you can redistribute it and/or modify |
2e2efe65 RM |
5 | it under the terms of the GNU General Public License as published |
6 | by the Free Software Foundation; version 2 of the License, or | |
7 | (at your option) any later version. | |
19bc17a9 | 8 | |
43bc8ac6 | 9 | This program is distributed in the hope that it will be useful, |
5290baf0 | 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
43bc8ac6 UD |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12 | GNU General Public License for more details. | |
19bc17a9 | 13 | |
43bc8ac6 | 14 | You should have received a copy of the GNU General Public License |
5a82c748 | 15 | along with this program; if not, see <https://www.gnu.org/licenses/>. */ |
19bc17a9 RM |
16 | |
17 | #ifdef HAVE_CONFIG_H | |
18 | # include <config.h> | |
19 | #endif | |
20 | ||
47e8b443 | 21 | #include <assert.h> |
19bc17a9 RM |
22 | #include <ctype.h> |
23 | #include <errno.h> | |
24 | #include <libintl.h> | |
25 | #include <stdarg.h> | |
26 | #include <stdlib.h> | |
27 | #include <string.h> | |
e054f494 | 28 | #include <stdint.h> |
19bc17a9 | 29 | |
f2b98f97 | 30 | #include "localedef.h" |
4b10dd6c | 31 | #include "charmap.h" |
19bc17a9 RM |
32 | #include "error.h" |
33 | #include "linereader.h" | |
47e8b443 | 34 | #include "locfile.h" |
93693c4d | 35 | |
4b10dd6c | 36 | /* Prototypes for local functions. */ |
19bc17a9 RM |
37 | static struct token *get_toplvl_escape (struct linereader *lr); |
38 | static struct token *get_symname (struct linereader *lr); | |
39 | static struct token *get_ident (struct linereader *lr); | |
40 | static struct token *get_string (struct linereader *lr, | |
4b10dd6c | 41 | const struct charmap_t *charmap, |
47e8b443 | 42 | struct localedef_t *locale, |
93693c4d UD |
43 | const struct repertoire_t *repertoire, |
44 | int verbose); | |
b15538d7 | 45 | static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch); |
19bc17a9 RM |
46 | |
47 | ||
48 | struct linereader * | |
49 | lr_open (const char *fname, kw_hash_fct_t hf) | |
50 | { | |
51 | FILE *fp; | |
19bc17a9 RM |
52 | |
53 | if (fname == NULL || strcmp (fname, "-") == 0 | |
54 | || strcmp (fname, "/dev/stdin") == 0) | |
3e076219 | 55 | return lr_create (stdin, "<stdin>", hf); |
19bc17a9 RM |
56 | else |
57 | { | |
2e2dc1a5 | 58 | fp = fopen (fname, "rm"); |
19bc17a9 RM |
59 | if (fp == NULL) |
60 | return NULL; | |
3e076219 | 61 | return lr_create (fp, fname, hf); |
19bc17a9 | 62 | } |
3e076219 UD |
63 | } |
64 | ||
65 | struct linereader * | |
66 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) | |
67 | { | |
68 | struct linereader *result; | |
69 | int n; | |
19bc17a9 RM |
70 | |
71 | result = (struct linereader *) xmalloc (sizeof (*result)); | |
72 | ||
73 | result->fp = fp; | |
3e076219 | 74 | result->fname = xstrdup (fname); |
19bc17a9 RM |
75 | result->buf = NULL; |
76 | result->bufsize = 0; | |
77 | result->lineno = 1; | |
78 | result->idx = 0; | |
79 | result->comment_char = '#'; | |
80 | result->escape_char = '\\'; | |
81 | result->translate_strings = 1; | |
7c11c4a1 | 82 | result->return_widestr = 0; |
19bc17a9 RM |
83 | |
84 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); | |
85 | if (n < 0) | |
86 | { | |
87 | int save = errno; | |
88 | fclose (result->fp); | |
46ec036d | 89 | free ((char *) result->fname); |
19bc17a9 RM |
90 | free (result); |
91 | errno = save; | |
92 | return NULL; | |
93 | } | |
94 | ||
95 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') | |
96 | n -= 2; | |
97 | ||
98 | result->buf[n] = '\0'; | |
99 | result->bufact = n; | |
100 | result->hash_fct = hf; | |
101 | ||
102 | return result; | |
103 | } | |
104 | ||
105 | ||
106 | int | |
107 | lr_eof (struct linereader *lr) | |
108 | { | |
109 | return lr->bufact = 0; | |
110 | } | |
111 | ||
112 | ||
dd9423a6 UD |
113 | void |
114 | lr_ignore_rest (struct linereader *lr, int verbose) | |
115 | { | |
116 | if (verbose) | |
117 | { | |
118 | while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' | |
119 | && lr->buf[lr->idx] != lr->comment_char) | |
120 | if (lr->buf[lr->idx] == '\0') | |
121 | { | |
122 | if (lr_next (lr) < 0) | |
123 | return; | |
124 | } | |
125 | else | |
126 | ++lr->idx; | |
127 | ||
128 | if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) | |
129 | && lr->buf[lr->idx] != lr->comment_char) | |
130 | lr_error (lr, _("trailing garbage at end of line")); | |
131 | } | |
132 | ||
133 | /* Ignore continued line. */ | |
134 | while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') | |
135 | if (lr_next (lr) < 0) | |
136 | break; | |
137 | ||
138 | lr->idx = lr->bufact; | |
139 | } | |
140 | ||
141 | ||
19bc17a9 RM |
142 | void |
143 | lr_close (struct linereader *lr) | |
144 | { | |
145 | fclose (lr->fp); | |
146 | free (lr->buf); | |
147 | free (lr); | |
148 | } | |
149 | ||
150 | ||
151 | int | |
152 | lr_next (struct linereader *lr) | |
153 | { | |
154 | int n; | |
155 | ||
156 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); | |
157 | if (n < 0) | |
158 | return -1; | |
159 | ||
160 | ++lr->lineno; | |
161 | ||
162 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') | |
163 | { | |
4b10dd6c UD |
164 | #if 0 |
165 | /* XXX Is this correct? */ | |
19bc17a9 RM |
166 | /* An escaped newline character is substituted with a single <SP>. */ |
167 | --n; | |
168 | lr->buf[n - 1] = ' '; | |
4b10dd6c UD |
169 | #else |
170 | n -= 2; | |
171 | #endif | |
19bc17a9 RM |
172 | } |
173 | ||
174 | lr->buf[n] = '\0'; | |
175 | lr->bufact = n; | |
176 | lr->idx = 0; | |
177 | ||
178 | return 0; | |
179 | } | |
180 | ||
181 | ||
182 | /* Defined in error.c. */ | |
183 | /* This variable is incremented each time `error' is called. */ | |
184 | extern unsigned int error_message_count; | |
185 | ||
186 | /* The calling program should define program_name and set it to the | |
187 | name of the executing program. */ | |
188 | extern char *program_name; | |
189 | ||
190 | ||
191 | struct token * | |
4b10dd6c | 192 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
193 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
194 | int verbose) | |
19bc17a9 RM |
195 | { |
196 | int ch; | |
197 | ||
198 | while (1) | |
199 | { | |
200 | do | |
201 | { | |
202 | ch = lr_getc (lr); | |
203 | ||
76fbcfdd UD |
204 | if (ch == EOF) |
205 | { | |
206 | lr->token.tok = tok_eof; | |
207 | return &lr->token; | |
208 | }; | |
209 | ||
19bc17a9 RM |
210 | if (ch == '\n') |
211 | { | |
212 | lr->token.tok = tok_eol; | |
213 | return &lr->token; | |
214 | } | |
215 | } | |
216 | while (isspace (ch)); | |
217 | ||
19bc17a9 RM |
218 | if (ch != lr->comment_char) |
219 | break; | |
220 | ||
a0dc5206 UD |
221 | /* Is there an newline at the end of the buffer? */ |
222 | if (lr->buf[lr->bufact - 1] != '\n') | |
223 | { | |
224 | /* No. Some people want this to mean that only the line in | |
225 | the file not the logical, concatenated line is ignored. | |
226 | Let's try this. */ | |
227 | lr->idx = lr->bufact; | |
228 | continue; | |
229 | } | |
230 | ||
19bc17a9 RM |
231 | /* Ignore rest of line. */ |
232 | lr_ignore_rest (lr, 0); | |
233 | lr->token.tok = tok_eol; | |
234 | return &lr->token; | |
235 | } | |
236 | ||
237 | /* Match escape sequences. */ | |
238 | if (ch == lr->escape_char) | |
239 | return get_toplvl_escape (lr); | |
240 | ||
241 | /* Match ellipsis. */ | |
4b10dd6c | 242 | if (ch == '.') |
19bc17a9 | 243 | { |
a0dc5206 UD |
244 | if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0) |
245 | { | |
246 | int cnt; | |
247 | for (cnt = 0; cnt < 10; ++cnt) | |
248 | lr_getc (lr); | |
249 | lr->token.tok = tok_ellipsis4_2; | |
250 | return &lr->token; | |
251 | } | |
4b10dd6c UD |
252 | if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) |
253 | { | |
254 | lr_getc (lr); | |
255 | lr_getc (lr); | |
256 | lr_getc (lr); | |
257 | lr->token.tok = tok_ellipsis4; | |
258 | return &lr->token; | |
259 | } | |
260 | if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) | |
261 | { | |
262 | lr_getc (lr); | |
263 | lr_getc (lr); | |
264 | lr->token.tok = tok_ellipsis3; | |
265 | return &lr->token; | |
266 | } | |
a0dc5206 UD |
267 | if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0) |
268 | { | |
269 | int cnt; | |
270 | for (cnt = 0; cnt < 6; ++cnt) | |
271 | lr_getc (lr); | |
272 | lr->token.tok = tok_ellipsis2_2; | |
273 | return &lr->token; | |
274 | } | |
4b10dd6c UD |
275 | if (lr->buf[lr->idx] == '.') |
276 | { | |
277 | lr_getc (lr); | |
278 | lr->token.tok = tok_ellipsis2; | |
279 | return &lr->token; | |
280 | } | |
19bc17a9 RM |
281 | } |
282 | ||
283 | switch (ch) | |
284 | { | |
285 | case '<': | |
286 | return get_symname (lr); | |
287 | ||
288 | case '0' ... '9': | |
289 | lr->token.tok = tok_number; | |
290 | lr->token.val.num = ch - '0'; | |
291 | ||
292 | while (isdigit (ch = lr_getc (lr))) | |
293 | { | |
294 | lr->token.val.num *= 10; | |
295 | lr->token.val.num += ch - '0'; | |
296 | } | |
297 | if (isalpha (ch)) | |
5290baf0 | 298 | lr_error (lr, _("garbage at end of number")); |
19bc17a9 RM |
299 | lr_ungetn (lr, 1); |
300 | ||
301 | return &lr->token; | |
302 | ||
303 | case ';': | |
304 | lr->token.tok = tok_semicolon; | |
305 | return &lr->token; | |
306 | ||
307 | case ',': | |
308 | lr->token.tok = tok_comma; | |
309 | return &lr->token; | |
310 | ||
311 | case '(': | |
312 | lr->token.tok = tok_open_brace; | |
313 | return &lr->token; | |
314 | ||
315 | case ')': | |
316 | lr->token.tok = tok_close_brace; | |
317 | return &lr->token; | |
318 | ||
319 | case '"': | |
47e8b443 | 320 | return get_string (lr, charmap, locale, repertoire, verbose); |
19bc17a9 RM |
321 | |
322 | case '-': | |
323 | ch = lr_getc (lr); | |
324 | if (ch == '1') | |
325 | { | |
326 | lr->token.tok = tok_minus1; | |
327 | return &lr->token; | |
328 | } | |
329 | lr_ungetn (lr, 2); | |
330 | break; | |
b15538d7 FW |
331 | |
332 | case 0x80 ... 0xff: /* UTF-8 sequence. */ | |
9d77023b FW |
333 | { |
334 | uint32_t wch; | |
335 | if (!utf8_decode (lr, ch, &wch)) | |
336 | { | |
337 | lr->token.tok = tok_error; | |
338 | return &lr->token; | |
339 | } | |
340 | lr->token.tok = tok_ucs4; | |
341 | lr->token.val.ucs4 = wch; | |
342 | return &lr->token; | |
343 | } | |
19bc17a9 RM |
344 | } |
345 | ||
346 | return get_ident (lr); | |
347 | } | |
348 | ||
349 | ||
350 | static struct token * | |
351 | get_toplvl_escape (struct linereader *lr) | |
352 | { | |
353 | /* This is supposed to be a numeric value. We return the | |
354 | numerical value and the number of bytes. */ | |
355 | size_t start_idx = lr->idx - 1; | |
9cfe5381 RM |
356 | unsigned char *bytes = lr->token.val.charcode.bytes; |
357 | size_t nbytes = 0; | |
19bc17a9 RM |
358 | int ch; |
359 | ||
360 | do | |
361 | { | |
362 | unsigned int byte = 0; | |
363 | unsigned int base = 8; | |
364 | ||
365 | ch = lr_getc (lr); | |
366 | ||
367 | if (ch == 'd') | |
368 | { | |
369 | base = 10; | |
370 | ch = lr_getc (lr); | |
371 | } | |
372 | else if (ch == 'x') | |
373 | { | |
374 | base = 16; | |
375 | ch = lr_getc (lr); | |
376 | } | |
377 | ||
378 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 379 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
380 | { |
381 | esc_error: | |
4b10dd6c | 382 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
19bc17a9 | 383 | |
76fbcfdd | 384 | while (ch != EOF && !isspace (ch)) |
19bc17a9 | 385 | ch = lr_getc (lr); |
4b10dd6c | 386 | lr->token.val.str.lenmb = lr->idx - start_idx; |
19bc17a9 RM |
387 | |
388 | lr->token.tok = tok_error; | |
389 | return &lr->token; | |
390 | } | |
391 | ||
392 | if (isdigit (ch)) | |
393 | byte = ch - '0'; | |
394 | else | |
4b10dd6c | 395 | byte = tolower (ch) - 'a' + 10; |
19bc17a9 RM |
396 | |
397 | ch = lr_getc (lr); | |
398 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 399 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
400 | goto esc_error; |
401 | ||
402 | byte *= base; | |
403 | if (isdigit (ch)) | |
404 | byte += ch - '0'; | |
405 | else | |
4b10dd6c | 406 | byte += tolower (ch) - 'a' + 10; |
19bc17a9 RM |
407 | |
408 | ch = lr_getc (lr); | |
409 | if (base != 16 && isdigit (ch)) | |
410 | { | |
411 | byte *= base; | |
679f5a56 | 412 | byte += ch - '0'; |
19bc17a9 RM |
413 | |
414 | ch = lr_getc (lr); | |
415 | } | |
416 | ||
4b10dd6c | 417 | bytes[nbytes++] = byte; |
19bc17a9 | 418 | } |
c50ec4e0 | 419 | while (ch == lr->escape_char |
6dd67bd5 | 420 | && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); |
19bc17a9 RM |
421 | |
422 | if (!isspace (ch)) | |
423 | lr_error (lr, _("garbage at end of character code specification")); | |
424 | ||
425 | lr_ungetn (lr, 1); | |
426 | ||
427 | lr->token.tok = tok_charcode; | |
19bc17a9 RM |
428 | lr->token.val.charcode.nbytes = nbytes; |
429 | ||
430 | return &lr->token; | |
431 | } | |
432 | ||
5dcbff58 FW |
433 | /* Multibyte string buffer. */ |
434 | struct lr_buffer | |
435 | { | |
436 | size_t act; | |
437 | size_t max; | |
438 | char *buf; | |
439 | }; | |
19bc17a9 | 440 | |
5dcbff58 FW |
441 | /* Initialize *LRB with a default-sized buffer. */ |
442 | static void | |
443 | lr_buffer_init (struct lr_buffer *lrb) | |
444 | { | |
445 | lrb->act = 0; | |
446 | lrb->max = 56; | |
447 | lrb->buf = xmalloc (lrb->max); | |
448 | } | |
4b10dd6c | 449 | |
5dcbff58 FW |
450 | /* Transfers the buffer string from *LRB to LR->token.mbstr. */ |
451 | static void | |
452 | lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr) | |
453 | { | |
454 | lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1); | |
455 | lr->token.val.str.startmb[lrb->act] = '\0'; | |
456 | lr->token.val.str.lenmb = lrb->act; | |
457 | } | |
4b10dd6c | 458 | |
5dcbff58 FW |
459 | /* Adds CH to *LRB. */ |
460 | static void | |
461 | addc (struct lr_buffer *lrb, char ch) | |
462 | { | |
463 | if (lrb->act == lrb->max) | |
464 | { | |
465 | lrb->max *= 2; | |
466 | lrb->buf = xrealloc (lrb->buf, lrb->max); | |
467 | } | |
468 | lrb->buf[lrb->act++] = ch; | |
469 | } | |
4b10dd6c | 470 | |
5dcbff58 FW |
471 | /* Adds L bytes at S to *LRB. */ |
472 | static void | |
473 | adds (struct lr_buffer *lrb, const unsigned char *s, size_t l) | |
474 | { | |
475 | if (lrb->max - lrb->act < l) | |
476 | { | |
477 | size_t required_size = lrb->act + l; | |
478 | size_t new_max = 2 * lrb->max; | |
479 | if (new_max < required_size) | |
480 | new_max = required_size; | |
481 | lrb->buf = xrealloc (lrb->buf, new_max); | |
482 | lrb->max = new_max; | |
483 | } | |
484 | memcpy (lrb->buf + lrb->act, s, l); | |
485 | lrb->act += l; | |
486 | } | |
4b10dd6c UD |
487 | |
488 | #define ADDWC(ch) \ | |
489 | do \ | |
490 | { \ | |
491 | if (buf2act == buf2max) \ | |
492 | { \ | |
493 | buf2max *= 2; \ | |
494 | buf2 = xrealloc (buf2, buf2max * 4); \ | |
495 | } \ | |
496 | buf2[buf2act++] = (ch); \ | |
497 | } \ | |
19bc17a9 RM |
498 | while (0) |
499 | ||
500 | ||
501 | static struct token * | |
502 | get_symname (struct linereader *lr) | |
503 | { | |
504 | /* Symbol in brackets. We must distinguish three kinds: | |
505 | 1. reserved words | |
506 | 2. ISO 10646 position values | |
507 | 3. all other. */ | |
19bc17a9 RM |
508 | const struct keyword_t *kw; |
509 | int ch; | |
5dcbff58 | 510 | struct lr_buffer lrb; |
19bc17a9 | 511 | |
5dcbff58 | 512 | lr_buffer_init (&lrb); |
19bc17a9 RM |
513 | |
514 | do | |
515 | { | |
516 | ch = lr_getc (lr); | |
517 | if (ch == lr->escape_char) | |
518 | { | |
519 | int c2 = lr_getc (lr); | |
5dcbff58 | 520 | addc (&lrb, c2); |
19bc17a9 RM |
521 | |
522 | if (c2 == '\n') | |
523 | ch = '\n'; | |
524 | } | |
525 | else | |
5dcbff58 | 526 | addc (&lrb, ch); |
19bc17a9 RM |
527 | } |
528 | while (ch != '>' && ch != '\n'); | |
529 | ||
530 | if (ch == '\n') | |
531 | lr_error (lr, _("unterminated symbolic name")); | |
532 | ||
533 | /* Test for ISO 10646 position value. */ | |
5dcbff58 | 534 | if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10)) |
19bc17a9 | 535 | { |
5dcbff58 FW |
536 | char *cp = lrb.buf + 1; |
537 | while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp)) | |
19bc17a9 RM |
538 | ++cp; |
539 | ||
5dcbff58 | 540 | if (cp == &lrb.buf[lrb.act - 1]) |
19bc17a9 RM |
541 | { |
542 | /* Yes, it is. */ | |
4b10dd6c | 543 | lr->token.tok = tok_ucs4; |
5dcbff58 | 544 | lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16); |
19bc17a9 RM |
545 | |
546 | return &lr->token; | |
547 | } | |
548 | } | |
549 | ||
550 | /* It is a symbolic name. Test for reserved words. */ | |
5dcbff58 | 551 | kw = lr->hash_fct (lrb.buf, lrb.act - 1); |
19bc17a9 RM |
552 | |
553 | if (kw != NULL && kw->symname_or_ident == 1) | |
554 | { | |
555 | lr->token.tok = kw->token; | |
5dcbff58 | 556 | free (lrb.buf); |
19bc17a9 RM |
557 | } |
558 | else | |
559 | { | |
560 | lr->token.tok = tok_bsymbol; | |
5dcbff58 FW |
561 | lr_buffer_to_token (&lrb, lr); |
562 | --lr->token.val.str.lenmb; /* Hide the training '>'. */ | |
19bc17a9 RM |
563 | } |
564 | ||
565 | return &lr->token; | |
566 | } | |
567 | ||
568 | ||
569 | static struct token * | |
570 | get_ident (struct linereader *lr) | |
571 | { | |
19bc17a9 RM |
572 | const struct keyword_t *kw; |
573 | int ch; | |
5dcbff58 | 574 | struct lr_buffer lrb; |
19bc17a9 | 575 | |
5dcbff58 | 576 | lr_buffer_init (&lrb); |
19bc17a9 | 577 | |
5dcbff58 | 578 | addc (&lrb, lr->buf[lr->idx - 1]); |
19bc17a9 RM |
579 | |
580 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' | |
f126ef67 | 581 | && ch != '<' && ch != ',' && ch != EOF) |
4b10dd6c UD |
582 | { |
583 | if (ch == lr->escape_char) | |
584 | { | |
585 | ch = lr_getc (lr); | |
586 | if (ch == '\n' || ch == EOF) | |
587 | { | |
588 | lr_error (lr, _("invalid escape sequence")); | |
589 | break; | |
590 | } | |
591 | } | |
5dcbff58 | 592 | addc (&lrb, ch); |
4b10dd6c | 593 | } |
19bc17a9 | 594 | |
f126ef67 | 595 | lr_ungetc (lr, ch); |
19bc17a9 | 596 | |
5dcbff58 | 597 | kw = lr->hash_fct (lrb.buf, lrb.act); |
19bc17a9 RM |
598 | |
599 | if (kw != NULL && kw->symname_or_ident == 0) | |
600 | { | |
601 | lr->token.tok = kw->token; | |
5dcbff58 | 602 | free (lrb.buf); |
19bc17a9 RM |
603 | } |
604 | else | |
605 | { | |
606 | lr->token.tok = tok_ident; | |
5dcbff58 | 607 | lr_buffer_to_token (&lrb, lr); |
19bc17a9 RM |
608 | } |
609 | ||
610 | return &lr->token; | |
611 | } | |
612 | ||
7dcaabb9 FW |
613 | /* Process a decoded Unicode codepoint WCH in a string, placing the |
614 | multibyte sequence into LRB. Return false if the character is not | |
615 | found in CHARMAP/REPERTOIRE. */ | |
616 | static bool | |
617 | translate_unicode_codepoint (struct localedef_t *locale, | |
618 | const struct charmap_t *charmap, | |
619 | const struct repertoire_t *repertoire, | |
620 | uint32_t wch, struct lr_buffer *lrb) | |
621 | { | |
622 | /* See whether the charmap contains the Uxxxxxxxx names. */ | |
623 | char utmp[10]; | |
624 | snprintf (utmp, sizeof (utmp), "U%08X", wch); | |
625 | struct charseq *seq = charmap_find_value (charmap, utmp, 9); | |
626 | ||
627 | if (seq == NULL) | |
628 | { | |
629 | /* No, this isn't the case. Now determine from | |
630 | the repertoire the name of the character and | |
631 | find it in the charmap. */ | |
632 | if (repertoire != NULL) | |
633 | { | |
634 | const char *symbol = repertoire_find_symbol (repertoire, wch); | |
635 | if (symbol != NULL) | |
636 | seq = charmap_find_value (charmap, symbol, strlen (symbol)); | |
637 | } | |
638 | ||
639 | if (seq == NULL) | |
640 | { | |
641 | #ifndef NO_TRANSLITERATION | |
642 | /* Transliterate if possible. */ | |
643 | if (locale != NULL) | |
644 | { | |
645 | if ((locale->avail & CTYPE_LOCALE) == 0) | |
646 | { | |
647 | /* Load the CTYPE data now. */ | |
648 | int old_needed = locale->needed; | |
649 | ||
650 | locale->needed = 0; | |
651 | locale = load_locale (LC_CTYPE, locale->name, | |
652 | locale->repertoire_name, | |
653 | charmap, locale); | |
654 | locale->needed = old_needed; | |
655 | } | |
656 | ||
657 | uint32_t *translit; | |
658 | if ((locale->avail & CTYPE_LOCALE) != 0 | |
659 | && ((translit = find_translit (locale, charmap, wch)) | |
660 | != NULL)) | |
661 | /* The CTYPE data contains a matching | |
662 | transliteration. */ | |
663 | { | |
664 | for (int i = 0; translit[i] != 0; ++i) | |
665 | { | |
666 | snprintf (utmp, sizeof (utmp), "U%08X", translit[i]); | |
667 | seq = charmap_find_value (charmap, utmp, 9); | |
668 | assert (seq != NULL); | |
669 | adds (lrb, seq->bytes, seq->nbytes); | |
670 | } | |
671 | return true; | |
672 | } | |
673 | } | |
674 | #endif /* NO_TRANSLITERATION */ | |
675 | ||
676 | /* Not a known name. */ | |
677 | return false; | |
678 | } | |
679 | } | |
680 | ||
681 | if (seq != NULL) | |
682 | { | |
683 | adds (lrb, seq->bytes, seq->nbytes); | |
684 | return true; | |
685 | } | |
686 | else | |
687 | return false; | |
688 | } | |
689 | ||
b15538d7 FW |
690 | /* Returns true if ch is not EOF (that is, non-negative) and a valid |
691 | UTF-8 trailing byte. */ | |
692 | static bool | |
693 | utf8_valid_trailing (int ch) | |
694 | { | |
695 | return ch >= 0 && (ch & 0xc0) == 0x80; | |
696 | } | |
697 | ||
698 | /* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be | |
699 | EOF. Always returns false. */ | |
700 | static bool | |
701 | utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3, | |
702 | int ch4) | |
703 | { | |
0b3503e2 | 704 | char buf[38]; |
b15538d7 FW |
705 | |
706 | if (ch2 < 0) | |
707 | snprintf (buf, sizeof (buf), "0x%02x", ch1); | |
708 | else if (ch3 < 0) | |
709 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2); | |
710 | else if (ch4 < 0) | |
711 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3); | |
712 | else | |
713 | snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x", | |
714 | ch1, ch2, ch3, ch4); | |
715 | ||
716 | lr_error (lr, _("invalid UTF-8 sequence %s"), buf); | |
717 | return false; | |
718 | } | |
719 | ||
720 | /* Reads a UTF-8 sequence from LR, with the leading byte CH1, and | |
721 | stores the decoded codepoint in *WCH. Returns false on failure and | |
722 | reports an error. */ | |
723 | static bool | |
724 | utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch) | |
725 | { | |
726 | /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */ | |
727 | if (ch1 < 0xc2) | |
728 | return utf8_sequence_error (lr, ch1, -1, -1, -1); | |
729 | ||
730 | int ch2 = lr_getc (lr); | |
731 | if (!utf8_valid_trailing (ch2)) | |
732 | return utf8_sequence_error (lr, ch1, ch2, -1, -1); | |
733 | ||
734 | if (ch1 <= 0xdf) | |
735 | { | |
736 | uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f); | |
737 | if (result < 0x80) | |
738 | return utf8_sequence_error (lr, ch1, ch2, -1, -1); | |
739 | *wch = result; | |
740 | return true; | |
741 | } | |
742 | ||
743 | int ch3 = lr_getc (lr); | |
744 | if (!utf8_valid_trailing (ch3) || ch1 < 0xe0) | |
745 | return utf8_sequence_error (lr, ch1, ch2, ch3, -1); | |
746 | ||
747 | if (ch1 <= 0xef) | |
748 | { | |
749 | uint32_t result = (((ch1 & 0x0f) << 12) | |
750 | | ((ch2 & 0x3f) << 6) | |
751 | | (ch3 & 0x3f)); | |
752 | if (result < 0x800) | |
753 | return utf8_sequence_error (lr, ch1, ch2, ch3, -1); | |
754 | *wch = result; | |
755 | return true; | |
756 | } | |
757 | ||
758 | int ch4 = lr_getc (lr); | |
759 | if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4) | |
760 | return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); | |
761 | ||
762 | uint32_t result = (((ch1 & 0x07) << 18) | |
763 | | ((ch2 & 0x3f) << 12) | |
764 | | ((ch3 & 0x3f) << 6) | |
765 | | (ch4 & 0x3f)); | |
766 | if (result < 0x10000) | |
767 | return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); | |
768 | *wch = result; | |
769 | return true; | |
770 | } | |
19bc17a9 RM |
771 | |
772 | static struct token * | |
4b10dd6c | 773 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
774 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
775 | int verbose) | |
19bc17a9 | 776 | { |
4b10dd6c | 777 | int return_widestr = lr->return_widestr; |
5dcbff58 | 778 | struct lr_buffer lrb; |
a9c27b3e | 779 | wchar_t *buf2 = NULL; |
19bc17a9 | 780 | |
5dcbff58 | 781 | lr_buffer_init (&lrb); |
19bc17a9 | 782 | |
4b10dd6c UD |
783 | /* We know it'll be a string. */ |
784 | lr->token.tok = tok_string; | |
785 | ||
786 | /* If we need not translate the strings (i.e., expand <...> parts) | |
787 | we can run a simple loop. */ | |
788 | if (!lr->translate_strings) | |
789 | { | |
790 | int ch; | |
791 | ||
792 | buf2 = NULL; | |
793 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
b15538d7 FW |
794 | { |
795 | if (ch >= 0x80) | |
796 | lr_error (lr, _("illegal 8-bit character in untranslated string")); | |
797 | addc (&lrb, ch); | |
798 | } | |
4b10dd6c UD |
799 | |
800 | /* Catch errors with trailing escape character. */ | |
5dcbff58 FW |
801 | if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char |
802 | && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char)) | |
4b10dd6c UD |
803 | { |
804 | lr_error (lr, _("illegal escape sequence at end of string")); | |
5dcbff58 | 805 | --lrb.act; |
4b10dd6c UD |
806 | } |
807 | else if (ch == '\n' || ch == EOF) | |
808 | lr_error (lr, _("unterminated string")); | |
809 | ||
5dcbff58 | 810 | addc (&lrb, '\0'); |
4b10dd6c UD |
811 | } |
812 | else | |
813 | { | |
7dcaabb9 | 814 | bool illegal_string = false; |
4b10dd6c UD |
815 | size_t buf2act = 0; |
816 | size_t buf2max = 56 * sizeof (uint32_t); | |
817 | int ch; | |
4b10dd6c UD |
818 | |
819 | /* We have to provide the wide character result as well. */ | |
820 | if (return_widestr) | |
821 | buf2 = xmalloc (buf2max); | |
822 | ||
823 | /* Read until the end of the string (or end of the line or file). */ | |
824 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
825 | { | |
826 | size_t startidx; | |
827 | uint32_t wch; | |
828 | struct charseq *seq; | |
829 | ||
830 | if (ch != '<') | |
831 | { | |
b15538d7 FW |
832 | /* The standards leave it up to the implementation to |
833 | decide what to do with characters which stand for | |
834 | themselves. This implementation treats the input | |
835 | file as encoded in UTF-8. */ | |
4b10dd6c UD |
836 | if (ch == lr->escape_char) |
837 | { | |
838 | ch = lr_getc (lr); | |
b15538d7 FW |
839 | if (ch >= 0x80) |
840 | { | |
841 | lr_error (lr, _("illegal 8-bit escape sequence")); | |
842 | illegal_string = true; | |
843 | break; | |
844 | } | |
4b10dd6c UD |
845 | if (ch == '\n' || ch == EOF) |
846 | break; | |
b15538d7 FW |
847 | addc (&lrb, ch); |
848 | wch = ch; | |
849 | } | |
850 | else if (ch < 0x80) | |
851 | { | |
852 | wch = ch; | |
853 | addc (&lrb, ch); | |
854 | } | |
855 | else /* UTF-8 sequence. */ | |
856 | { | |
857 | if (!utf8_decode (lr, ch, &wch)) | |
858 | { | |
859 | illegal_string = true; | |
860 | break; | |
861 | } | |
862 | if (!translate_unicode_codepoint (locale, charmap, | |
863 | repertoire, wch, &lrb)) | |
864 | { | |
865 | /* Ignore the rest of the string. Callers may | |
866 | skip this string because it cannot be encoded | |
867 | in the output character set. */ | |
868 | illegal_string = true; | |
869 | continue; | |
870 | } | |
4b10dd6c UD |
871 | } |
872 | ||
4b10dd6c | 873 | if (return_widestr) |
b15538d7 | 874 | ADDWC (wch); |
4b10dd6c UD |
875 | |
876 | continue; | |
877 | } | |
878 | ||
879 | /* Now we have to search for the end of the symbolic name, i.e., | |
880 | the closing '>'. */ | |
5dcbff58 | 881 | startidx = lrb.act; |
4b10dd6c UD |
882 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) |
883 | { | |
884 | if (ch == lr->escape_char) | |
885 | { | |
886 | ch = lr_getc (lr); | |
887 | if (ch == '\n' || ch == EOF) | |
888 | break; | |
889 | } | |
5dcbff58 | 890 | addc (&lrb, ch); |
4b10dd6c UD |
891 | } |
892 | if (ch == '\n' || ch == EOF) | |
893 | /* Not a correct string. */ | |
894 | break; | |
5dcbff58 | 895 | if (lrb.act == startidx) |
4b10dd6c UD |
896 | { |
897 | /* <> is no correct name. Ignore it and also signal an | |
898 | error. */ | |
7dcaabb9 | 899 | illegal_string = true; |
4b10dd6c UD |
900 | continue; |
901 | } | |
19bc17a9 | 902 | |
4b10dd6c | 903 | /* It might be a Uxxxx symbol. */ |
5dcbff58 FW |
904 | if (lrb.buf[startidx] == 'U' |
905 | && (lrb.act - startidx == 5 || lrb.act - startidx == 9)) | |
4b10dd6c | 906 | { |
5dcbff58 FW |
907 | char *cp = lrb.buf + startidx + 1; |
908 | while (cp < &lrb.buf[lrb.act] && isxdigit (*cp)) | |
4b10dd6c UD |
909 | ++cp; |
910 | ||
5dcbff58 | 911 | if (cp == &lrb.buf[lrb.act]) |
4b10dd6c | 912 | { |
4b10dd6c | 913 | /* Yes, it is. */ |
5dcbff58 FW |
914 | addc (&lrb, '\0'); |
915 | wch = strtoul (lrb.buf + startidx + 1, NULL, 16); | |
4b10dd6c UD |
916 | |
917 | /* Now forget about the name we just added. */ | |
5dcbff58 | 918 | lrb.act = startidx; |
4b10dd6c UD |
919 | |
920 | if (return_widestr) | |
921 | ADDWC (wch); | |
922 | ||
7dcaabb9 FW |
923 | if (!translate_unicode_codepoint (locale, charmap, |
924 | repertoire, wch, &lrb)) | |
925 | illegal_string = true; | |
4b10dd6c UD |
926 | continue; |
927 | } | |
928 | } | |
929 | ||
5dcbff58 FW |
930 | /* We now have the symbolic name in lrb.buf[startidx] to |
931 | lrb.buf[lrb.act-1]. Now find out the value for this character | |
3c833378 UD |
932 | in the charmap as well as in the repertoire map (in this |
933 | order). */ | |
5dcbff58 FW |
934 | seq = charmap_find_value (charmap, &lrb.buf[startidx], |
935 | lrb.act - startidx); | |
3c833378 UD |
936 | |
937 | if (seq == NULL) | |
938 | { | |
939 | /* This name is not in the charmap. */ | |
940 | lr_error (lr, _("symbol `%.*s' not in charmap"), | |
5dcbff58 | 941 | (int) (lrb.act - startidx), &lrb.buf[startidx]); |
7dcaabb9 | 942 | illegal_string = true; |
3c833378 UD |
943 | } |
944 | ||
4b10dd6c UD |
945 | if (return_widestr) |
946 | { | |
3c833378 UD |
947 | /* Now the same for the multibyte representation. */ |
948 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
949 | wch = seq->ucs4; | |
950 | else | |
951 | { | |
5dcbff58 FW |
952 | wch = repertoire_find_value (repertoire, &lrb.buf[startidx], |
953 | lrb.act - startidx); | |
3c833378 UD |
954 | if (seq != NULL) |
955 | seq->ucs4 = wch; | |
956 | } | |
957 | ||
4b10dd6c UD |
958 | if (wch == ILLEGAL_CHAR_VALUE) |
959 | { | |
960 | /* This name is not in the repertoire map. */ | |
961 | lr_error (lr, _("symbol `%.*s' not in repertoire map"), | |
5dcbff58 | 962 | (int) (lrb.act - startidx), &lrb.buf[startidx]); |
7dcaabb9 | 963 | illegal_string = true; |
4b10dd6c UD |
964 | } |
965 | else | |
966 | ADDWC (wch); | |
967 | } | |
968 | ||
3c833378 | 969 | /* Now forget about the name we just added. */ |
5dcbff58 | 970 | lrb.act = startidx; |
19bc17a9 | 971 | |
3c833378 UD |
972 | /* And copy the bytes. */ |
973 | if (seq != NULL) | |
5dcbff58 | 974 | adds (&lrb, seq->bytes, seq->nbytes); |
4b10dd6c | 975 | } |
19bc17a9 | 976 | |
4b10dd6c UD |
977 | if (ch == '\n' || ch == EOF) |
978 | { | |
979 | lr_error (lr, _("unterminated string")); | |
7dcaabb9 | 980 | illegal_string = true; |
4b10dd6c | 981 | } |
19bc17a9 | 982 | |
4b10dd6c UD |
983 | if (illegal_string) |
984 | { | |
5dcbff58 | 985 | free (lrb.buf); |
72e6cdfa | 986 | free (buf2); |
4b10dd6c UD |
987 | lr->token.val.str.startmb = NULL; |
988 | lr->token.val.str.lenmb = 0; | |
d5fd1f3f UD |
989 | lr->token.val.str.startwc = NULL; |
990 | lr->token.val.str.lenwc = 0; | |
19bc17a9 | 991 | |
4b10dd6c UD |
992 | return &lr->token; |
993 | } | |
19bc17a9 | 994 | |
5dcbff58 | 995 | addc (&lrb, '\0'); |
19bc17a9 | 996 | |
4b10dd6c UD |
997 | if (return_widestr) |
998 | { | |
999 | ADDWC (0); | |
1000 | lr->token.val.str.startwc = xrealloc (buf2, | |
1001 | buf2act * sizeof (uint32_t)); | |
1002 | lr->token.val.str.lenwc = buf2act; | |
1003 | } | |
19bc17a9 RM |
1004 | } |
1005 | ||
5dcbff58 | 1006 | lr_buffer_to_token (&lrb, lr); |
4b10dd6c | 1007 | |
19bc17a9 RM |
1008 | return &lr->token; |
1009 | } |