]>
Commit | Line | Data |
---|---|---|
f7a9f785 | 1 | /* Copyright (C) 1996-2016 Free Software Foundation, Inc. |
5290baf0 | 2 | This file is part of the GNU C Library. |
4b10dd6c | 3 | Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. |
19bc17a9 | 4 | |
43bc8ac6 | 5 | This program is free software; you can redistribute it and/or modify |
2e2efe65 RM |
6 | it under the terms of the GNU General Public License as published |
7 | by the Free Software Foundation; version 2 of the License, or | |
8 | (at your option) any later version. | |
19bc17a9 | 9 | |
43bc8ac6 | 10 | This program is distributed in the hope that it will be useful, |
5290baf0 | 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
43bc8ac6 UD |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | GNU General Public License for more details. | |
19bc17a9 | 14 | |
43bc8ac6 | 15 | You should have received a copy of the GNU General Public License |
59ba27a6 | 16 | along with this program; if not, see <http://www.gnu.org/licenses/>. */ |
19bc17a9 RM |
17 | |
18 | #ifdef HAVE_CONFIG_H | |
19 | # include <config.h> | |
20 | #endif | |
21 | ||
47e8b443 | 22 | #include <assert.h> |
19bc17a9 RM |
23 | #include <ctype.h> |
24 | #include <errno.h> | |
25 | #include <libintl.h> | |
26 | #include <stdarg.h> | |
27 | #include <stdlib.h> | |
28 | #include <string.h> | |
e054f494 | 29 | #include <stdint.h> |
19bc17a9 | 30 | |
f2b98f97 | 31 | #include "localedef.h" |
4b10dd6c | 32 | #include "charmap.h" |
19bc17a9 RM |
33 | #include "error.h" |
34 | #include "linereader.h" | |
47e8b443 | 35 | #include "locfile.h" |
93693c4d | 36 | |
4b10dd6c | 37 | /* Prototypes for local functions. */ |
19bc17a9 RM |
38 | static struct token *get_toplvl_escape (struct linereader *lr); |
39 | static struct token *get_symname (struct linereader *lr); | |
40 | static struct token *get_ident (struct linereader *lr); | |
41 | static struct token *get_string (struct linereader *lr, | |
4b10dd6c | 42 | const struct charmap_t *charmap, |
47e8b443 | 43 | struct localedef_t *locale, |
93693c4d UD |
44 | const struct repertoire_t *repertoire, |
45 | int verbose); | |
19bc17a9 RM |
46 | |
47 | ||
48 | struct linereader * | |
49 | lr_open (const char *fname, kw_hash_fct_t hf) | |
50 | { | |
51 | FILE *fp; | |
19bc17a9 RM |
52 | |
53 | if (fname == NULL || strcmp (fname, "-") == 0 | |
54 | || strcmp (fname, "/dev/stdin") == 0) | |
3e076219 | 55 | return lr_create (stdin, "<stdin>", hf); |
19bc17a9 RM |
56 | else |
57 | { | |
2e2dc1a5 | 58 | fp = fopen (fname, "rm"); |
19bc17a9 RM |
59 | if (fp == NULL) |
60 | return NULL; | |
3e076219 | 61 | return lr_create (fp, fname, hf); |
19bc17a9 | 62 | } |
3e076219 UD |
63 | } |
64 | ||
65 | struct linereader * | |
66 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) | |
67 | { | |
68 | struct linereader *result; | |
69 | int n; | |
19bc17a9 RM |
70 | |
71 | result = (struct linereader *) xmalloc (sizeof (*result)); | |
72 | ||
73 | result->fp = fp; | |
3e076219 | 74 | result->fname = xstrdup (fname); |
19bc17a9 RM |
75 | result->buf = NULL; |
76 | result->bufsize = 0; | |
77 | result->lineno = 1; | |
78 | result->idx = 0; | |
79 | result->comment_char = '#'; | |
80 | result->escape_char = '\\'; | |
81 | result->translate_strings = 1; | |
7c11c4a1 | 82 | result->return_widestr = 0; |
19bc17a9 RM |
83 | |
84 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); | |
85 | if (n < 0) | |
86 | { | |
87 | int save = errno; | |
88 | fclose (result->fp); | |
46ec036d | 89 | free ((char *) result->fname); |
19bc17a9 RM |
90 | free (result); |
91 | errno = save; | |
92 | return NULL; | |
93 | } | |
94 | ||
95 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') | |
96 | n -= 2; | |
97 | ||
98 | result->buf[n] = '\0'; | |
99 | result->bufact = n; | |
100 | result->hash_fct = hf; | |
101 | ||
102 | return result; | |
103 | } | |
104 | ||
105 | ||
106 | int | |
107 | lr_eof (struct linereader *lr) | |
108 | { | |
109 | return lr->bufact = 0; | |
110 | } | |
111 | ||
112 | ||
dd9423a6 UD |
113 | void |
114 | lr_ignore_rest (struct linereader *lr, int verbose) | |
115 | { | |
116 | if (verbose) | |
117 | { | |
118 | while (isspace (lr->buf[lr->idx]) && lr->buf[lr->idx] != '\n' | |
119 | && lr->buf[lr->idx] != lr->comment_char) | |
120 | if (lr->buf[lr->idx] == '\0') | |
121 | { | |
122 | if (lr_next (lr) < 0) | |
123 | return; | |
124 | } | |
125 | else | |
126 | ++lr->idx; | |
127 | ||
128 | if (lr->buf[lr->idx] != '\n' && ! feof (lr->fp) | |
129 | && lr->buf[lr->idx] != lr->comment_char) | |
130 | lr_error (lr, _("trailing garbage at end of line")); | |
131 | } | |
132 | ||
133 | /* Ignore continued line. */ | |
134 | while (lr->bufact > 0 && lr->buf[lr->bufact - 1] != '\n') | |
135 | if (lr_next (lr) < 0) | |
136 | break; | |
137 | ||
138 | lr->idx = lr->bufact; | |
139 | } | |
140 | ||
141 | ||
19bc17a9 RM |
142 | void |
143 | lr_close (struct linereader *lr) | |
144 | { | |
145 | fclose (lr->fp); | |
146 | free (lr->buf); | |
147 | free (lr); | |
148 | } | |
149 | ||
150 | ||
151 | int | |
152 | lr_next (struct linereader *lr) | |
153 | { | |
154 | int n; | |
155 | ||
156 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); | |
157 | if (n < 0) | |
158 | return -1; | |
159 | ||
160 | ++lr->lineno; | |
161 | ||
162 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') | |
163 | { | |
4b10dd6c UD |
164 | #if 0 |
165 | /* XXX Is this correct? */ | |
19bc17a9 RM |
166 | /* An escaped newline character is substituted with a single <SP>. */ |
167 | --n; | |
168 | lr->buf[n - 1] = ' '; | |
4b10dd6c UD |
169 | #else |
170 | n -= 2; | |
171 | #endif | |
19bc17a9 RM |
172 | } |
173 | ||
174 | lr->buf[n] = '\0'; | |
175 | lr->bufact = n; | |
176 | lr->idx = 0; | |
177 | ||
178 | return 0; | |
179 | } | |
180 | ||
181 | ||
182 | /* Defined in error.c. */ | |
183 | /* This variable is incremented each time `error' is called. */ | |
184 | extern unsigned int error_message_count; | |
185 | ||
186 | /* The calling program should define program_name and set it to the | |
187 | name of the executing program. */ | |
188 | extern char *program_name; | |
189 | ||
190 | ||
191 | struct token * | |
4b10dd6c | 192 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
193 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
194 | int verbose) | |
19bc17a9 RM |
195 | { |
196 | int ch; | |
197 | ||
198 | while (1) | |
199 | { | |
200 | do | |
201 | { | |
202 | ch = lr_getc (lr); | |
203 | ||
76fbcfdd UD |
204 | if (ch == EOF) |
205 | { | |
206 | lr->token.tok = tok_eof; | |
207 | return &lr->token; | |
208 | }; | |
209 | ||
19bc17a9 RM |
210 | if (ch == '\n') |
211 | { | |
212 | lr->token.tok = tok_eol; | |
213 | return &lr->token; | |
214 | } | |
215 | } | |
216 | while (isspace (ch)); | |
217 | ||
19bc17a9 RM |
218 | if (ch != lr->comment_char) |
219 | break; | |
220 | ||
a0dc5206 UD |
221 | /* Is there an newline at the end of the buffer? */ |
222 | if (lr->buf[lr->bufact - 1] != '\n') | |
223 | { | |
224 | /* No. Some people want this to mean that only the line in | |
225 | the file not the logical, concatenated line is ignored. | |
226 | Let's try this. */ | |
227 | lr->idx = lr->bufact; | |
228 | continue; | |
229 | } | |
230 | ||
19bc17a9 RM |
231 | /* Ignore rest of line. */ |
232 | lr_ignore_rest (lr, 0); | |
233 | lr->token.tok = tok_eol; | |
234 | return &lr->token; | |
235 | } | |
236 | ||
237 | /* Match escape sequences. */ | |
238 | if (ch == lr->escape_char) | |
239 | return get_toplvl_escape (lr); | |
240 | ||
241 | /* Match ellipsis. */ | |
4b10dd6c | 242 | if (ch == '.') |
19bc17a9 | 243 | { |
a0dc5206 UD |
244 | if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0) |
245 | { | |
246 | int cnt; | |
247 | for (cnt = 0; cnt < 10; ++cnt) | |
248 | lr_getc (lr); | |
249 | lr->token.tok = tok_ellipsis4_2; | |
250 | return &lr->token; | |
251 | } | |
4b10dd6c UD |
252 | if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) |
253 | { | |
254 | lr_getc (lr); | |
255 | lr_getc (lr); | |
256 | lr_getc (lr); | |
257 | lr->token.tok = tok_ellipsis4; | |
258 | return &lr->token; | |
259 | } | |
260 | if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) | |
261 | { | |
262 | lr_getc (lr); | |
263 | lr_getc (lr); | |
264 | lr->token.tok = tok_ellipsis3; | |
265 | return &lr->token; | |
266 | } | |
a0dc5206 UD |
267 | if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0) |
268 | { | |
269 | int cnt; | |
270 | for (cnt = 0; cnt < 6; ++cnt) | |
271 | lr_getc (lr); | |
272 | lr->token.tok = tok_ellipsis2_2; | |
273 | return &lr->token; | |
274 | } | |
4b10dd6c UD |
275 | if (lr->buf[lr->idx] == '.') |
276 | { | |
277 | lr_getc (lr); | |
278 | lr->token.tok = tok_ellipsis2; | |
279 | return &lr->token; | |
280 | } | |
19bc17a9 RM |
281 | } |
282 | ||
283 | switch (ch) | |
284 | { | |
285 | case '<': | |
286 | return get_symname (lr); | |
287 | ||
288 | case '0' ... '9': | |
289 | lr->token.tok = tok_number; | |
290 | lr->token.val.num = ch - '0'; | |
291 | ||
292 | while (isdigit (ch = lr_getc (lr))) | |
293 | { | |
294 | lr->token.val.num *= 10; | |
295 | lr->token.val.num += ch - '0'; | |
296 | } | |
297 | if (isalpha (ch)) | |
5290baf0 | 298 | lr_error (lr, _("garbage at end of number")); |
19bc17a9 RM |
299 | lr_ungetn (lr, 1); |
300 | ||
301 | return &lr->token; | |
302 | ||
303 | case ';': | |
304 | lr->token.tok = tok_semicolon; | |
305 | return &lr->token; | |
306 | ||
307 | case ',': | |
308 | lr->token.tok = tok_comma; | |
309 | return &lr->token; | |
310 | ||
311 | case '(': | |
312 | lr->token.tok = tok_open_brace; | |
313 | return &lr->token; | |
314 | ||
315 | case ')': | |
316 | lr->token.tok = tok_close_brace; | |
317 | return &lr->token; | |
318 | ||
319 | case '"': | |
47e8b443 | 320 | return get_string (lr, charmap, locale, repertoire, verbose); |
19bc17a9 RM |
321 | |
322 | case '-': | |
323 | ch = lr_getc (lr); | |
324 | if (ch == '1') | |
325 | { | |
326 | lr->token.tok = tok_minus1; | |
327 | return &lr->token; | |
328 | } | |
329 | lr_ungetn (lr, 2); | |
330 | break; | |
331 | } | |
332 | ||
333 | return get_ident (lr); | |
334 | } | |
335 | ||
336 | ||
337 | static struct token * | |
338 | get_toplvl_escape (struct linereader *lr) | |
339 | { | |
340 | /* This is supposed to be a numeric value. We return the | |
341 | numerical value and the number of bytes. */ | |
342 | size_t start_idx = lr->idx - 1; | |
9cfe5381 RM |
343 | unsigned char *bytes = lr->token.val.charcode.bytes; |
344 | size_t nbytes = 0; | |
19bc17a9 RM |
345 | int ch; |
346 | ||
347 | do | |
348 | { | |
349 | unsigned int byte = 0; | |
350 | unsigned int base = 8; | |
351 | ||
352 | ch = lr_getc (lr); | |
353 | ||
354 | if (ch == 'd') | |
355 | { | |
356 | base = 10; | |
357 | ch = lr_getc (lr); | |
358 | } | |
359 | else if (ch == 'x') | |
360 | { | |
361 | base = 16; | |
362 | ch = lr_getc (lr); | |
363 | } | |
364 | ||
365 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 366 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
367 | { |
368 | esc_error: | |
4b10dd6c | 369 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
19bc17a9 | 370 | |
76fbcfdd | 371 | while (ch != EOF && !isspace (ch)) |
19bc17a9 | 372 | ch = lr_getc (lr); |
4b10dd6c | 373 | lr->token.val.str.lenmb = lr->idx - start_idx; |
19bc17a9 RM |
374 | |
375 | lr->token.tok = tok_error; | |
376 | return &lr->token; | |
377 | } | |
378 | ||
379 | if (isdigit (ch)) | |
380 | byte = ch - '0'; | |
381 | else | |
4b10dd6c | 382 | byte = tolower (ch) - 'a' + 10; |
19bc17a9 RM |
383 | |
384 | ch = lr_getc (lr); | |
385 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 386 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
387 | goto esc_error; |
388 | ||
389 | byte *= base; | |
390 | if (isdigit (ch)) | |
391 | byte += ch - '0'; | |
392 | else | |
4b10dd6c | 393 | byte += tolower (ch) - 'a' + 10; |
19bc17a9 RM |
394 | |
395 | ch = lr_getc (lr); | |
396 | if (base != 16 && isdigit (ch)) | |
397 | { | |
398 | byte *= base; | |
679f5a56 | 399 | byte += ch - '0'; |
19bc17a9 RM |
400 | |
401 | ch = lr_getc (lr); | |
402 | } | |
403 | ||
4b10dd6c | 404 | bytes[nbytes++] = byte; |
19bc17a9 | 405 | } |
c50ec4e0 | 406 | while (ch == lr->escape_char |
6dd67bd5 | 407 | && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); |
19bc17a9 RM |
408 | |
409 | if (!isspace (ch)) | |
410 | lr_error (lr, _("garbage at end of character code specification")); | |
411 | ||
412 | lr_ungetn (lr, 1); | |
413 | ||
414 | lr->token.tok = tok_charcode; | |
19bc17a9 RM |
415 | lr->token.val.charcode.nbytes = nbytes; |
416 | ||
417 | return &lr->token; | |
418 | } | |
419 | ||
420 | ||
4b10dd6c UD |
421 | #define ADDC(ch) \ |
422 | do \ | |
423 | { \ | |
424 | if (bufact == bufmax) \ | |
425 | { \ | |
426 | bufmax *= 2; \ | |
427 | buf = xrealloc (buf, bufmax); \ | |
428 | } \ | |
429 | buf[bufact++] = (ch); \ | |
430 | } \ | |
431 | while (0) | |
432 | ||
433 | ||
434 | #define ADDS(s, l) \ | |
435 | do \ | |
436 | { \ | |
437 | size_t _l = (l); \ | |
438 | if (bufact + _l > bufmax) \ | |
439 | { \ | |
440 | if (bufact < _l) \ | |
441 | bufact = _l; \ | |
442 | bufmax *= 2; \ | |
443 | buf = xrealloc (buf, bufmax); \ | |
444 | } \ | |
445 | memcpy (&buf[bufact], s, _l); \ | |
446 | bufact += _l; \ | |
447 | } \ | |
448 | while (0) | |
449 | ||
450 | ||
451 | #define ADDWC(ch) \ | |
452 | do \ | |
453 | { \ | |
454 | if (buf2act == buf2max) \ | |
455 | { \ | |
456 | buf2max *= 2; \ | |
457 | buf2 = xrealloc (buf2, buf2max * 4); \ | |
458 | } \ | |
459 | buf2[buf2act++] = (ch); \ | |
460 | } \ | |
19bc17a9 RM |
461 | while (0) |
462 | ||
463 | ||
464 | static struct token * | |
465 | get_symname (struct linereader *lr) | |
466 | { | |
467 | /* Symbol in brackets. We must distinguish three kinds: | |
468 | 1. reserved words | |
469 | 2. ISO 10646 position values | |
470 | 3. all other. */ | |
471 | char *buf; | |
472 | size_t bufact = 0; | |
473 | size_t bufmax = 56; | |
474 | const struct keyword_t *kw; | |
475 | int ch; | |
476 | ||
477 | buf = (char *) xmalloc (bufmax); | |
478 | ||
479 | do | |
480 | { | |
481 | ch = lr_getc (lr); | |
482 | if (ch == lr->escape_char) | |
483 | { | |
484 | int c2 = lr_getc (lr); | |
485 | ADDC (c2); | |
486 | ||
487 | if (c2 == '\n') | |
488 | ch = '\n'; | |
489 | } | |
490 | else | |
491 | ADDC (ch); | |
492 | } | |
493 | while (ch != '>' && ch != '\n'); | |
494 | ||
495 | if (ch == '\n') | |
496 | lr_error (lr, _("unterminated symbolic name")); | |
497 | ||
498 | /* Test for ISO 10646 position value. */ | |
499 | if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) | |
500 | { | |
501 | char *cp = buf + 1; | |
502 | while (cp < &buf[bufact - 1] && isxdigit (*cp)) | |
503 | ++cp; | |
504 | ||
505 | if (cp == &buf[bufact - 1]) | |
506 | { | |
507 | /* Yes, it is. */ | |
4b10dd6c UD |
508 | lr->token.tok = tok_ucs4; |
509 | lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); | |
19bc17a9 RM |
510 | |
511 | return &lr->token; | |
512 | } | |
513 | } | |
514 | ||
515 | /* It is a symbolic name. Test for reserved words. */ | |
516 | kw = lr->hash_fct (buf, bufact - 1); | |
517 | ||
518 | if (kw != NULL && kw->symname_or_ident == 1) | |
519 | { | |
520 | lr->token.tok = kw->token; | |
521 | free (buf); | |
522 | } | |
523 | else | |
524 | { | |
525 | lr->token.tok = tok_bsymbol; | |
526 | ||
19bc17a9 | 527 | buf = xrealloc (buf, bufact + 1); |
b16dba4c | 528 | buf[bufact] = '\0'; |
19bc17a9 | 529 | |
4b10dd6c UD |
530 | lr->token.val.str.startmb = buf; |
531 | lr->token.val.str.lenmb = bufact - 1; | |
19bc17a9 RM |
532 | } |
533 | ||
534 | return &lr->token; | |
535 | } | |
536 | ||
537 | ||
538 | static struct token * | |
539 | get_ident (struct linereader *lr) | |
540 | { | |
541 | char *buf; | |
542 | size_t bufact; | |
543 | size_t bufmax = 56; | |
544 | const struct keyword_t *kw; | |
545 | int ch; | |
546 | ||
547 | buf = xmalloc (bufmax); | |
548 | bufact = 0; | |
549 | ||
550 | ADDC (lr->buf[lr->idx - 1]); | |
551 | ||
552 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' | |
f126ef67 | 553 | && ch != '<' && ch != ',' && ch != EOF) |
4b10dd6c UD |
554 | { |
555 | if (ch == lr->escape_char) | |
556 | { | |
557 | ch = lr_getc (lr); | |
558 | if (ch == '\n' || ch == EOF) | |
559 | { | |
560 | lr_error (lr, _("invalid escape sequence")); | |
561 | break; | |
562 | } | |
563 | } | |
564 | ADDC (ch); | |
565 | } | |
19bc17a9 | 566 | |
f126ef67 | 567 | lr_ungetc (lr, ch); |
19bc17a9 RM |
568 | |
569 | kw = lr->hash_fct (buf, bufact); | |
570 | ||
571 | if (kw != NULL && kw->symname_or_ident == 0) | |
572 | { | |
573 | lr->token.tok = kw->token; | |
574 | free (buf); | |
575 | } | |
576 | else | |
577 | { | |
578 | lr->token.tok = tok_ident; | |
579 | ||
19bc17a9 | 580 | buf = xrealloc (buf, bufact + 1); |
b16dba4c | 581 | buf[bufact] = '\0'; |
19bc17a9 | 582 | |
4b10dd6c UD |
583 | lr->token.val.str.startmb = buf; |
584 | lr->token.val.str.lenmb = bufact; | |
19bc17a9 RM |
585 | } |
586 | ||
587 | return &lr->token; | |
588 | } | |
589 | ||
590 | ||
591 | static struct token * | |
4b10dd6c | 592 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
593 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
594 | int verbose) | |
19bc17a9 | 595 | { |
4b10dd6c UD |
596 | int return_widestr = lr->return_widestr; |
597 | char *buf; | |
a9c27b3e | 598 | wchar_t *buf2 = NULL; |
19bc17a9 RM |
599 | size_t bufact; |
600 | size_t bufmax = 56; | |
19bc17a9 | 601 | |
4b10dd6c | 602 | /* We must return two different strings. */ |
19bc17a9 RM |
603 | buf = xmalloc (bufmax); |
604 | bufact = 0; | |
605 | ||
4b10dd6c UD |
606 | /* We know it'll be a string. */ |
607 | lr->token.tok = tok_string; | |
608 | ||
609 | /* If we need not translate the strings (i.e., expand <...> parts) | |
610 | we can run a simple loop. */ | |
611 | if (!lr->translate_strings) | |
612 | { | |
613 | int ch; | |
614 | ||
615 | buf2 = NULL; | |
616 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
19bc17a9 | 617 | ADDC (ch); |
4b10dd6c UD |
618 | |
619 | /* Catch errors with trailing escape character. */ | |
620 | if (bufact > 0 && buf[bufact - 1] == lr->escape_char | |
621 | && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) | |
622 | { | |
623 | lr_error (lr, _("illegal escape sequence at end of string")); | |
624 | --bufact; | |
625 | } | |
626 | else if (ch == '\n' || ch == EOF) | |
627 | lr_error (lr, _("unterminated string")); | |
628 | ||
629 | ADDC ('\0'); | |
630 | } | |
631 | else | |
632 | { | |
633 | int illegal_string = 0; | |
634 | size_t buf2act = 0; | |
635 | size_t buf2max = 56 * sizeof (uint32_t); | |
636 | int ch; | |
637 | int warned = 0; | |
638 | ||
639 | /* We have to provide the wide character result as well. */ | |
640 | if (return_widestr) | |
641 | buf2 = xmalloc (buf2max); | |
642 | ||
643 | /* Read until the end of the string (or end of the line or file). */ | |
644 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
645 | { | |
646 | size_t startidx; | |
647 | uint32_t wch; | |
648 | struct charseq *seq; | |
649 | ||
650 | if (ch != '<') | |
651 | { | |
652 | /* The standards leave it up to the implementation to decide | |
653 | what to do with character which stand for themself. We | |
654 | could jump through hoops to find out the value relative to | |
655 | the charmap and the repertoire map, but instead we leave | |
656 | it up to the locale definition author to write a better | |
657 | definition. We assume here that every character which | |
658 | stands for itself is encoded using ISO 8859-1. Using the | |
659 | escape character is allowed. */ | |
660 | if (ch == lr->escape_char) | |
661 | { | |
662 | ch = lr_getc (lr); | |
663 | if (ch == '\n' || ch == EOF) | |
664 | break; | |
665 | } | |
666 | ||
667 | if (verbose && !warned) | |
668 | { | |
669 | lr_error (lr, _("\ | |
670 | non-symbolic character value should not be used")); | |
671 | warned = 1; | |
672 | } | |
673 | ||
674 | ADDC (ch); | |
675 | if (return_widestr) | |
676 | ADDWC ((uint32_t) ch); | |
677 | ||
678 | continue; | |
679 | } | |
680 | ||
681 | /* Now we have to search for the end of the symbolic name, i.e., | |
682 | the closing '>'. */ | |
683 | startidx = bufact; | |
684 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) | |
685 | { | |
686 | if (ch == lr->escape_char) | |
687 | { | |
688 | ch = lr_getc (lr); | |
689 | if (ch == '\n' || ch == EOF) | |
690 | break; | |
691 | } | |
692 | ADDC (ch); | |
693 | } | |
694 | if (ch == '\n' || ch == EOF) | |
695 | /* Not a correct string. */ | |
696 | break; | |
697 | if (bufact == startidx) | |
698 | { | |
699 | /* <> is no correct name. Ignore it and also signal an | |
700 | error. */ | |
19bc17a9 | 701 | illegal_string = 1; |
4b10dd6c UD |
702 | continue; |
703 | } | |
19bc17a9 | 704 | |
4b10dd6c UD |
705 | /* It might be a Uxxxx symbol. */ |
706 | if (buf[startidx] == 'U' | |
707 | && (bufact - startidx == 5 || bufact - startidx == 9)) | |
708 | { | |
709 | char *cp = buf + startidx + 1; | |
710 | while (cp < &buf[bufact] && isxdigit (*cp)) | |
711 | ++cp; | |
712 | ||
713 | if (cp == &buf[bufact]) | |
714 | { | |
3c833378 | 715 | char utmp[10]; |
4b10dd6c UD |
716 | |
717 | /* Yes, it is. */ | |
718 | ADDC ('\0'); | |
719 | wch = strtoul (buf + startidx + 1, NULL, 16); | |
720 | ||
721 | /* Now forget about the name we just added. */ | |
722 | bufact = startidx; | |
723 | ||
724 | if (return_widestr) | |
725 | ADDWC (wch); | |
726 | ||
3c833378 UD |
727 | /* See whether the charmap contains the Uxxxxxxxx names. */ |
728 | snprintf (utmp, sizeof (utmp), "U%08X", wch); | |
729 | seq = charmap_find_value (charmap, utmp, 9); | |
4b10dd6c | 730 | |
3c833378 | 731 | if (seq == NULL) |
4b10dd6c | 732 | { |
3c833378 UD |
733 | /* No, this isn't the case. Now determine from |
734 | the repertoire the name of the character and | |
735 | find it in the charmap. */ | |
736 | if (repertoire != NULL) | |
3c833378 | 737 | { |
47e8b443 | 738 | const char *symbol; |
3c833378 | 739 | |
47e8b443 UD |
740 | symbol = repertoire_find_symbol (repertoire, wch); |
741 | ||
742 | if (symbol != NULL) | |
743 | seq = charmap_find_value (charmap, symbol, | |
744 | strlen (symbol)); | |
745 | } | |
746 | ||
747 | if (seq == NULL) | |
748 | { | |
749 | #ifndef NO_TRANSLITERATION | |
750 | /* Transliterate if possible. */ | |
751 | if (locale != NULL) | |
752 | { | |
753 | uint32_t *translit; | |
754 | ||
755 | if ((locale->avail & CTYPE_LOCALE) == 0) | |
756 | { | |
757 | /* Load the CTYPE data now. */ | |
758 | int old_needed = locale->needed; | |
759 | ||
760 | locale->needed = 0; | |
69f6a804 | 761 | locale = load_locale (LC_CTYPE, |
47e8b443 UD |
762 | locale->name, |
763 | locale->repertoire_name, | |
764 | charmap, locale); | |
765 | locale->needed = old_needed; | |
766 | } | |
767 | ||
768 | if ((locale->avail & CTYPE_LOCALE) != 0 | |
769 | && ((translit = find_translit (locale, | |
770 | charmap, wch)) | |
771 | != NULL)) | |
772 | /* The CTYPE data contains a matching | |
773 | transliteration. */ | |
774 | { | |
775 | int i; | |
776 | ||
777 | for (i = 0; translit[i] != 0; ++i) | |
778 | { | |
779 | char utmp[10]; | |
780 | ||
781 | snprintf (utmp, sizeof (utmp), "U%08X", | |
782 | translit[i]); | |
783 | seq = charmap_find_value (charmap, utmp, | |
784 | 9); | |
785 | assert (seq != NULL); | |
786 | ADDS (seq->bytes, seq->nbytes); | |
787 | } | |
788 | ||
789 | continue; | |
790 | } | |
791 | } | |
792 | #endif /* NO_TRANSLITERATION */ | |
793 | ||
794 | /* Not a known name. */ | |
795 | illegal_string = 1; | |
3c833378 | 796 | } |
4b10dd6c UD |
797 | } |
798 | ||
3c833378 UD |
799 | if (seq != NULL) |
800 | ADDS (seq->bytes, seq->nbytes); | |
801 | ||
4b10dd6c UD |
802 | continue; |
803 | } | |
804 | } | |
805 | ||
3c833378 UD |
806 | /* We now have the symbolic name in buf[startidx] to |
807 | buf[bufact-1]. Now find out the value for this character | |
808 | in the charmap as well as in the repertoire map (in this | |
809 | order). */ | |
810 | seq = charmap_find_value (charmap, &buf[startidx], | |
811 | bufact - startidx); | |
812 | ||
813 | if (seq == NULL) | |
814 | { | |
815 | /* This name is not in the charmap. */ | |
816 | lr_error (lr, _("symbol `%.*s' not in charmap"), | |
817 | (int) (bufact - startidx), &buf[startidx]); | |
818 | illegal_string = 1; | |
819 | } | |
820 | ||
4b10dd6c UD |
821 | if (return_widestr) |
822 | { | |
3c833378 UD |
823 | /* Now the same for the multibyte representation. */ |
824 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
825 | wch = seq->ucs4; | |
826 | else | |
827 | { | |
828 | wch = repertoire_find_value (repertoire, &buf[startidx], | |
829 | bufact - startidx); | |
830 | if (seq != NULL) | |
831 | seq->ucs4 = wch; | |
832 | } | |
833 | ||
4b10dd6c UD |
834 | if (wch == ILLEGAL_CHAR_VALUE) |
835 | { | |
836 | /* This name is not in the repertoire map. */ | |
837 | lr_error (lr, _("symbol `%.*s' not in repertoire map"), | |
70e51ab9 | 838 | (int) (bufact - startidx), &buf[startidx]); |
4b10dd6c UD |
839 | illegal_string = 1; |
840 | } | |
841 | else | |
842 | ADDWC (wch); | |
843 | } | |
844 | ||
3c833378 UD |
845 | /* Now forget about the name we just added. */ |
846 | bufact = startidx; | |
19bc17a9 | 847 | |
3c833378 UD |
848 | /* And copy the bytes. */ |
849 | if (seq != NULL) | |
850 | ADDS (seq->bytes, seq->nbytes); | |
4b10dd6c | 851 | } |
19bc17a9 | 852 | |
4b10dd6c UD |
853 | if (ch == '\n' || ch == EOF) |
854 | { | |
855 | lr_error (lr, _("unterminated string")); | |
856 | illegal_string = 1; | |
857 | } | |
19bc17a9 | 858 | |
4b10dd6c UD |
859 | if (illegal_string) |
860 | { | |
861 | free (buf); | |
72e6cdfa | 862 | free (buf2); |
4b10dd6c UD |
863 | lr->token.val.str.startmb = NULL; |
864 | lr->token.val.str.lenmb = 0; | |
d5fd1f3f UD |
865 | lr->token.val.str.startwc = NULL; |
866 | lr->token.val.str.lenwc = 0; | |
19bc17a9 | 867 | |
4b10dd6c UD |
868 | return &lr->token; |
869 | } | |
19bc17a9 | 870 | |
4b10dd6c | 871 | ADDC ('\0'); |
19bc17a9 | 872 | |
4b10dd6c UD |
873 | if (return_widestr) |
874 | { | |
875 | ADDWC (0); | |
876 | lr->token.val.str.startwc = xrealloc (buf2, | |
877 | buf2act * sizeof (uint32_t)); | |
878 | lr->token.val.str.lenwc = buf2act; | |
879 | } | |
19bc17a9 RM |
880 | } |
881 | ||
4b10dd6c UD |
882 | lr->token.val.str.startmb = xrealloc (buf, bufact); |
883 | lr->token.val.str.lenmb = bufact; | |
884 | ||
19bc17a9 RM |
885 | return &lr->token; |
886 | } |