]>
Commit | Line | Data |
---|---|---|
d569d333 | 1 | /* Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc. |
5290baf0 | 2 | This file is part of the GNU C Library. |
4b10dd6c | 3 | Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. |
19bc17a9 | 4 | |
5290baf0 UD |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Library General Public License as | |
7 | published by the Free Software Foundation; either version 2 of the | |
8 | License, or (at your option) any later version. | |
19bc17a9 | 9 | |
5290baf0 UD |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | Library General Public License for more details. | |
19bc17a9 | 14 | |
5290baf0 UD |
15 | You should have received a copy of the GNU Library General Public |
16 | License along with the GNU C Library; see the file COPYING.LIB. If not, | |
17 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
18 | Boston, MA 02111-1307, USA. */ | |
19bc17a9 RM |
19 | |
20 | #ifdef HAVE_CONFIG_H | |
21 | # include <config.h> | |
22 | #endif | |
23 | ||
24 | #include <ctype.h> | |
25 | #include <errno.h> | |
26 | #include <libintl.h> | |
27 | #include <stdarg.h> | |
28 | #include <stdlib.h> | |
29 | #include <string.h> | |
30 | ||
4b10dd6c | 31 | #include "charmap.h" |
19bc17a9 RM |
32 | #include "error.h" |
33 | #include "linereader.h" | |
4b10dd6c | 34 | #include "localedef.h" |
19bc17a9 RM |
35 | |
36 | ||
4b10dd6c | 37 | /* Prototypes for local functions. */ |
19bc17a9 RM |
38 | static struct token *get_toplvl_escape (struct linereader *lr); |
39 | static struct token *get_symname (struct linereader *lr); | |
40 | static struct token *get_ident (struct linereader *lr); | |
41 | static struct token *get_string (struct linereader *lr, | |
4b10dd6c UD |
42 | const struct charmap_t *charmap, |
43 | const struct repertoire_t *repertoire); | |
19bc17a9 RM |
44 | |
45 | ||
46 | struct linereader * | |
47 | lr_open (const char *fname, kw_hash_fct_t hf) | |
48 | { | |
49 | FILE *fp; | |
19bc17a9 RM |
50 | |
51 | if (fname == NULL || strcmp (fname, "-") == 0 | |
52 | || strcmp (fname, "/dev/stdin") == 0) | |
3e076219 | 53 | return lr_create (stdin, "<stdin>", hf); |
19bc17a9 RM |
54 | else |
55 | { | |
56 | fp = fopen (fname, "r"); | |
57 | if (fp == NULL) | |
58 | return NULL; | |
3e076219 | 59 | return lr_create (fp, fname, hf); |
19bc17a9 | 60 | } |
3e076219 UD |
61 | } |
62 | ||
63 | struct linereader * | |
64 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) | |
65 | { | |
66 | struct linereader *result; | |
67 | int n; | |
19bc17a9 RM |
68 | |
69 | result = (struct linereader *) xmalloc (sizeof (*result)); | |
70 | ||
71 | result->fp = fp; | |
3e076219 | 72 | result->fname = xstrdup (fname); |
19bc17a9 RM |
73 | result->buf = NULL; |
74 | result->bufsize = 0; | |
75 | result->lineno = 1; | |
76 | result->idx = 0; | |
77 | result->comment_char = '#'; | |
78 | result->escape_char = '\\'; | |
79 | result->translate_strings = 1; | |
80 | ||
81 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); | |
82 | if (n < 0) | |
83 | { | |
84 | int save = errno; | |
85 | fclose (result->fp); | |
46ec036d | 86 | free ((char *) result->fname); |
19bc17a9 RM |
87 | free (result); |
88 | errno = save; | |
89 | return NULL; | |
90 | } | |
91 | ||
92 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') | |
93 | n -= 2; | |
94 | ||
95 | result->buf[n] = '\0'; | |
96 | result->bufact = n; | |
97 | result->hash_fct = hf; | |
98 | ||
99 | return result; | |
100 | } | |
101 | ||
102 | ||
103 | int | |
104 | lr_eof (struct linereader *lr) | |
105 | { | |
106 | return lr->bufact = 0; | |
107 | } | |
108 | ||
109 | ||
110 | void | |
111 | lr_close (struct linereader *lr) | |
112 | { | |
113 | fclose (lr->fp); | |
114 | free (lr->buf); | |
115 | free (lr); | |
116 | } | |
117 | ||
118 | ||
119 | int | |
120 | lr_next (struct linereader *lr) | |
121 | { | |
122 | int n; | |
123 | ||
124 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); | |
125 | if (n < 0) | |
126 | return -1; | |
127 | ||
128 | ++lr->lineno; | |
129 | ||
130 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') | |
131 | { | |
4b10dd6c UD |
132 | #if 0 |
133 | /* XXX Is this correct? */ | |
19bc17a9 RM |
134 | /* An escaped newline character is substituted with a single <SP>. */ |
135 | --n; | |
136 | lr->buf[n - 1] = ' '; | |
4b10dd6c UD |
137 | #else |
138 | n -= 2; | |
139 | #endif | |
19bc17a9 RM |
140 | } |
141 | ||
142 | lr->buf[n] = '\0'; | |
143 | lr->bufact = n; | |
144 | lr->idx = 0; | |
145 | ||
146 | return 0; | |
147 | } | |
148 | ||
149 | ||
150 | /* Defined in error.c. */ | |
151 | /* This variable is incremented each time `error' is called. */ | |
152 | extern unsigned int error_message_count; | |
153 | ||
154 | /* The calling program should define program_name and set it to the | |
155 | name of the executing program. */ | |
156 | extern char *program_name; | |
157 | ||
158 | ||
159 | struct token * | |
4b10dd6c UD |
160 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
161 | const struct repertoire_t *repertoire) | |
19bc17a9 RM |
162 | { |
163 | int ch; | |
164 | ||
165 | while (1) | |
166 | { | |
167 | do | |
168 | { | |
169 | ch = lr_getc (lr); | |
170 | ||
76fbcfdd UD |
171 | if (ch == EOF) |
172 | { | |
173 | lr->token.tok = tok_eof; | |
174 | return &lr->token; | |
175 | }; | |
176 | ||
19bc17a9 RM |
177 | if (ch == '\n') |
178 | { | |
179 | lr->token.tok = tok_eol; | |
180 | return &lr->token; | |
181 | } | |
182 | } | |
183 | while (isspace (ch)); | |
184 | ||
185 | if (ch == EOF) | |
186 | { | |
187 | lr->token.tok = tok_eof; | |
188 | return &lr->token; | |
189 | }; | |
190 | ||
191 | if (ch != lr->comment_char) | |
192 | break; | |
193 | ||
a0dc5206 UD |
194 | /* Is there an newline at the end of the buffer? */ |
195 | if (lr->buf[lr->bufact - 1] != '\n') | |
196 | { | |
197 | /* No. Some people want this to mean that only the line in | |
198 | the file not the logical, concatenated line is ignored. | |
199 | Let's try this. */ | |
200 | lr->idx = lr->bufact; | |
201 | continue; | |
202 | } | |
203 | ||
19bc17a9 RM |
204 | /* Ignore rest of line. */ |
205 | lr_ignore_rest (lr, 0); | |
206 | lr->token.tok = tok_eol; | |
207 | return &lr->token; | |
208 | } | |
209 | ||
210 | /* Match escape sequences. */ | |
211 | if (ch == lr->escape_char) | |
212 | return get_toplvl_escape (lr); | |
213 | ||
214 | /* Match ellipsis. */ | |
4b10dd6c | 215 | if (ch == '.') |
19bc17a9 | 216 | { |
a0dc5206 UD |
217 | if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0) |
218 | { | |
219 | int cnt; | |
220 | for (cnt = 0; cnt < 10; ++cnt) | |
221 | lr_getc (lr); | |
222 | lr->token.tok = tok_ellipsis4_2; | |
223 | return &lr->token; | |
224 | } | |
4b10dd6c UD |
225 | if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) |
226 | { | |
227 | lr_getc (lr); | |
228 | lr_getc (lr); | |
229 | lr_getc (lr); | |
230 | lr->token.tok = tok_ellipsis4; | |
231 | return &lr->token; | |
232 | } | |
233 | if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) | |
234 | { | |
235 | lr_getc (lr); | |
236 | lr_getc (lr); | |
237 | lr->token.tok = tok_ellipsis3; | |
238 | return &lr->token; | |
239 | } | |
a0dc5206 UD |
240 | if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0) |
241 | { | |
242 | int cnt; | |
243 | for (cnt = 0; cnt < 6; ++cnt) | |
244 | lr_getc (lr); | |
245 | lr->token.tok = tok_ellipsis2_2; | |
246 | return &lr->token; | |
247 | } | |
4b10dd6c UD |
248 | if (lr->buf[lr->idx] == '.') |
249 | { | |
250 | lr_getc (lr); | |
251 | lr->token.tok = tok_ellipsis2; | |
252 | return &lr->token; | |
253 | } | |
19bc17a9 RM |
254 | } |
255 | ||
256 | switch (ch) | |
257 | { | |
258 | case '<': | |
259 | return get_symname (lr); | |
260 | ||
261 | case '0' ... '9': | |
262 | lr->token.tok = tok_number; | |
263 | lr->token.val.num = ch - '0'; | |
264 | ||
265 | while (isdigit (ch = lr_getc (lr))) | |
266 | { | |
267 | lr->token.val.num *= 10; | |
268 | lr->token.val.num += ch - '0'; | |
269 | } | |
270 | if (isalpha (ch)) | |
5290baf0 | 271 | lr_error (lr, _("garbage at end of number")); |
19bc17a9 RM |
272 | lr_ungetn (lr, 1); |
273 | ||
274 | return &lr->token; | |
275 | ||
276 | case ';': | |
277 | lr->token.tok = tok_semicolon; | |
278 | return &lr->token; | |
279 | ||
280 | case ',': | |
281 | lr->token.tok = tok_comma; | |
282 | return &lr->token; | |
283 | ||
284 | case '(': | |
285 | lr->token.tok = tok_open_brace; | |
286 | return &lr->token; | |
287 | ||
288 | case ')': | |
289 | lr->token.tok = tok_close_brace; | |
290 | return &lr->token; | |
291 | ||
292 | case '"': | |
4b10dd6c | 293 | return get_string (lr, charmap, repertoire); |
19bc17a9 RM |
294 | |
295 | case '-': | |
296 | ch = lr_getc (lr); | |
297 | if (ch == '1') | |
298 | { | |
299 | lr->token.tok = tok_minus1; | |
300 | return &lr->token; | |
301 | } | |
302 | lr_ungetn (lr, 2); | |
303 | break; | |
304 | } | |
305 | ||
306 | return get_ident (lr); | |
307 | } | |
308 | ||
309 | ||
310 | static struct token * | |
311 | get_toplvl_escape (struct linereader *lr) | |
312 | { | |
313 | /* This is supposed to be a numeric value. We return the | |
314 | numerical value and the number of bytes. */ | |
315 | size_t start_idx = lr->idx - 1; | |
4b10dd6c | 316 | char *bytes = lr->token.val.charcode.bytes; |
19bc17a9 RM |
317 | int nbytes = 0; |
318 | int ch; | |
319 | ||
320 | do | |
321 | { | |
322 | unsigned int byte = 0; | |
323 | unsigned int base = 8; | |
324 | ||
325 | ch = lr_getc (lr); | |
326 | ||
327 | if (ch == 'd') | |
328 | { | |
329 | base = 10; | |
330 | ch = lr_getc (lr); | |
331 | } | |
332 | else if (ch == 'x') | |
333 | { | |
334 | base = 16; | |
335 | ch = lr_getc (lr); | |
336 | } | |
337 | ||
338 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 339 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
340 | { |
341 | esc_error: | |
4b10dd6c | 342 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
19bc17a9 | 343 | |
76fbcfdd | 344 | while (ch != EOF && !isspace (ch)) |
19bc17a9 | 345 | ch = lr_getc (lr); |
4b10dd6c | 346 | lr->token.val.str.lenmb = lr->idx - start_idx; |
19bc17a9 RM |
347 | |
348 | lr->token.tok = tok_error; | |
349 | return &lr->token; | |
350 | } | |
351 | ||
352 | if (isdigit (ch)) | |
353 | byte = ch - '0'; | |
354 | else | |
4b10dd6c | 355 | byte = tolower (ch) - 'a' + 10; |
19bc17a9 RM |
356 | |
357 | ch = lr_getc (lr); | |
358 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 359 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
360 | goto esc_error; |
361 | ||
362 | byte *= base; | |
363 | if (isdigit (ch)) | |
364 | byte += ch - '0'; | |
365 | else | |
4b10dd6c | 366 | byte += tolower (ch) - 'a' + 10; |
19bc17a9 RM |
367 | |
368 | ch = lr_getc (lr); | |
369 | if (base != 16 && isdigit (ch)) | |
370 | { | |
371 | byte *= base; | |
679f5a56 | 372 | byte += ch - '0'; |
19bc17a9 RM |
373 | |
374 | ch = lr_getc (lr); | |
375 | } | |
376 | ||
4b10dd6c | 377 | bytes[nbytes++] = byte; |
19bc17a9 RM |
378 | } |
379 | while (ch == lr->escape_char && nbytes < 4); | |
380 | ||
381 | if (!isspace (ch)) | |
382 | lr_error (lr, _("garbage at end of character code specification")); | |
383 | ||
384 | lr_ungetn (lr, 1); | |
385 | ||
386 | lr->token.tok = tok_charcode; | |
19bc17a9 RM |
387 | lr->token.val.charcode.nbytes = nbytes; |
388 | ||
389 | return &lr->token; | |
390 | } | |
391 | ||
392 | ||
4b10dd6c UD |
393 | #define ADDC(ch) \ |
394 | do \ | |
395 | { \ | |
396 | if (bufact == bufmax) \ | |
397 | { \ | |
398 | bufmax *= 2; \ | |
399 | buf = xrealloc (buf, bufmax); \ | |
400 | } \ | |
401 | buf[bufact++] = (ch); \ | |
402 | } \ | |
403 | while (0) | |
404 | ||
405 | ||
406 | #define ADDS(s, l) \ | |
407 | do \ | |
408 | { \ | |
409 | size_t _l = (l); \ | |
410 | if (bufact + _l > bufmax) \ | |
411 | { \ | |
412 | if (bufact < _l) \ | |
413 | bufact = _l; \ | |
414 | bufmax *= 2; \ | |
415 | buf = xrealloc (buf, bufmax); \ | |
416 | } \ | |
417 | memcpy (&buf[bufact], s, _l); \ | |
418 | bufact += _l; \ | |
419 | } \ | |
420 | while (0) | |
421 | ||
422 | ||
423 | #define ADDWC(ch) \ | |
424 | do \ | |
425 | { \ | |
426 | if (buf2act == buf2max) \ | |
427 | { \ | |
428 | buf2max *= 2; \ | |
429 | buf2 = xrealloc (buf2, buf2max * 4); \ | |
430 | } \ | |
431 | buf2[buf2act++] = (ch); \ | |
432 | } \ | |
19bc17a9 RM |
433 | while (0) |
434 | ||
435 | ||
436 | static struct token * | |
437 | get_symname (struct linereader *lr) | |
438 | { | |
439 | /* Symbol in brackets. We must distinguish three kinds: | |
440 | 1. reserved words | |
441 | 2. ISO 10646 position values | |
442 | 3. all other. */ | |
443 | char *buf; | |
444 | size_t bufact = 0; | |
445 | size_t bufmax = 56; | |
446 | const struct keyword_t *kw; | |
447 | int ch; | |
448 | ||
449 | buf = (char *) xmalloc (bufmax); | |
450 | ||
451 | do | |
452 | { | |
453 | ch = lr_getc (lr); | |
454 | if (ch == lr->escape_char) | |
455 | { | |
456 | int c2 = lr_getc (lr); | |
457 | ADDC (c2); | |
458 | ||
459 | if (c2 == '\n') | |
460 | ch = '\n'; | |
461 | } | |
462 | else | |
463 | ADDC (ch); | |
464 | } | |
465 | while (ch != '>' && ch != '\n'); | |
466 | ||
467 | if (ch == '\n') | |
468 | lr_error (lr, _("unterminated symbolic name")); | |
469 | ||
470 | /* Test for ISO 10646 position value. */ | |
471 | if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) | |
472 | { | |
473 | char *cp = buf + 1; | |
474 | while (cp < &buf[bufact - 1] && isxdigit (*cp)) | |
475 | ++cp; | |
476 | ||
477 | if (cp == &buf[bufact - 1]) | |
478 | { | |
479 | /* Yes, it is. */ | |
4b10dd6c UD |
480 | lr->token.tok = tok_ucs4; |
481 | lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); | |
19bc17a9 RM |
482 | |
483 | return &lr->token; | |
484 | } | |
485 | } | |
486 | ||
487 | /* It is a symbolic name. Test for reserved words. */ | |
488 | kw = lr->hash_fct (buf, bufact - 1); | |
489 | ||
490 | if (kw != NULL && kw->symname_or_ident == 1) | |
491 | { | |
492 | lr->token.tok = kw->token; | |
493 | free (buf); | |
494 | } | |
495 | else | |
496 | { | |
497 | lr->token.tok = tok_bsymbol; | |
498 | ||
499 | buf[bufact] = '\0'; | |
500 | buf = xrealloc (buf, bufact + 1); | |
501 | ||
4b10dd6c UD |
502 | lr->token.val.str.startmb = buf; |
503 | lr->token.val.str.lenmb = bufact - 1; | |
19bc17a9 RM |
504 | } |
505 | ||
506 | return &lr->token; | |
507 | } | |
508 | ||
509 | ||
510 | static struct token * | |
511 | get_ident (struct linereader *lr) | |
512 | { | |
513 | char *buf; | |
514 | size_t bufact; | |
515 | size_t bufmax = 56; | |
516 | const struct keyword_t *kw; | |
517 | int ch; | |
518 | ||
519 | buf = xmalloc (bufmax); | |
520 | bufact = 0; | |
521 | ||
522 | ADDC (lr->buf[lr->idx - 1]); | |
523 | ||
524 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' | |
525 | && ch != '<' && ch != ',') | |
4b10dd6c UD |
526 | { |
527 | if (ch == lr->escape_char) | |
528 | { | |
529 | ch = lr_getc (lr); | |
530 | if (ch == '\n' || ch == EOF) | |
531 | { | |
532 | lr_error (lr, _("invalid escape sequence")); | |
533 | break; | |
534 | } | |
535 | } | |
536 | ADDC (ch); | |
537 | } | |
19bc17a9 RM |
538 | |
539 | lr_ungetn (lr, 1); | |
540 | ||
541 | kw = lr->hash_fct (buf, bufact); | |
542 | ||
543 | if (kw != NULL && kw->symname_or_ident == 0) | |
544 | { | |
545 | lr->token.tok = kw->token; | |
546 | free (buf); | |
547 | } | |
548 | else | |
549 | { | |
550 | lr->token.tok = tok_ident; | |
551 | ||
552 | buf[bufact] = '\0'; | |
553 | buf = xrealloc (buf, bufact + 1); | |
554 | ||
4b10dd6c UD |
555 | lr->token.val.str.startmb = buf; |
556 | lr->token.val.str.lenmb = bufact; | |
19bc17a9 RM |
557 | } |
558 | ||
559 | return &lr->token; | |
560 | } | |
561 | ||
562 | ||
563 | static struct token * | |
4b10dd6c UD |
564 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
565 | const struct repertoire_t *repertoire) | |
19bc17a9 | 566 | { |
4b10dd6c UD |
567 | int return_widestr = lr->return_widestr; |
568 | char *buf; | |
a9c27b3e | 569 | wchar_t *buf2 = NULL; |
19bc17a9 RM |
570 | size_t bufact; |
571 | size_t bufmax = 56; | |
19bc17a9 | 572 | |
4b10dd6c | 573 | /* We must return two different strings. */ |
19bc17a9 RM |
574 | buf = xmalloc (bufmax); |
575 | bufact = 0; | |
576 | ||
4b10dd6c UD |
577 | /* We know it'll be a string. */ |
578 | lr->token.tok = tok_string; | |
579 | ||
580 | /* If we need not translate the strings (i.e., expand <...> parts) | |
581 | we can run a simple loop. */ | |
582 | if (!lr->translate_strings) | |
583 | { | |
584 | int ch; | |
585 | ||
586 | buf2 = NULL; | |
587 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
19bc17a9 | 588 | ADDC (ch); |
4b10dd6c UD |
589 | |
590 | /* Catch errors with trailing escape character. */ | |
591 | if (bufact > 0 && buf[bufact - 1] == lr->escape_char | |
592 | && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) | |
593 | { | |
594 | lr_error (lr, _("illegal escape sequence at end of string")); | |
595 | --bufact; | |
596 | } | |
597 | else if (ch == '\n' || ch == EOF) | |
598 | lr_error (lr, _("unterminated string")); | |
599 | ||
600 | ADDC ('\0'); | |
601 | } | |
602 | else | |
603 | { | |
604 | int illegal_string = 0; | |
605 | size_t buf2act = 0; | |
606 | size_t buf2max = 56 * sizeof (uint32_t); | |
607 | int ch; | |
608 | int warned = 0; | |
609 | ||
610 | /* We have to provide the wide character result as well. */ | |
611 | if (return_widestr) | |
612 | buf2 = xmalloc (buf2max); | |
613 | ||
614 | /* Read until the end of the string (or end of the line or file). */ | |
615 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
616 | { | |
617 | size_t startidx; | |
618 | uint32_t wch; | |
619 | struct charseq *seq; | |
620 | ||
621 | if (ch != '<') | |
622 | { | |
623 | /* The standards leave it up to the implementation to decide | |
624 | what to do with character which stand for themself. We | |
625 | could jump through hoops to find out the value relative to | |
626 | the charmap and the repertoire map, but instead we leave | |
627 | it up to the locale definition author to write a better | |
628 | definition. We assume here that every character which | |
629 | stands for itself is encoded using ISO 8859-1. Using the | |
630 | escape character is allowed. */ | |
631 | if (ch == lr->escape_char) | |
632 | { | |
633 | ch = lr_getc (lr); | |
634 | if (ch == '\n' || ch == EOF) | |
635 | break; | |
636 | } | |
637 | ||
638 | if (verbose && !warned) | |
639 | { | |
640 | lr_error (lr, _("\ | |
641 | non-symbolic character value should not be used")); | |
642 | warned = 1; | |
643 | } | |
644 | ||
645 | ADDC (ch); | |
646 | if (return_widestr) | |
647 | ADDWC ((uint32_t) ch); | |
648 | ||
649 | continue; | |
650 | } | |
651 | ||
652 | /* Now we have to search for the end of the symbolic name, i.e., | |
653 | the closing '>'. */ | |
654 | startidx = bufact; | |
655 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) | |
656 | { | |
657 | if (ch == lr->escape_char) | |
658 | { | |
659 | ch = lr_getc (lr); | |
660 | if (ch == '\n' || ch == EOF) | |
661 | break; | |
662 | } | |
663 | ADDC (ch); | |
664 | } | |
665 | if (ch == '\n' || ch == EOF) | |
666 | /* Not a correct string. */ | |
667 | break; | |
668 | if (bufact == startidx) | |
669 | { | |
670 | /* <> is no correct name. Ignore it and also signal an | |
671 | error. */ | |
19bc17a9 | 672 | illegal_string = 1; |
4b10dd6c UD |
673 | continue; |
674 | } | |
19bc17a9 | 675 | |
4b10dd6c UD |
676 | /* It might be a Uxxxx symbol. */ |
677 | if (buf[startidx] == 'U' | |
678 | && (bufact - startidx == 5 || bufact - startidx == 9)) | |
679 | { | |
680 | char *cp = buf + startidx + 1; | |
681 | while (cp < &buf[bufact] && isxdigit (*cp)) | |
682 | ++cp; | |
683 | ||
684 | if (cp == &buf[bufact]) | |
685 | { | |
3c833378 | 686 | char utmp[10]; |
4b10dd6c UD |
687 | const char *symbol = NULL; |
688 | ||
689 | /* Yes, it is. */ | |
690 | ADDC ('\0'); | |
691 | wch = strtoul (buf + startidx + 1, NULL, 16); | |
692 | ||
693 | /* Now forget about the name we just added. */ | |
694 | bufact = startidx; | |
695 | ||
696 | if (return_widestr) | |
697 | ADDWC (wch); | |
698 | ||
3c833378 UD |
699 | /* See whether the charmap contains the Uxxxxxxxx names. */ |
700 | snprintf (utmp, sizeof (utmp), "U%08X", wch); | |
701 | seq = charmap_find_value (charmap, utmp, 9); | |
4b10dd6c | 702 | |
3c833378 | 703 | if (seq == NULL) |
4b10dd6c | 704 | { |
3c833378 UD |
705 | /* No, this isn't the case. Now determine from |
706 | the repertoire the name of the character and | |
707 | find it in the charmap. */ | |
708 | if (repertoire != NULL) | |
709 | symbol = repertoire_find_symbol (repertoire, wch); | |
4b10dd6c | 710 | |
3c833378 | 711 | if (symbol == NULL) |
d364e525 UD |
712 | /* We cannot generate a string since we |
713 | cannot map from the Unicode number to the | |
714 | character symbol. */ | |
715 | illegal_string = 1; | |
4b10dd6c | 716 | else |
3c833378 UD |
717 | { |
718 | seq = charmap_find_value (charmap, symbol, | |
719 | strlen (symbol)); | |
720 | ||
721 | if (seq == NULL) | |
d364e525 UD |
722 | /* Not a known name. */ |
723 | illegal_string = 1; | |
3c833378 | 724 | } |
4b10dd6c UD |
725 | } |
726 | ||
3c833378 UD |
727 | if (seq != NULL) |
728 | ADDS (seq->bytes, seq->nbytes); | |
729 | ||
4b10dd6c UD |
730 | continue; |
731 | } | |
732 | } | |
733 | ||
3c833378 UD |
734 | /* We now have the symbolic name in buf[startidx] to |
735 | buf[bufact-1]. Now find out the value for this character | |
736 | in the charmap as well as in the repertoire map (in this | |
737 | order). */ | |
738 | seq = charmap_find_value (charmap, &buf[startidx], | |
739 | bufact - startidx); | |
740 | ||
741 | if (seq == NULL) | |
742 | { | |
743 | /* This name is not in the charmap. */ | |
744 | lr_error (lr, _("symbol `%.*s' not in charmap"), | |
745 | (int) (bufact - startidx), &buf[startidx]); | |
746 | illegal_string = 1; | |
747 | } | |
748 | ||
4b10dd6c UD |
749 | if (return_widestr) |
750 | { | |
3c833378 UD |
751 | /* Now the same for the multibyte representation. */ |
752 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
753 | wch = seq->ucs4; | |
754 | else | |
755 | { | |
756 | wch = repertoire_find_value (repertoire, &buf[startidx], | |
757 | bufact - startidx); | |
758 | if (seq != NULL) | |
759 | seq->ucs4 = wch; | |
760 | } | |
761 | ||
4b10dd6c UD |
762 | if (wch == ILLEGAL_CHAR_VALUE) |
763 | { | |
764 | /* This name is not in the repertoire map. */ | |
765 | lr_error (lr, _("symbol `%.*s' not in repertoire map"), | |
70e51ab9 | 766 | (int) (bufact - startidx), &buf[startidx]); |
4b10dd6c UD |
767 | illegal_string = 1; |
768 | } | |
769 | else | |
770 | ADDWC (wch); | |
771 | } | |
772 | ||
3c833378 UD |
773 | /* Now forget about the name we just added. */ |
774 | bufact = startidx; | |
19bc17a9 | 775 | |
3c833378 UD |
776 | /* And copy the bytes. */ |
777 | if (seq != NULL) | |
778 | ADDS (seq->bytes, seq->nbytes); | |
4b10dd6c | 779 | } |
19bc17a9 | 780 | |
4b10dd6c UD |
781 | if (ch == '\n' || ch == EOF) |
782 | { | |
783 | lr_error (lr, _("unterminated string")); | |
784 | illegal_string = 1; | |
785 | } | |
19bc17a9 | 786 | |
4b10dd6c UD |
787 | if (illegal_string) |
788 | { | |
789 | free (buf); | |
790 | if (buf2 != NULL) | |
791 | free (buf2); | |
792 | lr->token.val.str.startmb = NULL; | |
793 | lr->token.val.str.lenmb = 0; | |
19bc17a9 | 794 | |
4b10dd6c UD |
795 | return &lr->token; |
796 | } | |
19bc17a9 | 797 | |
4b10dd6c | 798 | ADDC ('\0'); |
19bc17a9 | 799 | |
4b10dd6c UD |
800 | if (return_widestr) |
801 | { | |
802 | ADDWC (0); | |
803 | lr->token.val.str.startwc = xrealloc (buf2, | |
804 | buf2act * sizeof (uint32_t)); | |
805 | lr->token.val.str.lenwc = buf2act; | |
806 | } | |
19bc17a9 RM |
807 | } |
808 | ||
4b10dd6c UD |
809 | lr->token.val.str.startmb = xrealloc (buf, bufact); |
810 | lr->token.val.str.lenmb = bufact; | |
811 | ||
19bc17a9 RM |
812 | return &lr->token; |
813 | } |