]>
Commit | Line | Data |
---|---|---|
f2b98f97 | 1 | /* Copyright (C) 1996-2001, 2002 Free Software Foundation, Inc. |
5290baf0 | 2 | This file is part of the GNU C Library. |
4b10dd6c | 3 | Contributed by Ulrich Drepper <drepper@gnu.org>, 1996. |
19bc17a9 | 4 | |
5290baf0 | 5 | The GNU C Library is free software; you can redistribute it and/or |
41bdb6e2 AJ |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either | |
8 | version 2.1 of the License, or (at your option) any later version. | |
19bc17a9 | 9 | |
5290baf0 UD |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
41bdb6e2 | 13 | Lesser General Public License for more details. |
19bc17a9 | 14 | |
41bdb6e2 AJ |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, write to the Free | |
17 | Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
18 | 02111-1307 USA. */ | |
19bc17a9 RM |
19 | |
20 | #ifdef HAVE_CONFIG_H | |
21 | # include <config.h> | |
22 | #endif | |
23 | ||
47e8b443 | 24 | #include <assert.h> |
19bc17a9 RM |
25 | #include <ctype.h> |
26 | #include <errno.h> | |
27 | #include <libintl.h> | |
28 | #include <stdarg.h> | |
29 | #include <stdlib.h> | |
30 | #include <string.h> | |
31 | ||
f2b98f97 | 32 | #include "localedef.h" |
4b10dd6c | 33 | #include "charmap.h" |
19bc17a9 RM |
34 | #include "error.h" |
35 | #include "linereader.h" | |
47e8b443 | 36 | #include "locfile.h" |
93693c4d | 37 | |
4b10dd6c | 38 | /* Prototypes for local functions. */ |
19bc17a9 RM |
39 | static struct token *get_toplvl_escape (struct linereader *lr); |
40 | static struct token *get_symname (struct linereader *lr); | |
41 | static struct token *get_ident (struct linereader *lr); | |
42 | static struct token *get_string (struct linereader *lr, | |
4b10dd6c | 43 | const struct charmap_t *charmap, |
47e8b443 | 44 | struct localedef_t *locale, |
93693c4d UD |
45 | const struct repertoire_t *repertoire, |
46 | int verbose); | |
19bc17a9 RM |
47 | |
48 | ||
49 | struct linereader * | |
50 | lr_open (const char *fname, kw_hash_fct_t hf) | |
51 | { | |
52 | FILE *fp; | |
19bc17a9 RM |
53 | |
54 | if (fname == NULL || strcmp (fname, "-") == 0 | |
55 | || strcmp (fname, "/dev/stdin") == 0) | |
3e076219 | 56 | return lr_create (stdin, "<stdin>", hf); |
19bc17a9 RM |
57 | else |
58 | { | |
2e2dc1a5 | 59 | fp = fopen (fname, "rm"); |
19bc17a9 RM |
60 | if (fp == NULL) |
61 | return NULL; | |
3e076219 | 62 | return lr_create (fp, fname, hf); |
19bc17a9 | 63 | } |
3e076219 UD |
64 | } |
65 | ||
66 | struct linereader * | |
67 | lr_create (FILE *fp, const char *fname, kw_hash_fct_t hf) | |
68 | { | |
69 | struct linereader *result; | |
70 | int n; | |
19bc17a9 RM |
71 | |
72 | result = (struct linereader *) xmalloc (sizeof (*result)); | |
73 | ||
74 | result->fp = fp; | |
3e076219 | 75 | result->fname = xstrdup (fname); |
19bc17a9 RM |
76 | result->buf = NULL; |
77 | result->bufsize = 0; | |
78 | result->lineno = 1; | |
79 | result->idx = 0; | |
80 | result->comment_char = '#'; | |
81 | result->escape_char = '\\'; | |
82 | result->translate_strings = 1; | |
83 | ||
84 | n = getdelim (&result->buf, &result->bufsize, '\n', result->fp); | |
85 | if (n < 0) | |
86 | { | |
87 | int save = errno; | |
88 | fclose (result->fp); | |
46ec036d | 89 | free ((char *) result->fname); |
19bc17a9 RM |
90 | free (result); |
91 | errno = save; | |
92 | return NULL; | |
93 | } | |
94 | ||
95 | if (n > 1 && result->buf[n - 2] == '\\' && result->buf[n - 1] == '\n') | |
96 | n -= 2; | |
97 | ||
98 | result->buf[n] = '\0'; | |
99 | result->bufact = n; | |
100 | result->hash_fct = hf; | |
101 | ||
102 | return result; | |
103 | } | |
104 | ||
105 | ||
106 | int | |
107 | lr_eof (struct linereader *lr) | |
108 | { | |
109 | return lr->bufact = 0; | |
110 | } | |
111 | ||
112 | ||
113 | void | |
114 | lr_close (struct linereader *lr) | |
115 | { | |
116 | fclose (lr->fp); | |
117 | free (lr->buf); | |
118 | free (lr); | |
119 | } | |
120 | ||
121 | ||
122 | int | |
123 | lr_next (struct linereader *lr) | |
124 | { | |
125 | int n; | |
126 | ||
127 | n = getdelim (&lr->buf, &lr->bufsize, '\n', lr->fp); | |
128 | if (n < 0) | |
129 | return -1; | |
130 | ||
131 | ++lr->lineno; | |
132 | ||
133 | if (n > 1 && lr->buf[n - 2] == lr->escape_char && lr->buf[n - 1] == '\n') | |
134 | { | |
4b10dd6c UD |
135 | #if 0 |
136 | /* XXX Is this correct? */ | |
19bc17a9 RM |
137 | /* An escaped newline character is substituted with a single <SP>. */ |
138 | --n; | |
139 | lr->buf[n - 1] = ' '; | |
4b10dd6c UD |
140 | #else |
141 | n -= 2; | |
142 | #endif | |
19bc17a9 RM |
143 | } |
144 | ||
145 | lr->buf[n] = '\0'; | |
146 | lr->bufact = n; | |
147 | lr->idx = 0; | |
148 | ||
149 | return 0; | |
150 | } | |
151 | ||
152 | ||
153 | /* Defined in error.c. */ | |
154 | /* This variable is incremented each time `error' is called. */ | |
155 | extern unsigned int error_message_count; | |
156 | ||
157 | /* The calling program should define program_name and set it to the | |
158 | name of the executing program. */ | |
159 | extern char *program_name; | |
160 | ||
161 | ||
162 | struct token * | |
4b10dd6c | 163 | lr_token (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
164 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
165 | int verbose) | |
19bc17a9 RM |
166 | { |
167 | int ch; | |
168 | ||
169 | while (1) | |
170 | { | |
171 | do | |
172 | { | |
173 | ch = lr_getc (lr); | |
174 | ||
76fbcfdd UD |
175 | if (ch == EOF) |
176 | { | |
177 | lr->token.tok = tok_eof; | |
178 | return &lr->token; | |
179 | }; | |
180 | ||
19bc17a9 RM |
181 | if (ch == '\n') |
182 | { | |
183 | lr->token.tok = tok_eol; | |
184 | return &lr->token; | |
185 | } | |
186 | } | |
187 | while (isspace (ch)); | |
188 | ||
189 | if (ch == EOF) | |
190 | { | |
191 | lr->token.tok = tok_eof; | |
192 | return &lr->token; | |
193 | }; | |
194 | ||
195 | if (ch != lr->comment_char) | |
196 | break; | |
197 | ||
a0dc5206 UD |
198 | /* Is there an newline at the end of the buffer? */ |
199 | if (lr->buf[lr->bufact - 1] != '\n') | |
200 | { | |
201 | /* No. Some people want this to mean that only the line in | |
202 | the file not the logical, concatenated line is ignored. | |
203 | Let's try this. */ | |
204 | lr->idx = lr->bufact; | |
205 | continue; | |
206 | } | |
207 | ||
19bc17a9 RM |
208 | /* Ignore rest of line. */ |
209 | lr_ignore_rest (lr, 0); | |
210 | lr->token.tok = tok_eol; | |
211 | return &lr->token; | |
212 | } | |
213 | ||
214 | /* Match escape sequences. */ | |
215 | if (ch == lr->escape_char) | |
216 | return get_toplvl_escape (lr); | |
217 | ||
218 | /* Match ellipsis. */ | |
4b10dd6c | 219 | if (ch == '.') |
19bc17a9 | 220 | { |
a0dc5206 UD |
221 | if (strncmp (&lr->buf[lr->idx], "...(2)....", 10) == 0) |
222 | { | |
223 | int cnt; | |
224 | for (cnt = 0; cnt < 10; ++cnt) | |
225 | lr_getc (lr); | |
226 | lr->token.tok = tok_ellipsis4_2; | |
227 | return &lr->token; | |
228 | } | |
4b10dd6c UD |
229 | if (strncmp (&lr->buf[lr->idx], "...", 3) == 0) |
230 | { | |
231 | lr_getc (lr); | |
232 | lr_getc (lr); | |
233 | lr_getc (lr); | |
234 | lr->token.tok = tok_ellipsis4; | |
235 | return &lr->token; | |
236 | } | |
237 | if (strncmp (&lr->buf[lr->idx], "..", 2) == 0) | |
238 | { | |
239 | lr_getc (lr); | |
240 | lr_getc (lr); | |
241 | lr->token.tok = tok_ellipsis3; | |
242 | return &lr->token; | |
243 | } | |
a0dc5206 UD |
244 | if (strncmp (&lr->buf[lr->idx], ".(2)..", 6) == 0) |
245 | { | |
246 | int cnt; | |
247 | for (cnt = 0; cnt < 6; ++cnt) | |
248 | lr_getc (lr); | |
249 | lr->token.tok = tok_ellipsis2_2; | |
250 | return &lr->token; | |
251 | } | |
4b10dd6c UD |
252 | if (lr->buf[lr->idx] == '.') |
253 | { | |
254 | lr_getc (lr); | |
255 | lr->token.tok = tok_ellipsis2; | |
256 | return &lr->token; | |
257 | } | |
19bc17a9 RM |
258 | } |
259 | ||
260 | switch (ch) | |
261 | { | |
262 | case '<': | |
263 | return get_symname (lr); | |
264 | ||
265 | case '0' ... '9': | |
266 | lr->token.tok = tok_number; | |
267 | lr->token.val.num = ch - '0'; | |
268 | ||
269 | while (isdigit (ch = lr_getc (lr))) | |
270 | { | |
271 | lr->token.val.num *= 10; | |
272 | lr->token.val.num += ch - '0'; | |
273 | } | |
274 | if (isalpha (ch)) | |
5290baf0 | 275 | lr_error (lr, _("garbage at end of number")); |
19bc17a9 RM |
276 | lr_ungetn (lr, 1); |
277 | ||
278 | return &lr->token; | |
279 | ||
280 | case ';': | |
281 | lr->token.tok = tok_semicolon; | |
282 | return &lr->token; | |
283 | ||
284 | case ',': | |
285 | lr->token.tok = tok_comma; | |
286 | return &lr->token; | |
287 | ||
288 | case '(': | |
289 | lr->token.tok = tok_open_brace; | |
290 | return &lr->token; | |
291 | ||
292 | case ')': | |
293 | lr->token.tok = tok_close_brace; | |
294 | return &lr->token; | |
295 | ||
296 | case '"': | |
47e8b443 | 297 | return get_string (lr, charmap, locale, repertoire, verbose); |
19bc17a9 RM |
298 | |
299 | case '-': | |
300 | ch = lr_getc (lr); | |
301 | if (ch == '1') | |
302 | { | |
303 | lr->token.tok = tok_minus1; | |
304 | return &lr->token; | |
305 | } | |
306 | lr_ungetn (lr, 2); | |
307 | break; | |
308 | } | |
309 | ||
310 | return get_ident (lr); | |
311 | } | |
312 | ||
313 | ||
314 | static struct token * | |
315 | get_toplvl_escape (struct linereader *lr) | |
316 | { | |
317 | /* This is supposed to be a numeric value. We return the | |
318 | numerical value and the number of bytes. */ | |
319 | size_t start_idx = lr->idx - 1; | |
4b10dd6c | 320 | char *bytes = lr->token.val.charcode.bytes; |
19bc17a9 RM |
321 | int nbytes = 0; |
322 | int ch; | |
323 | ||
324 | do | |
325 | { | |
326 | unsigned int byte = 0; | |
327 | unsigned int base = 8; | |
328 | ||
329 | ch = lr_getc (lr); | |
330 | ||
331 | if (ch == 'd') | |
332 | { | |
333 | base = 10; | |
334 | ch = lr_getc (lr); | |
335 | } | |
336 | else if (ch == 'x') | |
337 | { | |
338 | base = 16; | |
339 | ch = lr_getc (lr); | |
340 | } | |
341 | ||
342 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 343 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
344 | { |
345 | esc_error: | |
4b10dd6c | 346 | lr->token.val.str.startmb = &lr->buf[start_idx]; |
19bc17a9 | 347 | |
76fbcfdd | 348 | while (ch != EOF && !isspace (ch)) |
19bc17a9 | 349 | ch = lr_getc (lr); |
4b10dd6c | 350 | lr->token.val.str.lenmb = lr->idx - start_idx; |
19bc17a9 RM |
351 | |
352 | lr->token.tok = tok_error; | |
353 | return &lr->token; | |
354 | } | |
355 | ||
356 | if (isdigit (ch)) | |
357 | byte = ch - '0'; | |
358 | else | |
4b10dd6c | 359 | byte = tolower (ch) - 'a' + 10; |
19bc17a9 RM |
360 | |
361 | ch = lr_getc (lr); | |
362 | if ((base == 16 && !isxdigit (ch)) | |
ba1ffaa1 | 363 | || (base != 16 && (ch < '0' || ch >= (int) ('0' + base)))) |
19bc17a9 RM |
364 | goto esc_error; |
365 | ||
366 | byte *= base; | |
367 | if (isdigit (ch)) | |
368 | byte += ch - '0'; | |
369 | else | |
4b10dd6c | 370 | byte += tolower (ch) - 'a' + 10; |
19bc17a9 RM |
371 | |
372 | ch = lr_getc (lr); | |
373 | if (base != 16 && isdigit (ch)) | |
374 | { | |
375 | byte *= base; | |
679f5a56 | 376 | byte += ch - '0'; |
19bc17a9 RM |
377 | |
378 | ch = lr_getc (lr); | |
379 | } | |
380 | ||
4b10dd6c | 381 | bytes[nbytes++] = byte; |
19bc17a9 | 382 | } |
c50ec4e0 | 383 | while (ch == lr->escape_char |
6dd67bd5 | 384 | && nbytes < (int) sizeof (lr->token.val.charcode.bytes)); |
19bc17a9 RM |
385 | |
386 | if (!isspace (ch)) | |
387 | lr_error (lr, _("garbage at end of character code specification")); | |
388 | ||
389 | lr_ungetn (lr, 1); | |
390 | ||
391 | lr->token.tok = tok_charcode; | |
19bc17a9 RM |
392 | lr->token.val.charcode.nbytes = nbytes; |
393 | ||
394 | return &lr->token; | |
395 | } | |
396 | ||
397 | ||
4b10dd6c UD |
398 | #define ADDC(ch) \ |
399 | do \ | |
400 | { \ | |
401 | if (bufact == bufmax) \ | |
402 | { \ | |
403 | bufmax *= 2; \ | |
404 | buf = xrealloc (buf, bufmax); \ | |
405 | } \ | |
406 | buf[bufact++] = (ch); \ | |
407 | } \ | |
408 | while (0) | |
409 | ||
410 | ||
411 | #define ADDS(s, l) \ | |
412 | do \ | |
413 | { \ | |
414 | size_t _l = (l); \ | |
415 | if (bufact + _l > bufmax) \ | |
416 | { \ | |
417 | if (bufact < _l) \ | |
418 | bufact = _l; \ | |
419 | bufmax *= 2; \ | |
420 | buf = xrealloc (buf, bufmax); \ | |
421 | } \ | |
422 | memcpy (&buf[bufact], s, _l); \ | |
423 | bufact += _l; \ | |
424 | } \ | |
425 | while (0) | |
426 | ||
427 | ||
428 | #define ADDWC(ch) \ | |
429 | do \ | |
430 | { \ | |
431 | if (buf2act == buf2max) \ | |
432 | { \ | |
433 | buf2max *= 2; \ | |
434 | buf2 = xrealloc (buf2, buf2max * 4); \ | |
435 | } \ | |
436 | buf2[buf2act++] = (ch); \ | |
437 | } \ | |
19bc17a9 RM |
438 | while (0) |
439 | ||
440 | ||
441 | static struct token * | |
442 | get_symname (struct linereader *lr) | |
443 | { | |
444 | /* Symbol in brackets. We must distinguish three kinds: | |
445 | 1. reserved words | |
446 | 2. ISO 10646 position values | |
447 | 3. all other. */ | |
448 | char *buf; | |
449 | size_t bufact = 0; | |
450 | size_t bufmax = 56; | |
451 | const struct keyword_t *kw; | |
452 | int ch; | |
453 | ||
454 | buf = (char *) xmalloc (bufmax); | |
455 | ||
456 | do | |
457 | { | |
458 | ch = lr_getc (lr); | |
459 | if (ch == lr->escape_char) | |
460 | { | |
461 | int c2 = lr_getc (lr); | |
462 | ADDC (c2); | |
463 | ||
464 | if (c2 == '\n') | |
465 | ch = '\n'; | |
466 | } | |
467 | else | |
468 | ADDC (ch); | |
469 | } | |
470 | while (ch != '>' && ch != '\n'); | |
471 | ||
472 | if (ch == '\n') | |
473 | lr_error (lr, _("unterminated symbolic name")); | |
474 | ||
475 | /* Test for ISO 10646 position value. */ | |
476 | if (buf[0] == 'U' && (bufact == 6 || bufact == 10)) | |
477 | { | |
478 | char *cp = buf + 1; | |
479 | while (cp < &buf[bufact - 1] && isxdigit (*cp)) | |
480 | ++cp; | |
481 | ||
482 | if (cp == &buf[bufact - 1]) | |
483 | { | |
484 | /* Yes, it is. */ | |
4b10dd6c UD |
485 | lr->token.tok = tok_ucs4; |
486 | lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16); | |
19bc17a9 RM |
487 | |
488 | return &lr->token; | |
489 | } | |
490 | } | |
491 | ||
492 | /* It is a symbolic name. Test for reserved words. */ | |
493 | kw = lr->hash_fct (buf, bufact - 1); | |
494 | ||
495 | if (kw != NULL && kw->symname_or_ident == 1) | |
496 | { | |
497 | lr->token.tok = kw->token; | |
498 | free (buf); | |
499 | } | |
500 | else | |
501 | { | |
502 | lr->token.tok = tok_bsymbol; | |
503 | ||
504 | buf[bufact] = '\0'; | |
505 | buf = xrealloc (buf, bufact + 1); | |
506 | ||
4b10dd6c UD |
507 | lr->token.val.str.startmb = buf; |
508 | lr->token.val.str.lenmb = bufact - 1; | |
19bc17a9 RM |
509 | } |
510 | ||
511 | return &lr->token; | |
512 | } | |
513 | ||
514 | ||
515 | static struct token * | |
516 | get_ident (struct linereader *lr) | |
517 | { | |
518 | char *buf; | |
519 | size_t bufact; | |
520 | size_t bufmax = 56; | |
521 | const struct keyword_t *kw; | |
522 | int ch; | |
523 | ||
524 | buf = xmalloc (bufmax); | |
525 | bufact = 0; | |
526 | ||
527 | ADDC (lr->buf[lr->idx - 1]); | |
528 | ||
529 | while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';' | |
f126ef67 | 530 | && ch != '<' && ch != ',' && ch != EOF) |
4b10dd6c UD |
531 | { |
532 | if (ch == lr->escape_char) | |
533 | { | |
534 | ch = lr_getc (lr); | |
535 | if (ch == '\n' || ch == EOF) | |
536 | { | |
537 | lr_error (lr, _("invalid escape sequence")); | |
538 | break; | |
539 | } | |
540 | } | |
541 | ADDC (ch); | |
542 | } | |
19bc17a9 | 543 | |
f126ef67 | 544 | lr_ungetc (lr, ch); |
19bc17a9 RM |
545 | |
546 | kw = lr->hash_fct (buf, bufact); | |
547 | ||
548 | if (kw != NULL && kw->symname_or_ident == 0) | |
549 | { | |
550 | lr->token.tok = kw->token; | |
551 | free (buf); | |
552 | } | |
553 | else | |
554 | { | |
555 | lr->token.tok = tok_ident; | |
556 | ||
557 | buf[bufact] = '\0'; | |
558 | buf = xrealloc (buf, bufact + 1); | |
559 | ||
4b10dd6c UD |
560 | lr->token.val.str.startmb = buf; |
561 | lr->token.val.str.lenmb = bufact; | |
19bc17a9 RM |
562 | } |
563 | ||
564 | return &lr->token; | |
565 | } | |
566 | ||
567 | ||
568 | static struct token * | |
4b10dd6c | 569 | get_string (struct linereader *lr, const struct charmap_t *charmap, |
47e8b443 UD |
570 | struct localedef_t *locale, const struct repertoire_t *repertoire, |
571 | int verbose) | |
19bc17a9 | 572 | { |
4b10dd6c UD |
573 | int return_widestr = lr->return_widestr; |
574 | char *buf; | |
a9c27b3e | 575 | wchar_t *buf2 = NULL; |
19bc17a9 RM |
576 | size_t bufact; |
577 | size_t bufmax = 56; | |
19bc17a9 | 578 | |
4b10dd6c | 579 | /* We must return two different strings. */ |
19bc17a9 RM |
580 | buf = xmalloc (bufmax); |
581 | bufact = 0; | |
582 | ||
4b10dd6c UD |
583 | /* We know it'll be a string. */ |
584 | lr->token.tok = tok_string; | |
585 | ||
586 | /* If we need not translate the strings (i.e., expand <...> parts) | |
587 | we can run a simple loop. */ | |
588 | if (!lr->translate_strings) | |
589 | { | |
590 | int ch; | |
591 | ||
592 | buf2 = NULL; | |
593 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
19bc17a9 | 594 | ADDC (ch); |
4b10dd6c UD |
595 | |
596 | /* Catch errors with trailing escape character. */ | |
597 | if (bufact > 0 && buf[bufact - 1] == lr->escape_char | |
598 | && (bufact == 1 || buf[bufact - 2] != lr->escape_char)) | |
599 | { | |
600 | lr_error (lr, _("illegal escape sequence at end of string")); | |
601 | --bufact; | |
602 | } | |
603 | else if (ch == '\n' || ch == EOF) | |
604 | lr_error (lr, _("unterminated string")); | |
605 | ||
606 | ADDC ('\0'); | |
607 | } | |
608 | else | |
609 | { | |
610 | int illegal_string = 0; | |
611 | size_t buf2act = 0; | |
612 | size_t buf2max = 56 * sizeof (uint32_t); | |
613 | int ch; | |
614 | int warned = 0; | |
615 | ||
616 | /* We have to provide the wide character result as well. */ | |
617 | if (return_widestr) | |
618 | buf2 = xmalloc (buf2max); | |
619 | ||
620 | /* Read until the end of the string (or end of the line or file). */ | |
621 | while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) | |
622 | { | |
623 | size_t startidx; | |
624 | uint32_t wch; | |
625 | struct charseq *seq; | |
626 | ||
627 | if (ch != '<') | |
628 | { | |
629 | /* The standards leave it up to the implementation to decide | |
630 | what to do with character which stand for themself. We | |
631 | could jump through hoops to find out the value relative to | |
632 | the charmap and the repertoire map, but instead we leave | |
633 | it up to the locale definition author to write a better | |
634 | definition. We assume here that every character which | |
635 | stands for itself is encoded using ISO 8859-1. Using the | |
636 | escape character is allowed. */ | |
637 | if (ch == lr->escape_char) | |
638 | { | |
639 | ch = lr_getc (lr); | |
640 | if (ch == '\n' || ch == EOF) | |
641 | break; | |
642 | } | |
643 | ||
644 | if (verbose && !warned) | |
645 | { | |
646 | lr_error (lr, _("\ | |
647 | non-symbolic character value should not be used")); | |
648 | warned = 1; | |
649 | } | |
650 | ||
651 | ADDC (ch); | |
652 | if (return_widestr) | |
653 | ADDWC ((uint32_t) ch); | |
654 | ||
655 | continue; | |
656 | } | |
657 | ||
658 | /* Now we have to search for the end of the symbolic name, i.e., | |
659 | the closing '>'. */ | |
660 | startidx = bufact; | |
661 | while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF) | |
662 | { | |
663 | if (ch == lr->escape_char) | |
664 | { | |
665 | ch = lr_getc (lr); | |
666 | if (ch == '\n' || ch == EOF) | |
667 | break; | |
668 | } | |
669 | ADDC (ch); | |
670 | } | |
671 | if (ch == '\n' || ch == EOF) | |
672 | /* Not a correct string. */ | |
673 | break; | |
674 | if (bufact == startidx) | |
675 | { | |
676 | /* <> is no correct name. Ignore it and also signal an | |
677 | error. */ | |
19bc17a9 | 678 | illegal_string = 1; |
4b10dd6c UD |
679 | continue; |
680 | } | |
19bc17a9 | 681 | |
4b10dd6c UD |
682 | /* It might be a Uxxxx symbol. */ |
683 | if (buf[startidx] == 'U' | |
684 | && (bufact - startidx == 5 || bufact - startidx == 9)) | |
685 | { | |
686 | char *cp = buf + startidx + 1; | |
687 | while (cp < &buf[bufact] && isxdigit (*cp)) | |
688 | ++cp; | |
689 | ||
690 | if (cp == &buf[bufact]) | |
691 | { | |
3c833378 | 692 | char utmp[10]; |
4b10dd6c UD |
693 | |
694 | /* Yes, it is. */ | |
695 | ADDC ('\0'); | |
696 | wch = strtoul (buf + startidx + 1, NULL, 16); | |
697 | ||
698 | /* Now forget about the name we just added. */ | |
699 | bufact = startidx; | |
700 | ||
701 | if (return_widestr) | |
702 | ADDWC (wch); | |
703 | ||
3c833378 UD |
704 | /* See whether the charmap contains the Uxxxxxxxx names. */ |
705 | snprintf (utmp, sizeof (utmp), "U%08X", wch); | |
706 | seq = charmap_find_value (charmap, utmp, 9); | |
4b10dd6c | 707 | |
3c833378 | 708 | if (seq == NULL) |
4b10dd6c | 709 | { |
3c833378 UD |
710 | /* No, this isn't the case. Now determine from |
711 | the repertoire the name of the character and | |
712 | find it in the charmap. */ | |
713 | if (repertoire != NULL) | |
3c833378 | 714 | { |
47e8b443 | 715 | const char *symbol; |
3c833378 | 716 | |
47e8b443 UD |
717 | symbol = repertoire_find_symbol (repertoire, wch); |
718 | ||
719 | if (symbol != NULL) | |
720 | seq = charmap_find_value (charmap, symbol, | |
721 | strlen (symbol)); | |
722 | } | |
723 | ||
724 | if (seq == NULL) | |
725 | { | |
726 | #ifndef NO_TRANSLITERATION | |
727 | /* Transliterate if possible. */ | |
728 | if (locale != NULL) | |
729 | { | |
730 | uint32_t *translit; | |
731 | ||
732 | if ((locale->avail & CTYPE_LOCALE) == 0) | |
733 | { | |
734 | /* Load the CTYPE data now. */ | |
735 | int old_needed = locale->needed; | |
736 | ||
737 | locale->needed = 0; | |
69f6a804 | 738 | locale = load_locale (LC_CTYPE, |
47e8b443 UD |
739 | locale->name, |
740 | locale->repertoire_name, | |
741 | charmap, locale); | |
742 | locale->needed = old_needed; | |
743 | } | |
744 | ||
745 | if ((locale->avail & CTYPE_LOCALE) != 0 | |
746 | && ((translit = find_translit (locale, | |
747 | charmap, wch)) | |
748 | != NULL)) | |
749 | /* The CTYPE data contains a matching | |
750 | transliteration. */ | |
751 | { | |
752 | int i; | |
753 | ||
754 | for (i = 0; translit[i] != 0; ++i) | |
755 | { | |
756 | char utmp[10]; | |
757 | ||
758 | snprintf (utmp, sizeof (utmp), "U%08X", | |
759 | translit[i]); | |
760 | seq = charmap_find_value (charmap, utmp, | |
761 | 9); | |
762 | assert (seq != NULL); | |
763 | ADDS (seq->bytes, seq->nbytes); | |
764 | } | |
765 | ||
766 | continue; | |
767 | } | |
768 | } | |
769 | #endif /* NO_TRANSLITERATION */ | |
770 | ||
771 | /* Not a known name. */ | |
772 | illegal_string = 1; | |
3c833378 | 773 | } |
4b10dd6c UD |
774 | } |
775 | ||
3c833378 UD |
776 | if (seq != NULL) |
777 | ADDS (seq->bytes, seq->nbytes); | |
778 | ||
4b10dd6c UD |
779 | continue; |
780 | } | |
781 | } | |
782 | ||
3c833378 UD |
783 | /* We now have the symbolic name in buf[startidx] to |
784 | buf[bufact-1]. Now find out the value for this character | |
785 | in the charmap as well as in the repertoire map (in this | |
786 | order). */ | |
787 | seq = charmap_find_value (charmap, &buf[startidx], | |
788 | bufact - startidx); | |
789 | ||
790 | if (seq == NULL) | |
791 | { | |
792 | /* This name is not in the charmap. */ | |
793 | lr_error (lr, _("symbol `%.*s' not in charmap"), | |
794 | (int) (bufact - startidx), &buf[startidx]); | |
795 | illegal_string = 1; | |
796 | } | |
797 | ||
4b10dd6c UD |
798 | if (return_widestr) |
799 | { | |
3c833378 UD |
800 | /* Now the same for the multibyte representation. */ |
801 | if (seq != NULL && seq->ucs4 != UNINITIALIZED_CHAR_VALUE) | |
802 | wch = seq->ucs4; | |
803 | else | |
804 | { | |
805 | wch = repertoire_find_value (repertoire, &buf[startidx], | |
806 | bufact - startidx); | |
807 | if (seq != NULL) | |
808 | seq->ucs4 = wch; | |
809 | } | |
810 | ||
4b10dd6c UD |
811 | if (wch == ILLEGAL_CHAR_VALUE) |
812 | { | |
813 | /* This name is not in the repertoire map. */ | |
814 | lr_error (lr, _("symbol `%.*s' not in repertoire map"), | |
70e51ab9 | 815 | (int) (bufact - startidx), &buf[startidx]); |
4b10dd6c UD |
816 | illegal_string = 1; |
817 | } | |
818 | else | |
819 | ADDWC (wch); | |
820 | } | |
821 | ||
3c833378 UD |
822 | /* Now forget about the name we just added. */ |
823 | bufact = startidx; | |
19bc17a9 | 824 | |
3c833378 UD |
825 | /* And copy the bytes. */ |
826 | if (seq != NULL) | |
827 | ADDS (seq->bytes, seq->nbytes); | |
4b10dd6c | 828 | } |
19bc17a9 | 829 | |
4b10dd6c UD |
830 | if (ch == '\n' || ch == EOF) |
831 | { | |
832 | lr_error (lr, _("unterminated string")); | |
833 | illegal_string = 1; | |
834 | } | |
19bc17a9 | 835 | |
4b10dd6c UD |
836 | if (illegal_string) |
837 | { | |
838 | free (buf); | |
839 | if (buf2 != NULL) | |
840 | free (buf2); | |
841 | lr->token.val.str.startmb = NULL; | |
842 | lr->token.val.str.lenmb = 0; | |
d5fd1f3f UD |
843 | lr->token.val.str.startwc = NULL; |
844 | lr->token.val.str.lenwc = 0; | |
19bc17a9 | 845 | |
4b10dd6c UD |
846 | return &lr->token; |
847 | } | |
19bc17a9 | 848 | |
4b10dd6c | 849 | ADDC ('\0'); |
19bc17a9 | 850 | |
4b10dd6c UD |
851 | if (return_widestr) |
852 | { | |
853 | ADDWC (0); | |
854 | lr->token.val.str.startwc = xrealloc (buf2, | |
855 | buf2act * sizeof (uint32_t)); | |
856 | lr->token.val.str.lenwc = buf2act; | |
857 | } | |
19bc17a9 RM |
858 | } |
859 | ||
4b10dd6c UD |
860 | lr->token.val.str.startmb = xrealloc (buf, bufact); |
861 | lr->token.val.str.lenmb = bufact; | |
862 | ||
19bc17a9 RM |
863 | return &lr->token; |
864 | } |