]> git.ipfire.org Git - thirdparty/glibc.git/blame - locale/programs/charmap.c
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / locale / programs / charmap.c
CommitLineData
bfff8b1b 1/* Copyright (C) 1996-2017 Free Software Foundation, Inc.
df4ef2ab 2 This file is part of the GNU C Library.
4b10dd6c 3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
19bc17a9 4
43bc8ac6 5 This program is free software; you can redistribute it and/or modify
2e2efe65
RM
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; version 2 of the License, or
8 (at your option) any later version.
19bc17a9 9
43bc8ac6 10 This program is distributed in the hope that it will be useful,
df4ef2ab 11 but WITHOUT ANY WARRANTY; without even the implied warranty of
43bc8ac6
UD
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
19bc17a9 14
43bc8ac6 15 You should have received a copy of the GNU General Public License
59ba27a6 16 along with this program; if not, see <http://www.gnu.org/licenses/>. */
19bc17a9
RM
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <ctype.h>
23#include <errno.h>
24#include <libintl.h>
4b10dd6c 25#include <limits.h>
bb39c4ef 26#include <stdio.h>
19bc17a9
RM
27#include <stdlib.h>
28#include <string.h>
f2b98f97 29#include <error.h>
e054f494 30#include <stdint.h>
19bc17a9 31
f2b98f97 32#include "localedef.h"
19bc17a9 33#include "linereader.h"
4b10dd6c 34#include "charmap.h"
3e076219 35#include "charmap-dir.h"
19bc17a9 36
19bc17a9
RM
37#include <assert.h>
38
39
40/* Define the lookup function. */
41#include "charmap-kw.h"
42
43
19bc17a9 44/* Prototypes for local functions. */
93693c4d
UD
45static struct charmap_t *parse_charmap (struct linereader *cmfile,
46 int verbose, int be_quiet);
4b10dd6c 47static void new_width (struct linereader *cmfile, struct charmap_t *result,
75cd5204
RM
48 const char *from, const char *to,
49 unsigned long int width);
4b10dd6c 50static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
9cfe5381
RM
51 size_t nbytes, unsigned char *bytes,
52 const char *from, const char *to,
53 int decimal_ellipsis, int step);
19bc17a9 54
93693c4d 55
cb2eab1f
UD
56bool enc_not_ascii_compatible;
57
58
194c5f8d 59#ifdef NEED_NULL_POINTER
93693c4d 60static const char *null_pointer;
194c5f8d 61#endif
93693c4d 62
3e076219
UD
63static struct linereader *
64cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
65{
66 FILE *fp;
67
68 fp = charmap_open (directory, name);
69 if (fp == NULL)
70 return NULL;
71 else
72 {
73 size_t dlen = strlen (directory);
74 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
75 size_t nlen = strlen (name);
76 char *pathname;
77 char *p;
78
79 pathname = alloca (dlen + add_slash + nlen + 1);
80 p = stpcpy (pathname, directory);
81 if (add_slash)
82 *p++ = '/';
83 stpcpy (p, name);
84
85 return lr_create (fp, pathname, hf);
86 }
87}
19bc17a9 88
4b10dd6c 89struct charmap_t *
8a6537b0
UD
90charmap_read (const char *filename, int verbose, int error_not_found,
91 int be_quiet, int use_default)
19bc17a9 92{
4b10dd6c 93 struct charmap_t *result = NULL;
19bc17a9
RM
94
95 if (filename != NULL)
96 {
f14854aa 97 struct linereader *cmfile;
19bc17a9 98
f14854aa
UD
99 /* First try the name as found in the parameter. */
100 cmfile = lr_open (filename, charmap_hash);
101 if (cmfile == NULL)
102 {
103 /* No successful. So start looking through the directories
104 in the I18NPATH if this is a simple name. */
105 if (strchr (filename, '/') == NULL)
106 {
107 char *i18npath = getenv ("I18NPATH");
108 if (i18npath != NULL && *i18npath != '\0')
109 {
db2f05ba
RM
110 const size_t pathlen = strlen (i18npath);
111 char i18npathbuf[pathlen + 1];
112 char path[pathlen + sizeof ("/charmaps")];
f14854aa 113 char *next;
db2f05ba 114 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
f14854aa 115
f14854aa
UD
116 while (cmfile == NULL
117 && (next = strsep (&i18npath, ":")) != NULL)
118 {
3e076219
UD
119 stpcpy (stpcpy (path, next), "/charmaps");
120 cmfile = cmlr_open (path, filename, charmap_hash);
f14854aa
UD
121
122 if (cmfile == NULL)
47e8b443
UD
123 /* Try without the "/charmaps" part. */
124 cmfile = cmlr_open (next, filename, charmap_hash);
f14854aa
UD
125 }
126 }
b2386e4e
UD
127
128 if (cmfile == NULL)
47e8b443
UD
129 /* Try the default directory. */
130 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
f14854aa 131 }
19bc17a9
RM
132 }
133
f14854aa 134 if (cmfile != NULL)
c10d32c8 135 result = parse_charmap (cmfile, verbose, be_quiet);
19bc17a9 136
8a6537b0 137 if (result == NULL && error_not_found)
c10d32c8 138 WITH_CUR_LOCALE (error (0, errno, _("\
f2b98f97 139character map file `%s' not found"), filename));
19bc17a9
RM
140 }
141
93693c4d 142 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
51702635
UD
143 {
144 /* OK, one more try. We also accept the names given to the
145 character sets in the files. Sometimes they differ from the
146 file name. */
3e076219 147 CHARMAP_DIR *dir;
51702635 148
3e076219 149 dir = charmap_opendir (CHARMAP_PATH);
eb7c2001 150 if (dir != NULL)
51702635 151 {
3e076219 152 const char *dirent;
51702635 153
3e076219
UD
154 while ((dirent = charmap_readdir (dir)) != NULL)
155 {
156 char **aliases;
157 char **p;
158 int found;
159
160 aliases = charmap_aliases (CHARMAP_PATH, dirent);
161 found = 0;
162 for (p = aliases; *p; p++)
163 if (strcasecmp (*p, filename) == 0)
51702635 164 {
3e076219
UD
165 found = 1;
166 break;
51702635 167 }
3e076219
UD
168 charmap_free_aliases (aliases);
169
170 if (found)
171 {
172 struct linereader *cmfile;
51702635 173
3e076219
UD
174 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
175 if (cmfile != NULL)
93693c4d 176 result = parse_charmap (cmfile, verbose, be_quiet);
3e076219
UD
177
178 break;
179 }
180 }
181
182 charmap_closedir (dir);
51702635
UD
183 }
184 }
185
93693c4d 186 if (result == NULL && DEFAULT_CHARMAP != NULL)
19bc17a9 187 {
f14854aa 188 struct linereader *cmfile;
19bc17a9 189
3e076219
UD
190 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
191 if (cmfile != NULL)
93693c4d 192 result = parse_charmap (cmfile, verbose, be_quiet);
19bc17a9
RM
193
194 if (result == NULL)
f2b98f97
UD
195 WITH_CUR_LOCALE (error (4, errno, _("\
196default character map file `%s' not found"), DEFAULT_CHARMAP));
19bc17a9
RM
197 }
198
659f290a 199 if (result != NULL && result->code_set_name == NULL)
4a10c7fe
UD
200 /* The input file does not specify a code set name. This
201 shouldn't happen but we should cope with it. */
202 result->code_set_name = basename (filename);
203
bb39c4ef
UD
204 /* Test of ASCII compatibility of locale encoding.
205
206 Verify that the encoding to be used in a locale is ASCII compatible,
207 at least for the graphic characters, excluding the control characters,
208 '$' and '@'. This constraint comes from an ISO C 99 restriction.
209
210 ISO C 99 section 7.17.(2) (about wchar_t):
211 the null character shall have the code value zero and each member of
212 the basic character set shall have a code value equal to its value
213 when used as the lone character in an integer character constant.
214 ISO C 99 section 5.2.1.(3):
215 Both the basic source and basic execution character sets shall have
216 the following members: the 26 uppercase letters of the Latin alphabet
217 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
218 the 26 lowercase letters of the Latin alphabet
219 a b c d e f g h i j k l m n o p q r s t u v w x y z
220 the 10 decimal digits
221 0 1 2 3 4 5 6 7 8 9
222 the following 29 graphic characters
223 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
224 the space character, and control characters representing horizontal
225 tab, vertical tab, and form feed.
226
227 Therefore, for all members of the "basic character set", the 'char' code
228 must have the same value as the 'wchar_t' code, which in glibc is the
229 same as the Unicode code, which for all of the enumerated characters
230 is identical to the ASCII code. */
93693c4d 231 if (result != NULL && use_default)
bb39c4ef
UD
232 {
233 static const char basic_charset[] =
234 {
235 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
236 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
237 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
238 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
239 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
240 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
241 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
242 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
243 };
244 int failed = 0;
245 const char *p = basic_charset;
246
247 do
248 {
6dd67bd5 249 struct charseq *seq = charmap_find_symbol (result, p, 1);
bb39c4ef 250
6dd67bd5 251 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
bb39c4ef
UD
252 failed = 1;
253 }
254 while (*p++ != '\0');
255
256 if (failed)
cb2eab1f
UD
257 {
258 WITH_CUR_LOCALE (fprintf (stderr, _("\
bb39c4ef 259character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
cb2eab1f
UD
260 result->code_set_name));
261 enc_not_ascii_compatible = true;
262 }
bb39c4ef
UD
263 }
264
19bc17a9
RM
265 return result;
266}
267
268
4b10dd6c 269static struct charmap_t *
93693c4d 270parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
19bc17a9 271{
4b10dd6c 272 struct charmap_t *result;
19bc17a9
RM
273 int state;
274 enum token_t expected_tok = tok_error;
275 const char *expected_str = NULL;
276 char *from_name = NULL;
277 char *to_name = NULL;
4b10dd6c 278 enum token_t ellipsis = 0;
a0dc5206 279 int step = 1;
19bc17a9 280
4b10dd6c
UD
281 /* We don't want symbolic names in string to be translated. */
282 cmfile->translate_strings = 0;
283
19bc17a9 284 /* Allocate room for result. */
4b10dd6c
UD
285 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
286 memset (result, '\0', sizeof (struct charmap_t));
75cd5204
RM
287 /* The default DEFAULT_WIDTH is 1. */
288 result->width_default = 1;
19bc17a9 289
df4ef2ab 290#define obstack_chunk_alloc malloc
19bc17a9
RM
291#define obstack_chunk_free free
292 obstack_init (&result->mem_pool);
293
4b10dd6c
UD
294 if (init_hash (&result->char_table, 256)
295 || init_hash (&result->byte_table, 256))
19bc17a9
RM
296 {
297 free (result);
298 return NULL;
299 }
300
301 /* We use a state machine to describe the charmap description file
302 format. */
303 state = 1;
304 while (1)
305 {
306 /* What's on? */
47e8b443 307 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
19bc17a9
RM
308 enum token_t nowtok = now->tok;
309 struct token *arg;
310
311 if (nowtok == tok_eof)
312 break;
313
314 switch (state)
315 {
316 case 1:
317 /* The beginning. We expect the special declarations, EOL or
318 `CHARMAP'. */
319 if (nowtok == tok_eol)
320 /* Ignore empty lines. */
321 continue;
322
323 if (nowtok == tok_charmap)
324 {
325 from_name = NULL;
326 to_name = NULL;
327
328 /* We have to set up the real work. Fill in some
329 default values. */
330 if (result->mb_cur_max == 0)
331 result->mb_cur_max = 1;
332 if (result->mb_cur_min == 0)
333 result->mb_cur_min = result->mb_cur_max;
880f421f 334 if (result->mb_cur_min > result->mb_cur_max)
19bc17a9 335 {
880f421f 336 if (!be_quiet)
f2b98f97 337 WITH_CUR_LOCALE (error (0, 0, _("\
19bc17a9 338%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
f2b98f97 339 cmfile->fname));
19bc17a9
RM
340
341 result->mb_cur_min = result->mb_cur_max;
342 }
343
344 lr_ignore_rest (cmfile, 1);
345
346 state = 2;
347 continue;
348 }
349
350 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
351 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
352 && nowtok != tok_comment_char && nowtok != tok_g0esc
353 && nowtok != tok_g1esc && nowtok != tok_g2esc
4b10dd6c
UD
354 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
355 && nowtok != tok_include)
19bc17a9
RM
356 {
357 lr_error (cmfile, _("syntax error in prolog: %s"),
4b10dd6c 358 _("invalid definition"));
19bc17a9
RM
359
360 lr_ignore_rest (cmfile, 0);
361 continue;
362 }
363
364 /* We know that we need an argument. */
47e8b443 365 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
19bc17a9
RM
366
367 switch (nowtok)
368 {
369 case tok_code_set_name:
4b10dd6c 370 case tok_repertoiremap:
2a631990 371 if (arg->tok != tok_ident && arg->tok != tok_string)
19bc17a9
RM
372 {
373 badarg:
374 lr_error (cmfile, _("syntax error in prolog: %s"),
375 _("bad argument"));
376
377 lr_ignore_rest (cmfile, 0);
378 continue;
379 }
380
4b10dd6c
UD
381 if (nowtok == tok_code_set_name)
382 result->code_set_name = obstack_copy0 (&result->mem_pool,
383 arg->val.str.startmb,
384 arg->val.str.lenmb);
385 else
386 result->repertoiremap = obstack_copy0 (&result->mem_pool,
387 arg->val.str.startmb,
388 arg->val.str.lenmb);
19bc17a9
RM
389
390 lr_ignore_rest (cmfile, 1);
391 continue;
392
393 case tok_mb_cur_max:
394 case tok_mb_cur_min:
395 if (arg->tok != tok_number)
396 goto badarg;
397
4b10dd6c
UD
398 if (verbose
399 && ((nowtok == tok_mb_cur_max
400 && result->mb_cur_max != 0)
401 || (nowtok == tok_mb_cur_max
402 && result->mb_cur_max != 0)))
403 lr_error (cmfile, _("duplicate definition of <%s>"),
404 nowtok == tok_mb_cur_min
405 ? "mb_cur_min" : "mb_cur_max");
406
407 if (arg->val.num < 1)
19bc17a9
RM
408 {
409 lr_error (cmfile,
4b10dd6c
UD
410 _("value for <%s> must be 1 or greater"),
411 nowtok == tok_mb_cur_min
412 ? "mb_cur_min" : "mb_cur_max");
19bc17a9
RM
413
414 lr_ignore_rest (cmfile, 0);
415 continue;
416 }
417 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
ba1ffaa1 418 && (int) arg->val.num < result->mb_cur_min)
19bc17a9 419 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
ba1ffaa1 420 && (int) arg->val.num > result->mb_cur_max))
19bc17a9
RM
421 {
422 lr_error (cmfile, _("\
4b10dd6c
UD
423value of <%s> must be greater or equal than the value of <%s>"),
424 "mb_cur_max", "mb_cur_min");
19bc17a9
RM
425
426 lr_ignore_rest (cmfile, 0);
427 continue;
428 }
429
430 if (nowtok == tok_mb_cur_max)
431 result->mb_cur_max = arg->val.num;
432 else
433 result->mb_cur_min = arg->val.num;
434
435 lr_ignore_rest (cmfile, 1);
436 continue;
437
438 case tok_escape_char:
439 case tok_comment_char:
440 if (arg->tok != tok_ident)
441 goto badarg;
442
4b10dd6c 443 if (arg->val.str.lenmb != 1)
19bc17a9
RM
444 {
445 lr_error (cmfile, _("\
446argument to <%s> must be a single character"),
447 nowtok == tok_escape_char ? "escape_char"
448 : "comment_char");
449
450 lr_ignore_rest (cmfile, 0);
451 continue;
452 }
453
454 if (nowtok == tok_escape_char)
4b10dd6c 455 cmfile->escape_char = *arg->val.str.startmb;
19bc17a9 456 else
4b10dd6c 457 cmfile->comment_char = *arg->val.str.startmb;
19bc17a9
RM
458
459 lr_ignore_rest (cmfile, 1);
460 continue;
461
462 case tok_g0esc:
463 case tok_g1esc:
464 case tok_g2esc:
465 case tok_g3esc:
4b10dd6c 466 case tok_escseq:
19bc17a9
RM
467 lr_ignore_rest (cmfile, 0); /* XXX */
468 continue;
469
4b10dd6c
UD
470 case tok_include:
471 lr_error (cmfile, _("\
472character sets with locking states are not supported"));
473 exit (4);
474
19bc17a9
RM
475 default:
476 /* Cannot happen. */
477 assert (! "Should not happen");
478 }
479 break;
480
481 case 2:
482 /* We have seen `CHARMAP' and now are in the body. Each line
483 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
484 if (nowtok == tok_eol)
485 /* Ignore empty lines. */
486 continue;
487
488 if (nowtok == tok_end)
489 {
490 expected_tok = tok_charmap;
491 expected_str = "CHARMAP";
492 state = 90;
493 continue;
494 }
495
723faa38 496 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
497 {
498 lr_error (cmfile, _("syntax error in %s definition: %s"),
499 "CHARMAP", _("no symbolic name given"));
500
501 lr_ignore_rest (cmfile, 0);
502 continue;
503 }
504
505 /* If the previous line was not completely correct free the
506 used memory. */
507 if (from_name != NULL)
508 obstack_free (&result->mem_pool, from_name);
509
723faa38
UD
510 if (nowtok == tok_bsymbol)
511 from_name = (char *) obstack_copy0 (&result->mem_pool,
512 now->val.str.startmb,
513 now->val.str.lenmb);
514 else
515 {
a0dc5206 516 obstack_printf (&result->mem_pool, "U%08X",
723faa38
UD
517 cmfile->token.val.ucs4);
518 obstack_1grow (&result->mem_pool, '\0');
519 from_name = (char *) obstack_finish (&result->mem_pool);
520 }
19bc17a9
RM
521 to_name = NULL;
522
523 state = 3;
524 continue;
525
526 case 3:
527 /* We have two possibilities: We can see an ellipsis or an
528 encoding value. */
4b10dd6c 529 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
a0dc5206
UD
530 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
531 || nowtok == tok_ellipsis2_2)
19bc17a9 532 {
4b10dd6c 533 ellipsis = nowtok;
a0dc5206
UD
534 if (nowtok == tok_ellipsis4_2)
535 {
536 step = 2;
537 nowtok = tok_ellipsis4;
538 }
539 else if (nowtok == tok_ellipsis2_2)
540 {
541 step = 2;
542 nowtok = tok_ellipsis2;
543 }
19bc17a9
RM
544 state = 4;
545 continue;
546 }
547 /* FALLTHROUGH */
548
549 case 5:
4b10dd6c 550 if (nowtok != tok_charcode)
19bc17a9
RM
551 {
552 lr_error (cmfile, _("syntax error in %s definition: %s"),
4b10dd6c 553 "CHARMAP", _("invalid encoding given"));
19bc17a9
RM
554
555 lr_ignore_rest (cmfile, 0);
556
557 state = 2;
558 continue;
559 }
560
69f155d4
UD
561 if (now->val.charcode.nbytes < result->mb_cur_min)
562 lr_error (cmfile, _("too few bytes in character encoding"));
563 else if (now->val.charcode.nbytes > result->mb_cur_max)
564 lr_error (cmfile, _("too many bytes in character encoding"));
19bc17a9 565 else
4b10dd6c
UD
566 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
567 now->val.charcode.bytes, from_name, to_name,
a0dc5206 568 ellipsis != tok_ellipsis2, step);
19bc17a9
RM
569
570 /* Ignore trailing comment silently. */
571 lr_ignore_rest (cmfile, 0);
572
573 from_name = NULL;
574 to_name = NULL;
a0dc5206
UD
575 ellipsis = tok_none;
576 step = 1;
19bc17a9
RM
577
578 state = 2;
579 continue;
580
581 case 4:
723faa38 582 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
583 {
584 lr_error (cmfile, _("syntax error in %s definition: %s"),
585 "CHARMAP",
586 _("no symbolic name given for end of range"));
587
588 lr_ignore_rest (cmfile, 0);
589 continue;
590 }
591
69f155d4 592 /* Copy the to-name in a safe place. */
723faa38
UD
593 if (nowtok == tok_bsymbol)
594 to_name = (char *) obstack_copy0 (&result->mem_pool,
595 cmfile->token.val.str.startmb,
596 cmfile->token.val.str.lenmb);
597 else
598 {
a0dc5206 599 obstack_printf (&result->mem_pool, "U%08X",
723faa38
UD
600 cmfile->token.val.ucs4);
601 obstack_1grow (&result->mem_pool, '\0');
602 to_name = (char *) obstack_finish (&result->mem_pool);
603 }
19bc17a9 604
74015205 605 state = 5;
19bc17a9
RM
606 continue;
607
608 case 90:
609 if (nowtok != expected_tok)
610 lr_error (cmfile, _("\
11bf311e 611%1$s: definition does not end with `END %1$s'"), expected_str);
19bc17a9
RM
612
613 lr_ignore_rest (cmfile, nowtok == expected_tok);
614 state = 91;
615 continue;
616
617 case 91:
618 /* Waiting for WIDTH... */
75cd5204
RM
619 if (nowtok == tok_eol)
620 /* Ignore empty lines. */
621 continue;
622
19bc17a9
RM
623 if (nowtok == tok_width_default)
624 {
625 state = 92;
626 continue;
627 }
628
629 if (nowtok == tok_width)
630 {
631 lr_ignore_rest (cmfile, 1);
632 state = 93;
633 continue;
634 }
635
636 if (nowtok == tok_width_variable)
637 {
638 lr_ignore_rest (cmfile, 1);
639 state = 98;
640 continue;
641 }
642
643 lr_error (cmfile, _("\
644only WIDTH definitions are allowed to follow the CHARMAP definition"));
645
646 lr_ignore_rest (cmfile, 0);
647 continue;
648
649 case 92:
650 if (nowtok != tok_number)
651 lr_error (cmfile, _("value for %s must be an integer"),
652 "WIDTH_DEFAULT");
653 else
654 result->width_default = now->val.num;
655
656 lr_ignore_rest (cmfile, nowtok == tok_number);
657
658 state = 91;
659 continue;
660
661 case 93:
662 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
663 "%s...%s %d\n". */
664 if (nowtok == tok_eol)
665 /* ignore empty lines. */
666 continue;
667
668 if (nowtok == tok_end)
669 {
670 expected_tok = tok_width;
671 expected_str = "WIDTH";
672 state = 90;
673 continue;
674 }
675
8d6120a9 676 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
677 {
678 lr_error (cmfile, _("syntax error in %s definition: %s"),
679 "WIDTH", _("no symbolic name given"));
680
681 lr_ignore_rest (cmfile, 0);
682 continue;
683 }
684
685 if (from_name != NULL)
686 obstack_free (&result->mem_pool, from_name);
687
8d6120a9
UD
688 if (nowtok == tok_bsymbol)
689 from_name = (char *) obstack_copy0 (&result->mem_pool,
690 now->val.str.startmb,
691 now->val.str.lenmb);
692 else
693 {
694 obstack_printf (&result->mem_pool, "U%08X",
695 cmfile->token.val.ucs4);
696 obstack_1grow (&result->mem_pool, '\0');
697 from_name = (char *) obstack_finish (&result->mem_pool);
698 }
699
19bc17a9
RM
700 to_name = NULL;
701
702 state = 94;
703 continue;
704
705 case 94:
4b10dd6c 706 if (nowtok == tok_ellipsis3)
75cd5204
RM
707 {
708 state = 95;
709 continue;
710 }
19bc17a9
RM
711
712 case 96:
713 if (nowtok != tok_number)
714 lr_error (cmfile, _("value for %s must be an integer"),
715 "WIDTH");
716 else
717 {
75cd5204
RM
718 /* Store width for chars. */
719 new_width (cmfile, result, from_name, to_name, now->val.num);
720
19bc17a9 721 from_name = NULL;
75cd5204 722 to_name = NULL;
19bc17a9
RM
723 }
724
725 lr_ignore_rest (cmfile, nowtok == tok_number);
726
727 state = 93;
728 continue;
729
730 case 95:
8d6120a9 731 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
732 {
733 lr_error (cmfile, _("syntax error in %s definition: %s"),
734 "WIDTH", _("no symbolic name given for end of range"));
735
736 lr_ignore_rest (cmfile, 0);
737
738 state = 93;
739 continue;
740 }
741
8d6120a9
UD
742 if (nowtok == tok_bsymbol)
743 to_name = (char *) obstack_copy0 (&result->mem_pool,
744 now->val.str.startmb,
745 now->val.str.lenmb);
746 else
747 {
748 obstack_printf (&result->mem_pool, "U%08X",
749 cmfile->token.val.ucs4);
750 obstack_1grow (&result->mem_pool, '\0');
751 to_name = (char *) obstack_finish (&result->mem_pool);
752 }
19bc17a9 753
19bc17a9
RM
754 state = 96;
755 continue;
756
757 case 98:
758 /* We now expect `END WIDTH_VARIABLE' or lines of the format
759 "%s\n" or "%s...%s\n". */
760 if (nowtok == tok_eol)
761 /* ignore empty lines. */
762 continue;
763
764 if (nowtok == tok_end)
765 {
766 expected_tok = tok_width_variable;
767 expected_str = "WIDTH_VARIABLE";
768 state = 90;
769 continue;
770 }
771
620cdffb 772 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
773 {
774 lr_error (cmfile, _("syntax error in %s definition: %s"),
775 "WIDTH_VARIABLE", _("no symbolic name given"));
776
777 lr_ignore_rest (cmfile, 0);
778
779 continue;
780 }
781
782 if (from_name != NULL)
783 obstack_free (&result->mem_pool, from_name);
784
620cdffb
UD
785 if (nowtok == tok_bsymbol)
786 from_name = (char *) obstack_copy0 (&result->mem_pool,
787 now->val.str.startmb,
788 now->val.str.lenmb);
789 else
790 {
791 obstack_printf (&result->mem_pool, "U%08X",
792 cmfile->token.val.ucs4);
793 obstack_1grow (&result->mem_pool, '\0');
794 from_name = (char *) obstack_finish (&result->mem_pool);
795 }
19bc17a9
RM
796 to_name = NULL;
797
798 state = 99;
799 continue;
800
801 case 99:
4b10dd6c 802 if (nowtok == tok_ellipsis3)
19bc17a9
RM
803 state = 100;
804
805 /* Store info. */
806 from_name = NULL;
807
808 /* Warn */
809 state = 98;
810 continue;
811
812 case 100:
620cdffb
UD
813 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
814 {
815 lr_error (cmfile, _("syntax error in %s definition: %s"),
816 "WIDTH_VARIABLE",
817 _("no symbolic name given for end of range"));
818 lr_ignore_rest (cmfile, 0);
819 continue;
820 }
821
822 if (nowtok == tok_bsymbol)
823 to_name = (char *) obstack_copy0 (&result->mem_pool,
824 now->val.str.startmb,
825 now->val.str.lenmb);
19bc17a9
RM
826 else
827 {
620cdffb
UD
828 obstack_printf (&result->mem_pool, "U%08X",
829 cmfile->token.val.ucs4);
830 obstack_1grow (&result->mem_pool, '\0');
831 to_name = (char *) obstack_finish (&result->mem_pool);
19bc17a9
RM
832 }
833
620cdffb
UD
834 /* XXX Enter value into table. */
835
836 lr_ignore_rest (cmfile, 1);
19bc17a9
RM
837
838 state = 98;
839 continue;
840
841 default:
f2b98f97
UD
842 WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
843 __FILE__));
19bc17a9
RM
844 /* NOTREACHED */
845 }
846 break;
847 }
848
c84142e8 849 if (state != 91 && !be_quiet)
f2b98f97
UD
850 WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
851 cmfile->fname));
19bc17a9
RM
852
853 lr_close (cmfile);
854
855 return result;
856}
75cd5204
RM
857
858
859static void
4b10dd6c 860new_width (struct linereader *cmfile, struct charmap_t *result,
75cd5204
RM
861 const char *from, const char *to, unsigned long int width)
862{
4b10dd6c
UD
863 struct charseq *from_val;
864 struct charseq *to_val;
75cd5204 865
4b10dd6c
UD
866 from_val = charmap_find_value (result, from, strlen (from));
867 if (from_val == NULL)
75cd5204
RM
868 {
869 lr_error (cmfile, _("unknown character `%s'"), from);
870 return;
871 }
872
873 if (to == NULL)
874 to_val = from_val;
875 else
876 {
4b10dd6c
UD
877 to_val = charmap_find_value (result, to, strlen (to));
878 if (to_val == NULL)
75cd5204
RM
879 {
880 lr_error (cmfile, _("unknown character `%s'"), to);
881 return;
882 }
e57372d1
UD
883
884 /* Make sure the number of bytes for the end points of the range
885 is correct. */
886 if (from_val->nbytes != to_val->nbytes)
887 {
888 lr_error (cmfile, _("\
889number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
890 from_val->nbytes, to_val->nbytes);
891 return;
892 }
75cd5204
RM
893 }
894
895 if (result->nwidth_rules >= result->nwidth_rules_max)
896 {
897 size_t new_size = result->nwidth_rules + 32;
898 struct width_rule *new_rules =
899 (struct width_rule *) obstack_alloc (&result->mem_pool,
900 (new_size
901 * sizeof (struct width_rule)));
902
903 memcpy (new_rules, result->width_rules,
904 result->nwidth_rules_max * sizeof (struct width_rule));
905
906 result->width_rules = new_rules;
907 result->nwidth_rules_max = new_size;
908 }
909
910 result->width_rules[result->nwidth_rules].from = from_val;
911 result->width_rules[result->nwidth_rules].to = to_val;
912 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
913 ++result->nwidth_rules;
914}
4b10dd6c
UD
915
916
917struct charseq *
918charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
919{
920 void *result;
921
922 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
923 < 0 ? NULL : (struct charseq *) result);
924}
925
926
927static void
928charmap_new_char (struct linereader *lr, struct charmap_t *cm,
9cfe5381
RM
929 size_t nbytes, unsigned char *bytes,
930 const char *from, const char *to,
a0dc5206 931 int decimal_ellipsis, int step)
4b10dd6c
UD
932{
933 hash_table *ht = &cm->char_table;
934 hash_table *bt = &cm->byte_table;
935 struct obstack *ob = &cm->mem_pool;
936 char *from_end;
937 char *to_end;
938 const char *cp;
939 int prefix_len, len1, len2;
940 unsigned int from_nr, to_nr, cnt;
941 struct charseq *newp;
942
943 len1 = strlen (from);
944
945 if (to == NULL)
946 {
947 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
948 newp->nbytes = nbytes;
949 memcpy (newp->bytes, bytes, nbytes);
a0dc5206 950 newp->name = from;
2d05bb35 951
4b10dd6c 952 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
2d05bb35
UD
953 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
954 {
955 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
956 xxxx and xxxxxxxx are hexadecimal numbers. In this case
957 we use the value of xxxx or xxxxxxxx as the UCS4 value of
958 this character and we don't have to consult the repertoire
959 map.
960
961 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
962 and xxxxxxxx also give the code point in UCS4 but this must
963 be in the private, i.e., unassigned, area. This should be
964 used for characters which do not (yet) have an equivalent
965 in ISO 10646 and Unicode. */
966 char *endp;
967
968 errno = 0;
a0dc5206 969 newp->ucs4 = strtoul (from + 1, &endp, 16);
2d05bb35 970 if (endp - from != len1
6dd67bd5 971 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
2d05bb35
UD
972 || newp->ucs4 >= 0x80000000)
973 /* This wasn't successful. Signal this name cannot be a
974 correct UCS value. */
975 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
976 }
4b10dd6c
UD
977
978 insert_entry (ht, from, len1, newp);
979 insert_entry (bt, newp->bytes, nbytes, newp);
980 /* Please note that it isn't a bug if a symbol is defined more
981 than once. All later definitions are simply discarded. */
982 return;
983 }
984
985 /* We have a range: the names must have names with equal prefixes
986 and an equal number of digits, where the second number is greater
987 or equal than the first. */
988 len2 = strlen (to);
989
990 if (len1 != len2)
991 {
992 illegal_range:
993 lr_error (lr, _("invalid names for character range"));
994 return;
995 }
996
997 cp = &from[len1 - 1];
998 if (decimal_ellipsis)
999 while (isdigit (*cp) && cp >= from)
1000 --cp;
1001 else
1002 while (isxdigit (*cp) && cp >= from)
1003 {
1004 if (!isdigit (*cp) && !isupper (*cp))
1005 lr_error (lr, _("\
1006hexadecimal range format should use only capital characters"));
1007 --cp;
1008 }
1009
1010 prefix_len = (cp - from) + 1;
1011
1012 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1013 goto illegal_range;
1014
1015 errno = 0;
1016 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
6dd67bd5 1017 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
4b10dd6c 1018 || ((to_nr = strtoul (&to[prefix_len], &to_end,
6dd67bd5 1019 decimal_ellipsis ? 10 : 16)) == UINT_MAX
4b10dd6c
UD
1020 && errno == ERANGE)
1021 || *to_end != '\0')
1022 {
11bf311e 1023 lr_error (lr, _("<%s> and <%s> are invalid names for range"), from, to);
4b10dd6c
UD
1024 return;
1025 }
1026
1027 if (from_nr > to_nr)
1028 {
11bf311e 1029 lr_error (lr, _("upper limit in range is smaller than lower limit"));
4b10dd6c
UD
1030 return;
1031 }
1032
a0dc5206 1033 for (cnt = from_nr; cnt <= to_nr; cnt += step)
4b10dd6c
UD
1034 {
1035 char *name_end;
1036 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1037 prefix_len, from, len1 - prefix_len, cnt);
9deb2b36 1038 obstack_1grow (ob, '\0');
4b10dd6c
UD
1039 name_end = obstack_finish (ob);
1040
1041 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1042 newp->nbytes = nbytes;
1043 memcpy (newp->bytes, bytes, nbytes);
1044 newp->name = name_end;
2d05bb35 1045
4b10dd6c 1046 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
2d05bb35
UD
1047 if ((name_end[0] == 'U' || name_end[0] == 'P')
1048 && (len1 == 5 || len1 == 9))
1049 {
1050 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1051 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1052 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1053 this character and we don't have to consult the repertoire
1054 map.
1055
1056 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1057 and xxxxxxxx also give the code point in UCS4 but this must
1058 be in the private, i.e., unassigned, area. This should be
1059 used for characters which do not (yet) have an equivalent
1060 in ISO 10646 and Unicode. */
1061 char *endp;
1062
1063 errno = 0;
601d2942 1064 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
2d05bb35 1065 if (endp - name_end != len1
6dd67bd5 1066 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
2d05bb35
UD
1067 || newp->ucs4 >= 0x80000000)
1068 /* This wasn't successful. Signal this name cannot be a
1069 correct UCS value. */
1070 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1071 }
4b10dd6c
UD
1072
1073 insert_entry (ht, name_end, len1, newp);
1074 insert_entry (bt, newp->bytes, nbytes, newp);
1075 /* Please note we don't examine the return value since it is no error
1076 if we have two definitions for a symbol. */
1077
1078 /* Increment the value in the byte sequence. */
1079 if (++bytes[nbytes - 1] == '\0')
1080 {
1081 int b = nbytes - 2;
1082
1083 do
1084 if (b < 0)
1085 {
1086 lr_error (lr,
1087 _("resulting bytes for range not representable."));
1088 return;
1089 }
1090 while (++bytes[b--] == 0);
1091 }
1092 }
1093}
1094
1095
1096struct charseq *
1097charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1098 size_t nbytes)
1099{
1100 void *result;
1101
1102 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1103 < 0 ? NULL : (struct charseq *) result);
1104}