]> git.ipfire.org Git - thirdparty/glibc.git/blame - locale/programs/charmap.c
2.5-18.1
[thirdparty/glibc.git] / locale / programs / charmap.c
CommitLineData
0ecb606c 1/* Copyright (C) 1996, 1998-2004,2005, 2006 Free Software Foundation, Inc.
df4ef2ab 2 This file is part of the GNU C Library.
4b10dd6c 3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1996.
19bc17a9 4
0ecb606c
JJ
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License version 2 as
7 published by the Free Software Foundation.
19bc17a9 8
0ecb606c 9 This program is distributed in the hope that it will be useful,
df4ef2ab 10 but WITHOUT ANY WARRANTY; without even the implied warranty of
0ecb606c
JJ
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
19bc17a9 13
0ecb606c
JJ
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software Foundation,
16 Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
19bc17a9
RM
17
18#ifdef HAVE_CONFIG_H
19# include <config.h>
20#endif
21
22#include <ctype.h>
23#include <errno.h>
24#include <libintl.h>
4b10dd6c 25#include <limits.h>
bb39c4ef 26#include <stdio.h>
19bc17a9
RM
27#include <stdlib.h>
28#include <string.h>
f2b98f97 29#include <error.h>
19bc17a9 30
f2b98f97 31#include "localedef.h"
19bc17a9 32#include "linereader.h"
4b10dd6c 33#include "charmap.h"
3e076219 34#include "charmap-dir.h"
19bc17a9 35
19bc17a9
RM
36#include <assert.h>
37
38
39/* Define the lookup function. */
40#include "charmap-kw.h"
41
42
19bc17a9 43/* Prototypes for local functions. */
93693c4d
UD
44static struct charmap_t *parse_charmap (struct linereader *cmfile,
45 int verbose, int be_quiet);
4b10dd6c 46static void new_width (struct linereader *cmfile, struct charmap_t *result,
75cd5204
RM
47 const char *from, const char *to,
48 unsigned long int width);
4b10dd6c 49static void charmap_new_char (struct linereader *lr, struct charmap_t *cm,
0ecb606c
JJ
50 size_t nbytes, unsigned char *bytes,
51 const char *from, const char *to,
52 int decimal_ellipsis, int step);
19bc17a9 53
93693c4d 54
cb2eab1f
UD
55bool enc_not_ascii_compatible;
56
57
194c5f8d 58#ifdef NEED_NULL_POINTER
93693c4d 59static const char *null_pointer;
194c5f8d 60#endif
93693c4d 61
3e076219
UD
62static struct linereader *
63cmlr_open (const char *directory, const char *name, kw_hash_fct_t hf)
64{
65 FILE *fp;
66
67 fp = charmap_open (directory, name);
68 if (fp == NULL)
69 return NULL;
70 else
71 {
72 size_t dlen = strlen (directory);
73 int add_slash = (dlen == 0 || directory[dlen - 1] != '/');
74 size_t nlen = strlen (name);
75 char *pathname;
76 char *p;
77
78 pathname = alloca (dlen + add_slash + nlen + 1);
79 p = stpcpy (pathname, directory);
80 if (add_slash)
81 *p++ = '/';
82 stpcpy (p, name);
83
84 return lr_create (fp, pathname, hf);
85 }
86}
19bc17a9 87
4b10dd6c 88struct charmap_t *
0ecb606c
JJ
89charmap_read (const char *filename, int verbose, int error_not_found,
90 int be_quiet, int use_default)
19bc17a9 91{
4b10dd6c 92 struct charmap_t *result = NULL;
19bc17a9
RM
93
94 if (filename != NULL)
95 {
f14854aa 96 struct linereader *cmfile;
19bc17a9 97
f14854aa
UD
98 /* First try the name as found in the parameter. */
99 cmfile = lr_open (filename, charmap_hash);
100 if (cmfile == NULL)
101 {
102 /* No successful. So start looking through the directories
103 in the I18NPATH if this is a simple name. */
104 if (strchr (filename, '/') == NULL)
105 {
106 char *i18npath = getenv ("I18NPATH");
107 if (i18npath != NULL && *i18npath != '\0')
108 {
db2f05ba
RM
109 const size_t pathlen = strlen (i18npath);
110 char i18npathbuf[pathlen + 1];
111 char path[pathlen + sizeof ("/charmaps")];
f14854aa 112 char *next;
db2f05ba 113 i18npath = memcpy (i18npathbuf, i18npath, pathlen + 1);
f14854aa 114
f14854aa
UD
115 while (cmfile == NULL
116 && (next = strsep (&i18npath, ":")) != NULL)
117 {
3e076219
UD
118 stpcpy (stpcpy (path, next), "/charmaps");
119 cmfile = cmlr_open (path, filename, charmap_hash);
f14854aa
UD
120
121 if (cmfile == NULL)
47e8b443
UD
122 /* Try without the "/charmaps" part. */
123 cmfile = cmlr_open (next, filename, charmap_hash);
f14854aa
UD
124 }
125 }
b2386e4e
UD
126
127 if (cmfile == NULL)
47e8b443
UD
128 /* Try the default directory. */
129 cmfile = cmlr_open (CHARMAP_PATH, filename, charmap_hash);
f14854aa 130 }
19bc17a9
RM
131 }
132
f14854aa 133 if (cmfile != NULL)
0ecb606c 134 result = parse_charmap (cmfile, verbose, be_quiet);
19bc17a9 135
0ecb606c
JJ
136 if (result == NULL && error_not_found)
137 WITH_CUR_LOCALE (error (0, errno, _("\
f2b98f97 138character map file `%s' not found"), filename));
19bc17a9
RM
139 }
140
93693c4d 141 if (result == NULL && filename != NULL && strchr (filename, '/') == NULL)
51702635
UD
142 {
143 /* OK, one more try. We also accept the names given to the
144 character sets in the files. Sometimes they differ from the
145 file name. */
3e076219 146 CHARMAP_DIR *dir;
51702635 147
3e076219 148 dir = charmap_opendir (CHARMAP_PATH);
eb7c2001 149 if (dir != NULL)
51702635 150 {
3e076219 151 const char *dirent;
51702635 152
3e076219
UD
153 while ((dirent = charmap_readdir (dir)) != NULL)
154 {
155 char **aliases;
156 char **p;
157 int found;
158
159 aliases = charmap_aliases (CHARMAP_PATH, dirent);
160 found = 0;
161 for (p = aliases; *p; p++)
162 if (strcasecmp (*p, filename) == 0)
51702635 163 {
3e076219
UD
164 found = 1;
165 break;
51702635 166 }
3e076219
UD
167 charmap_free_aliases (aliases);
168
169 if (found)
170 {
171 struct linereader *cmfile;
51702635 172
3e076219
UD
173 cmfile = cmlr_open (CHARMAP_PATH, dirent, charmap_hash);
174 if (cmfile != NULL)
93693c4d 175 result = parse_charmap (cmfile, verbose, be_quiet);
3e076219
UD
176
177 break;
178 }
179 }
180
181 charmap_closedir (dir);
51702635
UD
182 }
183 }
184
93693c4d 185 if (result == NULL && DEFAULT_CHARMAP != NULL)
19bc17a9 186 {
f14854aa 187 struct linereader *cmfile;
19bc17a9 188
3e076219
UD
189 cmfile = cmlr_open (CHARMAP_PATH, DEFAULT_CHARMAP, charmap_hash);
190 if (cmfile != NULL)
93693c4d 191 result = parse_charmap (cmfile, verbose, be_quiet);
19bc17a9
RM
192
193 if (result == NULL)
f2b98f97
UD
194 WITH_CUR_LOCALE (error (4, errno, _("\
195default character map file `%s' not found"), DEFAULT_CHARMAP));
19bc17a9
RM
196 }
197
659f290a 198 if (result != NULL && result->code_set_name == NULL)
4a10c7fe
UD
199 /* The input file does not specify a code set name. This
200 shouldn't happen but we should cope with it. */
201 result->code_set_name = basename (filename);
202
bb39c4ef
UD
203 /* Test of ASCII compatibility of locale encoding.
204
205 Verify that the encoding to be used in a locale is ASCII compatible,
206 at least for the graphic characters, excluding the control characters,
207 '$' and '@'. This constraint comes from an ISO C 99 restriction.
208
209 ISO C 99 section 7.17.(2) (about wchar_t):
210 the null character shall have the code value zero and each member of
211 the basic character set shall have a code value equal to its value
212 when used as the lone character in an integer character constant.
213 ISO C 99 section 5.2.1.(3):
214 Both the basic source and basic execution character sets shall have
215 the following members: the 26 uppercase letters of the Latin alphabet
216 A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
217 the 26 lowercase letters of the Latin alphabet
218 a b c d e f g h i j k l m n o p q r s t u v w x y z
219 the 10 decimal digits
220 0 1 2 3 4 5 6 7 8 9
221 the following 29 graphic characters
222 ! " # % & ' ( ) * + , - . / : ; < = > ? [ \ ] ^ _ { | } ~
223 the space character, and control characters representing horizontal
224 tab, vertical tab, and form feed.
225
226 Therefore, for all members of the "basic character set", the 'char' code
227 must have the same value as the 'wchar_t' code, which in glibc is the
228 same as the Unicode code, which for all of the enumerated characters
229 is identical to the ASCII code. */
93693c4d 230 if (result != NULL && use_default)
bb39c4ef
UD
231 {
232 static const char basic_charset[] =
233 {
234 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
235 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
236 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
237 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
238 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
239 '!', '"', '#', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
240 '.', '/', ':', ';', '<', '=', '>', '?', '[', '\\', ']', '^',
241 '_', '{', '|', '}', '~', ' ', '\t', '\v', '\f', '\0'
242 };
243 int failed = 0;
244 const char *p = basic_charset;
245
246 do
247 {
6dd67bd5 248 struct charseq *seq = charmap_find_symbol (result, p, 1);
bb39c4ef 249
6dd67bd5 250 if (seq == NULL || seq->ucs4 != (uint32_t) *p)
bb39c4ef
UD
251 failed = 1;
252 }
253 while (*p++ != '\0');
254
255 if (failed)
cb2eab1f
UD
256 {
257 WITH_CUR_LOCALE (fprintf (stderr, _("\
bb39c4ef 258character map `%s' is not ASCII compatible, locale not ISO C compliant\n"),
cb2eab1f
UD
259 result->code_set_name));
260 enc_not_ascii_compatible = true;
261 }
bb39c4ef
UD
262 }
263
19bc17a9
RM
264 return result;
265}
266
267
4b10dd6c 268static struct charmap_t *
93693c4d 269parse_charmap (struct linereader *cmfile, int verbose, int be_quiet)
19bc17a9 270{
4b10dd6c 271 struct charmap_t *result;
19bc17a9
RM
272 int state;
273 enum token_t expected_tok = tok_error;
274 const char *expected_str = NULL;
275 char *from_name = NULL;
276 char *to_name = NULL;
4b10dd6c 277 enum token_t ellipsis = 0;
a0dc5206 278 int step = 1;
19bc17a9 279
4b10dd6c
UD
280 /* We don't want symbolic names in string to be translated. */
281 cmfile->translate_strings = 0;
282
19bc17a9 283 /* Allocate room for result. */
4b10dd6c
UD
284 result = (struct charmap_t *) xmalloc (sizeof (struct charmap_t));
285 memset (result, '\0', sizeof (struct charmap_t));
75cd5204
RM
286 /* The default DEFAULT_WIDTH is 1. */
287 result->width_default = 1;
19bc17a9 288
df4ef2ab 289#define obstack_chunk_alloc malloc
19bc17a9
RM
290#define obstack_chunk_free free
291 obstack_init (&result->mem_pool);
292
4b10dd6c
UD
293 if (init_hash (&result->char_table, 256)
294 || init_hash (&result->byte_table, 256))
19bc17a9
RM
295 {
296 free (result);
297 return NULL;
298 }
299
300 /* We use a state machine to describe the charmap description file
301 format. */
302 state = 1;
303 while (1)
304 {
305 /* What's on? */
47e8b443 306 struct token *now = lr_token (cmfile, NULL, NULL, NULL, verbose);
19bc17a9
RM
307 enum token_t nowtok = now->tok;
308 struct token *arg;
309
310 if (nowtok == tok_eof)
311 break;
312
313 switch (state)
314 {
315 case 1:
316 /* The beginning. We expect the special declarations, EOL or
317 `CHARMAP'. */
318 if (nowtok == tok_eol)
319 /* Ignore empty lines. */
320 continue;
321
322 if (nowtok == tok_charmap)
323 {
324 from_name = NULL;
325 to_name = NULL;
326
327 /* We have to set up the real work. Fill in some
328 default values. */
329 if (result->mb_cur_max == 0)
330 result->mb_cur_max = 1;
331 if (result->mb_cur_min == 0)
332 result->mb_cur_min = result->mb_cur_max;
880f421f 333 if (result->mb_cur_min > result->mb_cur_max)
19bc17a9 334 {
880f421f 335 if (!be_quiet)
f2b98f97 336 WITH_CUR_LOCALE (error (0, 0, _("\
19bc17a9 337%s: <mb_cur_max> must be greater than <mb_cur_min>\n"),
f2b98f97 338 cmfile->fname));
19bc17a9
RM
339
340 result->mb_cur_min = result->mb_cur_max;
341 }
342
343 lr_ignore_rest (cmfile, 1);
344
345 state = 2;
346 continue;
347 }
348
349 if (nowtok != tok_code_set_name && nowtok != tok_mb_cur_max
350 && nowtok != tok_mb_cur_min && nowtok != tok_escape_char
351 && nowtok != tok_comment_char && nowtok != tok_g0esc
352 && nowtok != tok_g1esc && nowtok != tok_g2esc
4b10dd6c
UD
353 && nowtok != tok_g3esc && nowtok != tok_repertoiremap
354 && nowtok != tok_include)
19bc17a9
RM
355 {
356 lr_error (cmfile, _("syntax error in prolog: %s"),
4b10dd6c 357 _("invalid definition"));
19bc17a9
RM
358
359 lr_ignore_rest (cmfile, 0);
360 continue;
361 }
362
363 /* We know that we need an argument. */
47e8b443 364 arg = lr_token (cmfile, NULL, NULL, NULL, verbose);
19bc17a9
RM
365
366 switch (nowtok)
367 {
368 case tok_code_set_name:
4b10dd6c 369 case tok_repertoiremap:
2a631990 370 if (arg->tok != tok_ident && arg->tok != tok_string)
19bc17a9
RM
371 {
372 badarg:
373 lr_error (cmfile, _("syntax error in prolog: %s"),
374 _("bad argument"));
375
376 lr_ignore_rest (cmfile, 0);
377 continue;
378 }
379
4b10dd6c
UD
380 if (nowtok == tok_code_set_name)
381 result->code_set_name = obstack_copy0 (&result->mem_pool,
382 arg->val.str.startmb,
383 arg->val.str.lenmb);
384 else
385 result->repertoiremap = obstack_copy0 (&result->mem_pool,
386 arg->val.str.startmb,
387 arg->val.str.lenmb);
19bc17a9
RM
388
389 lr_ignore_rest (cmfile, 1);
390 continue;
391
392 case tok_mb_cur_max:
393 case tok_mb_cur_min:
394 if (arg->tok != tok_number)
395 goto badarg;
396
4b10dd6c
UD
397 if (verbose
398 && ((nowtok == tok_mb_cur_max
399 && result->mb_cur_max != 0)
400 || (nowtok == tok_mb_cur_max
401 && result->mb_cur_max != 0)))
402 lr_error (cmfile, _("duplicate definition of <%s>"),
403 nowtok == tok_mb_cur_min
404 ? "mb_cur_min" : "mb_cur_max");
405
406 if (arg->val.num < 1)
19bc17a9
RM
407 {
408 lr_error (cmfile,
4b10dd6c
UD
409 _("value for <%s> must be 1 or greater"),
410 nowtok == tok_mb_cur_min
411 ? "mb_cur_min" : "mb_cur_max");
19bc17a9
RM
412
413 lr_ignore_rest (cmfile, 0);
414 continue;
415 }
416 if ((nowtok == tok_mb_cur_max && result->mb_cur_min != 0
ba1ffaa1 417 && (int) arg->val.num < result->mb_cur_min)
19bc17a9 418 || (nowtok == tok_mb_cur_min && result->mb_cur_max != 0
ba1ffaa1 419 && (int) arg->val.num > result->mb_cur_max))
19bc17a9
RM
420 {
421 lr_error (cmfile, _("\
4b10dd6c
UD
422value of <%s> must be greater or equal than the value of <%s>"),
423 "mb_cur_max", "mb_cur_min");
19bc17a9
RM
424
425 lr_ignore_rest (cmfile, 0);
426 continue;
427 }
428
429 if (nowtok == tok_mb_cur_max)
430 result->mb_cur_max = arg->val.num;
431 else
432 result->mb_cur_min = arg->val.num;
433
434 lr_ignore_rest (cmfile, 1);
435 continue;
436
437 case tok_escape_char:
438 case tok_comment_char:
439 if (arg->tok != tok_ident)
440 goto badarg;
441
4b10dd6c 442 if (arg->val.str.lenmb != 1)
19bc17a9
RM
443 {
444 lr_error (cmfile, _("\
445argument to <%s> must be a single character"),
446 nowtok == tok_escape_char ? "escape_char"
447 : "comment_char");
448
449 lr_ignore_rest (cmfile, 0);
450 continue;
451 }
452
453 if (nowtok == tok_escape_char)
4b10dd6c 454 cmfile->escape_char = *arg->val.str.startmb;
19bc17a9 455 else
4b10dd6c 456 cmfile->comment_char = *arg->val.str.startmb;
19bc17a9
RM
457
458 lr_ignore_rest (cmfile, 1);
459 continue;
460
461 case tok_g0esc:
462 case tok_g1esc:
463 case tok_g2esc:
464 case tok_g3esc:
4b10dd6c 465 case tok_escseq:
19bc17a9
RM
466 lr_ignore_rest (cmfile, 0); /* XXX */
467 continue;
468
4b10dd6c
UD
469 case tok_include:
470 lr_error (cmfile, _("\
471character sets with locking states are not supported"));
472 exit (4);
473
19bc17a9
RM
474 default:
475 /* Cannot happen. */
476 assert (! "Should not happen");
477 }
478 break;
479
480 case 2:
481 /* We have seen `CHARMAP' and now are in the body. Each line
482 must have the format "%s %s %s\n" or "%s...%s %s %s\n". */
483 if (nowtok == tok_eol)
484 /* Ignore empty lines. */
485 continue;
486
487 if (nowtok == tok_end)
488 {
489 expected_tok = tok_charmap;
490 expected_str = "CHARMAP";
491 state = 90;
492 continue;
493 }
494
723faa38 495 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
496 {
497 lr_error (cmfile, _("syntax error in %s definition: %s"),
498 "CHARMAP", _("no symbolic name given"));
499
500 lr_ignore_rest (cmfile, 0);
501 continue;
502 }
503
504 /* If the previous line was not completely correct free the
505 used memory. */
506 if (from_name != NULL)
507 obstack_free (&result->mem_pool, from_name);
508
723faa38
UD
509 if (nowtok == tok_bsymbol)
510 from_name = (char *) obstack_copy0 (&result->mem_pool,
511 now->val.str.startmb,
512 now->val.str.lenmb);
513 else
514 {
a0dc5206 515 obstack_printf (&result->mem_pool, "U%08X",
723faa38
UD
516 cmfile->token.val.ucs4);
517 obstack_1grow (&result->mem_pool, '\0');
518 from_name = (char *) obstack_finish (&result->mem_pool);
519 }
19bc17a9
RM
520 to_name = NULL;
521
522 state = 3;
523 continue;
524
525 case 3:
526 /* We have two possibilities: We can see an ellipsis or an
527 encoding value. */
4b10dd6c 528 if (nowtok == tok_ellipsis3 || nowtok == tok_ellipsis4
a0dc5206
UD
529 || nowtok == tok_ellipsis2 || nowtok == tok_ellipsis4_2
530 || nowtok == tok_ellipsis2_2)
19bc17a9 531 {
4b10dd6c 532 ellipsis = nowtok;
a0dc5206
UD
533 if (nowtok == tok_ellipsis4_2)
534 {
535 step = 2;
536 nowtok = tok_ellipsis4;
537 }
538 else if (nowtok == tok_ellipsis2_2)
539 {
540 step = 2;
541 nowtok = tok_ellipsis2;
542 }
19bc17a9
RM
543 state = 4;
544 continue;
545 }
546 /* FALLTHROUGH */
547
548 case 5:
4b10dd6c 549 if (nowtok != tok_charcode)
19bc17a9
RM
550 {
551 lr_error (cmfile, _("syntax error in %s definition: %s"),
4b10dd6c 552 "CHARMAP", _("invalid encoding given"));
19bc17a9
RM
553
554 lr_ignore_rest (cmfile, 0);
555
556 state = 2;
557 continue;
558 }
559
69f155d4
UD
560 if (now->val.charcode.nbytes < result->mb_cur_min)
561 lr_error (cmfile, _("too few bytes in character encoding"));
562 else if (now->val.charcode.nbytes > result->mb_cur_max)
563 lr_error (cmfile, _("too many bytes in character encoding"));
19bc17a9 564 else
4b10dd6c
UD
565 charmap_new_char (cmfile, result, now->val.charcode.nbytes,
566 now->val.charcode.bytes, from_name, to_name,
a0dc5206 567 ellipsis != tok_ellipsis2, step);
19bc17a9
RM
568
569 /* Ignore trailing comment silently. */
570 lr_ignore_rest (cmfile, 0);
571
572 from_name = NULL;
573 to_name = NULL;
a0dc5206
UD
574 ellipsis = tok_none;
575 step = 1;
19bc17a9
RM
576
577 state = 2;
578 continue;
579
580 case 4:
723faa38 581 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
582 {
583 lr_error (cmfile, _("syntax error in %s definition: %s"),
584 "CHARMAP",
585 _("no symbolic name given for end of range"));
586
587 lr_ignore_rest (cmfile, 0);
588 continue;
589 }
590
69f155d4 591 /* Copy the to-name in a safe place. */
723faa38
UD
592 if (nowtok == tok_bsymbol)
593 to_name = (char *) obstack_copy0 (&result->mem_pool,
594 cmfile->token.val.str.startmb,
595 cmfile->token.val.str.lenmb);
596 else
597 {
a0dc5206 598 obstack_printf (&result->mem_pool, "U%08X",
723faa38
UD
599 cmfile->token.val.ucs4);
600 obstack_1grow (&result->mem_pool, '\0');
601 to_name = (char *) obstack_finish (&result->mem_pool);
602 }
19bc17a9 603
74015205 604 state = 5;
19bc17a9
RM
605 continue;
606
607 case 90:
608 if (nowtok != expected_tok)
609 lr_error (cmfile, _("\
610`%1$s' definition does not end with `END %1$s'"), expected_str);
611
612 lr_ignore_rest (cmfile, nowtok == expected_tok);
613 state = 91;
614 continue;
615
616 case 91:
617 /* Waiting for WIDTH... */
75cd5204
RM
618 if (nowtok == tok_eol)
619 /* Ignore empty lines. */
620 continue;
621
19bc17a9
RM
622 if (nowtok == tok_width_default)
623 {
624 state = 92;
625 continue;
626 }
627
628 if (nowtok == tok_width)
629 {
630 lr_ignore_rest (cmfile, 1);
631 state = 93;
632 continue;
633 }
634
635 if (nowtok == tok_width_variable)
636 {
637 lr_ignore_rest (cmfile, 1);
638 state = 98;
639 continue;
640 }
641
642 lr_error (cmfile, _("\
643only WIDTH definitions are allowed to follow the CHARMAP definition"));
644
645 lr_ignore_rest (cmfile, 0);
646 continue;
647
648 case 92:
649 if (nowtok != tok_number)
650 lr_error (cmfile, _("value for %s must be an integer"),
651 "WIDTH_DEFAULT");
652 else
653 result->width_default = now->val.num;
654
655 lr_ignore_rest (cmfile, nowtok == tok_number);
656
657 state = 91;
658 continue;
659
660 case 93:
661 /* We now expect `END WIDTH' or lines of the format "%s %d\n" or
662 "%s...%s %d\n". */
663 if (nowtok == tok_eol)
664 /* ignore empty lines. */
665 continue;
666
667 if (nowtok == tok_end)
668 {
669 expected_tok = tok_width;
670 expected_str = "WIDTH";
671 state = 90;
672 continue;
673 }
674
8d6120a9 675 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
676 {
677 lr_error (cmfile, _("syntax error in %s definition: %s"),
678 "WIDTH", _("no symbolic name given"));
679
680 lr_ignore_rest (cmfile, 0);
681 continue;
682 }
683
684 if (from_name != NULL)
685 obstack_free (&result->mem_pool, from_name);
686
8d6120a9
UD
687 if (nowtok == tok_bsymbol)
688 from_name = (char *) obstack_copy0 (&result->mem_pool,
689 now->val.str.startmb,
690 now->val.str.lenmb);
691 else
692 {
693 obstack_printf (&result->mem_pool, "U%08X",
694 cmfile->token.val.ucs4);
695 obstack_1grow (&result->mem_pool, '\0');
696 from_name = (char *) obstack_finish (&result->mem_pool);
697 }
698
19bc17a9
RM
699 to_name = NULL;
700
701 state = 94;
702 continue;
703
704 case 94:
4b10dd6c 705 if (nowtok == tok_ellipsis3)
75cd5204
RM
706 {
707 state = 95;
708 continue;
709 }
19bc17a9
RM
710
711 case 96:
712 if (nowtok != tok_number)
713 lr_error (cmfile, _("value for %s must be an integer"),
714 "WIDTH");
715 else
716 {
75cd5204
RM
717 /* Store width for chars. */
718 new_width (cmfile, result, from_name, to_name, now->val.num);
719
19bc17a9 720 from_name = NULL;
75cd5204 721 to_name = NULL;
19bc17a9
RM
722 }
723
724 lr_ignore_rest (cmfile, nowtok == tok_number);
725
726 state = 93;
727 continue;
728
729 case 95:
8d6120a9 730 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
731 {
732 lr_error (cmfile, _("syntax error in %s definition: %s"),
733 "WIDTH", _("no symbolic name given for end of range"));
734
735 lr_ignore_rest (cmfile, 0);
736
737 state = 93;
738 continue;
739 }
740
8d6120a9
UD
741 if (nowtok == tok_bsymbol)
742 to_name = (char *) obstack_copy0 (&result->mem_pool,
743 now->val.str.startmb,
744 now->val.str.lenmb);
745 else
746 {
747 obstack_printf (&result->mem_pool, "U%08X",
748 cmfile->token.val.ucs4);
749 obstack_1grow (&result->mem_pool, '\0');
750 to_name = (char *) obstack_finish (&result->mem_pool);
751 }
19bc17a9 752
19bc17a9
RM
753 state = 96;
754 continue;
755
756 case 98:
757 /* We now expect `END WIDTH_VARIABLE' or lines of the format
758 "%s\n" or "%s...%s\n". */
759 if (nowtok == tok_eol)
760 /* ignore empty lines. */
761 continue;
762
763 if (nowtok == tok_end)
764 {
765 expected_tok = tok_width_variable;
766 expected_str = "WIDTH_VARIABLE";
767 state = 90;
768 continue;
769 }
770
620cdffb 771 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
19bc17a9
RM
772 {
773 lr_error (cmfile, _("syntax error in %s definition: %s"),
774 "WIDTH_VARIABLE", _("no symbolic name given"));
775
776 lr_ignore_rest (cmfile, 0);
777
778 continue;
779 }
780
781 if (from_name != NULL)
782 obstack_free (&result->mem_pool, from_name);
783
620cdffb
UD
784 if (nowtok == tok_bsymbol)
785 from_name = (char *) obstack_copy0 (&result->mem_pool,
786 now->val.str.startmb,
787 now->val.str.lenmb);
788 else
789 {
790 obstack_printf (&result->mem_pool, "U%08X",
791 cmfile->token.val.ucs4);
792 obstack_1grow (&result->mem_pool, '\0');
793 from_name = (char *) obstack_finish (&result->mem_pool);
794 }
19bc17a9
RM
795 to_name = NULL;
796
797 state = 99;
798 continue;
799
800 case 99:
4b10dd6c 801 if (nowtok == tok_ellipsis3)
19bc17a9
RM
802 state = 100;
803
804 /* Store info. */
805 from_name = NULL;
806
807 /* Warn */
808 state = 98;
809 continue;
810
811 case 100:
620cdffb
UD
812 if (nowtok != tok_bsymbol && nowtok != tok_ucs4)
813 {
814 lr_error (cmfile, _("syntax error in %s definition: %s"),
815 "WIDTH_VARIABLE",
816 _("no symbolic name given for end of range"));
817 lr_ignore_rest (cmfile, 0);
818 continue;
819 }
820
821 if (nowtok == tok_bsymbol)
822 to_name = (char *) obstack_copy0 (&result->mem_pool,
823 now->val.str.startmb,
824 now->val.str.lenmb);
19bc17a9
RM
825 else
826 {
620cdffb
UD
827 obstack_printf (&result->mem_pool, "U%08X",
828 cmfile->token.val.ucs4);
829 obstack_1grow (&result->mem_pool, '\0');
830 to_name = (char *) obstack_finish (&result->mem_pool);
19bc17a9
RM
831 }
832
620cdffb
UD
833 /* XXX Enter value into table. */
834
835 lr_ignore_rest (cmfile, 1);
19bc17a9
RM
836
837 state = 98;
838 continue;
839
840 default:
f2b98f97
UD
841 WITH_CUR_LOCALE (error (5, 0, _("%s: error in state machine"),
842 __FILE__));
19bc17a9
RM
843 /* NOTREACHED */
844 }
845 break;
846 }
847
c84142e8 848 if (state != 91 && !be_quiet)
f2b98f97
UD
849 WITH_CUR_LOCALE (error (0, 0, _("%s: premature end of file"),
850 cmfile->fname));
19bc17a9
RM
851
852 lr_close (cmfile);
853
854 return result;
855}
75cd5204
RM
856
857
858static void
4b10dd6c 859new_width (struct linereader *cmfile, struct charmap_t *result,
75cd5204
RM
860 const char *from, const char *to, unsigned long int width)
861{
4b10dd6c
UD
862 struct charseq *from_val;
863 struct charseq *to_val;
75cd5204 864
4b10dd6c
UD
865 from_val = charmap_find_value (result, from, strlen (from));
866 if (from_val == NULL)
75cd5204
RM
867 {
868 lr_error (cmfile, _("unknown character `%s'"), from);
869 return;
870 }
871
872 if (to == NULL)
873 to_val = from_val;
874 else
875 {
4b10dd6c
UD
876 to_val = charmap_find_value (result, to, strlen (to));
877 if (to_val == NULL)
75cd5204
RM
878 {
879 lr_error (cmfile, _("unknown character `%s'"), to);
880 return;
881 }
e57372d1
UD
882
883 /* Make sure the number of bytes for the end points of the range
884 is correct. */
885 if (from_val->nbytes != to_val->nbytes)
886 {
887 lr_error (cmfile, _("\
888number of bytes for byte sequence of beginning and end of range not the same: %d vs %d"),
889 from_val->nbytes, to_val->nbytes);
890 return;
891 }
75cd5204
RM
892 }
893
894 if (result->nwidth_rules >= result->nwidth_rules_max)
895 {
896 size_t new_size = result->nwidth_rules + 32;
897 struct width_rule *new_rules =
898 (struct width_rule *) obstack_alloc (&result->mem_pool,
899 (new_size
900 * sizeof (struct width_rule)));
901
902 memcpy (new_rules, result->width_rules,
903 result->nwidth_rules_max * sizeof (struct width_rule));
904
905 result->width_rules = new_rules;
906 result->nwidth_rules_max = new_size;
907 }
908
909 result->width_rules[result->nwidth_rules].from = from_val;
910 result->width_rules[result->nwidth_rules].to = to_val;
911 result->width_rules[result->nwidth_rules].width = (unsigned int) width;
912 ++result->nwidth_rules;
913}
4b10dd6c
UD
914
915
916struct charseq *
917charmap_find_value (const struct charmap_t *cm, const char *name, size_t len)
918{
919 void *result;
920
921 return (find_entry ((hash_table *) &cm->char_table, name, len, &result)
922 < 0 ? NULL : (struct charseq *) result);
923}
924
925
926static void
927charmap_new_char (struct linereader *lr, struct charmap_t *cm,
0ecb606c
JJ
928 size_t nbytes, unsigned char *bytes,
929 const char *from, const char *to,
a0dc5206 930 int decimal_ellipsis, int step)
4b10dd6c
UD
931{
932 hash_table *ht = &cm->char_table;
933 hash_table *bt = &cm->byte_table;
934 struct obstack *ob = &cm->mem_pool;
935 char *from_end;
936 char *to_end;
937 const char *cp;
938 int prefix_len, len1, len2;
939 unsigned int from_nr, to_nr, cnt;
940 struct charseq *newp;
941
942 len1 = strlen (from);
943
944 if (to == NULL)
945 {
946 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
947 newp->nbytes = nbytes;
948 memcpy (newp->bytes, bytes, nbytes);
a0dc5206 949 newp->name = from;
2d05bb35 950
4b10dd6c 951 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
2d05bb35
UD
952 if ((from[0] == 'U' || from[0] == 'P') && (len1 == 5 || len1 == 9))
953 {
954 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
955 xxxx and xxxxxxxx are hexadecimal numbers. In this case
956 we use the value of xxxx or xxxxxxxx as the UCS4 value of
957 this character and we don't have to consult the repertoire
958 map.
959
960 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
961 and xxxxxxxx also give the code point in UCS4 but this must
962 be in the private, i.e., unassigned, area. This should be
963 used for characters which do not (yet) have an equivalent
964 in ISO 10646 and Unicode. */
965 char *endp;
966
967 errno = 0;
a0dc5206 968 newp->ucs4 = strtoul (from + 1, &endp, 16);
2d05bb35 969 if (endp - from != len1
6dd67bd5 970 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
2d05bb35
UD
971 || newp->ucs4 >= 0x80000000)
972 /* This wasn't successful. Signal this name cannot be a
973 correct UCS value. */
974 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
975 }
4b10dd6c
UD
976
977 insert_entry (ht, from, len1, newp);
978 insert_entry (bt, newp->bytes, nbytes, newp);
979 /* Please note that it isn't a bug if a symbol is defined more
980 than once. All later definitions are simply discarded. */
981 return;
982 }
983
984 /* We have a range: the names must have names with equal prefixes
985 and an equal number of digits, where the second number is greater
986 or equal than the first. */
987 len2 = strlen (to);
988
989 if (len1 != len2)
990 {
991 illegal_range:
992 lr_error (lr, _("invalid names for character range"));
993 return;
994 }
995
996 cp = &from[len1 - 1];
997 if (decimal_ellipsis)
998 while (isdigit (*cp) && cp >= from)
999 --cp;
1000 else
1001 while (isxdigit (*cp) && cp >= from)
1002 {
1003 if (!isdigit (*cp) && !isupper (*cp))
1004 lr_error (lr, _("\
1005hexadecimal range format should use only capital characters"));
1006 --cp;
1007 }
1008
1009 prefix_len = (cp - from) + 1;
1010
1011 if (cp == &from[len1 - 1] || strncmp (from, to, prefix_len) != 0)
1012 goto illegal_range;
1013
1014 errno = 0;
1015 from_nr = strtoul (&from[prefix_len], &from_end, decimal_ellipsis ? 10 : 16);
6dd67bd5 1016 if (*from_end != '\0' || (from_nr == UINT_MAX && errno == ERANGE)
4b10dd6c 1017 || ((to_nr = strtoul (&to[prefix_len], &to_end,
6dd67bd5 1018 decimal_ellipsis ? 10 : 16)) == UINT_MAX
4b10dd6c
UD
1019 && errno == ERANGE)
1020 || *to_end != '\0')
1021 {
2d05bb35 1022 lr_error (lr, _("<%s> and <%s> are illegal names for range"), from, to);
4b10dd6c
UD
1023 return;
1024 }
1025
1026 if (from_nr > to_nr)
1027 {
1028 lr_error (lr, _("upper limit in range is not higher then lower limit"));
1029 return;
1030 }
1031
a0dc5206 1032 for (cnt = from_nr; cnt <= to_nr; cnt += step)
4b10dd6c
UD
1033 {
1034 char *name_end;
1035 obstack_printf (ob, decimal_ellipsis ? "%.*s%0*d" : "%.*s%0*X",
1036 prefix_len, from, len1 - prefix_len, cnt);
9deb2b36 1037 obstack_1grow (ob, '\0');
4b10dd6c
UD
1038 name_end = obstack_finish (ob);
1039
1040 newp = (struct charseq *) obstack_alloc (ob, sizeof (*newp) + nbytes);
1041 newp->nbytes = nbytes;
1042 memcpy (newp->bytes, bytes, nbytes);
1043 newp->name = name_end;
2d05bb35 1044
4b10dd6c 1045 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
2d05bb35
UD
1046 if ((name_end[0] == 'U' || name_end[0] == 'P')
1047 && (len1 == 5 || len1 == 9))
1048 {
1049 /* Maybe the name is of the form `Uxxxx' or `Uxxxxxxxx' where
1050 xxxx and xxxxxxxx are hexadecimal numbers. In this case
1051 we use the value of xxxx or xxxxxxxx as the UCS4 value of
1052 this character and we don't have to consult the repertoire
1053 map.
1054
1055 If the name is of the form `Pxxxx' or `Pxxxxxxxx' the xxxx
1056 and xxxxxxxx also give the code point in UCS4 but this must
1057 be in the private, i.e., unassigned, area. This should be
1058 used for characters which do not (yet) have an equivalent
1059 in ISO 10646 and Unicode. */
1060 char *endp;
1061
1062 errno = 0;
601d2942 1063 newp->ucs4 = strtoul (name_end + 1, &endp, 16);
2d05bb35 1064 if (endp - name_end != len1
6dd67bd5 1065 || (newp->ucs4 == ~((uint32_t) 0) && errno == ERANGE)
2d05bb35
UD
1066 || newp->ucs4 >= 0x80000000)
1067 /* This wasn't successful. Signal this name cannot be a
1068 correct UCS value. */
1069 newp->ucs4 = UNINITIALIZED_CHAR_VALUE;
1070 }
4b10dd6c
UD
1071
1072 insert_entry (ht, name_end, len1, newp);
1073 insert_entry (bt, newp->bytes, nbytes, newp);
1074 /* Please note we don't examine the return value since it is no error
1075 if we have two definitions for a symbol. */
1076
1077 /* Increment the value in the byte sequence. */
1078 if (++bytes[nbytes - 1] == '\0')
1079 {
1080 int b = nbytes - 2;
1081
1082 do
1083 if (b < 0)
1084 {
1085 lr_error (lr,
1086 _("resulting bytes for range not representable."));
1087 return;
1088 }
1089 while (++bytes[b--] == 0);
1090 }
1091 }
1092}
1093
1094
1095struct charseq *
1096charmap_find_symbol (const struct charmap_t *cm, const char *bytes,
1097 size_t nbytes)
1098{
1099 void *result;
1100
1101 return (find_entry ((hash_table *) &cm->byte_table, bytes, nbytes, &result)
1102 < 0 ? NULL : (struct charseq *) result);
1103}