]> git.ipfire.org Git - thirdparty/glibc.git/blame - locale/programs/ld-ctype.c
(collate_output): Update.
[thirdparty/glibc.git] / locale / programs / ld-ctype.c
CommitLineData
f76d7052 1/* Copyright (C) 1995, 1996, 1997, 1998, 1999 Free Software Foundation, Inc.
c84142e8 2 This file is part of the GNU C Library.
4b10dd6c 3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
19bc17a9 4
c84142e8
UD
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
19bc17a9 9
c84142e8
UD
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
19bc17a9 14
c84142e8
UD
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
19bc17a9
RM
19
20#ifdef HAVE_CONFIG_H
21# include <config.h>
22#endif
23
a68b0d31 24#include <alloca.h>
4b10dd6c 25#include <byteswap.h>
19bc17a9 26#include <endian.h>
4b10dd6c 27#include <errno.h>
19bc17a9 28#include <limits.h>
4b10dd6c
UD
29#include <obstack.h>
30#include <stdlib.h>
19bc17a9 31#include <string.h>
4b10dd6c
UD
32#include <wchar.h>
33#include <wctype.h>
34#include <sys/uio.h>
19bc17a9 35
4b10dd6c 36#include "charmap.h"
19bc17a9
RM
37#include "localeinfo.h"
38#include "langinfo.h"
4b10dd6c 39#include "linereader.h"
19bc17a9 40#include "locfile-token.h"
4b10dd6c
UD
41#include "locfile.h"
42#include "localedef.h"
19bc17a9 43
19bc17a9
RM
44#include <assert.h>
45
46
011ebfab 47#ifdef PREDEFINED_CLASSES
4b10dd6c
UD
48/* These are the extra bits not in wctype.h since these are not preallocated
49 classes. */
011ebfab
UD
50# define _ISwspecial1 (1 << 29)
51# define _ISwspecial2 (1 << 30)
52# define _ISwspecial3 (1 << 31)
53#endif
19bc17a9
RM
54
55
56/* The bit used for representing a special class. */
57#define BITPOS(class) ((class) - tok_upper)
4b10dd6c
UD
58#define BIT(class) (_ISbit (BITPOS (class)))
59#define BITw(class) (_ISwbit (BITPOS (class)))
19bc17a9
RM
60
61#define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
64
19bc17a9
RM
65
66/* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
4b10dd6c 69#define char_class_t uint16_t
4b10dd6c 70#define char_class32_t uint32_t
4b10dd6c
UD
71
72
73/* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
77struct translit_to_t
78{
79 uint32_t *str;
80
81 struct translit_to_t *next;
82};
83
84struct translit_t
85{
86 uint32_t *from;
87
88 struct translit_to_t *to;
89
90 struct translit_t *next;
91};
19bc17a9
RM
92
93
94/* The real definition of the struct for the LC_CTYPE locale. */
95struct locale_ctype_t
96{
4b10dd6c 97 uint32_t *charnames;
19bc17a9
RM
98 size_t charnames_max;
99 size_t charnames_act;
100
4b10dd6c
UD
101 struct repertoire_t *repertoire;
102
103 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
104#define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
ba1ffaa1 105 size_t nr_charclass;
19bc17a9 106 const char *classnames[MAX_NR_CHARCLASS];
4b10dd6c
UD
107 uint32_t last_class_char;
108 uint32_t class256_collection[256];
109 uint32_t *class_collection;
19bc17a9
RM
110 size_t class_collection_max;
111 size_t class_collection_act;
4b10dd6c
UD
112 uint32_t class_done;
113
114 struct charseq **mbdigits;
115 size_t mbdigits_act;
116 size_t mbdigits_max;
117 uint32_t *wcdigits;
118 size_t wcdigits_act;
119 size_t wcdigits_max;
120
121 struct charseq *mboutdigits[10];
122 uint32_t wcoutdigits[10];
123 size_t outdigits_act;
19bc17a9
RM
124
125 /* If the following number ever turns out to be too small simply
126 increase it. But I doubt it will. --drepper@gnu */
127#define MAX_NR_CHARMAP 16
128 const char *mapnames[MAX_NR_CHARMAP];
4b10dd6c
UD
129 uint32_t *map_collection[MAX_NR_CHARMAP];
130 uint32_t map256_collection[2][256];
9a0a462c
UD
131 size_t map_collection_max[MAX_NR_CHARMAP];
132 size_t map_collection_act[MAX_NR_CHARMAP];
19bc17a9
RM
133 size_t map_collection_nr;
134 size_t last_map_idx;
4b10dd6c
UD
135 int tomap_done[MAX_NR_CHARMAP];
136
137 /* Transliteration information. */
138 const char *translit_copy_locale;
139 const char *translit_copy_repertoire;
140 struct translit_t *translit;
19bc17a9
RM
141
142 /* The arrays for the binary representation. */
4b10dd6c
UD
143 uint32_t plane_size;
144 uint32_t plane_cnt;
19bc17a9
RM
145 char_class_t *ctype_b;
146 char_class32_t *ctype32_b;
4a33c2f5
UD
147 uint32_t *names;
148 uint32_t **map;
49f2be5b 149 uint32_t **map32;
4b10dd6c
UD
150 uint32_t *class_name_ptr;
151 uint32_t *map_name_ptr;
75cd5204 152 unsigned char *width;
4b10dd6c 153 uint32_t mb_cur_max;
6990326c 154 const char *codeset_name;
4a33c2f5
UD
155 uint32_t translit_hash_size;
156 uint32_t translit_hash_layers;
157 uint32_t *translit_from_idx;
158 uint32_t *translit_from_tbl;
159 uint32_t *translit_to_idx;
160 uint32_t *translit_to_tbl;
4b10dd6c
UD
161 size_t translit_idx_size;
162 size_t translit_from_tbl_size;
163 size_t translit_to_tbl_size;
164
165 struct obstack mem_pool;
19bc17a9
RM
166};
167
168
4b10dd6c
UD
169#define obstack_chunk_alloc xmalloc
170#define obstack_chunk_free free
171
172
19bc17a9 173/* Prototypes for local functions. */
4b10dd6c
UD
174static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
175 struct charmap_t *charmap, int ignore_content);
176static void ctype_class_new (struct linereader *lr,
177 struct locale_ctype_t *ctype, const char *name);
178static void ctype_map_new (struct linereader *lr,
179 struct locale_ctype_t *ctype,
180 const char *name, struct charmap_t *charmap);
181static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
182 size_t *max, size_t *act, unsigned int idx);
19bc17a9 183static void set_class_defaults (struct locale_ctype_t *ctype,
4b10dd6c
UD
184 struct charmap_t *charmap,
185 struct repertoire_t *repertoire);
75cd5204 186static void allocate_arrays (struct locale_ctype_t *ctype,
4b10dd6c
UD
187 struct charmap_t *charmap,
188 struct repertoire_t *repertoire);
19bc17a9
RM
189
190
4b10dd6c
UD
191static const char *longnames[] =
192{
193 "zero", "one", "two", "three", "four",
194 "five", "six", "seven", "eight", "nine"
195};
196static const unsigned char digits[] = "0123456789";
197
198
199static void
19bc17a9 200ctype_startup (struct linereader *lr, struct localedef_t *locale,
4b10dd6c 201 struct charmap_t *charmap, int ignore_content)
19bc17a9
RM
202{
203 unsigned int cnt;
204 struct locale_ctype_t *ctype;
205
4b10dd6c 206 if (!ignore_content)
19bc17a9 207 {
4b10dd6c
UD
208 /* Allocate the needed room. */
209 locale->categories[LC_CTYPE].ctype = ctype =
210 (struct locale_ctype_t *) xcalloc (1, sizeof (struct locale_ctype_t));
211
212 /* We have seen no names yet. */
213 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
214 ctype->charnames =
215 (unsigned int *) xmalloc (ctype->charnames_max
216 * sizeof (unsigned int));
217 for (cnt = 0; cnt < 256; ++cnt)
218 ctype->charnames[cnt] = cnt;
219 ctype->charnames_act = 256;
220
221 /* Fill character class information. */
222 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
223 /* The order of the following instructions determines the bit
224 positions! */
225 ctype_class_new (lr, ctype, "upper");
226 ctype_class_new (lr, ctype, "lower");
227 ctype_class_new (lr, ctype, "alpha");
228 ctype_class_new (lr, ctype, "digit");
229 ctype_class_new (lr, ctype, "xdigit");
230 ctype_class_new (lr, ctype, "space");
231 ctype_class_new (lr, ctype, "print");
232 ctype_class_new (lr, ctype, "graph");
233 ctype_class_new (lr, ctype, "blank");
234 ctype_class_new (lr, ctype, "cntrl");
235 ctype_class_new (lr, ctype, "punct");
236 ctype_class_new (lr, ctype, "alnum");
011ebfab 237#ifdef PREDEFINED_CLASSES
4b10dd6c
UD
238 /* The following are extensions from ISO 14652. */
239 ctype_class_new (lr, ctype, "left_to_right");
240 ctype_class_new (lr, ctype, "right_to_left");
241 ctype_class_new (lr, ctype, "num_terminator");
242 ctype_class_new (lr, ctype, "num_separator");
243 ctype_class_new (lr, ctype, "segment_separator");
244 ctype_class_new (lr, ctype, "block_separator");
245 ctype_class_new (lr, ctype, "direction_control");
246 ctype_class_new (lr, ctype, "sym_swap_layout");
247 ctype_class_new (lr, ctype, "char_shape_selector");
248 ctype_class_new (lr, ctype, "num_shape_selector");
249 ctype_class_new (lr, ctype, "non_spacing");
250 ctype_class_new (lr, ctype, "non_spacing_level3");
251 ctype_class_new (lr, ctype, "normal_connect");
252 ctype_class_new (lr, ctype, "r_connect");
253 ctype_class_new (lr, ctype, "no_connect");
254 ctype_class_new (lr, ctype, "no_connect-space");
255 ctype_class_new (lr, ctype, "vowel_connect");
011ebfab 256#endif
4b10dd6c
UD
257
258 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
259 ctype->class_collection
260 = (uint32_t *) xcalloc (sizeof (unsigned long int),
261 ctype->class_collection_max);
262 ctype->class_collection_act = 256;
263
264 /* Fill character map information. */
265 ctype->map_collection_nr = 0;
266 ctype->last_map_idx = MAX_NR_CHARMAP;
267 ctype_map_new (lr, ctype, "toupper", charmap);
268 ctype_map_new (lr, ctype, "tolower", charmap);
011ebfab 269#ifdef PREDEFINED_CLASSES
4b10dd6c 270 ctype_map_new (lr, ctype, "tosymmetric", charmap);
011ebfab 271#endif
4b10dd6c
UD
272
273 /* Fill first 256 entries in `toXXX' arrays. */
274 for (cnt = 0; cnt < 256; ++cnt)
275 {
276 ctype->map_collection[0][cnt] = cnt;
277 ctype->map_collection[1][cnt] = cnt;
9e2b7438 278#ifdef PREDEFINED_CLASSES
4b10dd6c 279 ctype->map_collection[2][cnt] = cnt;
9e2b7438 280#endif
4b10dd6c
UD
281 ctype->map256_collection[0][cnt] = cnt;
282 ctype->map256_collection[1][cnt] = cnt;
283 }
284
285 obstack_init (&ctype->mem_pool);
19bc17a9
RM
286 }
287}
288
289
290void
4b10dd6c 291ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
19bc17a9
RM
292{
293 /* See POSIX.2, table 2-6 for the meaning of the following table. */
294#define NCLASS 12
295 static const struct
296 {
297 const char *name;
298 const char allow[NCLASS];
299 }
300 valid_table[NCLASS] =
301 {
302 /* The order is important. See token.h for more information.
303 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
304 { "upper", "--MX-XDDXXX-" },
305 { "lower", "--MX-XDDXXX-" },
306 { "alpha", "---X-XDDXXX-" },
307 { "digit", "XXX--XDDXXX-" },
308 { "xdigit", "-----XDDXXX-" },
309 { "space", "XXXXX------X" },
310 { "print", "---------X--" },
311 { "graph", "---------X--" },
312 { "blank", "XXXXXM-----X" },
313 { "cntrl", "XXXXX-XX--XX" },
314 { "punct", "XXXXX-DD-X-X" },
315 { "alnum", "-----XDDXXX-" }
316 };
317 size_t cnt;
318 int cls1, cls2;
4b10dd6c
UD
319 uint32_t space_value;
320 struct charseq *space_seq;
19bc17a9 321 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
4b10dd6c 322 int warned;
19bc17a9 323
b9eb05d6
UD
324 /* Now resolve copying and also handle completely missing definitions. */
325 if (ctype == NULL)
326 {
70e51ab9
UD
327 const char *repertoire_name;
328
b9eb05d6
UD
329 /* First see whether we were supposed to copy. If yes, find the
330 actual definition. */
331 if (locale->copy_name[LC_CTYPE] != NULL)
332 {
333 /* Find the copying locale. This has to happen transitively since
334 the locale we are copying from might also copying another one. */
335 struct localedef_t *from = locale;
336
337 do
338 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
339 from->repertoire_name, charmap);
340 while (from->categories[LC_CTYPE].ctype == NULL
341 && from->copy_name[LC_CTYPE] != NULL);
342
343 ctype = locale->categories[LC_CTYPE].ctype
344 = from->categories[LC_CTYPE].ctype;
345 }
346
347 /* If there is still no definition issue an warning and create an
348 empty one. */
349 if (ctype == NULL)
350 {
f6ada7ad
UD
351 if (! be_quiet)
352 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
b9eb05d6
UD
353 ctype_startup (NULL, locale, charmap, 0);
354 ctype = locale->categories[LC_CTYPE].ctype;
355 }
70e51ab9
UD
356
357 /* Get the repertoire we have to use. */
358 repertoire_name = locale->repertoire_name ?: repertoire_global;
359 if (repertoire_name != NULL)
360 ctype->repertoire = repertoire_read (repertoire_name);
b9eb05d6
UD
361 }
362
19bc17a9 363 /* Set default value for classes not specified. */
4b10dd6c 364 set_class_defaults (ctype, charmap, ctype->repertoire);
19bc17a9
RM
365
366 /* Check according to table. */
42d7c593 367 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
19bc17a9 368 {
4b10dd6c 369 uint32_t tmp = ctype->class_collection[cnt];
19bc17a9 370
4b10dd6c
UD
371 if (tmp != 0)
372 {
373 for (cls1 = 0; cls1 < NCLASS; ++cls1)
374 if ((tmp & _ISwbit (cls1)) != 0)
375 for (cls2 = 0; cls2 < NCLASS; ++cls2)
376 if (valid_table[cls1].allow[cls2] != '-')
19bc17a9 377 {
4b10dd6c
UD
378 int eq = (tmp & _ISwbit (cls2)) != 0;
379 switch (valid_table[cls1].allow[cls2])
19bc17a9 380 {
4b10dd6c
UD
381 case 'M':
382 if (!eq)
383 {
384 uint32_t value = ctype->charnames[cnt];
385
386 if (!be_quiet)
387 error (0, 0, _("\
388character L'\\u%0*x' in class `%s' must be in class `%s'"),
389 value > 0xffff ? 8 : 4, value,
390 valid_table[cls1].name,
391 valid_table[cls2].name);
392 }
393 break;
394
395 case 'X':
396 if (eq)
397 {
398 uint32_t value = ctype->charnames[cnt];
399
400 if (!be_quiet)
401 error (0, 0, _("\
402character L'\\u%0*x' in class `%s' must not be in class `%s'"),
403 value > 0xffff ? 8 : 4, value,
404 valid_table[cls1].name,
405 valid_table[cls2].name);
406 }
407 break;
408
409 case 'D':
410 ctype->class_collection[cnt] |= _ISwbit (cls2);
411 break;
412
413 default:
414 error (5, 0, _("internal error in %s, line %u"),
415 __FUNCTION__, __LINE__);
19bc17a9 416 }
4b10dd6c
UD
417 }
418 }
419 }
420
421 for (cnt = 0; cnt < 256; ++cnt)
422 {
423 uint32_t tmp = ctype->class256_collection[cnt];
19bc17a9 424
4b10dd6c
UD
425 if (tmp != 0)
426 {
427 for (cls1 = 0; cls1 < NCLASS; ++cls1)
428 if ((tmp & _ISbit (cls1)) != 0)
429 for (cls2 = 0; cls2 < NCLASS; ++cls2)
430 if (valid_table[cls1].allow[cls2] != '-')
431 {
432 int eq = (tmp & _ISbit (cls2)) != 0;
433 switch (valid_table[cls1].allow[cls2])
19bc17a9 434 {
4b10dd6c
UD
435 case 'M':
436 if (!eq)
437 {
438 char buf[17];
439
dbbbaf53 440 sprintf (buf, "\\%zo", cnt);
4b10dd6c
UD
441
442 if (!be_quiet)
443 error (0, 0, _("\
444character '%s' in class `%s' must be in class `%s'"),
445 buf, valid_table[cls1].name,
446 valid_table[cls2].name);
447 }
448 break;
449
450 case 'X':
451 if (eq)
452 {
453 char buf[17];
454
dbbbaf53 455 sprintf (buf, "\\%zo", cnt);
4b10dd6c
UD
456
457 if (!be_quiet)
458 error (0, 0, _("\
459character '%s' in class `%s' must not be in class `%s'"),
460 buf, valid_table[cls1].name,
461 valid_table[cls2].name);
462 }
463 break;
464
465 case 'D':
466 ctype->class256_collection[cnt] |= _ISbit (cls2);
467 break;
468
469 default:
470 error (5, 0, _("internal error in %s, line %u"),
471 __FUNCTION__, __LINE__);
19bc17a9 472 }
4b10dd6c
UD
473 }
474 }
19bc17a9
RM
475 }
476
477 /* ... and now test <SP> as a special case. */
4b10dd6c
UD
478 space_value = repertoire_find_value (ctype->repertoire, "SP", 2);
479 if (space_value == ILLEGAL_CHAR_VALUE)
880f421f
UD
480 {
481 if (!be_quiet)
482 error (0, 0, _("character <SP> not defined in character map"));
483 }
c84142e8
UD
484 else if (((cnt = BITPOS (tok_space),
485 (ELEM (ctype, class_collection, , space_value)
4b10dd6c 486 & BITw (tok_space)) == 0)
c84142e8
UD
487 || (cnt = BITPOS (tok_blank),
488 (ELEM (ctype, class_collection, , space_value)
4b10dd6c 489 & BITw (tok_blank)) == 0)))
880f421f
UD
490 {
491 if (!be_quiet)
492 error (0, 0, _("<SP> character not in class `%s'"),
493 valid_table[cnt].name);
494 }
c84142e8
UD
495 else if (((cnt = BITPOS (tok_punct),
496 (ELEM (ctype, class_collection, , space_value)
4b10dd6c 497 & BITw (tok_punct)) != 0)
c84142e8
UD
498 || (cnt = BITPOS (tok_graph),
499 (ELEM (ctype, class_collection, , space_value)
4b10dd6c 500 & BITw (tok_graph))
880f421f
UD
501 != 0)))
502 {
503 if (!be_quiet)
504 error (0, 0, _("<SP> character must not be in class `%s'"),
505 valid_table[cnt].name);
506 }
19bc17a9 507 else
4b10dd6c
UD
508 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
509
510 space_seq = charmap_find_value (charmap, "SP", 2);
511 if (space_seq == NULL || space_seq->nbytes != 1)
512 {
513 if (!be_quiet)
514 error (0, 0, _("character <SP> not defined in character map"));
515 }
516 else if (((cnt = BITPOS (tok_space),
517 (ctype->class256_collection[space_seq->bytes[0]]
518 & BIT (tok_space)) == 0)
519 || (cnt = BITPOS (tok_blank),
520 (ctype->class256_collection[space_seq->bytes[0]]
521 & BIT (tok_blank)) == 0)))
522 {
523 if (!be_quiet)
524 error (0, 0, _("<SP> character not in class `%s'"),
525 valid_table[cnt].name);
526 }
527 else if (((cnt = BITPOS (tok_punct),
528 (ctype->class256_collection[space_seq->bytes[0]]
529 & BIT (tok_punct)) != 0)
530 || (cnt = BITPOS (tok_graph),
531 (ctype->class256_collection[space_seq->bytes[0]]
532 & BIT (tok_graph)) != 0)))
533 {
534 if (!be_quiet)
535 error (0, 0, _("<SP> character must not be in class `%s'"),
536 valid_table[cnt].name);
537 }
538 else
539 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
75cd5204
RM
540
541 /* Now that the tests are done make sure the name array contains all
542 characters which are handled in the WIDTH section of the
543 character set definition file. */
4b10dd6c
UD
544 if (charmap->width_rules != NULL)
545 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
75cd5204 546 {
827ff758
UD
547 unsigned char bytes[charmap->mb_cur_max];
548 int nbytes = charmap->width_rules[cnt].from->nbytes;
549
550 /* We have the range of character for which the width is
551 specified described using byte sequences of the multibyte
552 charset. We have to convert this to UCS4 now. And we
553 cannot simply convert the beginning and the end of the
554 sequence, we have to iterate over the byte sequence and
555 convert it for every single character. */
556 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
557
558 while (nbytes < charmap->width_rules[cnt].to->nbytes
559 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
560 nbytes) <= 0)
561 {
562 /* Find the UCS value for `bytes'. */
827ff758 563 int inner;
76e680a8
UD
564 uint32_t wch;
565 struct charseq *seq = charmap_find_symbol (charmap, bytes, nbytes);
566
567 if (seq == NULL)
568 wch = ILLEGAL_CHAR_VALUE;
569 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
570 wch = seq->ucs4;
571 else
572 wch = repertoire_find_value (ctype->repertoire, seq->name,
573 strlen (seq->name));
827ff758
UD
574
575 if (wch != ILLEGAL_CHAR_VALUE)
576 /* We are only interested in the side-effects of the
577 `find_idx' call. It will add appropriate entries in
578 the name array if this is necessary. */
579 (void) find_idx (ctype, NULL, NULL, NULL, wch);
580
581 /* "Increment" the bytes sequence. */
582 inner = nbytes - 1;
583 while (inner >= 0 && bytes[inner] == 0xff)
584 --inner;
585
586 if (inner < 0)
587 {
588 /* We have to extend the byte sequence. */
589 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
590 break;
591
592 bytes[0] = 1;
593 memset (&bytes[1], 0, nbytes);
594 ++nbytes;
595 }
596 else
597 {
598 ++bytes[inner];
599 while (++inner < nbytes)
600 bytes[inner] = 0;
601 }
602 }
4b10dd6c
UD
603 }
604
605 /* There must be a multiple of 10 digits. */
606 if (ctype->mbdigits_act % 10 != 0)
607 {
608 assert (ctype->mbdigits_act == ctype->wcdigits_act);
609 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
610 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
611 error (0, 0, _("`digit' category has not entries in groups of ten"));
612 }
613
614 /* Check the input digits. There must be a multiple of ten available.
42d7c593 615 In each group it could be that one or the other character is missing.
4b10dd6c
UD
616 In this case the whole group must be removed. */
617 cnt = 0;
618 while (cnt < ctype->mbdigits_act)
619 {
620 size_t inner;
621 for (inner = 0; inner < 10; ++inner)
622 if (ctype->mbdigits[cnt + inner] == NULL)
623 break;
624
625 if (inner == 10)
626 cnt += 10;
627 else
628 {
629 /* Remove the group. */
630 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
631 ((ctype->wcdigits_act - cnt - 10)
632 * sizeof (ctype->mbdigits[0])));
633 ctype->mbdigits_act -= 10;
634 }
635 }
636
637 /* If no input digits are given use the default. */
638 if (ctype->mbdigits_act == 0)
639 {
640 if (ctype->mbdigits_max == 0)
641 {
642 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
643 10 * sizeof (struct charseq *));
644 ctype->mbdigits_max = 10;
645 }
646
647 for (cnt = 0; cnt < 10; ++cnt)
648 {
649 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
650 digits + cnt, 1);
651 if (ctype->mbdigits[cnt] == NULL)
652 {
653 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
654 longnames[cnt],
655 strlen (longnames[cnt]));
656 if (ctype->mbdigits[cnt] == NULL)
657 {
658 /* Hum, this ain't good. */
659 error (0, 0, _("\
660no input digits defined and none of the standard names in the charmap"));
661
662 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
663 sizeof (struct charseq) + 1);
664
665 /* This is better than nothing. */
666 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
667 ctype->mbdigits[cnt]->nbytes = 1;
668 }
669 }
670 }
671
672 ctype->mbdigits_act = 10;
673 }
674
675 /* Check the wide character input digits. There must be a multiple
42d7c593 676 of ten available. In each group it could be that one or the other
4b10dd6c
UD
677 character is missing. In this case the whole group must be
678 removed. */
679 cnt = 0;
680 while (cnt < ctype->wcdigits_act)
681 {
682 size_t inner;
683 for (inner = 0; inner < 10; ++inner)
684 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
685 break;
686
687 if (inner == 10)
688 cnt += 10;
689 else
690 {
691 /* Remove the group. */
692 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
693 ((ctype->wcdigits_act - cnt - 10)
694 * sizeof (ctype->wcdigits[0])));
695 ctype->wcdigits_act -= 10;
696 }
697 }
698
699 /* If no input digits are given use the default. */
700 if (ctype->wcdigits_act == 0)
701 {
702 if (ctype->wcdigits_max == 0)
703 {
704 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
705 10 * sizeof (uint32_t));
706 ctype->wcdigits_max = 10;
707 }
708
709 for (cnt = 0; cnt < 10; ++cnt)
710 ctype->wcdigits[cnt] = L'0' + cnt;
711
712 ctype->mbdigits_act = 10;
713 }
714
715 /* Check the outdigits. */
716 warned = 0;
717 for (cnt = 0; cnt < 10; ++cnt)
718 if (ctype->mboutdigits[cnt] == NULL)
719 {
720 static struct charseq replace[2];
721
722 if (!warned)
723 {
724 error (0, 0, _("\
725not all characters used in `outdigit' are available in the charmap"));
726 warned = 1;
727 }
728
729 replace[0].nbytes = 1;
730 replace[0].bytes[0] = '?';
731 replace[0].bytes[1] = '\0';
732 ctype->mboutdigits[cnt] = &replace[0];
733 }
734
735 warned = 0;
736 for (cnt = 0; cnt < 10; ++cnt)
737 if (ctype->wcoutdigits[cnt] == 0)
738 {
739 if (!warned)
740 {
741 error (0, 0, _("\
742not all characters used in `outdigit' are available in the repertoire"));
743 warned = 1;
744 }
745
746 ctype->wcoutdigits[cnt] = L'?';
75cd5204 747 }
19bc17a9
RM
748}
749
750
751void
4b10dd6c 752ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
75cd5204 753 const char *output_path)
19bc17a9
RM
754{
755 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
756 const size_t nelems = (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)
5491da0d 757 + (ctype->map_collection_nr - 2));
75cd5204
RM
758 struct iovec iov[2 + nelems + ctype->nr_charclass
759 + ctype->map_collection_nr];
19bc17a9 760 struct locale_file data;
4b10dd6c 761 uint32_t idx[nelems + 1];
75cd5204 762 size_t elem, cnt, offset, total;
4b10dd6c 763 char *cp;
19bc17a9
RM
764
765 /* Now prepare the output: Find the sizes of the table we can use. */
4b10dd6c 766 allocate_arrays (ctype, charmap, ctype->repertoire);
19bc17a9
RM
767
768 data.magic = LIMAGIC (LC_CTYPE);
769 data.n = nelems;
770 iov[0].iov_base = (void *) &data;
771 iov[0].iov_len = sizeof (data);
772
773 iov[1].iov_base = (void *) idx;
774 iov[1].iov_len = sizeof (idx);
775
776 idx[0] = iov[0].iov_len + iov[1].iov_len;
777 offset = 0;
778
779 for (elem = 0; elem < nelems; ++elem)
780 {
781 if (elem < _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE))
782 switch (elem)
783 {
784#define CTYPE_DATA(name, base, len) \
785 case _NL_ITEM_INDEX (name): \
ce7a5ef4
RM
786 iov[2 + elem + offset].iov_base = (base); \
787 iov[2 + elem + offset].iov_len = (len); \
75cd5204
RM
788 if (elem + 1 < nelems) \
789 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
19bc17a9
RM
790 break
791
792 CTYPE_DATA (_NL_CTYPE_CLASS,
793 ctype->ctype_b,
794 (256 + 128) * sizeof (char_class_t));
795
4a33c2f5
UD
796 CTYPE_DATA (_NL_CTYPE_TOUPPER,
797 ctype->map[0],
f1d8b804 798 (256 + 128) * sizeof (uint32_t));
4a33c2f5
UD
799 CTYPE_DATA (_NL_CTYPE_TOLOWER,
800 ctype->map[1],
f1d8b804 801 (256 + 128) * sizeof (uint32_t));
19bc17a9 802
49f2be5b
UD
803 CTYPE_DATA (_NL_CTYPE_TOUPPER32,
804 ctype->map32[0],
f1d8b804 805 (ctype->plane_size * ctype->plane_cnt)
49f2be5b
UD
806 * sizeof (uint32_t));
807 CTYPE_DATA (_NL_CTYPE_TOLOWER32,
808 ctype->map32[1],
f1d8b804 809 (ctype->plane_size * ctype->plane_cnt)
49f2be5b
UD
810 * sizeof (uint32_t));
811
19bc17a9
RM
812 CTYPE_DATA (_NL_CTYPE_CLASS32,
813 ctype->ctype32_b,
814 (ctype->plane_size * ctype->plane_cnt
815 * sizeof (char_class32_t)));
816
4a33c2f5
UD
817 CTYPE_DATA (_NL_CTYPE_NAMES,
818 ctype->names, (ctype->plane_size * ctype->plane_cnt
819 * sizeof (uint32_t)));
820
821 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_SIZE,
822 &ctype->translit_hash_size, sizeof (uint32_t));
823 CTYPE_DATA (_NL_CTYPE_TRANSLIT_HASH_LAYERS,
824 &ctype->translit_hash_layers, sizeof (uint32_t));
825
826 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
827 ctype->translit_from_idx,
4b10dd6c
UD
828 ctype->translit_idx_size);
829
4a33c2f5
UD
830 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
831 ctype->translit_from_tbl,
4b10dd6c
UD
832 ctype->translit_from_tbl_size);
833
4a33c2f5
UD
834 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
835 ctype->translit_to_idx,
4b10dd6c
UD
836 ctype->translit_idx_size);
837
4a33c2f5
UD
838 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
839 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
4b10dd6c 840
4a33c2f5 841 CTYPE_DATA (_NL_CTYPE_HASH_SIZE,
4b10dd6c 842 &ctype->plane_size, sizeof (uint32_t));
4a33c2f5 843 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS,
4b10dd6c 844 &ctype->plane_cnt, sizeof (uint32_t));
19bc17a9 845
75cd5204
RM
846 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
847 /* The class name array. */
848 total = 0;
849 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
850 {
851 iov[2 + elem + offset].iov_base
852 = (void *) ctype->classnames[cnt];
853 iov[2 + elem + offset].iov_len
854 = strlen (ctype->classnames[cnt]) + 1;
855 total += iov[2 + elem + offset].iov_len;
856 }
ce7a5ef4
RM
857 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
858 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
859 total += 1 + (4 - ((total + 1) % 4));
75cd5204 860
4b10dd6c 861 idx[elem + 1] = idx[elem] + total;
75cd5204
RM
862 break;
863
864 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
865 /* The class name array. */
866 total = 0;
867 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
868 {
869 iov[2 + elem + offset].iov_base
870 = (void *) ctype->mapnames[cnt];
871 iov[2 + elem + offset].iov_len
872 = strlen (ctype->mapnames[cnt]) + 1;
873 total += iov[2 + elem + offset].iov_len;
874 }
ce7a5ef4
RM
875 iov[2 + elem + offset].iov_base = (void *) "\0\0\0";
876 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
877 total += 1 + (4 - ((total + 1) % 4));
75cd5204 878
4b10dd6c 879 idx[elem + 1] = idx[elem] + total;
75cd5204 880 break;
19bc17a9
RM
881
882 CTYPE_DATA (_NL_CTYPE_WIDTH,
75cd5204 883 ctype->width, ctype->plane_size * ctype->plane_cnt);
19bc17a9 884
0200214b 885 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
4b10dd6c 886 &ctype->mb_cur_max, sizeof (uint32_t));
0200214b 887
ce7a5ef4
RM
888 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
889 total = strlen (ctype->codeset_name) + 1;
890 if (total % 4 == 0)
891 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
892 else
893 {
894 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
9756dfe1
UD
895 memset (mempcpy (iov[2 + elem + offset].iov_base,
896 ctype->codeset_name, total),
897 '\0', 4 - (total & 3));
ce7a5ef4
RM
898 total = (total + 3) & ~3;
899 }
900 iov[2 + elem + offset].iov_len = total;
4b10dd6c
UD
901 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
902 break;
903
4a33c2f5 904 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
4b10dd6c
UD
905 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
906 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
4a33c2f5
UD
907 *(uint32_t *) iov[2 + elem + offset].iov_base =
908 ctype->mbdigits_act / 10;
a9c27b3e 909 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
4b10dd6c
UD
910 break;
911
4a33c2f5 912 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
4b10dd6c
UD
913 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
914 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
4a33c2f5
UD
915 *(uint32_t *) iov[2 + elem + offset].iov_base =
916 ctype->wcdigits_act / 10;
a9c27b3e 917 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
4b10dd6c
UD
918 break;
919
920 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
921 /* Compute the length of all possible characters. For INDIGITS
922 there might be more than one. We simply concatenate all of
923 them with a NUL byte following. The NUL byte wouldn't be
924 necessary but it makes it easier for the user. */
925 total = 0;
926 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
927 cnt < ctype->mbdigits_act; cnt += 10)
928 total += ctype->mbdigits[cnt]->nbytes + 1;
929 iov[2 + elem + offset].iov_base = (char *) alloca (total);
930 iov[2 + elem + offset].iov_len = total;
931
932 cp = iov[2 + elem + offset].iov_base;
933 for (cnt = elem - _NL_CTYPE_INDIGITS0_MB;
934 cnt < ctype->mbdigits_act; cnt += 10)
935 {
936 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
937 ctype->mbdigits[cnt]->nbytes);
938 *cp++ = '\0';
939 }
a9c27b3e 940 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
4b10dd6c
UD
941 break;
942
943 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
944 /* Compute the length of all possible characters. For INDIGITS
945 there might be more than one. We simply concatenate all of
946 them with a NUL byte following. The NUL byte wouldn't be
947 necessary but it makes it easier for the user. */
948 cnt = elem - _NL_CTYPE_OUTDIGIT0_MB;
949 total = ctype->mboutdigits[cnt]->nbytes + 1;
950 iov[2 + elem + offset].iov_base = (char *) alloca (total);
951 iov[2 + elem + offset].iov_len = total;
952
953 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
954 ctype->mbdigits[cnt]->bytes,
955 ctype->mbdigits[cnt]->nbytes) = '\0';
a9c27b3e 956 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
4b10dd6c
UD
957 break;
958
4a33c2f5 959 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
4b10dd6c
UD
960 total = ctype->wcdigits_act / 10;
961
962 iov[2 + elem + offset].iov_base =
963 (uint32_t *) alloca (total * sizeof (uint32_t));
964 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
965
4a33c2f5 966 for (cnt = elem - _NL_CTYPE_INDIGITS0_WC;
4b10dd6c
UD
967 cnt < ctype->wcdigits_act; cnt += 10)
968 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
4a33c2f5 969 = ctype->wcdigits[cnt];
a9c27b3e 970 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
4b10dd6c
UD
971 break;
972
4a33c2f5
UD
973 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
974 cnt = elem - _NL_CTYPE_OUTDIGIT0_WC;
4b10dd6c
UD
975 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
976 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
a9c27b3e 977 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
4b10dd6c
UD
978 break;
979
19bc17a9
RM
980 default:
981 assert (! "unknown CTYPE element");
982 }
983 else
984 {
985 /* Handle extra maps. */
5491da0d 986 size_t nr = (elem - _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE)) + 2;
19bc17a9 987
49f2be5b 988 iov[2 + elem + offset].iov_base = ctype->map32[nr];
75cd5204 989 iov[2 + elem + offset].iov_len = ((ctype->plane_size
f1d8b804 990 * ctype->plane_cnt)
4b10dd6c 991 * sizeof (uint32_t));
19bc17a9 992
4b10dd6c 993 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
75cd5204 994 }
19bc17a9 995 }
19bc17a9 996
75cd5204
RM
997 assert (2 + elem + offset == (nelems + ctype->nr_charclass
998 + ctype->map_collection_nr + 2));
19bc17a9 999
75cd5204 1000 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
19bc17a9
RM
1001}
1002
1003
4b10dd6c
UD
1004/* Local functions. */
1005static void
1006ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1007 const char *name)
19bc17a9 1008{
4b10dd6c 1009 size_t cnt;
19bc17a9 1010
4b10dd6c
UD
1011 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1012 if (strcmp (ctype->classnames[cnt], name) == 0)
1013 break;
19bc17a9 1014
4b10dd6c
UD
1015 if (cnt < ctype->nr_charclass)
1016 {
1017 lr_error (lr, _("character class `%s' already defined"), name);
1018 return;
1019 }
19bc17a9 1020
4b10dd6c
UD
1021 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1022 /* Exit code 2 is prescribed in P1003.2b. */
1023 error (2, 0, _("\
dbbbaf53 1024implementation limit: no more than %zd character classes allowed"),
4b10dd6c 1025 MAX_NR_CHARCLASS);
19bc17a9 1026
4b10dd6c 1027 ctype->classnames[ctype->nr_charclass++] = name;
19bc17a9
RM
1028}
1029
1030
4b10dd6c
UD
1031static void
1032ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1033 const char *name, struct charmap_t *charmap)
19bc17a9 1034{
4b10dd6c 1035 size_t max_chars = 0;
ba1ffaa1 1036 size_t cnt;
19bc17a9 1037
4b10dd6c 1038 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
19bc17a9 1039 {
4b10dd6c
UD
1040 if (strcmp (ctype->mapnames[cnt], name) == 0)
1041 break;
1042
1043 if (max_chars < ctype->map_collection_max[cnt])
1044 max_chars = ctype->map_collection_max[cnt];
19bc17a9
RM
1045 }
1046
4b10dd6c
UD
1047 if (cnt < ctype->map_collection_nr)
1048 {
1049 lr_error (lr, _("character map `%s' already defined"), name);
1050 return;
1051 }
19bc17a9 1052
4b10dd6c
UD
1053 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1054 /* Exit code 2 is prescribed in P1003.2b. */
1055 error (2, 0, _("\
1056implementation limit: no more than %d character maps allowed"),
1057 MAX_NR_CHARMAP);
19bc17a9 1058
4b10dd6c
UD
1059 ctype->mapnames[cnt] = name;
1060
1061 if (max_chars == 0)
1062 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1063 else
1064 ctype->map_collection_max[cnt] = max_chars;
1065
1066 ctype->map_collection[cnt] = (uint32_t *)
1067 xmalloc (sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1068 memset (ctype->map_collection[cnt], '\0',
1069 sizeof (uint32_t) * ctype->map_collection_max[cnt]);
1070 ctype->map_collection_act[cnt] = 256;
19bc17a9 1071
4b10dd6c 1072 ++ctype->map_collection_nr;
19bc17a9
RM
1073}
1074
1075
4b10dd6c 1076/* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
42d7c593 1077 is possible if we only want to extend the name array. */
4b10dd6c
UD
1078static uint32_t *
1079find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1080 size_t *act, uint32_t idx)
19bc17a9 1081{
4b10dd6c 1082 size_t cnt;
19bc17a9 1083
4b10dd6c
UD
1084 if (idx < 256)
1085 return table == NULL ? NULL : &(*table)[idx];
19bc17a9 1086
4b10dd6c
UD
1087 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1088 if (ctype->charnames[cnt] == idx)
1089 break;
19bc17a9 1090
4b10dd6c
UD
1091 /* We have to distinguish two cases: the name is found or not. */
1092 if (cnt == ctype->charnames_act)
1093 {
1094 /* Extend the name array. */
1095 if (ctype->charnames_act == ctype->charnames_max)
1096 {
1097 ctype->charnames_max *= 2;
1098 ctype->charnames = (unsigned int *)
1099 xrealloc (ctype->charnames,
1100 sizeof (unsigned int) * ctype->charnames_max);
1101 }
1102 ctype->charnames[ctype->charnames_act++] = idx;
1103 }
19bc17a9 1104
4b10dd6c
UD
1105 if (table == NULL)
1106 /* We have done everything we are asked to do. */
1107 return NULL;
19bc17a9 1108
4b10dd6c
UD
1109 if (cnt >= *act)
1110 {
1111 if (cnt >= *max)
1112 {
1113 size_t old_max = *max;
1114 do
1115 *max *= 2;
1116 while (*max <= cnt);
19bc17a9 1117
4b10dd6c
UD
1118 *table =
1119 (uint32_t *) xrealloc (*table, *max * sizeof (unsigned long int));
1120 memset (&(*table)[old_max], '\0',
1121 (*max - old_max) * sizeof (uint32_t));
1122 }
19bc17a9 1123
76e680a8 1124 *act = cnt + 1;
4b10dd6c 1125 }
19bc17a9 1126
4b10dd6c 1127 return &(*table)[cnt];
19bc17a9
RM
1128}
1129
1130
4b10dd6c
UD
1131static int
1132get_character (struct token *now, struct charmap_t *charmap,
1133 struct repertoire_t *repertoire,
1134 struct charseq **seqp, uint32_t *wchp)
19bc17a9 1135{
4b10dd6c
UD
1136 if (now->tok == tok_bsymbol)
1137 {
1138 /* This will hopefully be the normal case. */
1139 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1140 now->val.str.lenmb);
1141 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1142 now->val.str.lenmb);
1143 }
1144 else if (now->tok == tok_ucs4)
1145 {
1146 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
19bc17a9 1147
4b10dd6c
UD
1148 if (*seqp == NULL)
1149 {
1150 /* Compute the value in the charmap from the UCS value. */
1151 const char *symbol = repertoire_find_symbol (repertoire,
1152 now->val.ucs4);
19bc17a9 1153
4b10dd6c
UD
1154 if (symbol == NULL)
1155 *seqp = NULL;
1156 else
1157 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
19bc17a9 1158
4b10dd6c
UD
1159 if (*seqp == NULL)
1160 {
1161 /* Insert a negative entry. */
1162 static const struct charseq negative
1163 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1164 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1165 *newp = now->val.ucs4;
1166
1167 insert_entry (&repertoire->seq_table, newp, 4,
1168 (void *) &negative);
1169 }
1170 else
1171 (*seqp)->ucs4 = now->val.ucs4;
1172 }
1173 else if ((*seqp)->ucs4 != now->val.ucs4)
1174 *seqp = NULL;
19bc17a9 1175
4b10dd6c
UD
1176 *wchp = now->val.ucs4;
1177 }
1178 else if (now->tok == tok_charcode)
1179 {
1180 /* We must map from the byte code to UCS4. */
1181 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1182 now->val.str.lenmb);
19bc17a9 1183
4b10dd6c
UD
1184 if (*seqp == NULL)
1185 *wchp = ILLEGAL_CHAR_VALUE;
1186 else
1187 {
1188 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1189 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1190 strlen ((*seqp)->name));
1191 *wchp = (*seqp)->ucs4;
1192 }
1193 }
1194 else
1195 return 1;
19bc17a9
RM
1196
1197 return 0;
1198}
1199
1200
4b10dd6c
UD
1201/* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>'. */
1202static void
1203charclass_symbolic_ellipsis (struct linereader *ldfile,
1204 struct locale_ctype_t *ctype,
1205 struct charmap_t *charmap,
1206 struct repertoire_t *repertoire,
1207 struct token *now,
1208 const char *last_str,
1209 unsigned long int class256_bit,
1210 unsigned long int class_bit, int base,
1211 int ignore_content, int handle_digits)
19bc17a9 1212{
4b10dd6c
UD
1213 const char *nowstr = now->val.str.startmb;
1214 char tmp[now->val.str.lenmb + 1];
1215 const char *cp;
1216 char *endp;
1217 unsigned long int from;
1218 unsigned long int to;
19bc17a9 1219
4b10dd6c
UD
1220 /* We have to compute the ellipsis values using the symbolic names. */
1221 assert (last_str != NULL);
1222
1223 if (strlen (last_str) != now->val.str.lenmb)
19bc17a9 1224 {
4b10dd6c
UD
1225 invalid_range:
1226 lr_error (ldfile,
549b3c3a 1227 _("`%s' and `%.*s' are no valid names for symbolic range"),
f6ada7ad 1228 last_str, (int) now->val.str.lenmb, nowstr);
4b10dd6c 1229 return;
19bc17a9
RM
1230 }
1231
4b10dd6c
UD
1232 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1233 /* Nothing to do, the names are the same. */
1234 return;
19bc17a9 1235
4b10dd6c
UD
1236 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1237 ;
19bc17a9 1238
4b10dd6c
UD
1239 errno = 0;
1240 from = strtoul (cp, &endp, base);
1241 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1242 goto invalid_range;
19bc17a9 1243
4b10dd6c 1244 to = strtoul (nowstr + (cp - last_str), &endp, base);
549b3c3a
UD
1245 if ((to == UINT_MAX && errno == ERANGE)
1246 || (endp - nowstr) != now->val.str.lenmb || from >= to)
4b10dd6c 1247 goto invalid_range;
19bc17a9 1248
4b10dd6c
UD
1249 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1250 if (!ignore_content)
1251 {
1252 now->val.str.startmb = tmp;
1253 while (++from <= to)
1254 {
1255 struct charseq *seq;
1256 uint32_t wch;
19bc17a9 1257
4b10dd6c
UD
1258 sprintf (tmp, (base == 10 ? "%.*s%0*d" : "%.*s%0*X"), cp - last_str,
1259 last_str, now->val.str.lenmb - (cp - last_str), from);
19bc17a9 1260
4b10dd6c
UD
1261 get_character (now, charmap, repertoire, &seq, &wch);
1262
1263 if (seq != NULL && seq->nbytes == 1)
1264 /* Yep, we can store information about this byte sequence. */
1265 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
19bc17a9 1266
4b10dd6c
UD
1267 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1268 /* We have the UCS4 position. */
1269 *find_idx (ctype, &ctype->class_collection,
1270 &ctype->class_collection_max,
1271 &ctype->class_collection_act, wch) |= class_bit;
19bc17a9 1272
4b10dd6c
UD
1273 if (handle_digits == 1)
1274 {
1275 /* We must store the digit values. */
1276 if (ctype->mbdigits_act == ctype->mbdigits_max)
1277 {
1278 ctype->mbdigits_max *= 2;
1279 ctype->mbdigits = xrealloc (ctype->mbdigits,
1280 (ctype->mbdigits_max
1281 * sizeof (char *)));
1282 ctype->wcdigits_max *= 2;
1283 ctype->wcdigits = xrealloc (ctype->wcdigits,
1284 (ctype->wcdigits_max
1285 * sizeof (uint32_t)));
1286 }
1287
1288 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1289 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1290 }
1291 else if (handle_digits == 2)
1292 {
1293 /* We must store the digit values. */
1294 if (ctype->outdigits_act >= 10)
1295 {
1296 lr_error (ldfile, _("\
1297%s: field `%s' does not contain exactly ten entries"),
1298 "LC_CTYPE", "outdigit");
1299 return;
1300 }
1301
1302 ctype->mboutdigits[ctype->outdigits_act] = seq;
1303 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1304 ++ctype->outdigits_act;
1305 }
1306 }
1307 }
19bc17a9
RM
1308}
1309
1310
4b10dd6c
UD
1311/* Ellipsis like in `<U1234>..<U2345>'. */
1312static void
1313charclass_ucs4_ellipsis (struct linereader *ldfile,
1314 struct locale_ctype_t *ctype,
1315 struct charmap_t *charmap,
1316 struct repertoire_t *repertoire,
1317 struct token *now, uint32_t last_wch,
1318 unsigned long int class256_bit,
1319 unsigned long int class_bit, int ignore_content,
1320 int handle_digits)
19bc17a9 1321{
4b10dd6c 1322 if (last_wch > now->val.ucs4)
19bc17a9 1323 {
4b10dd6c
UD
1324 lr_error (ldfile, _("\
1325to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1326 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1327 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
19bc17a9
RM
1328 return;
1329 }
1330
4b10dd6c
UD
1331 if (!ignore_content)
1332 while (++last_wch <= now->val.ucs4)
1333 {
1334 /* We have to find out whether there is a byte sequence corresponding
1335 to this UCS4 value. */
1336 struct charseq *seq = repertoire_find_seq (repertoire, last_wch);
19bc17a9 1337
4b10dd6c
UD
1338 /* If this is the first time we look for this sequence create a new
1339 entry. */
1340 if (seq == NULL)
1341 {
1342 /* Find the symbolic name for this UCS4 value. */
1343 const char *symbol = repertoire_find_symbol (repertoire, last_wch);
1344 uint32_t *newp = obstack_alloc (&repertoire->mem_pool, 4);
1345 *newp = last_wch;
19bc17a9 1346
4b10dd6c
UD
1347 if (symbol != NULL)
1348 /* We have a name, now search the multibyte value. */
1349 seq = charmap_find_value (charmap, symbol, strlen (symbol));
19bc17a9 1350
4b10dd6c
UD
1351 if (seq == NULL)
1352 {
1353 /* We have to create a fake entry. */
1354 static const struct charseq negative
1355 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1356 seq = (struct charseq *) &negative;
1357 }
1358 else
1359 seq->ucs4 = last_wch;
1360
1361 insert_entry (&repertoire->seq_table, newp, 4, seq);
1362 }
1363
1364 /* We have a name, now search the multibyte value. */
1365 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1366 /* Yep, we can store information about this byte sequence. */
1367 ctype->class256_collection[(size_t) seq->bytes[0]]
1368 |= class256_bit;
1369
1370 /* And of course we have the UCS4 position. */
1371 if (class_bit != 0 && class_bit != 0)
1372 *find_idx (ctype, &ctype->class_collection,
1373 &ctype->class_collection_max,
1374 &ctype->class_collection_act, last_wch) |= class_bit;
1375
1376 if (handle_digits == 1)
1377 {
1378 /* We must store the digit values. */
1379 if (ctype->mbdigits_act == ctype->mbdigits_max)
1380 {
1381 ctype->mbdigits_max *= 2;
1382 ctype->mbdigits = xrealloc (ctype->mbdigits,
1383 (ctype->mbdigits_max
1384 * sizeof (char *)));
1385 ctype->wcdigits_max *= 2;
1386 ctype->wcdigits = xrealloc (ctype->wcdigits,
1387 (ctype->wcdigits_max
1388 * sizeof (uint32_t)));
1389 }
1390
1391 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1392 ? seq : NULL);
1393 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1394 }
1395 else if (handle_digits == 2)
1396 {
1397 /* We must store the digit values. */
1398 if (ctype->outdigits_act >= 10)
1399 {
1400 lr_error (ldfile, _("\
1401%s: field `%s' does not contain exactly ten entries"),
1402 "LC_CTYPE", "outdigit");
1403 return;
1404 }
19bc17a9 1405
4b10dd6c
UD
1406 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1407 ? seq : NULL);
1408 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1409 ++ctype->outdigits_act;
1410 }
1411 }
19bc17a9
RM
1412}
1413
1414
4b10dd6c 1415/* Ellipsis as in `/xea/x12.../xea/x34'. */
19bc17a9 1416static void
4b10dd6c
UD
1417charclass_charcode_ellipsis (struct linereader *ldfile,
1418 struct locale_ctype_t *ctype,
1419 struct charmap_t *charmap,
1420 struct repertoire_t *repertoire,
1421 struct token *now, char *last_charcode,
1422 uint32_t last_charcode_len,
1423 unsigned long int class256_bit,
1424 unsigned long int class_bit, int ignore_content,
1425 int handle_digits)
19bc17a9 1426{
4b10dd6c
UD
1427 /* First check whether the to-value is larger. */
1428 if (now->val.charcode.nbytes != last_charcode_len)
1429 {
1430 lr_error (ldfile, _("\
1431start end end character sequence of range must have the same length"));
1432 return;
1433 }
19bc17a9 1434
4b10dd6c 1435 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
19bc17a9 1436 {
4b10dd6c
UD
1437 lr_error (ldfile, _("\
1438to-value character sequence is smaller than from-value sequence"));
19bc17a9
RM
1439 return;
1440 }
1441
4b10dd6c
UD
1442 if (!ignore_content)
1443 {
1444 do
1445 {
1446 /* Increment the byte sequence value. */
1447 struct charseq *seq;
1448 uint32_t wch;
1449 int i;
1450
1451 for (i = last_charcode_len - 1; i >= 0; --i)
1452 if (++last_charcode[i] != 0)
1453 break;
1454
1455 if (last_charcode_len == 1)
1456 /* Of course we have the charcode value. */
1457 ctype->class256_collection[(size_t) last_charcode[0]]
1458 |= class256_bit;
1459
1460 /* Find the symbolic name. */
1461 seq = charmap_find_symbol (charmap, last_charcode,
1462 last_charcode_len);
1463 if (seq != NULL)
1464 {
1465 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1466 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1467 strlen (seq->name));
1468 wch = seq->ucs4;
1469
1470 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1471 *find_idx (ctype, &ctype->class_collection,
1472 &ctype->class_collection_max,
1473 &ctype->class_collection_act, wch) |= class_bit;
1474 }
1475 else
1476 wch = ILLEGAL_CHAR_VALUE;
19bc17a9 1477
4b10dd6c
UD
1478 if (handle_digits == 1)
1479 {
1480 /* We must store the digit values. */
1481 if (ctype->mbdigits_act == ctype->mbdigits_max)
1482 {
1483 ctype->mbdigits_max *= 2;
1484 ctype->mbdigits = xrealloc (ctype->mbdigits,
1485 (ctype->mbdigits_max
1486 * sizeof (char *)));
1487 ctype->wcdigits_max *= 2;
1488 ctype->wcdigits = xrealloc (ctype->wcdigits,
1489 (ctype->wcdigits_max
1490 * sizeof (uint32_t)));
1491 }
1492
1493 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1494 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1495 seq->nbytes = last_charcode_len;
1496
1497 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1498 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1499 }
1500 else if (handle_digits == 2)
1501 {
1502 struct charseq *seq;
1503 /* We must store the digit values. */
1504 if (ctype->outdigits_act >= 10)
1505 {
1506 lr_error (ldfile, _("\
1507%s: field `%s' does not contain exactly ten entries"),
1508 "LC_CTYPE", "outdigit");
1509 return;
1510 }
1511
1512 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1513 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1514 seq->nbytes = last_charcode_len;
1515
1516 ctype->mboutdigits[ctype->outdigits_act] = seq;
1517 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1518 ++ctype->outdigits_act;
1519 }
1520 }
1521 while (memcmp (last_charcode, now->val.charcode.bytes,
1522 last_charcode_len) != 0);
1523 }
19bc17a9
RM
1524}
1525
1526
4b10dd6c
UD
1527/* Read one transliteration entry. */
1528static uint32_t *
1529read_widestring (struct linereader *ldfile, struct token *now,
1530 struct charmap_t *charmap, struct repertoire_t *repertoire)
19bc17a9 1531{
4b10dd6c 1532 uint32_t *wstr;
19bc17a9 1533
4b10dd6c
UD
1534 if (now->tok == tok_default_missing)
1535 /* The special name "" will denote this case. */
1536 wstr = (uint32_t *) L"";
1537 else if (now->tok == tok_bsymbol)
19bc17a9 1538 {
4b10dd6c
UD
1539 /* Get the value from the repertoire. */
1540 wstr = xmalloc (2 * sizeof (uint32_t));
1541 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1542 now->val.str.lenmb);
1543 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1544 /* We cannot proceed, we don't know the UCS4 value. */
1545 return NULL;
1546
1547 wstr[1] = 0;
19bc17a9 1548 }
4b10dd6c 1549 else if (now->tok == tok_ucs4)
19bc17a9 1550 {
4b10dd6c
UD
1551 wstr = xmalloc (2 * sizeof (uint32_t));
1552 wstr[0] = now->val.ucs4;
1553 wstr[1] = 0;
1554 }
1555 else if (now->tok == tok_charcode)
1556 {
1557 /* Argh, we have to convert to the symbol name first and then to the
1558 UCS4 value. */
1559 struct charseq *seq = charmap_find_symbol (charmap,
1560 now->val.str.startmb,
1561 now->val.str.lenmb);
1562 if (seq == NULL)
1563 /* Cannot find the UCS4 value. */
1564 return NULL;
1565
1566 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1567 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1568 strlen (seq->name));
1569 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1570 /* We cannot proceed, we don't know the UCS4 value. */
1571 return NULL;
1572
1573 wstr = xmalloc (2 * sizeof (uint32_t));
1574 wstr[0] = seq->ucs4;
1575 wstr[1] = 0;
1576 }
1577 else if (now->tok == tok_string)
1578 {
1579 wstr = now->val.str.startwc;
1580 if (wstr[0] == 0)
1581 return NULL;
1582 }
1583 else
1584 {
1585 if (now->tok != tok_eol && now->tok != tok_eof)
1586 lr_ignore_rest (ldfile, 0);
1587 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1588 return (uint32_t *) -1l;
19bc17a9
RM
1589 }
1590
4b10dd6c
UD
1591 return wstr;
1592}
19bc17a9 1593
19bc17a9 1594
4b10dd6c
UD
1595static void
1596read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1597 struct token *now, struct charmap_t *charmap,
1598 struct repertoire_t *repertoire)
1599{
1600 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1601 struct translit_t *result;
1602 struct translit_to_t **top;
1603 struct obstack *ob = &ctype->mem_pool;
1604 int first;
1605 int ignore;
1606
1607 if (from_wstr == NULL)
1608 /* There is no valid from string. */
1609 return;
19bc17a9 1610
4b10dd6c
UD
1611 result = (struct translit_t *) obstack_alloc (ob,
1612 sizeof (struct translit_t));
1613 result->from = from_wstr;
1614 result->next = NULL;
1615 result->to = NULL;
1616 top = &result->to;
1617 first = 1;
1618 ignore = 0;
1619
1620 while (1)
1621 {
1622 uint32_t *to_wstr;
1623
1624 /* Next we have one or more transliterations. They are
1625 separated by semicolons. */
1626 now = lr_token (ldfile, charmap, repertoire);
1627
1628 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1629 {
1630 /* One string read. */
1631 const uint32_t zero = 0;
1632
1633 if (!ignore)
1634 {
1635 obstack_grow (ob, &zero, 4);
1636 to_wstr = obstack_finish (ob);
1637
1638 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1639 (*top)->str = to_wstr;
1640 (*top)->next = NULL;
1641 }
1642
1643 if (now->tok == tok_eol)
1644 {
1645 result->next = ctype->translit;
1646 ctype->translit = result;
1647 return;
1648 }
1649
1650 if (!ignore)
1651 top = &(*top)->next;
1652 ignore = 0;
1653 }
1654 else
1655 {
1656 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1657 if (to_wstr == (uint32_t *) -1l)
1658 {
1659 /* An error occurred. */
1660 obstack_free (ob, result);
1661 return;
1662 }
1663
1664 if (to_wstr == NULL)
1665 ignore = 1;
1666 else
1667 /* This value is usable. */
1668 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
19bc17a9 1669
4b10dd6c
UD
1670 first = 0;
1671 }
1672 }
19bc17a9
RM
1673}
1674
1675
4b10dd6c
UD
1676/* The parser for the LC_CTYPE section of the locale definition. */
1677void
1678ctype_read (struct linereader *ldfile, struct localedef_t *result,
1679 struct charmap_t *charmap, const char *repertoire_name,
1680 int ignore_content)
19bc17a9 1681{
4b10dd6c
UD
1682 struct repertoire_t *repertoire = NULL;
1683 struct locale_ctype_t *ctype;
1684 struct token *now;
1685 enum token_t nowtok;
19bc17a9 1686 size_t cnt;
4b10dd6c
UD
1687 struct charseq *last_seq;
1688 uint32_t last_wch = 0;
1689 enum token_t last_token;
1690 enum token_t ellipsis_token;
1691 char last_charcode[16];
1692 size_t last_charcode_len = 0;
1693 const char *last_str = NULL;
1694 int mapidx;
19bc17a9 1695
4b10dd6c
UD
1696 /* Get the repertoire we have to use. */
1697 if (repertoire_name != NULL)
1698 repertoire = repertoire_read (repertoire_name);
19bc17a9 1699
4b10dd6c
UD
1700 /* The rest of the line containing `LC_CTYPE' must be free. */
1701 lr_ignore_rest (ldfile, 1);
19bc17a9 1702
4b10dd6c
UD
1703
1704 do
19bc17a9 1705 {
4b10dd6c
UD
1706 now = lr_token (ldfile, charmap, NULL);
1707 nowtok = now->tok;
19bc17a9 1708 }
4b10dd6c 1709 while (nowtok == tok_eol);
19bc17a9 1710
4b10dd6c
UD
1711 /* If we see `copy' now we are almost done. */
1712 if (nowtok == tok_copy)
1713 {
b9eb05d6 1714 handle_copy (ldfile, charmap, repertoire, result, tok_lc_ctype, LC_CTYPE,
4b10dd6c
UD
1715 "LC_CTYPE", ignore_content);
1716 return;
1717 }
75cd5204 1718
4b10dd6c
UD
1719 /* Prepare the data structures. */
1720 ctype_startup (ldfile, result, charmap, ignore_content);
1721 ctype = result->categories[LC_CTYPE].ctype;
1722
1723 /* Remember the repertoire we use. */
1724 if (!ignore_content)
1725 ctype->repertoire = repertoire;
1726
1727 while (1)
19bc17a9 1728 {
4b10dd6c
UD
1729 unsigned long int class_bit = 0;
1730 unsigned long int class256_bit = 0;
1731 int handle_digits = 0;
1732
1733 /* Of course we don't proceed beyond the end of file. */
1734 if (nowtok == tok_eof)
1735 break;
1736
1737 /* Ingore empty lines. */
1738 if (nowtok == tok_eol)
19bc17a9 1739 {
4b10dd6c
UD
1740 now = lr_token (ldfile, charmap, NULL);
1741 nowtok = now->tok;
1742 continue;
1743 }
19bc17a9 1744
4b10dd6c
UD
1745 switch (nowtok)
1746 {
5491da0d
UD
1747 case tok_charclass:
1748 now = lr_token (ldfile, charmap, NULL);
1749 while (now->tok == tok_ident || now->tok == tok_string)
1750 {
1751 ctype_class_new (ldfile, ctype, now->val.str.startmb);
1752 now = lr_token (ldfile, charmap, NULL);
1753 if (now->tok != tok_semicolon)
1754 break;
1755 now = lr_token (ldfile, charmap, NULL);
1756 }
1757 if (now->tok != tok_eol)
1758 SYNTAX_ERROR (_("\
1759%s: syntax error in definition of new character class"), "LC_CTYPE");
1760 break;
1761
1762 case tok_charconv:
1763 now = lr_token (ldfile, charmap, NULL);
1764 while (now->tok == tok_ident || now->tok == tok_string)
1765 {
1766 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
1767 now = lr_token (ldfile, charmap, NULL);
1768 if (now->tok != tok_semicolon)
1769 break;
1770 now = lr_token (ldfile, charmap, NULL);
1771 }
1772 if (now->tok != tok_eol)
1773 SYNTAX_ERROR (_("\
1774%s: syntax error in definition of new character map"), "LC_CTYPE");
1775 break;
1776
4b10dd6c 1777 case tok_class:
b9eb05d6
UD
1778 /* Ignore the rest of the line if we don't need the input of
1779 this line. */
1780 if (ignore_content)
1781 {
1782 lr_ignore_rest (ldfile, 0);
1783 break;
1784 }
1785
4b10dd6c
UD
1786 /* We simply forget the `class' keyword and use the following
1787 operand to determine the bit. */
1788 now = lr_token (ldfile, charmap, NULL);
1789 if (now->tok == tok_ident || now->tok == tok_string)
1790 {
87372aa9 1791 /* Must can be one of the predefined class names. */
4b10dd6c
UD
1792 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1793 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
1794 break;
1795 if (cnt >= ctype->nr_charclass)
1796 {
011ebfab 1797#ifdef PREDEFINED_CLASSES
4b10dd6c
UD
1798 if (now->val.str.lenmb == 8
1799 && memcmp ("special1", now->val.str.startmb, 8) == 0)
1800 class_bit = _ISwspecial1;
1801 else if (now->val.str.lenmb == 8
1802 && memcmp ("special2", now->val.str.startmb, 8) == 0)
1803 class_bit = _ISwspecial2;
1804 else if (now->val.str.lenmb == 8
1805 && memcmp ("special3", now->val.str.startmb, 8) == 0)
1806 class_bit = _ISwspecial3;
1807 else
011ebfab 1808#endif
4b10dd6c 1809 {
87372aa9
UD
1810 /* OK, it's a new class. */
1811 ctype_class_new (ldfile, ctype, now->val.str.startmb);
4b10dd6c 1812
87372aa9 1813 class_bit = _ISwbit (ctype->nr_charclass - 1);
4b10dd6c
UD
1814 }
1815 }
1816 else
7f653277
UD
1817 {
1818 class_bit = _ISwbit (cnt);
4b10dd6c 1819
7f653277
UD
1820 free (now->val.str.startmb);
1821 }
4b10dd6c
UD
1822 }
1823 else if (now->tok == tok_digit)
1824 goto handle_tok_digit;
1825 else if (now->tok < tok_upper || now->tok > tok_blank)
1826 goto err_label;
1827 else
1828 {
1829 class_bit = BITw (now->tok);
1830 class256_bit = BIT (now->tok);
1831 }
1832
1833 /* The next character must be a semicolon. */
1834 now = lr_token (ldfile, charmap, NULL);
1835 if (now->tok != tok_semicolon)
1836 goto err_label;
1837 goto read_charclass;
1838
1839 case tok_upper:
1840 case tok_lower:
1841 case tok_alpha:
1842 case tok_alnum:
1843 case tok_space:
1844 case tok_cntrl:
1845 case tok_punct:
1846 case tok_graph:
1847 case tok_print:
1848 case tok_xdigit:
1849 case tok_blank:
b9eb05d6
UD
1850 /* Ignore the rest of the line if we don't need the input of
1851 this line. */
1852 if (ignore_content)
1853 {
1854 lr_ignore_rest (ldfile, 0);
1855 break;
1856 }
1857
4b10dd6c
UD
1858 class_bit = BITw (now->tok);
1859 class256_bit = BIT (now->tok);
1860 handle_digits = 0;
1861 read_charclass:
1862 ctype->class_done |= class_bit;
1863 last_token = tok_none;
1864 ellipsis_token = tok_none;
1865 now = lr_token (ldfile, charmap, NULL);
1866 while (now->tok != tok_eol && now->tok != tok_eof)
1867 {
1868 uint32_t wch;
1869 struct charseq *seq;
1870
1871 if (ellipsis_token == tok_none)
1872 {
1873 if (get_character (now, charmap, repertoire, &seq, &wch))
1874 goto err_label;
1875
1876 if (!ignore_content && seq != NULL && seq->nbytes == 1)
1877 /* Yep, we can store information about this byte
1878 sequence. */
1879 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1880
1881 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
1882 && class_bit != 0)
1883 /* We have the UCS4 position. */
1884 *find_idx (ctype, &ctype->class_collection,
1885 &ctype->class_collection_max,
1886 &ctype->class_collection_act, wch) |= class_bit;
1887
1888 last_token = now->tok;
549b3c3a 1889 /* Terminate the string. */
9e2b7438
UD
1890 if (last_token == tok_bsymbol)
1891 {
1892 now->val.str.startmb[now->val.str.lenmb] = '\0';
1893 last_str = now->val.str.startmb;
1894 }
1895 else
1896 last_str = NULL;
4b10dd6c
UD
1897 last_seq = seq;
1898 last_wch = wch;
1899 memcpy (last_charcode, now->val.charcode.bytes, 16);
1900 last_charcode_len = now->val.charcode.nbytes;
1901
1902 if (!ignore_content && handle_digits == 1)
1903 {
1904 /* We must store the digit values. */
1905 if (ctype->mbdigits_act == ctype->mbdigits_max)
1906 {
b9eb05d6 1907 ctype->mbdigits_max += 10;
4b10dd6c
UD
1908 ctype->mbdigits = xrealloc (ctype->mbdigits,
1909 (ctype->mbdigits_max
1910 * sizeof (char *)));
b9eb05d6 1911 ctype->wcdigits_max += 10;
4b10dd6c
UD
1912 ctype->wcdigits = xrealloc (ctype->wcdigits,
1913 (ctype->wcdigits_max
1914 * sizeof (uint32_t)));
1915 }
1916
1917 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1918 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1919 }
1920 else if (!ignore_content && handle_digits == 2)
1921 {
1922 /* We must store the digit values. */
1923 if (ctype->outdigits_act >= 10)
1924 {
1925 lr_error (ldfile, _("\
1926%s: field `%s' does not contain exactly ten entries"),
1927 "LC_CTYPE", "outdigit");
1928 goto err_label;
1929 }
1930
1931 ctype->mboutdigits[ctype->outdigits_act] = seq;
1932 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1933 ++ctype->outdigits_act;
1934 }
1935 }
1936 else
1937 {
1938 /* Now it gets complicated. We have to resolve the
1939 ellipsis problem. First we must distinguish between
1940 the different kind of ellipsis and this must match the
1941 tokens we have seen. */
1942 assert (last_token != tok_none);
1943
1944 if (last_token != now->tok)
1945 {
1946 lr_error (ldfile, _("\
1947ellipsis range must be marked by two operands of same type"));
1948 lr_ignore_rest (ldfile, 0);
1949 break;
1950 }
1951
1952 if (last_token == tok_bsymbol)
1953 {
1954 if (ellipsis_token == tok_ellipsis3)
1955 lr_error (ldfile, _("with symbolic name range values \
1956the absolute ellipsis `...' must not be used"));
1957
1958 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
1959 repertoire, now, last_str,
1960 class256_bit, class_bit,
1961 (ellipsis_token
1962 == tok_ellipsis4
1963 ? 10 : 16),
1964 ignore_content,
1965 handle_digits);
1966 }
1967 else if (last_token == tok_ucs4)
1968 {
1969 if (ellipsis_token != tok_ellipsis2)
1970 lr_error (ldfile, _("\
1971with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
1972
1973 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
1974 repertoire, now, last_wch,
1975 class256_bit, class_bit,
1976 ignore_content, handle_digits);
1977 }
1978 else
1979 {
1980 assert (last_token == tok_charcode);
1981
1982 if (ellipsis_token != tok_ellipsis3)
1983 lr_error (ldfile, _("\
1984with character code range values one must use the absolute ellipsis `...'"));
1985
1986 charclass_charcode_ellipsis (ldfile, ctype, charmap,
1987 repertoire, now,
1988 last_charcode,
1989 last_charcode_len,
1990 class256_bit, class_bit,
1991 ignore_content,
1992 handle_digits);
1993 }
1994
1995 /* Now we have used the last value. */
1996 last_token = tok_none;
1997 }
1998
1999 /* Next we expect a semicolon or the end of the line. */
2000 now = lr_token (ldfile, charmap, NULL);
2001 if (now->tok == tok_eol || now->tok == tok_eof)
2002 break;
2003
2004 if (last_token != tok_none
2005 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4)
2006 {
2007 ellipsis_token = now->tok;
2008 now = lr_token (ldfile, charmap, NULL);
2009 continue;
2010 }
2011
2012 if (now->tok != tok_semicolon)
2013 goto err_label;
2014
2015 /* And get the next character. */
2016 now = lr_token (ldfile, charmap, NULL);
2017
2018 ellipsis_token = tok_none;
2019 }
2020 break;
2021
2022 case tok_digit:
b9eb05d6
UD
2023 /* Ignore the rest of the line if we don't need the input of
2024 this line. */
2025 if (ignore_content)
42d7c593
UD
2026 {
2027 lr_ignore_rest (ldfile, 0);
2028 break;
2029 }
b9eb05d6 2030
4b10dd6c
UD
2031 handle_tok_digit:
2032 class_bit = _ISwdigit;
2033 class256_bit = _ISdigit;
2034 handle_digits = 1;
2035 goto read_charclass;
2036
2037 case tok_outdigit:
b9eb05d6
UD
2038 /* Ignore the rest of the line if we don't need the input of
2039 this line. */
2040 if (ignore_content)
2041 {
2042 lr_ignore_rest (ldfile, 0);
2043 break;
2044 }
2045
4b10dd6c
UD
2046 if (ctype->outdigits_act != 0)
2047 lr_error (ldfile, _("\
2048%s: field `%s' declared more than once"),
2049 "LC_CTYPE", "outdigit");
2050 class_bit = 0;
2051 class256_bit = 0;
2052 handle_digits = 2;
2053 goto read_charclass;
2054
2055 case tok_toupper:
b9eb05d6
UD
2056 /* Ignore the rest of the line if we don't need the input of
2057 this line. */
2058 if (ignore_content)
2059 {
2060 lr_ignore_rest (ldfile, 0);
2061 break;
2062 }
2063
4b10dd6c
UD
2064 mapidx = 0;
2065 goto read_mapping;
2066
2067 case tok_tolower:
b9eb05d6
UD
2068 /* Ignore the rest of the line if we don't need the input of
2069 this line. */
2070 if (ignore_content)
2071 {
2072 lr_ignore_rest (ldfile, 0);
2073 break;
2074 }
2075
4b10dd6c
UD
2076 mapidx = 1;
2077 goto read_mapping;
2078
2079 case tok_map:
b9eb05d6
UD
2080 /* Ignore the rest of the line if we don't need the input of
2081 this line. */
2082 if (ignore_content)
2083 {
2084 lr_ignore_rest (ldfile, 0);
2085 break;
2086 }
2087
4b10dd6c
UD
2088 /* We simply forget the `map' keyword and use the following
2089 operand to determine the mapping. */
2090 now = lr_token (ldfile, charmap, NULL);
2091 if (now->tok == tok_ident || now->tok == tok_string)
2092 {
2093 size_t cnt;
2094
2095 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2096 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2097 break;
2098
7f653277
UD
2099 if (cnt < ctype->map_collection_nr)
2100 free (now->val.str.startmb);
2101 else
87372aa9
UD
2102 /* OK, it's a new map. */
2103 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2104
2105 mapidx = cnt;
4b10dd6c
UD
2106 }
2107 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2108 goto err_label;
2109 else
2110 mapidx = now->tok - tok_toupper;
2111
2112 now = lr_token (ldfile, charmap, NULL);
2113 /* This better should be a semicolon. */
2114 if (now->tok != tok_semicolon)
2115 goto err_label;
2116
2117 read_mapping:
2118 /* Test whether this mapping was already defined. */
2119 if (ctype->tomap_done[mapidx])
2120 {
2121 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2122 ctype->mapnames[mapidx]);
2123 lr_ignore_rest (ldfile, 0);
2124 break;
2125 }
2126 ctype->tomap_done[mapidx] = 1;
2127
2128 now = lr_token (ldfile, charmap, NULL);
2129 while (now->tok != tok_eol && now->tok != tok_eof)
2130 {
2131 struct charseq *from_seq;
2132 uint32_t from_wch;
2133 struct charseq *to_seq;
2134 uint32_t to_wch;
2135
2136 /* Every pair starts with an opening brace. */
2137 if (now->tok != tok_open_brace)
2138 goto err_label;
2139
2140 /* Next comes the from-value. */
2141 now = lr_token (ldfile, charmap, NULL);
2142 if (get_character (now, charmap, repertoire, &from_seq,
2143 &from_wch) != 0)
2144 goto err_label;
2145
2146 /* The next is a comma. */
2147 now = lr_token (ldfile, charmap, NULL);
2148 if (now->tok != tok_comma)
2149 goto err_label;
2150
2151 /* And the other value. */
2152 now = lr_token (ldfile, charmap, NULL);
2153 if (get_character (now, charmap, repertoire, &to_seq,
2154 &to_wch) != 0)
2155 goto err_label;
2156
2157 /* And the last thing is the closing brace. */
2158 now = lr_token (ldfile, charmap, NULL);
2159 if (now->tok != tok_close_brace)
2160 goto err_label;
2161
2162 if (!ignore_content)
2163 {
2164 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2165 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2166 /* We can use this value. */
2167 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2168 = to_seq->bytes[0];
2169
2170 if (from_wch != ILLEGAL_CHAR_VALUE
2171 && to_wch != ILLEGAL_CHAR_VALUE)
2172 /* Both correct values. */
2173 *find_idx (ctype, &ctype->map_collection[mapidx],
2174 &ctype->map_collection_max[mapidx],
2175 &ctype->map_collection_act[mapidx],
2176 from_wch) = to_wch;
2177 }
2178
2179 /* Now comes a semicolon or the end of the line/file. */
2180 now = lr_token (ldfile, charmap, NULL);
2181 if (now->tok == tok_semicolon)
2182 now = lr_token (ldfile, charmap, NULL);
2183 }
2184 break;
2185
2186 case tok_translit_start:
b9eb05d6
UD
2187 /* Ignore the rest of the line if we don't need the input of
2188 this line. */
2189 if (ignore_content)
2190 {
2191 lr_ignore_rest (ldfile, 0);
2192 break;
2193 }
2194
4b10dd6c
UD
2195 /* The rest of the line better should be empty. */
2196 lr_ignore_rest (ldfile, 1);
2197
2198 /* We count here the number of allocated entries in the `translit'
2199 array. */
2200 cnt = 0;
2201
2202 /* We proceed until we see the `translit_end' token. */
2203 while (now = lr_token (ldfile, charmap, repertoire),
2204 now->tok != tok_translit_end && now->tok != tok_eof)
2205 {
2206 if (now->tok == tok_eol)
2207 /* Ignore empty lines. */
2208 continue;
2209
2210 if (now->tok == tok_translit_end)
2211 {
2212 lr_ignore_rest (ldfile, 0);
2213 break;
2214 }
2215
2216 if (now->tok == tok_include)
2217 {
2218 /* We have to include locale. */
2219 const char *locale_name;
2220 const char *repertoire_name;
2221
2222 now = lr_token (ldfile, charmap, NULL);
2223 /* This should be a string or an identifier. In any
2224 case something to name a locale. */
2225 if (now->tok != tok_string && now->tok != tok_ident)
2226 {
2227 translit_syntax:
2228 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2229 lr_ignore_rest (ldfile, 0);
2230 continue;
2231 }
2232 locale_name = now->val.str.startmb;
2233
2234 /* Next should be a semicolon. */
2235 now = lr_token (ldfile, charmap, NULL);
2236 if (now->tok != tok_semicolon)
2237 goto translit_syntax;
2238
2239 /* Now the repertoire name. */
2240 now = lr_token (ldfile, charmap, NULL);
2241 if ((now->tok != tok_string && now->tok != tok_ident)
2242 || now->val.str.startmb == NULL)
2243 goto translit_syntax;
2244 repertoire_name = now->val.str.startmb;
2245
2246 /* We must not have more than one `include'. */
2247 if (ctype->translit_copy_locale != NULL)
2248 {
2249 lr_error (ldfile, _("\
2250%s: only one `include' instruction allowed"), "LC_CTYPE");
2251 lr_ignore_rest (ldfile, 0);
2252 continue;
2253 }
2254
2255 ctype->translit_copy_locale = locale_name;
2256 ctype->translit_copy_repertoire = repertoire_name;
2257
2258 /* The rest of the line must be empty. */
2259 lr_ignore_rest (ldfile, 1);
2260 continue;
2261 }
2262
2263 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2264 }
2265 break;
2266
2267 case tok_ident:
b9eb05d6
UD
2268 /* Ignore the rest of the line if we don't need the input of
2269 this line. */
2270 if (ignore_content)
2271 {
2272 lr_ignore_rest (ldfile, 0);
2273 break;
2274 }
2275
4b10dd6c
UD
2276 /* This could mean one of several things. First test whether
2277 it's a character class name. */
2278 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2279 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2280 break;
2281 if (cnt < ctype->nr_charclass)
2282 {
2283 class_bit = _ISwbit (cnt);
2284 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2285 free (now->val.str.startmb);
2286 goto read_charclass;
2287 }
5491da0d
UD
2288 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2289 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2290 break;
2291 if (cnt < ctype->map_collection_nr)
2292 {
2293 mapidx = cnt;
2294 free (now->val.str.startmb);
2295 goto read_mapping;
2296 }
011ebfab 2297#ifdef PREDEFINED_CLASSES
4b10dd6c
UD
2298 if (strcmp (now->val.str.startmb, "special1") == 0)
2299 {
2300 class_bit = _ISwspecial1;
2301 free (now->val.str.startmb);
2302 goto read_charclass;
2303 }
2304 if (strcmp (now->val.str.startmb, "special2") == 0)
2305 {
2306 class_bit = _ISwspecial2;
2307 free (now->val.str.startmb);
2308 goto read_charclass;
2309 }
2310 if (strcmp (now->val.str.startmb, "special3") == 0)
2311 {
2312 class_bit = _ISwspecial3;
2313 free (now->val.str.startmb);
2314 goto read_charclass;
2315 }
2316 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2317 {
2318 mapidx = 2;
2319 goto read_mapping;
2320 }
011ebfab 2321#endif
4b10dd6c
UD
2322 break;
2323
2324 case tok_end:
2325 /* Next we assume `LC_CTYPE'. */
2326 now = lr_token (ldfile, charmap, NULL);
2327 if (now->tok == tok_eof)
2328 break;
2329 if (now->tok == tok_eol)
2330 lr_error (ldfile, _("%s: incomplete `END' line"),
2331 "LC_CTYPE");
2332 else if (now->tok != tok_lc_ctype)
2333 lr_error (ldfile, _("\
2334%1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2335 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2336 return;
2337
2338 default:
2339 err_label:
2340 if (now->tok != tok_eof)
2341 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
19bc17a9
RM
2342 }
2343
4b10dd6c
UD
2344 /* Prepare for the next round. */
2345 now = lr_token (ldfile, charmap, NULL);
2346 nowtok = now->tok;
19bc17a9
RM
2347 }
2348
4b10dd6c
UD
2349 /* When we come here we reached the end of the file. */
2350 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
19bc17a9
RM
2351}
2352
2353
2354static void
4b10dd6c
UD
2355set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2356 struct repertoire_t *repertoire)
19bc17a9 2357{
4b10dd6c
UD
2358 size_t cnt;
2359
19bc17a9
RM
2360 /* These function defines the default values for the classes and conversions
2361 according to POSIX.2 2.5.2.1.
2362 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2363 Don't move them unless you know what you do! */
2364
4b10dd6c 2365 void set_default (int bitpos, int from, int to)
19bc17a9
RM
2366 {
2367 char tmp[2];
2368 int ch;
4b10dd6c
UD
2369 int bit = _ISbit (bitpos);
2370 int bitw = _ISwbit (bitpos);
19bc17a9
RM
2371 /* Define string. */
2372 strcpy (tmp, "?");
2373
2374 for (ch = from; ch <= to; ++ch)
2375 {
4b10dd6c
UD
2376 uint32_t value;
2377 struct charseq *seq;
19bc17a9
RM
2378 tmp[0] = ch;
2379
4b10dd6c
UD
2380 value = repertoire_find_value (repertoire, tmp, 1);
2381 if (value == ILLEGAL_CHAR_VALUE)
19bc17a9 2382 {
880f421f
UD
2383 if (!be_quiet)
2384 error (0, 0, _("\
4b10dd6c
UD
2385%s: character `%s' not defined in repertoire while needed as default value"),
2386 "LC_CTYPE", tmp);
2387 }
2388 else
2389 ELEM (ctype, class_collection, , value) |= bitw;
2390
2391 seq = charmap_find_value (charmap, tmp, 1);
2392 if (seq == NULL)
2393 {
2394 if (!be_quiet)
2395 error (0, 0, _("\
2396%s: character `%s' not defined in charmap while needed as default value"),
2397 "LC_CTYPE", tmp);
19bc17a9 2398 }
4b10dd6c
UD
2399 else if (seq->nbytes != 1)
2400 error (0, 0, _("\
2401%s: character `%s' in charmap not representable with one byte"),
2402 "LC_CTYPE", tmp);
19bc17a9 2403 else
4b10dd6c 2404 ctype->class256_collection[seq->bytes[0]] |= bit;
19bc17a9
RM
2405 }
2406 }
2407
2408 /* Set default values if keyword was not present. */
4b10dd6c 2409 if ((ctype->class_done & BITw (tok_upper)) == 0)
19bc17a9
RM
2410 /* "If this keyword [lower] is not specified, the lowercase letters
2411 `A' through `Z', ..., shall automatically belong to this class,
2412 with implementation defined character values." [P1003.2, 2.5.2.1] */
4b10dd6c 2413 set_default (BITPOS (tok_upper), 'A', 'Z');
19bc17a9 2414
4b10dd6c 2415 if ((ctype->class_done & BITw (tok_lower)) == 0)
19bc17a9
RM
2416 /* "If this keyword [lower] is not specified, the lowercase letters
2417 `a' through `z', ..., shall automatically belong to this class,
2418 with implementation defined character values." [P1003.2, 2.5.2.1] */
4b10dd6c 2419 set_default (BITPOS (tok_lower), 'a', 'z');
19bc17a9 2420
4b10dd6c 2421 if ((ctype->class_done & BITw (tok_alpha)) == 0)
19bc17a9
RM
2422 {
2423 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2424 class `lower' *must* be in class `alpha'. */
2425 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
96f0d1f5
UD
2426 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2427
2428 for (cnt = 0; cnt < 256; ++cnt)
2429 if ((ctype->class256_collection[cnt] & mask) != 0)
2430 ctype->class256_collection[cnt] |= BIT (tok_alpha);
19bc17a9
RM
2431
2432 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
96f0d1f5
UD
2433 if ((ctype->class_collection[cnt] & maskw) != 0)
2434 ctype->class_collection[cnt] |= BITw (tok_alpha);
19bc17a9
RM
2435 }
2436
4b10dd6c 2437 if ((ctype->class_done & BITw (tok_digit)) == 0)
19bc17a9
RM
2438 /* "If this keyword [digit] is not specified, the digits `0' through
2439 `9', ..., shall automatically belong to this class, with
2440 implementation-defined character values." [P1003.2, 2.5.2.1] */
4b10dd6c 2441 set_default (BITPOS (tok_digit), '0', '9');
19bc17a9
RM
2442
2443 /* "Only characters specified for the `alpha' and `digit' keyword
2444 shall be specified. Characters specified for the keyword `alpha'
2445 and `digit' are automatically included in this class. */
2446 {
2447 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
96f0d1f5
UD
2448 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2449
2450 for (cnt = 0; cnt < 256; ++cnt)
2451 if ((ctype->class256_collection[cnt] & mask) != 0)
2452 ctype->class256_collection[cnt] |= BIT (tok_alnum);
19bc17a9
RM
2453
2454 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
96f0d1f5
UD
2455 if ((ctype->class_collection[cnt] & maskw) != 0)
2456 ctype->class_collection[cnt] |= BITw (tok_alnum);
19bc17a9
RM
2457 }
2458
4b10dd6c 2459 if ((ctype->class_done & BITw (tok_space)) == 0)
19bc17a9
RM
2460 /* "If this keyword [space] is not specified, the characters <space>,
2461 <form-feed>, <newline>, <carriage-return>, <tab>, and
2462 <vertical-tab>, ..., shall automatically belong to this class,
2463 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2464 {
4b10dd6c
UD
2465 uint32_t value;
2466 struct charseq *seq;
19bc17a9 2467
4b10dd6c
UD
2468 value = repertoire_find_value (repertoire, "space", 5);
2469 if (value == ILLEGAL_CHAR_VALUE)
880f421f
UD
2470 {
2471 if (!be_quiet)
2472 error (0, 0, _("\
4b10dd6c
UD
2473%s: character `%s' not defined while needed as default value"),
2474 "LC_CTYPE", "<space>");
880f421f 2475 }
19bc17a9
RM
2476 else
2477 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2478
4b10dd6c
UD
2479 seq = charmap_find_value (charmap, "space", 5);
2480 if (seq == NULL)
880f421f
UD
2481 {
2482 if (!be_quiet)
2483 error (0, 0, _("\
4b10dd6c
UD
2484%s: character `%s' not defined while needed as default value"),
2485 "LC_CTYPE", "<space>");
2486 }
2487 else if (seq->nbytes != 1)
2488 error (0, 0, _("\
2489%s: character `%s' in charmap not representable with one byte"),
2490 "LC_CTYPE", "<space>");
2491 else
2492 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2493
2494
2495 value = repertoire_find_value (repertoire, "form-feed", 9);
2496 if (value == ILLEGAL_CHAR_VALUE)
2497 {
2498 if (!be_quiet)
2499 error (0, 0, _("\
2500%s: character `%s' not defined while needed as default value"),
2501 "LC_CTYPE", "<form-feed>");
880f421f 2502 }
19bc17a9
RM
2503 else
2504 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2505
4b10dd6c
UD
2506 seq = charmap_find_value (charmap, "form-feed", 9);
2507 if (seq == NULL)
880f421f
UD
2508 {
2509 if (!be_quiet)
2510 error (0, 0, _("\
4b10dd6c
UD
2511%s: character `%s' not defined while needed as default value"),
2512 "LC_CTYPE", "<form-feed>");
2513 }
2514 else if (seq->nbytes != 1)
2515 error (0, 0, _("\
2516%s: character `%s' in charmap not representable with one byte"),
2517 "LC_CTYPE", "<form-feed>");
2518 else
2519 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2520
2521
2522 value = repertoire_find_value (repertoire, "newline", 7);
2523 if (value == ILLEGAL_CHAR_VALUE)
2524 {
2525 if (!be_quiet)
2526 error (0, 0, _("\
2527%s: character `%s' not defined while needed as default value"),
2528 "LC_CTYPE", "<newline>");
880f421f 2529 }
19bc17a9
RM
2530 else
2531 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2532
4b10dd6c
UD
2533 seq = charmap_find_value (charmap, "newline", 7);
2534 if (seq == NULL)
880f421f
UD
2535 {
2536 if (!be_quiet)
2537 error (0, 0, _("\
19bc17a9 2538character `%s' not defined while needed as default value"),
4b10dd6c
UD
2539 "<newline>");
2540 }
2541 else if (seq->nbytes != 1)
2542 error (0, 0, _("\
2543%s: character `%s' in charmap not representable with one byte"),
2544 "LC_CTYPE", "<newline>");
2545 else
2546 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2547
2548
2549 value = repertoire_find_value (repertoire, "carriage-return", 15);
2550 if (value == ILLEGAL_CHAR_VALUE)
2551 {
2552 if (!be_quiet)
2553 error (0, 0, _("\
2554%s: character `%s' not defined while needed as default value"),
2555 "LC_CTYPE", "<carriage-return>");
880f421f 2556 }
19bc17a9
RM
2557 else
2558 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2559
4b10dd6c
UD
2560 seq = charmap_find_value (charmap, "carriage-return", 15);
2561 if (seq == NULL)
880f421f
UD
2562 {
2563 if (!be_quiet)
2564 error (0, 0, _("\
4b10dd6c
UD
2565%s: character `%s' not defined while needed as default value"),
2566 "LC_CTYPE", "<carriage-return>");
2567 }
2568 else if (seq->nbytes != 1)
2569 error (0, 0, _("\
2570%s: character `%s' in charmap not representable with one byte"),
2571 "LC_CTYPE", "<carriage-return>");
2572 else
2573 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2574
2575
2576 value = repertoire_find_value (repertoire, "tab", 3);
2577 if (value == ILLEGAL_CHAR_VALUE)
2578 {
2579 if (!be_quiet)
2580 error (0, 0, _("\
2581%s: character `%s' not defined while needed as default value"),
2582 "LC_CTYPE", "<tab>");
880f421f 2583 }
19bc17a9
RM
2584 else
2585 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
2586
4b10dd6c
UD
2587 seq = charmap_find_value (charmap, "tab", 3);
2588 if (seq == NULL)
880f421f
UD
2589 {
2590 if (!be_quiet)
2591 error (0, 0, _("\
4b10dd6c
UD
2592%s: character `%s' not defined while needed as default value"),
2593 "LC_CTYPE", "<tab>");
2594 }
2595 else if (seq->nbytes != 1)
2596 error (0, 0, _("\
2597%s: character `%s' in charmap not representable with one byte"),
2598 "LC_CTYPE", "<tab>");
2599 else
2600 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
2601
2602
2603 value = repertoire_find_value (repertoire, "vertical-tab", 12);
2604 if (value == ILLEGAL_CHAR_VALUE)
2605 {
2606 if (!be_quiet)
2607 error (0, 0, _("\
2608%s: character `%s' not defined while needed as default value"),
2609 "LC_CTYPE", "<vertical-tab>");
880f421f 2610 }
19bc17a9
RM
2611 else
2612 ELEM (ctype, class_collection, , value) |= BIT (tok_space);
4b10dd6c
UD
2613
2614 seq = charmap_find_value (charmap, "vertical-tab", 12);
2615 if (seq == NULL)
2616 {
2617 if (!be_quiet)
2618 error (0, 0, _("\
2619%s: character `%s' not defined while needed as default value"),
2620 "LC_CTYPE", "<vertical-tab>");
2621 }
2622 else if (seq->nbytes != 1)
2623 error (0, 0, _("\
2624%s: character `%s' in charmap not representable with one byte"),
2625 "LC_CTYPE", "<vertical-tab>");
2626 else
2627 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
19bc17a9
RM
2628 }
2629
4b10dd6c 2630 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
19bc17a9
RM
2631 /* "If this keyword is not specified, the digits `0' to `9', the
2632 uppercase letters `A' through `F', and the lowercase letters `a'
2633 through `f', ..., shell automatically belong to this class, with
2634 implementation defined character values." [P1003.2, 2.5.2.1] */
2635 {
4b10dd6c
UD
2636 set_default (BITPOS (tok_xdigit), '0', '9');
2637 set_default (BITPOS (tok_xdigit), 'A', 'F');
2638 set_default (BITPOS (tok_xdigit), 'a', 'f');
19bc17a9
RM
2639 }
2640
4b10dd6c 2641 if ((ctype->class_done & BITw (tok_blank)) == 0)
19bc17a9
RM
2642 /* "If this keyword [blank] is unspecified, the characters <space> and
2643 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
2644 {
4b10dd6c
UD
2645 uint32_t value;
2646 struct charseq *seq;
19bc17a9 2647
4b10dd6c
UD
2648 value = repertoire_find_value (repertoire, "space", 5);
2649 if (value == ILLEGAL_CHAR_VALUE)
880f421f
UD
2650 {
2651 if (!be_quiet)
2652 error (0, 0, _("\
4b10dd6c
UD
2653%s: character `%s' not defined while needed as default value"),
2654 "LC_CTYPE", "<space>");
880f421f 2655 }
19bc17a9
RM
2656 else
2657 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
2658
4b10dd6c
UD
2659 seq = charmap_find_value (charmap, "space", 5);
2660 if (seq == NULL)
880f421f
UD
2661 {
2662 if (!be_quiet)
2663 error (0, 0, _("\
4b10dd6c
UD
2664%s: character `%s' not defined while needed as default value"),
2665 "LC_CTYPE", "<space>");
2666 }
2667 else if (seq->nbytes != 1)
2668 error (0, 0, _("\
2669%s: character `%s' in charmap not representable with one byte"),
2670 "LC_CTYPE", "<space>");
2671 else
2672 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
2673
2674
2675 value = repertoire_find_value (repertoire, "tab", 3);
2676 if (value == ILLEGAL_CHAR_VALUE)
2677 {
2678 if (!be_quiet)
2679 error (0, 0, _("\
2680%s: character `%s' not defined while needed as default value"),
2681 "LC_CTYPE", "<tab>");
880f421f 2682 }
19bc17a9
RM
2683 else
2684 ELEM (ctype, class_collection, , value) |= BIT (tok_blank);
4b10dd6c
UD
2685
2686 seq = charmap_find_value (charmap, "tab", 3);
2687 if (seq == NULL)
2688 {
2689 if (!be_quiet)
2690 error (0, 0, _("\
2691%s: character `%s' not defined while needed as default value"),
2692 "LC_CTYPE", "<tab>");
2693 }
2694 else if (seq->nbytes != 1)
2695 error (0, 0, _("\
2696%s: character `%s' in charmap not representable with one byte"),
2697 "LC_CTYPE", "<tab>");
2698 else
2699 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
19bc17a9
RM
2700 }
2701
4b10dd6c 2702 if ((ctype->class_done & BITw (tok_graph)) == 0)
19bc17a9
RM
2703 /* "If this keyword [graph] is not specified, characters specified for
2704 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
2705 shall belong to this character class." [P1003.2, 2.5.2.1] */
2706 {
2707 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2708 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2709 size_t cnt;
2710
2711 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2712 if ((ctype->class_collection[cnt] & mask) != 0)
2713 ctype->class_collection[cnt] |= BIT (tok_graph);
4b10dd6c
UD
2714
2715 for (cnt = 0; cnt < 256; ++cnt)
2716 if ((ctype->class256_collection[cnt] & mask) != 0)
2717 ctype->class256_collection[cnt] |= BIT (tok_graph);
19bc17a9
RM
2718 }
2719
4b10dd6c 2720 if ((ctype->class_done & BITw (tok_print)) == 0)
19bc17a9
RM
2721 /* "If this keyword [print] is not provided, characters specified for
2722 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
2723 and the <space> character shall belong to this character class."
2724 [P1003.2, 2.5.2.1] */
2725 {
2726 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
2727 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
2728 size_t cnt;
4b10dd6c
UD
2729 uint32_t space;
2730 struct charseq *seq;
19bc17a9
RM
2731
2732 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2733 if ((ctype->class_collection[cnt] & mask) != 0)
2734 ctype->class_collection[cnt] |= BIT (tok_print);
2735
4b10dd6c
UD
2736 for (cnt = 0; cnt < 256; ++cnt)
2737 if ((ctype->class256_collection[cnt] & mask) != 0)
2738 ctype->class256_collection[cnt] |= BIT (tok_print);
2739
2740
2741 space = repertoire_find_value (repertoire, "space", 5);
880f421f
UD
2742 if (space == ILLEGAL_CHAR_VALUE)
2743 {
2744 if (!be_quiet)
2745 error (0, 0, _("\
4b10dd6c
UD
2746%s: character `%s' not defined while needed as default value"),
2747 "LC_CTYPE", "<space>");
880f421f 2748 }
19bc17a9
RM
2749 else
2750 ELEM (ctype, class_collection, , space) |= BIT (tok_print);
4b10dd6c
UD
2751
2752 seq = charmap_find_value (charmap, "space", 5);
2753 if (seq == NULL)
2754 {
2755 if (!be_quiet)
2756 error (0, 0, _("\
2757%s: character `%s' not defined while needed as default value"),
2758 "LC_CTYPE", "<space>");
2759 }
2760 else if (seq->nbytes != 1)
2761 error (0, 0, _("\
2762%s: character `%s' in charmap not representable with one byte"),
2763 "LC_CTYPE", "<space>");
2764 else
2765 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_print);
19bc17a9
RM
2766 }
2767
4b10dd6c 2768 if (ctype->tomap_done[0] == 0)
6d52618b 2769 /* "If this keyword [toupper] is not specified, the lowercase letters
19bc17a9
RM
2770 `a' through `z', and their corresponding uppercase letters `A' to
2771 `Z', ..., shall automatically be included, with implementation-
2772 defined character values." [P1003.2, 2.5.2.1] */
2773 {
2774 char tmp[4];
2775 int ch;
2776
2777 strcpy (tmp, "<?>");
2778
2779 for (ch = 'a'; ch <= 'z'; ++ch)
2780 {
4b10dd6c
UD
2781 uint32_t value_from, value_to;
2782 struct charseq *seq_from, *seq_to;
19bc17a9
RM
2783
2784 tmp[1] = (char) ch;
2785
4b10dd6c
UD
2786 value_from = repertoire_find_value (repertoire, &tmp[1], 1);
2787 if (value_from == ILLEGAL_CHAR_VALUE)
19bc17a9 2788 {
880f421f
UD
2789 if (!be_quiet)
2790 error (0, 0, _("\
4b10dd6c
UD
2791%s: character `%s' not defined while needed as default value"),
2792 "LC_CTYPE", tmp);
2793 }
2794 else
2795 {
2796 /* This conversion is implementation defined. */
2797 tmp[1] = (char) (ch + ('A' - 'a'));
2798 value_to = repertoire_find_value (repertoire, &tmp[1], 1);
2799 if (value_to == ILLEGAL_CHAR_VALUE)
2800 {
2801 if (!be_quiet)
2802 error (0, 0, _("\
2803%s: character `%s' not defined while needed as default value"),
2804 "LC_CTYPE", tmp);
2805 }
2806 else
2807 /* The index [0] is determined by the order of the
2808 `ctype_map_newP' calls in `ctype_startup'. */
2809 ELEM (ctype, map_collection, [0], value_from) = value_to;
19bc17a9
RM
2810 }
2811
4b10dd6c
UD
2812 seq_from = charmap_find_value (charmap, &tmp[1], 1);
2813 if (seq_from == NULL)
19bc17a9 2814 {
880f421f
UD
2815 if (!be_quiet)
2816 error (0, 0, _("\
4b10dd6c
UD
2817%s: character `%s' not defined while needed as default value"),
2818 "LC_CTYPE", tmp);
2819 }
2820 else if (seq_from->nbytes != 1)
2821 {
2822 if (!be_quiet)
2823 error (0, 0, _("\
2824%s: character `%s' needed as default value not representable with one byte"),
2825 "LC_CTYPE", tmp);
2826 }
2827 else
2828 {
2829 /* This conversion is implementation defined. */
2830 tmp[1] = (char) (ch + ('A' - 'a'));
2831 seq_to = charmap_find_value (charmap, &tmp[1], 1);
2832 if (seq_to == NULL)
2833 {
2834 if (!be_quiet)
2835 error (0, 0, _("\
2836%s: character `%s' not defined while needed as default value"),
2837 "LC_CTYPE", tmp);
2838 }
2839 else if (seq_to->nbytes != 1)
2840 {
2841 if (!be_quiet)
2842 error (0, 0, _("\
2843%s: character `%s' needed as default value not representable with one byte"),
2844 "LC_CTYPE", tmp);
2845 }
2846 else
2847 /* The index [0] is determined by the order of the
2848 `ctype_map_newP' calls in `ctype_startup'. */
2849 ctype->map256_collection[0][seq_from->bytes[0]]
2850 = seq_to->bytes[0];
19bc17a9 2851 }
19bc17a9
RM
2852 }
2853 }
2854
4b10dd6c 2855 if (ctype->tomap_done[1] == 0)
19bc17a9
RM
2856 /* "If this keyword [tolower] is not specified, the mapping shall be
2857 the reverse mapping of the one specified to `toupper'." [P1003.2] */
2858 {
19bc17a9
RM
2859 for (cnt = 0; cnt < ctype->map_collection_act[0]; ++cnt)
2860 if (ctype->map_collection[0][cnt] != 0)
2861 ELEM (ctype, map_collection, [1],
2862 ctype->map_collection[0][cnt])
2863 = ctype->charnames[cnt];
4b10dd6c
UD
2864
2865 for (cnt = 0; cnt < 256; ++cnt)
2866 if (ctype->map256_collection[0][cnt] != 0)
85cb60ff 2867 ctype->map256_collection[1][ctype->map256_collection[0][cnt]] = cnt;
4b10dd6c
UD
2868 }
2869
2870 if (ctype->outdigits_act == 0)
2871 {
2872 for (cnt = 0; cnt < 10; ++cnt)
2873 {
2874 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2875 digits + cnt, 1);
2876
2877 if (ctype->mboutdigits[cnt] == NULL)
2878 {
2879 ctype->mboutdigits[cnt] = charmap_find_symbol (charmap,
2880 longnames[cnt],
2881 strlen (longnames[cnt]));
2882
2883 if (ctype->mboutdigits[cnt] == NULL)
2884 {
2885 /* Provide a replacement. */
2886 error (0, 0, _("\
2887no output digits defined and none of the standard names in the charmap"));
2888
2889 ctype->mboutdigits[cnt] = obstack_alloc (&charmap->mem_pool,
2890 sizeof (struct charseq) + 1);
2891
2892 /* This is better than nothing. */
2893 ctype->mboutdigits[cnt]->bytes[0] = digits[cnt];
2894 ctype->mboutdigits[cnt]->nbytes = 1;
2895 }
2896 }
b9eb05d6
UD
2897
2898 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2899 digits + cnt, 1);
2900
2901 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2902 {
2903 ctype->wcoutdigits[cnt] = repertoire_find_value (repertoire,
2904 longnames[cnt],
2905 strlen (longnames[cnt]));
2906
2907 if (ctype->wcoutdigits[cnt] == ILLEGAL_CHAR_VALUE)
2908 {
2909 /* Provide a replacement. */
2910 error (0, 0, _("\
2911no output digits defined and none of the standard names in the repertoire"));
2912
2913 /* This is better than nothing. */
2914 ctype->wcoutdigits[cnt] = (uint32_t) digits[cnt];
2915 }
2916 }
4b10dd6c
UD
2917 }
2918
2919 ctype->outdigits_act = 10;
19bc17a9
RM
2920 }
2921}
2922
2923
2924static void
4b10dd6c
UD
2925allocate_arrays (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2926 struct repertoire_t *repertoire)
19bc17a9
RM
2927{
2928 size_t idx;
2929
6d52618b
UD
2930 /* First we have to decide how we organize the arrays. It is easy
2931 for a one-byte character set. But multi-byte character set
2932 cannot be stored flat because the chars might be sparsely used.
2933 So we determine an optimal hashing function for the used
2934 characters.
2935
2936 We use a very trivial hashing function to store the sparse
2937 table. CH % TABSIZE is used as an index. To solve multiple hits
2938 we have N planes. This guarantees a fixed search time for a
42d7c593 2939 character [N / 2]. In the following code we determine the minimum
66ac0abe
UD
2940 value for TABSIZE * N, where TABSIZE >= 256.
2941
2942 Some people complained that this algorithm takes too long. Well,
2943 go on, improve it. But changing the step size is *not* an
2944 option. Some people changed this to use only sizes of prime
2945 numbers. Think again, do some math. We are looking for the
2946 optimal solution, not something which works in general. Unless
2947 somebody can provide a dynamic programming solution I think this
2948 implementation is as good as it can get. */
19bc17a9
RM
2949 size_t min_total = UINT_MAX;
2950 size_t act_size = 256;
2951
66ac0abe 2952 if (!be_quiet && ctype->charnames_act > 512)
c84142e8 2953 fputs (_("\
19bc17a9 2954Computing table size for character classes might take a while..."),
c84142e8 2955 stderr);
19bc17a9 2956
66ac0abe
UD
2957 /* While we want to have a small total size we are willing to use a
2958 little bit larger table if this reduces the number of layers.
2959 Therefore we add a little penalty to the number of planes.
2960 Maybe this constant has to be adjusted a bit. */
2961#define PENALTY 128
2962 do
19bc17a9
RM
2963 {
2964 size_t cnt[act_size];
2965 size_t act_planes = 1;
2966
2967 memset (cnt, '\0', sizeof cnt);
2968
2969 for (idx = 0; idx < 256; ++idx)
2970 cnt[idx] = 1;
2971
2972 for (idx = 0; idx < ctype->charnames_act; ++idx)
2973 if (ctype->charnames[idx] >= 256)
2974 {
2975 size_t nr = ctype->charnames[idx] % act_size;
2976
2977 if (++cnt[nr] > act_planes)
2978 {
2979 act_planes = cnt[nr];
66ac0abe 2980 if ((act_size + PENALTY) * act_planes >= min_total)
19bc17a9
RM
2981 break;
2982 }
2983 }
2984
66ac0abe 2985 if ((act_size + PENALTY) * act_planes < min_total)
19bc17a9 2986 {
66ac0abe 2987 min_total = (act_size + PENALTY) * act_planes;
19bc17a9
RM
2988 ctype->plane_size = act_size;
2989 ctype->plane_cnt = act_planes;
2990 }
2991
2992 ++act_size;
2993 }
66ac0abe 2994 while (act_size < min_total);
19bc17a9 2995
66ac0abe 2996 if (!be_quiet && ctype->charnames_act > 512)
c84142e8 2997 fputs (_(" done\n"), stderr);
19bc17a9 2998
75cd5204 2999
4a33c2f5
UD
3000 ctype->names = (uint32_t *) xcalloc (ctype->plane_size
3001 * ctype->plane_cnt,
3002 sizeof (uint32_t));
19bc17a9
RM
3003
3004 for (idx = 1; idx < 256; ++idx)
4a33c2f5 3005 ctype->names[idx] = idx;
19bc17a9
RM
3006
3007 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
4a33c2f5 3008 ctype->names[0] = 1;
19bc17a9
RM
3009
3010 for (idx = 256; idx < ctype->charnames_act; ++idx)
3011 {
3012 size_t nr = (ctype->charnames[idx] % ctype->plane_size);
3013 size_t depth = 0;
3014
4a33c2f5 3015 while (ctype->names[nr + depth * ctype->plane_size])
19bc17a9
RM
3016 ++depth;
3017 assert (depth < ctype->plane_cnt);
3018
4a33c2f5 3019 ctype->names[nr + depth * ctype->plane_size] = ctype->charnames[idx];
19bc17a9
RM
3020
3021 /* Now for faster access remember the index in the NAMES_B array. */
3022 ctype->charnames[idx] = nr + depth * ctype->plane_size;
3023 }
4a33c2f5 3024 ctype->names[0] = 0;
19bc17a9
RM
3025
3026
3027 /* You wonder about this amount of memory? This is only because some
3028 users do not manage to address the array with unsigned values or
3029 data types with range >= 256. '\200' would result in the array
3030 index -128. To help these poor people we duplicate the entries for
3031 128 up to 255 below the entry for \0. */
3032 ctype->ctype_b = (char_class_t *) xcalloc (256 + 128,
3033 sizeof (char_class_t));
3034 ctype->ctype32_b = (char_class32_t *) xcalloc (ctype->plane_size
3035 * ctype->plane_cnt,
3036 sizeof (char_class32_t));
3037
4a33c2f5 3038 /* This is the array accessed using the multibyte string elements. */
4b10dd6c 3039 for (idx = 0; idx < 256; ++idx)
4a33c2f5 3040 ctype->ctype_b[128 + idx] = ctype->class256_collection[idx];
19bc17a9 3041
75cd5204
RM
3042 /* Mirror first 127 entries. We must take care that entry -1 is not
3043 mirrored because EOF == -1. */
3044 for (idx = 0; idx < 127; ++idx)
19bc17a9
RM
3045 ctype->ctype_b[idx] = ctype->ctype_b[256 + idx];
3046
3047 /* The 32 bit array contains all characters. */
3048 for (idx = 0; idx < ctype->class_collection_act; ++idx)
4a33c2f5 3049 ctype->ctype32_b[ctype->charnames[idx]] = ctype->class_collection[idx];
19bc17a9
RM
3050
3051 /* Room for table of mappings. */
49f2be5b
UD
3052 ctype->map = (uint32_t **) xmalloc (2 * sizeof (uint32_t *));
3053 ctype->map32 = (uint32_t **) xmalloc (ctype->map_collection_nr
4a33c2f5 3054 * sizeof (uint32_t *));
19bc17a9
RM
3055
3056 /* Fill in all mappings. */
49f2be5b 3057 for (idx = 0; idx < 2; ++idx)
19bc17a9
RM
3058 {
3059 unsigned int idx2;
3060
3061 /* Allocate table. */
49f2be5b 3062 ctype->map[idx] = (uint32_t *) xmalloc ((256 + 128) * sizeof (uint32_t));
19bc17a9
RM
3063
3064 /* Copy values from collection. */
4b10dd6c 3065 for (idx2 = 0; idx2 < 256; ++idx2)
4a33c2f5 3066 ctype->map[idx][128 + idx2] = ctype->map256_collection[idx][idx2];
19bc17a9 3067
75cd5204
RM
3068 /* Mirror first 127 entries. We must take care not to map entry
3069 -1 because EOF == -1. */
3070 for (idx2 = 0; idx2 < 127; ++idx2)
4a33c2f5 3071 ctype->map[idx][idx2] = ctype->map[idx][256 + idx2];
19bc17a9 3072
75cd5204 3073 /* EOF must map to EOF. */
4a33c2f5 3074 ctype->map[idx][127] = EOF;
49f2be5b 3075 }
a9c27b3e 3076
49f2be5b
UD
3077 for (idx = 0; idx < ctype->map_collection_nr; ++idx)
3078 {
3079 unsigned int idx2;
3080
3081 /* Allocate table. */
f1d8b804
UD
3082 ctype->map32[idx] = (uint32_t *) xmalloc (ctype->plane_size
3083 * ctype->plane_cnt
3084 * sizeof (uint32_t));
49f2be5b
UD
3085
3086 /* Copy default value (identity mapping). */
f1d8b804 3087 memcpy (ctype->map32[idx], ctype->names,
49f2be5b
UD
3088 ctype->plane_size * ctype->plane_cnt * sizeof (uint32_t));
3089
3090 /* Copy values from collection. */
3091 for (idx2 = 0; idx2 < 256; ++idx2)
a9c27b3e 3092 if (ctype->map_collection[idx][idx2] != 0)
f1d8b804
UD
3093 ctype->map32[idx][idx2] = ctype->map_collection[idx][idx2];
3094
3095 while (idx2 < ctype->map_collection_act[idx])
b06c53e7
UD
3096 {
3097 if (ctype->map_collection[idx][idx2] != 0)
450bf66e
UD
3098 ctype->map32[idx][ctype->charnames[idx2]] =
3099 ctype->map_collection[idx][idx2];
b06c53e7
UD
3100 ++idx2;
3101 }
19bc17a9
RM
3102 }
3103
3104 /* Extra array for class and map names. */
4b10dd6c
UD
3105 ctype->class_name_ptr = (uint32_t *) xmalloc (ctype->nr_charclass
3106 * sizeof (uint32_t));
3107 ctype->map_name_ptr = (uint32_t *) xmalloc (ctype->map_collection_nr
3108 * sizeof (uint32_t));
75cd5204
RM
3109
3110 /* Array for width information. Because the expected width are very
3111 small we use only one single byte. This save space and we need
3112 not provide the information twice with both endianesses. */
3113 ctype->width = (unsigned char *) xmalloc (ctype->plane_size
3114 * ctype->plane_cnt);
3115 /* Initialize with default width value. */
4b10dd6c 3116 memset (ctype->width, charmap->width_default,
75cd5204 3117 ctype->plane_size * ctype->plane_cnt);
4b10dd6c 3118 if (charmap->width_rules != NULL)
75cd5204
RM
3119 {
3120 size_t cnt;
3121
4b10dd6c 3122 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
827ff758
UD
3123 {
3124 unsigned char bytes[charmap->mb_cur_max];
3125 int nbytes = charmap->width_rules[cnt].from->nbytes;
3126
3127 /* We have the range of character for which the width is
3128 specified described using byte sequences of the multibyte
3129 charset. We have to convert this to UCS4 now. And we
3130 cannot simply convert the beginning and the end of the
3131 sequence, we have to iterate over the byte sequence and
3132 convert it for every single character. */
3133 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
3134
3135 while (nbytes < charmap->width_rules[cnt].to->nbytes
3136 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
3137 nbytes) <= 0)
75cd5204 3138 {
827ff758 3139 /* Find the UCS value for `bytes'. */
827ff758 3140 int inner;
76e680a8
UD
3141 uint32_t wch;
3142 struct charseq *seq =
3143 charmap_find_symbol (charmap, bytes, nbytes);
3144
3145 if (seq == NULL)
3146 wch = ILLEGAL_CHAR_VALUE;
3147 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
3148 wch = seq->ucs4;
3149 else
3150 wch = repertoire_find_value (ctype->repertoire, seq->name,
3151 strlen (seq->name));
827ff758
UD
3152
3153 if (wch != ILLEGAL_CHAR_VALUE)
3154 {
3155 /* Store the value. */
b1c9ad82 3156 size_t nr = wch % ctype->plane_size;
827ff758
UD
3157 size_t depth = 0;
3158
b1c9ad82 3159 while (ctype->names[nr + depth * ctype->plane_size] != wch)
827ff758
UD
3160 ++depth;
3161 assert (depth < ctype->plane_cnt);
3162
3163 ctype->width[nr + depth * ctype->plane_size]
3164 = charmap->width_rules[cnt].width;
3165 }
3166
3167 /* "Increment" the bytes sequence. */
3168 inner = nbytes - 1;
3169 while (inner >= 0 && bytes[inner] == 0xff)
3170 --inner;
75cd5204 3171
827ff758
UD
3172 if (inner < 0)
3173 {
3174 /* We have to extend the byte sequence. */
3175 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
3176 break;
75cd5204 3177
827ff758
UD
3178 bytes[0] = 1;
3179 memset (&bytes[1], 0, nbytes);
3180 ++nbytes;
3181 }
3182 else
3183 {
3184 ++bytes[inner];
3185 while (++inner < nbytes)
3186 bytes[inner] = 0;
3187 }
75cd5204 3188 }
827ff758 3189 }
75cd5204 3190 }
0200214b 3191
4b10dd6c
UD
3192 /* Set MB_CUR_MAX. */
3193 ctype->mb_cur_max = charmap->mb_cur_max;
6990326c
RM
3194
3195 /* We need the name of the currently used 8-bit character set to
3196 make correct conversion between this 8-bit representation and the
3197 ISO 10646 character set used internally for wide characters. */
4b10dd6c
UD
3198 ctype->codeset_name = charmap->code_set_name;
3199
3200 /* Now determine the table for the transliteration information.
3201
3202 XXX It is not yet clear to me whether it is worth implementing a
3203 complicated algorithm which uses a hash table to locate the entries.
3204 For now I'll use a simple array which can be searching using binary
3205 search. */
3206 if (ctype->translit_copy_locale != NULL)
3207 {
3208 /* Fold in the transliteration information from the locale mentioned
3209 in the `include' statement. */
3210 struct locale_ctype_t *here = ctype;
3211
3212 do
3213 {
3214 struct localedef_t *other = find_locale (LC_CTYPE,
3215 here->translit_copy_locale,
3216 repertoire->name, charmap);
3217
3218 if (other == NULL)
3219 {
3220 error (0, 0, _("\
3221%s: transliteration data from locale `%s' not available"),
3222 "LC_CTYPE", here->translit_copy_locale);
3223 break;
3224 }
3225
3226 here = other->categories[LC_CTYPE].ctype;
3227
3228 /* Enqueue the information if necessary. */
3229 if (here->translit != NULL)
3230 {
3231 struct translit_t *endp = here->translit;
3232 while (endp->next != NULL)
3233 endp = endp->next;
3234
3235 endp->next = ctype->translit;
3236 ctype->translit = here->translit;
3237 }
3238 }
3239 while (here->translit_copy_locale != NULL);
3240 }
3241
3242 if (ctype->translit != NULL)
3243 {
3244 /* First count how many entries we have. This is the upper limit
3245 since some entries from the included files might be overwritten. */
3246 size_t number = 0;
3247 size_t cnt;
3248 struct translit_t *runp = ctype->translit;
3249 struct translit_t **sorted;
3250 size_t from_len, to_len;
3251
3252 while (runp != NULL)
3253 {
3254 ++number;
3255 runp = runp->next;
3256 }
3257
3258 /* Next we allocate an array large enough and fill in the values. */
a9c27b3e
UD
3259 sorted = (struct translit_t **) alloca (number
3260 * sizeof (struct translit_t **));
4b10dd6c
UD
3261 runp = ctype->translit;
3262 number = 0;
3263 do
3264 {
3265 /* Search for the place where to insert this string.
3266 XXX Better use a real sorting algorithm later. */
3267 size_t idx = 0;
3268 int replace = 0;
3269
3270 while (idx < number)
3271 {
3272 int res = wcscmp ((const wchar_t *) sorted[idx]->from,
3273 (const wchar_t *) runp->from);
3274 if (res == 0)
3275 {
3276 replace = 1;
3277 break;
3278 }
3279 if (res > 0)
3280 break;
3281 ++idx;
3282 }
3283
3284 if (replace)
3285 sorted[idx] = runp;
3286 else
3287 {
3288 memmove (&sorted[idx + 1], &sorted[idx],
3289 (number - idx) * sizeof (struct translit_t *));
3290 sorted[idx] = runp;
3291 ++number;
3292 }
3293
3294 runp = runp->next;
3295 }
3296 while (runp != NULL);
3297
3298 /* The next step is putting all the possible transliteration
3299 strings in one memory block so that we can write it out.
3300 We need several different blocks:
3301 - index to the tfromstring array
3302 - from-string array
3303 - index to the to-string array
3304 - to-string array.
3305 And this all must be available for both endianes variants.
3306 */
3307 from_len = to_len = 0;
3308 for (cnt = 0; cnt < number; ++cnt)
3309 {
3310 struct translit_to_t *srunp;
3311 from_len += wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
3312 srunp = sorted[cnt]->to;
3313 while (srunp != NULL)
3314 {
3315 to_len += wcslen ((const wchar_t *) srunp->str) + 1;
3316 srunp = srunp->next;
3317 }
3318 /* Plus one for the extra NUL character marking the end of
3319 the list for the current entry. */
3320 ++to_len;
3321 }
3322
3323 /* We can allocate the arrays for the results. */
4a33c2f5
UD
3324 ctype->translit_from_idx = xmalloc (number * sizeof (uint32_t));
3325 ctype->translit_from_tbl = xmalloc (from_len * sizeof (uint32_t));
3326 ctype->translit_to_idx = xmalloc (number * sizeof (uint32_t));
3327 ctype->translit_to_tbl = xmalloc (to_len * sizeof (uint32_t));
4b10dd6c
UD
3328
3329 from_len = 0;
3330 to_len = 0;
3331 for (cnt = 0; cnt < number; ++cnt)
3332 {
3333 size_t len;
3334 struct translit_to_t *srunp;
3335
4a33c2f5
UD
3336 ctype->translit_from_idx[cnt] = from_len;
3337 ctype->translit_to_idx[cnt] = to_len;
4b10dd6c
UD
3338
3339 len = wcslen ((const wchar_t *) sorted[cnt]->from) + 1;
4a33c2f5 3340 wmemcpy ((wchar_t *) &ctype->translit_from_tbl[from_len],
4b10dd6c
UD
3341 (const wchar_t *) sorted[cnt]->from, len);
3342 from_len += len;
3343
4a33c2f5 3344 ctype->translit_to_idx[cnt] = to_len;
4b10dd6c
UD
3345 srunp = sorted[cnt]->to;
3346 while (srunp != NULL)
3347 {
3348 len = wcslen ((const wchar_t *) srunp->str) + 1;
4a33c2f5 3349 wmemcpy ((wchar_t *) &ctype->translit_to_tbl[to_len],
4b10dd6c
UD
3350 (const wchar_t *) srunp->str, len);
3351 to_len += len;
3352 srunp = srunp->next;
3353 }
4a33c2f5 3354 ctype->translit_to_tbl[to_len++] = L'\0';
4b10dd6c 3355 }
4b10dd6c
UD
3356
3357 /* Store the information about the length. */
3358 ctype->translit_idx_size = number * sizeof (uint32_t);
3359 ctype->translit_from_tbl_size = from_len * sizeof (uint32_t);
3360 ctype->translit_to_tbl_size = to_len * sizeof (uint32_t);
3361 }
3362 else
3363 {
3364 /* Provide some dummy pointers since we have nothing to write out. */
3365 static uint32_t no_str = { 0 };
3366
4a33c2f5
UD
3367 ctype->translit_from_idx = &no_str;
3368 ctype->translit_from_tbl = &no_str;
3369 ctype->translit_to_tbl = &no_str;
4b10dd6c
UD
3370 ctype->translit_idx_size = 0;
3371 ctype->translit_from_tbl_size = 0;
3372 ctype->translit_to_tbl_size = 0;
3373 }
19bc17a9 3374}