1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
30 #include "localeinfo.h"
32 #include "locfile-token.h"
33 #include "stringtrans.h"
35 /* Uncomment the following line in the production version. */
40 void *xmalloc (size_t __n
);
41 void *xcalloc (size_t __n
, size_t __s
);
42 void *xrealloc (void *__ptr
, size_t __n
);
45 /* The bit used for representing a special class. */
46 #define BITPOS(class) ((class) - tok_upper)
47 #define BIT(class) (1 << BITPOS (class))
49 #define ELEM(ctype, collection, idx, value) \
50 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
51 &ctype->collection##_act idx, value)
54 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
57 ((((w) >> 8) & 0xff) | (((w) & 0xff) << 8))
60 /* To be compatible with former implementations we for now restrict
61 the number of bits for character classes to 16. When compatibility
62 is not necessary anymore increase the number to 32. */
63 #define char_class_t u_int16_t
64 #define CHAR_CLASS_TRANS SWAPU16
65 #define char_class32_t u_int32_t
66 #define CHAR_CLASS32_TRANS SWAPU32
69 /* The real definition of the struct for the LC_CTYPE locale. */
72 unsigned int *charnames
;
76 /* We will allow up to 8 * sizeof(u_int32_t) - 1 character classes. */
77 #define MAX_NR_CHARCLASS (8 * sizeof (u_int32_t) - 1)
79 const char *classnames
[MAX_NR_CHARCLASS
];
80 unsigned long int current_class_mask
;
81 unsigned int last_class_char
;
82 u_int32_t
*class_collection
;
83 size_t class_collection_max
;
84 size_t class_collection_act
;
85 unsigned long int class_done
;
87 /* If the following number ever turns out to be too small simply
88 increase it. But I doubt it will. --drepper@gnu */
89 #define MAX_NR_CHARMAP 16
90 const char *mapnames
[MAX_NR_CHARMAP
];
91 u_int32_t
*map_collection
[MAX_NR_CHARMAP
];
92 u_int32_t map_collection_max
[MAX_NR_CHARMAP
];
93 u_int32_t map_collection_act
[MAX_NR_CHARMAP
];
94 size_t map_collection_nr
;
96 unsigned int from_map_char
;
100 /* The arrays for the binary representation. */
101 u_int32_t plane_size
;
103 char_class_t
*ctype_b
;
104 char_class32_t
*ctype32_b
;
109 u_int32_t
*class_name_ptr
;
110 u_int32_t
*map_name_ptr
;
111 unsigned char *width
;
112 u_int32_t mb_cur_max
;
113 const char *codeset_name
;
117 /* Prototypes for local functions. */
118 static void ctype_class_newP (struct linereader
*lr
,
119 struct locale_ctype_t
*ctype
, const char *name
);
120 static void ctype_map_newP (struct linereader
*lr
,
121 struct locale_ctype_t
*ctype
,
122 const char *name
, struct charset_t
*charset
);
123 static u_int32_t
*find_idx (struct locale_ctype_t
*ctype
, u_int32_t
**table
,
124 size_t *max
, size_t *act
, unsigned int idx
);
125 static void set_class_defaults (struct locale_ctype_t
*ctype
,
126 struct charset_t
*charset
);
127 static void allocate_arrays (struct locale_ctype_t
*ctype
,
128 struct charset_t
*charset
);
132 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
133 struct charset_t
*charset
)
136 struct locale_ctype_t
*ctype
;
138 /* It is important that we always use UCS1 encoding for strings now. */
139 encoding_method
= ENC_UCS1
;
141 /* Allocate the needed room. */
142 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
143 (struct locale_ctype_t
*) xmalloc (sizeof (struct locale_ctype_t
));
145 /* We have no names seen yet. */
146 ctype
->charnames_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
148 (unsigned int *) xmalloc (ctype
->charnames_max
* sizeof (unsigned int));
149 for (cnt
= 0; cnt
< 256; ++cnt
)
150 ctype
->charnames
[cnt
] = cnt
;
151 ctype
->charnames_act
= 256;
153 /* Fill character class information. */
154 ctype
->nr_charclass
= 0;
155 ctype
->current_class_mask
= 0;
156 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
157 /* The order of the following instructions determines the bit
159 ctype_class_newP (lr
, ctype
, "upper");
160 ctype_class_newP (lr
, ctype
, "lower");
161 ctype_class_newP (lr
, ctype
, "alpha");
162 ctype_class_newP (lr
, ctype
, "digit");
163 ctype_class_newP (lr
, ctype
, "xdigit");
164 ctype_class_newP (lr
, ctype
, "space");
165 ctype_class_newP (lr
, ctype
, "print");
166 ctype_class_newP (lr
, ctype
, "graph");
167 ctype_class_newP (lr
, ctype
, "blank");
168 ctype_class_newP (lr
, ctype
, "cntrl");
169 ctype_class_newP (lr
, ctype
, "punct");
170 ctype_class_newP (lr
, ctype
, "alnum");
172 ctype
->class_collection_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
173 ctype
->class_collection
174 = (u_int32_t
*) xmalloc (sizeof (unsigned long int)
175 * ctype
->class_collection_max
);
176 memset (ctype
->class_collection
, '\0',
177 sizeof (unsigned long int) * ctype
->class_collection_max
);
178 ctype
->class_collection_act
= 256;
180 /* Fill character map information. */
181 ctype
->map_collection_nr
= 0;
182 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
183 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
184 ctype_map_newP (lr
, ctype
, "toupper", charset
);
185 ctype_map_newP (lr
, ctype
, "tolower", charset
);
187 /* Fill first 256 entries in `toupper' and `tolower' arrays. */
188 for (cnt
= 0; cnt
< 256; ++cnt
)
190 ctype
->map_collection
[0][cnt
] = cnt
;
191 ctype
->map_collection
[1][cnt
] = cnt
;
197 ctype_finish (struct localedef_t
*locale
, struct charset_t
*charset
)
199 /* See POSIX.2, table 2-6 for the meaning of the following table. */
204 const char allow
[NCLASS
];
206 valid_table
[NCLASS
] =
208 /* The order is important. See token.h for more information.
209 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
210 { "upper", "--MX-XDDXXX-" },
211 { "lower", "--MX-XDDXXX-" },
212 { "alpha", "---X-XDDXXX-" },
213 { "digit", "XXX--XDDXXX-" },
214 { "xdigit", "-----XDDXXX-" },
215 { "space", "XXXXX------X" },
216 { "print", "---------X--" },
217 { "graph", "---------X--" },
218 { "blank", "XXXXXM-----X" },
219 { "cntrl", "XXXXX-XX--XX" },
220 { "punct", "XXXXX-DD-X-X" },
221 { "alnum", "-----XDDXXX-" }
225 unsigned int space_value
;
226 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
228 /* Set default value for classes not specified. */
229 set_class_defaults (ctype
, charset
);
231 /* Check according to table. */
232 for (cnt
= 0; cnt
< ctype
->class_collection_max
; ++cnt
)
234 unsigned long int tmp
;
236 tmp
= ctype
->class_collection
[cnt
];
240 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
241 if ((tmp
& (1 << cls1
)) != 0)
242 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
243 if (valid_table
[cls1
].allow
[cls2
] != '-')
245 int eq
= (tmp
& (1 << cls2
)) != 0;
246 switch (valid_table
[cls1
].allow
[cls2
])
255 value
= ctype
->charnames
[cnt
];
257 if ((value
& 0xff000000) != 0)
258 cp
+= sprintf (cp
, "\\%o", (value
>> 24) & 0xff);
259 if ((value
& 0xffff0000) != 0)
260 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
261 if ((value
& 0xffffff00) != 0)
262 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
263 sprintf (cp
, "\\%o", value
& 0xff);
266 character %s'%s' in class `%s' must be in class `%s'"), value
> 256 ? "L" : "",
267 cp
, valid_table
[cls1
].name
,
268 valid_table
[cls2
].name
);
279 value
= ctype
->charnames
[cnt
];
281 if ((value
& 0xff000000) != 0)
282 cp
+= sprintf (cp
, "\\%o", value
>> 24);
283 if ((value
& 0xffff0000) != 0)
284 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
285 if ((value
& 0xffffff00) != 0)
286 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
287 sprintf (cp
, "\\%o", value
& 0xff);
290 character %s'%s' in class `%s' must not be in class `%s'"),
291 value
> 256 ? "L" : "", cp
,
292 valid_table
[cls1
].name
, valid_table
[cls2
].name
);
297 ctype
->class_collection
[cnt
] |= 1 << cls2
;
301 error (5, 0, _("internal error in %s, line %u"),
302 __FUNCTION__
, __LINE__
);
307 /* ... and now test <SP> as a special case. */
308 space_value
= charset_find_value (charset
, "SP", 2);
309 if (space_value
== ILLEGAL_CHAR_VALUE
)
310 error (0, 0, _("character <SP> not defined in character map"));
311 else if ((cnt
= BITPOS (tok_space
),
312 (ELEM (ctype
, class_collection
, , space_value
)
313 & BIT (tok_space
)) == 0)
314 || (cnt
= BITPOS (tok_blank
),
315 (ELEM (ctype
, class_collection
, , space_value
)
316 & BIT (tok_blank
)) == 0))
317 error (0, 0, _("<SP> character not in class `%s'"),
318 valid_table
[cnt
].name
);
319 else if ((cnt
= BITPOS (tok_punct
),
320 (ELEM (ctype
, class_collection
, , space_value
)
321 & BIT (tok_punct
)) != 0)
322 || (cnt
= BITPOS (tok_graph
),
323 (ELEM (ctype
, class_collection
, , space_value
)
326 error (0, 0, _("<SP> character must not be in class `%s'"),
327 valid_table
[cnt
].name
);
329 ELEM (ctype
, class_collection
, , space_value
) |= BIT (tok_print
);
331 /* Now that the tests are done make sure the name array contains all
332 characters which are handled in the WIDTH section of the
333 character set definition file. */
334 if (charset
->width_rules
!= NULL
)
335 for (cnt
= 0; cnt
< charset
->nwidth_rules
; ++cnt
)
338 for (inner
= charset
->width_rules
[cnt
].from
;
339 inner
<= charset
->width_rules
[cnt
].to
; ++inner
)
340 (void) find_idx (ctype
, NULL
, NULL
, NULL
, inner
);
346 ctype_output (struct localedef_t
*locale
, struct charset_t
*charset
,
347 const char *output_path
)
349 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
350 const size_t nelems
= (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)
351 + 2 * (ctype
->map_collection_nr
- 2));
352 struct iovec iov
[2 + nelems
+ ctype
->nr_charclass
353 + ctype
->map_collection_nr
];
354 struct locale_file data
;
355 u_int32_t idx
[nelems
];
356 size_t elem
, cnt
, offset
, total
;
359 if ((locale
->binary
& (1 << LC_CTYPE
)) != 0)
361 iov
[0].iov_base
= ctype
;
362 iov
[0].iov_len
= locale
->len
[LC_CTYPE
];
364 write_locale_data (output_path
, "LC_CTYPE", 1, iov
);
370 /* Now prepare the output: Find the sizes of the table we can use. */
371 allocate_arrays (ctype
, charset
);
373 data
.magic
= LIMAGIC (LC_CTYPE
);
375 iov
[0].iov_base
= (void *) &data
;
376 iov
[0].iov_len
= sizeof (data
);
378 iov
[1].iov_base
= (void *) idx
;
379 iov
[1].iov_len
= sizeof (idx
);
381 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
384 for (elem
= 0; elem
< nelems
; ++elem
)
386 if (elem
< _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
))
389 #define CTYPE_DATA(name, base, len) \
390 case _NL_ITEM_INDEX (name): \
391 iov[2 + elem + offset].iov_base = (base); \
392 iov[2 + elem + offset].iov_len = (len); \
393 if (elem + 1 < nelems) \
394 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
397 CTYPE_DATA (_NL_CTYPE_CLASS
,
399 (256 + 128) * sizeof (char_class_t
));
401 CTYPE_DATA (_NL_CTYPE_TOUPPER_EB
,
403 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
404 * sizeof (u_int32_t
));
405 CTYPE_DATA (_NL_CTYPE_TOLOWER_EB
,
407 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
408 * sizeof (u_int32_t
));
410 CTYPE_DATA (_NL_CTYPE_TOUPPER_EL
,
412 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
413 * sizeof (u_int32_t
));
414 CTYPE_DATA (_NL_CTYPE_TOLOWER_EL
,
416 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
417 * sizeof (u_int32_t
));
419 CTYPE_DATA (_NL_CTYPE_CLASS32
,
421 (ctype
->plane_size
* ctype
->plane_cnt
422 * sizeof (char_class32_t
)));
424 CTYPE_DATA (_NL_CTYPE_NAMES_EB
,
425 ctype
->names_eb
, (ctype
->plane_size
* ctype
->plane_cnt
426 * sizeof (u_int32_t
)));
427 CTYPE_DATA (_NL_CTYPE_NAMES_EL
,
428 ctype
->names_el
, (ctype
->plane_size
* ctype
->plane_cnt
429 * sizeof (u_int32_t
)));
431 CTYPE_DATA (_NL_CTYPE_HASH_SIZE
,
432 &ctype
->plane_size
, sizeof (u_int32_t
));
433 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS
,
434 &ctype
->plane_cnt
, sizeof (u_int32_t
));
436 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
437 /* The class name array. */
439 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
441 iov
[2 + elem
+ offset
].iov_base
442 = (void *) ctype
->classnames
[cnt
];
443 iov
[2 + elem
+ offset
].iov_len
444 = strlen (ctype
->classnames
[cnt
]) + 1;
445 total
+= iov
[2 + elem
+ offset
].iov_len
;
447 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
448 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
449 total
+= 1 + (4 - ((total
+ 1) % 4));
451 if (elem
+ 1 < nelems
)
452 idx
[elem
+ 1] = idx
[elem
] + total
;
455 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
456 /* The class name array. */
458 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
460 iov
[2 + elem
+ offset
].iov_base
461 = (void *) ctype
->mapnames
[cnt
];
462 iov
[2 + elem
+ offset
].iov_len
463 = strlen (ctype
->mapnames
[cnt
]) + 1;
464 total
+= iov
[2 + elem
+ offset
].iov_len
;
466 iov
[2 + elem
+ offset
].iov_base
= (void *) "\0\0\0";
467 iov
[2 + elem
+ offset
].iov_len
= 1 + (4 - ((total
+ 1) % 4));
468 total
+= 1 + (4 - ((total
+ 1) % 4));
470 if (elem
+ 1 < nelems
)
471 idx
[elem
+ 1] = idx
[elem
] + total
;
474 CTYPE_DATA (_NL_CTYPE_WIDTH
,
475 ctype
->width
, ctype
->plane_size
* ctype
->plane_cnt
);
477 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
478 &ctype
->mb_cur_max
, sizeof (u_int32_t
));
480 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME
):
481 total
= strlen (ctype
->codeset_name
) + 1;
483 iov
[2 + elem
+ offset
].iov_base
= (char *) ctype
->codeset_name
;
486 iov
[2 + elem
+ offset
].iov_base
= alloca ((total
+ 3) & ~3);
487 memcpy (iov
[2 + elem
+ offset
].iov_base
, ctype
->codeset_name
,
489 total
= (total
+ 3) & ~3;
491 iov
[2 + elem
+ offset
].iov_len
= total
;
492 if (elem
+ 1 < nelems
)
493 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
497 assert (! "unknown CTYPE element");
501 /* Handle extra maps. */
502 size_t nr
= (elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) >> 1;
504 if (((elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) & 1) == 0)
505 iov
[2 + elem
+ offset
].iov_base
= ctype
->map_eb
[nr
];
507 iov
[2 + elem
+ offset
].iov_base
= ctype
->map_el
[nr
];
509 iov
[2 + elem
+ offset
].iov_len
= ((ctype
->plane_size
510 * ctype
->plane_cnt
+ 128)
511 * sizeof (u_int32_t
));
513 if (elem
+ 1 < nelems
)
514 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
518 assert (2 + elem
+ offset
== (nelems
+ ctype
->nr_charclass
519 + ctype
->map_collection_nr
+ 2));
521 write_locale_data (output_path
, "LC_CTYPE", 2 + elem
+ offset
, iov
);
525 /* Character class handling. */
527 ctype_class_new (struct linereader
*lr
, struct localedef_t
*locale
,
528 enum token_t tok
, struct token
*code
,
529 struct charset_t
*charset
)
531 ctype_class_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
532 code
->val
.str
.start
);
537 ctype_is_charclass (struct linereader
*lr
, struct localedef_t
*locale
,
542 for (cnt
= 0; cnt
< locale
->categories
[LC_CTYPE
].ctype
->nr_charclass
; ++cnt
)
543 if (strcmp (name
, locale
->categories
[LC_CTYPE
].ctype
->classnames
[cnt
])
552 ctype_class_start (struct linereader
*lr
, struct localedef_t
*locale
,
553 enum token_t tok
, const char *str
,
554 struct charset_t
*charset
)
556 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
600 assert (! "illegal token as class name: should not happen");
603 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
604 if (strcmp (str
, ctype
->classnames
[cnt
]) == 0)
607 if (cnt
>= ctype
->nr_charclass
)
608 assert (! "unknown class in class definition: should not happen");
610 ctype
->class_done
|= BIT (tok
);
612 ctype
->current_class_mask
= 1 << cnt
;
613 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
618 ctype_class_from (struct linereader
*lr
, struct localedef_t
*locale
,
619 struct token
*code
, struct charset_t
*charset
)
621 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
624 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
626 ctype
->last_class_char
= value
;
628 if (value
== ILLEGAL_CHAR_VALUE
)
629 /* In the LC_CTYPE category it is no error when a character is
630 not found. This has to be ignored silently. */
633 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
634 &ctype
->class_collection_act
, value
)
635 |= ctype
->current_class_mask
;
640 ctype_class_to (struct linereader
*lr
, struct localedef_t
*locale
,
641 struct token
*code
, struct charset_t
*charset
)
643 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
644 unsigned int value
, cnt
;
646 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
648 assert (value
>= ctype
->last_class_char
);
650 for (cnt
= ctype
->last_class_char
+ 1; cnt
<= value
; ++cnt
)
651 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
652 &ctype
->class_collection_act
, cnt
)
653 |= ctype
->current_class_mask
;
655 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
660 ctype_class_end (struct linereader
*lr
, struct localedef_t
*locale
)
662 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
664 /* We have no special actions to perform here. */
665 ctype
->current_class_mask
= 0;
666 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
670 /* Character map handling. */
672 ctype_map_new (struct linereader
*lr
, struct localedef_t
*locale
,
673 enum token_t tok
, struct token
*code
,
674 struct charset_t
*charset
)
676 ctype_map_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
677 code
->val
.str
.start
, charset
);
682 ctype_is_charconv (struct linereader
*lr
, struct localedef_t
*locale
,
685 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
688 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
689 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
697 ctype_map_start (struct linereader
*lr
, struct localedef_t
*locale
,
698 enum token_t tok
, const char *name
, struct charset_t
*charset
)
700 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
706 ctype
->toupper_done
= 1;
710 ctype
->tolower_done
= 1;
716 assert (! "unknown token in category `LC_CTYPE' should not happen");
719 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
720 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
723 if (cnt
== ctype
->map_collection_nr
)
724 assert (! "unknown token in category `LC_CTYPE' should not happen");
726 ctype
->last_map_idx
= cnt
;
727 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
732 ctype_map_from (struct linereader
*lr
, struct localedef_t
*locale
,
733 struct token
*code
, struct charset_t
*charset
)
735 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
738 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
740 if (value
== ILLEGAL_CHAR_VALUE
)
741 /* In the LC_CTYPE category it is no error when a character is
742 not found. This has to be ignored silently. */
745 assert (ctype
->last_map_idx
< ctype
->map_collection_nr
);
747 ctype
->from_map_char
= value
;
752 ctype_map_to (struct linereader
*lr
, struct localedef_t
*locale
,
753 struct token
*code
, struct charset_t
*charset
)
755 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
758 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
760 if (ctype
->from_map_char
== ILLEGAL_CHAR_VALUE
761 || value
== ILLEGAL_CHAR_VALUE
)
763 /* In the LC_CTYPE category it is no error when a character is
764 not found. This has to be ignored silently. */
765 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
769 *find_idx (ctype
, &ctype
->map_collection
[ctype
->last_map_idx
],
770 &ctype
->map_collection_max
[ctype
->last_map_idx
],
771 &ctype
->map_collection_act
[ctype
->last_map_idx
],
772 ctype
->from_map_char
) = value
;
774 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
779 ctype_map_end (struct linereader
*lr
, struct localedef_t
*locale
)
781 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
783 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
784 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
788 /* Local functions. */
790 ctype_class_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
795 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
796 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
799 if (cnt
< ctype
->nr_charclass
)
801 lr_error (lr
, _("character class `%s' already defined"));
805 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
806 /* Exit code 2 is prescribed in P1003.2b. */
808 implementation limit: no more than %d character classes allowed"),
811 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
816 ctype_map_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
817 const char *name
, struct charset_t
*charset
)
819 size_t max_chars
= 0;
822 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
824 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
827 if (max_chars
< ctype
->map_collection_max
[cnt
])
828 max_chars
= ctype
->map_collection_max
[cnt
];
831 if (cnt
< ctype
->map_collection_nr
)
833 lr_error (lr
, _("character map `%s' already defined"));
837 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
838 /* Exit code 2 is prescribed in P1003.2b. */
840 implementation limit: no more than %d character maps allowed"),
843 ctype
->mapnames
[cnt
] = name
;
846 ctype
->map_collection_max
[cnt
] = charset
->mb_cur_max
== 1 ? 256 : 512;
848 ctype
->map_collection_max
[cnt
] = max_chars
;
850 ctype
->map_collection
[cnt
] = (u_int32_t
*)
851 xmalloc (sizeof (u_int32_t
) * ctype
->map_collection_max
[cnt
]);
852 memset (ctype
->map_collection
[cnt
], '\0',
853 sizeof (u_int32_t
) * ctype
->map_collection_max
[cnt
]);
854 ctype
->map_collection_act
[cnt
] = 256;
856 ++ctype
->map_collection_nr
;
860 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
861 is possible if we only want ot extend the name array. */
863 find_idx (struct locale_ctype_t
*ctype
, u_int32_t
**table
, size_t *max
,
864 size_t *act
, unsigned int idx
)
869 return table
== NULL
? NULL
: &(*table
)[idx
];
871 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
872 if (ctype
->charnames
[cnt
] == idx
)
875 /* We have to distinguish two cases: the names is found or not. */
876 if (cnt
== ctype
->charnames_act
)
878 /* Extend the name array. */
879 if (ctype
->charnames_act
== ctype
->charnames_max
)
881 ctype
->charnames_max
*= 2;
882 ctype
->charnames
= (unsigned int *)
883 xrealloc (ctype
->charnames
,
884 sizeof (unsigned int) * ctype
->charnames_max
);
886 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
890 /* We have done everything we are asked to do. */
897 size_t old_max
= *max
;
903 (u_int32_t
*) xrealloc (*table
, *max
* sizeof (unsigned long int));
904 memset (&(*table
)[old_max
], '\0',
905 (*max
- old_max
) * sizeof (u_int32_t
));
912 return &(*table
)[cnt
];
917 set_class_defaults (struct locale_ctype_t
*ctype
, struct charset_t
*charset
)
919 /* These function defines the default values for the classes and conversions
920 according to POSIX.2 2.5.2.1.
921 It may seem that the order of these if-blocks is arbitrary but it is NOT.
922 Don't move them unless you know what you do! */
924 void set_default (int bit
, int from
, int to
)
931 for (ch
= from
; ch
<= to
; ++ch
)
936 value
= charset_find_value (charset
, tmp
, 1);
937 if (value
== ILLEGAL_CHAR_VALUE
)
940 character `%s' not defined while needed as default value"),
945 ELEM (ctype
, class_collection
, , value
) |= bit
;
949 /* Set default values if keyword was not present. */
950 if ((ctype
->class_done
& BIT (tok_upper
)) == 0)
951 /* "If this keyword [lower] is not specified, the lowercase letters
952 `A' through `Z', ..., shall automatically belong to this class,
953 with implementation defined character values." [P1003.2, 2.5.2.1] */
954 set_default (BIT (tok_upper
), 'A', 'Z');
956 if ((ctype
->class_done
& BIT (tok_lower
)) == 0)
957 /* "If this keyword [lower] is not specified, the lowercase letters
958 `a' through `z', ..., shall automatically belong to this class,
959 with implementation defined character values." [P1003.2, 2.5.2.1] */
960 set_default (BIT (tok_lower
), 'a', 'z');
962 if ((ctype
->class_done
& BIT (tok_alpha
)) == 0)
964 /* Table 2-6 in P1003.2 says that characters in class `upper' or
965 class `lower' *must* be in class `alpha'. */
966 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
969 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
970 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
971 ctype
->class_collection
[cnt
] |= BIT (tok_alpha
);
974 if ((ctype
->class_done
& BIT (tok_digit
)) == 0)
975 /* "If this keyword [digit] is not specified, the digits `0' through
976 `9', ..., shall automatically belong to this class, with
977 implementation-defined character values." [P1003.2, 2.5.2.1] */
978 set_default (BIT (tok_digit
), '0', '9');
980 /* "Only characters specified for the `alpha' and `digit' keyword
981 shall be specified. Characters specified for the keyword `alpha'
982 and `digit' are automatically included in this class. */
984 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
987 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
988 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
989 ctype
->class_collection
[cnt
] |= BIT (tok_alnum
);
992 if ((ctype
->class_done
& BIT (tok_space
)) == 0)
993 /* "If this keyword [space] is not specified, the characters <space>,
994 <form-feed>, <newline>, <carriage-return>, <tab>, and
995 <vertical-tab>, ..., shall automatically belong to this class,
996 with implementation-defined character values." [P1003.2, 2.5.2.1] */
1000 value
= charset_find_value (charset
, "space", 5);
1001 if (value
== ILLEGAL_CHAR_VALUE
)
1003 character `%s' not defined while needed as default value"),
1006 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1008 value
= charset_find_value (charset
, "form-feed", 9);
1009 if (value
== ILLEGAL_CHAR_VALUE
)
1011 character `%s' not defined while needed as default value"),
1014 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1016 value
= charset_find_value (charset
, "newline", 7);
1017 if (value
== ILLEGAL_CHAR_VALUE
)
1019 character `%s' not defined while needed as default value"),
1022 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1024 value
= charset_find_value (charset
, "carriage-return", 15);
1025 if (value
== ILLEGAL_CHAR_VALUE
)
1027 character `%s' not defined while needed as default value"),
1028 "<carriage-return>");
1030 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1032 value
= charset_find_value (charset
, "tab", 3);
1033 if (value
== ILLEGAL_CHAR_VALUE
)
1035 character `%s' not defined while needed as default value"),
1038 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1040 value
= charset_find_value (charset
, "vertical-tab", 12);
1041 if (value
== ILLEGAL_CHAR_VALUE
)
1043 character `%s' not defined while needed as default value"),
1046 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1049 if ((ctype
->class_done
& BIT (tok_xdigit
)) == 0)
1050 /* "If this keyword is not specified, the digits `0' to `9', the
1051 uppercase letters `A' through `F', and the lowercase letters `a'
1052 through `f', ..., shell automatically belong to this class, with
1053 implementation defined character values." [P1003.2, 2.5.2.1] */
1055 set_default (BIT (tok_xdigit
), '0', '9');
1056 set_default (BIT (tok_xdigit
), 'A', 'F');
1057 set_default (BIT (tok_xdigit
), 'a', 'f');
1060 if ((ctype
->class_done
& BIT (tok_blank
)) == 0)
1061 /* "If this keyword [blank] is unspecified, the characters <space> and
1062 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
1066 value
= charset_find_value (charset
, "space", 5);
1067 if (value
== ILLEGAL_CHAR_VALUE
)
1069 character `%s' not defined while needed as default value"),
1072 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1074 value
= charset_find_value (charset
, "tab", 3);
1075 if (value
== ILLEGAL_CHAR_VALUE
)
1077 character `%s' not defined while needed as default value"),
1080 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1083 if ((ctype
->class_done
& BIT (tok_graph
)) == 0)
1084 /* "If this keyword [graph] is not specified, characters specified for
1085 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
1086 shall belong to this character class." [P1003.2, 2.5.2.1] */
1088 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1089 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1092 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1093 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1094 ctype
->class_collection
[cnt
] |= BIT (tok_graph
);
1097 if ((ctype
->class_done
& BIT (tok_print
)) == 0)
1098 /* "If this keyword [print] is not provided, characters specified for
1099 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
1100 and the <space> character shall belong to this character class."
1101 [P1003.2, 2.5.2.1] */
1103 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1104 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1108 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1109 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1110 ctype
->class_collection
[cnt
] |= BIT (tok_print
);
1112 space
= charset_find_value (charset
, "space", 5);
1113 if (space
== ILLEGAL_CHAR_VALUE
)
1115 character `%s' not defined while needed as default value"),
1118 ELEM (ctype
, class_collection
, , space
) |= BIT (tok_print
);
1121 if (ctype
->toupper_done
== 0)
1122 /* "If this keyword [toupper] is not spcified, the lowercase letters
1123 `a' through `z', and their corresponding uppercase letters `A' to
1124 `Z', ..., shall automatically be included, with implementation-
1125 defined character values." [P1003.2, 2.5.2.1] */
1130 strcpy (tmp
, "<?>");
1132 for (ch
= 'a'; ch
<= 'z'; ++ch
)
1134 unsigned int value_from
, value_to
;
1138 value_from
= charset_find_value (charset
, &tmp
[1], 1);
1139 if (value_from
== ILLEGAL_CHAR_VALUE
)
1142 character `%c' not defined while needed as default value"),
1147 /* This conversion is implementation defined. */
1148 tmp
[1] = (char) (ch
+ ('A' - 'a'));
1149 value_to
= charset_find_value (charset
, &tmp
[1], 1);
1153 character `%s' not defined while needed as default value"),
1158 /* The index [0] is determined by the order of the
1159 `ctype_map_newP' calls in `ctype_startup'. */
1160 ELEM (ctype
, map_collection
, [0], value_from
) = value_to
;
1164 if (ctype
->tolower_done
== 0)
1165 /* "If this keyword [tolower] is not specified, the mapping shall be
1166 the reverse mapping of the one specified to `toupper'." [P1003.2] */
1170 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
1171 if (ctype
->map_collection
[0][cnt
] != 0)
1172 ELEM (ctype
, map_collection
, [1],
1173 ctype
->map_collection
[0][cnt
])
1174 = ctype
->charnames
[cnt
];
1180 allocate_arrays (struct locale_ctype_t
*ctype
, struct charset_t
*charset
)
1184 /* First we have to decide how we organize the arrays. It is easy for
1185 a one-byte character set. But multi-byte character set cannot be
1186 stored flat because they might be sparsly used. So we determine an
1187 optimal hashing function for the used characters.
1189 We use a very trivial hashing function to store the sparse table.
1190 CH % TABSIZE is used as an index. To solve multiple hits we have
1191 N planes. This gurantees a fixed search time for a character [N
1192 / 2]. In the following code we determine the minmum value for
1193 TABSIZE * N, where TABSIZE >= 256. */
1194 size_t min_total
= UINT_MAX
;
1195 size_t act_size
= 256;
1198 Computing table size for character classes might take a while..."),
1201 while (act_size
< min_total
)
1203 size_t cnt
[act_size
];
1204 size_t act_planes
= 1;
1206 memset (cnt
, '\0', sizeof cnt
);
1208 for (idx
= 0; idx
< 256; ++idx
)
1211 for (idx
= 0; idx
< ctype
->charnames_act
; ++idx
)
1212 if (ctype
->charnames
[idx
] >= 256)
1214 size_t nr
= ctype
->charnames
[idx
] % act_size
;
1216 if (++cnt
[nr
] > act_planes
)
1218 act_planes
= cnt
[nr
];
1219 if (act_size
* act_planes
>= min_total
)
1224 if (act_size
* act_planes
< min_total
)
1226 min_total
= act_size
* act_planes
;
1227 ctype
->plane_size
= act_size
;
1228 ctype
->plane_cnt
= act_planes
;
1234 fprintf (stderr
, _(" done\n"));
1237 #if __BYTE_ORDER == __LITTLE_ENDIAN
1238 # define NAMES_B1 ctype->names_el
1239 # define NAMES_B2 ctype->names_eb
1241 # define NAMES_B1 ctype->names_eb
1242 # define NAMES_B2 ctype->names_el
1245 ctype
->names_eb
= (u_int32_t
*) xcalloc (ctype
->plane_size
1247 sizeof (u_int32_t
));
1248 ctype
->names_el
= (u_int32_t
*) xcalloc (ctype
->plane_size
1250 sizeof (u_int32_t
));
1252 for (idx
= 1; idx
< 256; ++idx
)
1253 NAMES_B1
[idx
] = idx
;
1255 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
1258 for (idx
= 256; idx
< ctype
->charnames_act
; ++idx
)
1260 size_t nr
= (ctype
->charnames
[idx
] % ctype
->plane_size
);
1263 while (NAMES_B1
[nr
+ depth
* ctype
->plane_size
])
1265 assert (depth
< ctype
->plane_cnt
);
1267 NAMES_B1
[nr
+ depth
* ctype
->plane_size
] = ctype
->charnames
[idx
];
1269 /* Now for faster access remember the index in the NAMES_B array. */
1270 ctype
->charnames
[idx
] = nr
+ depth
* ctype
->plane_size
;
1274 for (idx
= 0; idx
< ctype
->plane_size
* ctype
->plane_cnt
; ++idx
)
1275 NAMES_B2
[idx
] = SWAPU32 (NAMES_B1
[idx
]);
1278 /* You wonder about this amount of memory? This is only because some
1279 users do not manage to address the array with unsigned values or
1280 data types with range >= 256. '\200' would result in the array
1281 index -128. To help these poor people we duplicate the entries for
1282 128 up to 255 below the entry for \0. */
1283 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128,
1284 sizeof (char_class_t
));
1285 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (ctype
->plane_size
1287 sizeof (char_class32_t
));
1289 /* Fill in the character class information. */
1290 #if __BYTE_ORDER == __LITTLE_ENDIAN
1291 # define TRANS(w) CHAR_CLASS_TRANS (w)
1292 # define TRANS32(w) CHAR_CLASS32_TRANS (w)
1294 # define TRANS(w) (w)
1295 # define TRANS32(w) (w)
1298 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1299 if (ctype
->charnames
[idx
] < 256)
1300 ctype
->ctype_b
[128 + ctype
->charnames
[idx
]]
1301 = TRANS (ctype
->class_collection
[idx
]);
1303 /* Mirror first 127 entries. We must take care that entry -1 is not
1304 mirrored because EOF == -1. */
1305 for (idx
= 0; idx
< 127; ++idx
)
1306 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
1308 /* The 32 bit array contains all characters. */
1309 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1310 ctype
->ctype32_b
[ctype
->charnames
[idx
]]
1311 = TRANS32 (ctype
->class_collection
[idx
]);
1313 /* Room for table of mappings. */
1314 ctype
->map_eb
= (u_int32_t
**) xmalloc (ctype
->map_collection_nr
1315 * sizeof (u_int32_t
*));
1316 ctype
->map_el
= (u_int32_t
**) xmalloc (ctype
->map_collection_nr
1317 * sizeof (u_int32_t
*));
1319 /* Fill in all mappings. */
1320 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
1324 /* Allocate table. */
1325 ctype
->map_eb
[idx
] = (u_int32_t
*) xmalloc ((ctype
->plane_size
1326 * ctype
->plane_cnt
+ 128)
1327 * sizeof (u_int32_t
));
1328 ctype
->map_el
[idx
] = (u_int32_t
*) xmalloc ((ctype
->plane_size
1329 * ctype
->plane_cnt
+ 128)
1330 * sizeof (u_int32_t
));
1332 #if __BYTE_ORDER == __LITTLE_ENDIAN
1333 # define MAP_B1 ctype->map_el
1334 # define MAP_B2 ctype->map_eb
1336 # define MAP_B1 ctype->map_eb
1337 # define MAP_B2 ctype->map_el
1340 /* Copy default value (identity mapping). */
1341 memcpy (&MAP_B1
[idx
][128], NAMES_B1
,
1342 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (u_int32_t
));
1344 /* Copy values from collection. */
1345 for (idx2
= 0; idx2
< ctype
->map_collection_act
[idx
]; ++idx2
)
1346 if (ctype
->map_collection
[idx
][idx2
] != 0)
1347 MAP_B1
[idx
][128 + ctype
->charnames
[idx2
]] =
1348 ctype
->map_collection
[idx
][idx2
];
1350 /* Mirror first 127 entries. We must take care not to map entry
1351 -1 because EOF == -1. */
1352 for (idx2
= 0; idx2
< 127; ++idx2
)
1353 MAP_B1
[idx
][idx2
] = MAP_B1
[idx
][256 + idx2
];
1355 /* EOF must map to EOF. */
1356 MAP_B1
[idx
][127] = EOF
;
1358 /* And now the other byte order. */
1359 for (idx2
= 0; idx2
< ctype
->plane_size
* ctype
->plane_cnt
+ 128; ++idx2
)
1360 MAP_B2
[idx
][idx2
] = SWAPU32 (MAP_B1
[idx
][idx2
]);
1363 /* Extra array for class and map names. */
1364 ctype
->class_name_ptr
= (u_int32_t
*) xmalloc (ctype
->nr_charclass
1365 * sizeof (u_int32_t
));
1366 ctype
->map_name_ptr
= (u_int32_t
*) xmalloc (ctype
->map_collection_nr
1367 * sizeof (u_int32_t
));
1369 /* Array for width information. Because the expected width are very
1370 small we use only one single byte. This save space and we need
1371 not provide the information twice with both endianesses. */
1372 ctype
->width
= (unsigned char *) xmalloc (ctype
->plane_size
1373 * ctype
->plane_cnt
);
1374 /* Initialize with default width value. */
1375 memset (ctype
->width
, charset
->width_default
,
1376 ctype
->plane_size
* ctype
->plane_cnt
);
1377 if (charset
->width_rules
!= NULL
)
1381 for (cnt
= 0; cnt
< charset
->nwidth_rules
; ++cnt
)
1382 if (charset
->width_rules
[cnt
].width
!= charset
->width_default
)
1383 for (idx
= charset
->width_rules
[cnt
].from
;
1384 idx
<= charset
->width_rules
[cnt
].to
; ++idx
)
1386 size_t nr
= idx
% ctype
->plane_size
;
1389 while (NAMES_B1
[nr
+ depth
* ctype
->plane_size
] != nr
)
1391 assert (depth
< ctype
->plane_cnt
);
1393 ctype
->width
[nr
+ depth
* ctype
->plane_size
]
1394 = charset
->width_rules
[cnt
].width
;
1398 /* Compute MB_CUR_MAX. Please note the value mb_cur_max in the
1399 character set definition gives the number of bytes in the wide
1400 character representation. We compute the number of bytes used
1401 for the UTF-8 encoded form. */
1402 ctype
->mb_cur_max
= ((int []) { 2, 3, 5, 6 }) [charset
->mb_cur_max
- 1];
1404 /* We need the name of the currently used 8-bit character set to
1405 make correct conversion between this 8-bit representation and the
1406 ISO 10646 character set used internally for wide characters. */
1407 ctype
->codeset_name
= charset
->code_set_name
;