1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
29 #include "localeinfo.h"
31 #include "locfile-token.h"
32 #include "stringtrans.h"
34 /* Uncomment the following line in the production version. */
39 void *xmalloc (size_t __n
);
40 void *xcalloc (size_t __n
, size_t __s
);
41 void *xrealloc (void *__ptr
, size_t __n
);
44 /* The bit used for representing a special class. */
45 #define BITPOS(class) ((class) - tok_upper)
46 #define BIT(class) (1 << BITPOS (class))
48 #define ELEM(ctype, collection, idx, value) \
49 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
50 &ctype->collection##_act idx, value)
53 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
56 ((((w) >> 8) & 0xff) | (((w) & 0xff) << 8))
59 /* To be compatible with former implementations we for now restrict
60 the number of bits for character classes to 16. When compatibility
61 is not necessary anymore increase the number to 32. */
62 #define char_class_t u_int16_t
63 #define CHAR_CLASS_TRANS SWAPU16
64 #define char_class32_t u_int32_t
65 #define CHAR_CLASS32_TRANS SWAPU32
68 /* The real definition of the struct for the LC_CTYPE locale. */
71 unsigned int *charnames
;
75 /* We will allow up to 8 * sizeof(u_int32_t) - 1 character classes. */
76 #define MAX_NR_CHARCLASS (8 * sizeof (u_int32_t) - 1)
78 const char *classnames
[MAX_NR_CHARCLASS
];
79 unsigned long int current_class_mask
;
80 unsigned int last_class_char
;
81 u_int32_t
*class_collection
;
82 size_t class_collection_max
;
83 size_t class_collection_act
;
84 unsigned long int class_done
;
86 /* If the following number ever turns out to be too small simply
87 increase it. But I doubt it will. --drepper@gnu */
88 #define MAX_NR_CHARMAP 16
89 const char *mapnames
[MAX_NR_CHARMAP
];
90 u_int32_t
*map_collection
[MAX_NR_CHARMAP
];
91 u_int32_t map_collection_max
[MAX_NR_CHARMAP
];
92 u_int32_t map_collection_act
[MAX_NR_CHARMAP
];
93 size_t map_collection_nr
;
95 unsigned int from_map_char
;
99 /* The arrays for the binary representation. */
100 u_int32_t plane_size
;
102 char_class_t
*ctype_b
;
103 char_class32_t
*ctype32_b
;
108 u_int32_t
*class_name_ptr
;
109 u_int32_t
*map_name_ptr
;
110 unsigned char *width
;
111 u_int32_t mb_cur_max
;
112 const char *codeset_name
;
116 /* Prototypes for local functions. */
117 static void ctype_class_newP (struct linereader
*lr
,
118 struct locale_ctype_t
*ctype
, const char *name
);
119 static void ctype_map_newP (struct linereader
*lr
,
120 struct locale_ctype_t
*ctype
,
121 const char *name
, struct charset_t
*charset
);
122 static u_int32_t
*find_idx (struct locale_ctype_t
*ctype
, u_int32_t
**table
,
123 size_t *max
, size_t *act
, unsigned int idx
);
124 static void set_class_defaults (struct locale_ctype_t
*ctype
,
125 struct charset_t
*charset
);
126 static void allocate_arrays (struct locale_ctype_t
*ctype
,
127 struct charset_t
*charset
);
131 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
132 struct charset_t
*charset
)
135 struct locale_ctype_t
*ctype
;
137 /* It is important that we always use UCS1 encoding for strings now. */
138 encoding_method
= ENC_UCS1
;
140 /* Allocate the needed room. */
141 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
142 (struct locale_ctype_t
*) xmalloc (sizeof (struct locale_ctype_t
));
144 /* We have no names seen yet. */
145 ctype
->charnames_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
147 (unsigned int *) xmalloc (ctype
->charnames_max
* sizeof (unsigned int));
148 for (cnt
= 0; cnt
< 256; ++cnt
)
149 ctype
->charnames
[cnt
] = cnt
;
150 ctype
->charnames_act
= 256;
152 /* Fill character class information. */
153 ctype
->nr_charclass
= 0;
154 ctype
->current_class_mask
= 0;
155 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
156 /* The order of the following instructions determines the bit
158 ctype_class_newP (lr
, ctype
, "upper");
159 ctype_class_newP (lr
, ctype
, "lower");
160 ctype_class_newP (lr
, ctype
, "alpha");
161 ctype_class_newP (lr
, ctype
, "digit");
162 ctype_class_newP (lr
, ctype
, "xdigit");
163 ctype_class_newP (lr
, ctype
, "space");
164 ctype_class_newP (lr
, ctype
, "print");
165 ctype_class_newP (lr
, ctype
, "graph");
166 ctype_class_newP (lr
, ctype
, "blank");
167 ctype_class_newP (lr
, ctype
, "cntrl");
168 ctype_class_newP (lr
, ctype
, "punct");
169 ctype_class_newP (lr
, ctype
, "alnum");
171 ctype
->class_collection_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
172 ctype
->class_collection
173 = (u_int32_t
*) xmalloc (sizeof (unsigned long int)
174 * ctype
->class_collection_max
);
175 memset (ctype
->class_collection
, '\0',
176 sizeof (unsigned long int) * ctype
->class_collection_max
);
177 ctype
->class_collection_act
= 256;
179 /* Fill character map information. */
180 ctype
->map_collection_nr
= 0;
181 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
182 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
183 ctype_map_newP (lr
, ctype
, "toupper", charset
);
184 ctype_map_newP (lr
, ctype
, "tolower", charset
);
186 /* Fill first 256 entries in `toupper' and `tolower' arrays. */
187 for (cnt
= 0; cnt
< 256; ++cnt
)
189 ctype
->map_collection
[0][cnt
] = cnt
;
190 ctype
->map_collection
[1][cnt
] = cnt
;
196 ctype_finish (struct localedef_t
*locale
, struct charset_t
*charset
)
198 /* See POSIX.2, table 2-6 for the meaning of the following table. */
203 const char allow
[NCLASS
];
205 valid_table
[NCLASS
] =
207 /* The order is important. See token.h for more information.
208 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
209 { "upper", "--MX-XDDXXX-" },
210 { "lower", "--MX-XDDXXX-" },
211 { "alpha", "---X-XDDXXX-" },
212 { "digit", "XXX--XDDXXX-" },
213 { "xdigit", "-----XDDXXX-" },
214 { "space", "XXXXX------X" },
215 { "print", "---------X--" },
216 { "graph", "---------X--" },
217 { "blank", "XXXXXM-----X" },
218 { "cntrl", "XXXXX-XX--XX" },
219 { "punct", "XXXXX-DD-X-X" },
220 { "alnum", "-----XDDXXX-" }
224 unsigned int space_value
;
225 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
227 /* Set default value for classes not specified. */
228 set_class_defaults (ctype
, charset
);
230 /* Check according to table. */
231 for (cnt
= 0; cnt
< ctype
->class_collection_max
; ++cnt
)
233 unsigned long int tmp
;
235 tmp
= ctype
->class_collection
[cnt
];
239 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
240 if ((tmp
& (1 << cls1
)) != 0)
241 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
242 if (valid_table
[cls1
].allow
[cls2
] != '-')
244 int eq
= (tmp
& (1 << cls2
)) != 0;
245 switch (valid_table
[cls1
].allow
[cls2
])
254 value
= ctype
->charnames
[cnt
];
256 if ((value
& 0xff000000) != 0)
257 cp
+= sprintf (cp
, "\\%o", (value
>> 24) & 0xff);
258 if ((value
& 0xffff0000) != 0)
259 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
260 if ((value
& 0xffffff00) != 0)
261 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
262 sprintf (cp
, "\\%o", value
& 0xff);
265 character %s'%s' in class `%s' must be in class `%s'"), value
> 256 ? "L" : "",
266 cp
, valid_table
[cls1
].name
,
267 valid_table
[cls2
].name
);
278 value
= ctype
->charnames
[cnt
];
280 if ((value
& 0xff000000) != 0)
281 cp
+= sprintf (cp
, "\\%o", value
>> 24);
282 if ((value
& 0xffff0000) != 0)
283 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
284 if ((value
& 0xffffff00) != 0)
285 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
286 sprintf (cp
, "\\%o", value
& 0xff);
289 character %s'%s' in class `%s' must not be in class `%s'"),
290 value
> 256 ? "L" : "", cp
,
291 valid_table
[cls1
].name
, valid_table
[cls2
].name
);
296 ctype
->class_collection
[cnt
] |= 1 << cls2
;
300 error (5, 0, _("internal error in %s, line %u"),
301 __FUNCTION__
, __LINE__
);
306 /* ... and now test <SP> as a special case. */
307 space_value
= charset_find_value (charset
, "SP", 2);
308 if (space_value
== ILLEGAL_CHAR_VALUE
)
309 error (0, 0, _("character <SP> not defined in character map"));
310 else if ((cnt
= BITPOS (tok_space
),
311 (ELEM (ctype
, class_collection
, , space_value
)
312 & BIT (tok_space
)) == 0)
313 || (cnt
= BITPOS (tok_blank
),
314 (ELEM (ctype
, class_collection
, , space_value
)
315 & BIT (tok_blank
)) == 0))
316 error (0, 0, _("<SP> character not in class `%s'"),
317 valid_table
[cnt
].name
);
318 else if ((cnt
= BITPOS (tok_punct
),
319 (ELEM (ctype
, class_collection
, , space_value
)
320 & BIT (tok_punct
)) != 0)
321 || (cnt
= BITPOS (tok_graph
),
322 (ELEM (ctype
, class_collection
, , space_value
)
325 error (0, 0, _("<SP> character must not be in class `%s'"),
326 valid_table
[cnt
].name
);
328 ELEM (ctype
, class_collection
, , space_value
) |= BIT (tok_print
);
330 /* Now that the tests are done make sure the name array contains all
331 characters which are handled in the WIDTH section of the
332 character set definition file. */
333 if (charset
->width_rules
!= NULL
)
334 for (cnt
= 0; cnt
< charset
->nwidth_rules
; ++cnt
)
337 for (inner
= charset
->width_rules
[cnt
].from
;
338 inner
<= charset
->width_rules
[cnt
].to
; ++inner
)
339 (void) find_idx (ctype
, NULL
, NULL
, NULL
, inner
);
345 ctype_output (struct localedef_t
*locale
, struct charset_t
*charset
,
346 const char *output_path
)
348 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
349 const size_t nelems
= (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)
350 + 2 * (ctype
->map_collection_nr
- 2));
351 struct iovec iov
[2 + nelems
+ ctype
->nr_charclass
352 + ctype
->map_collection_nr
];
353 struct locale_file data
;
354 u_int32_t idx
[nelems
];
355 size_t elem
, cnt
, offset
, total
;
358 if ((locale
->binary
& (1 << LC_CTYPE
)) != 0)
360 iov
[0].iov_base
= ctype
;
361 iov
[0].iov_len
= locale
->len
[LC_CTYPE
];
363 write_locale_data (output_path
, "LC_CTYPE", 1, iov
);
369 /* Now prepare the output: Find the sizes of the table we can use. */
370 allocate_arrays (ctype
, charset
);
372 data
.magic
= LIMAGIC (LC_CTYPE
);
374 iov
[0].iov_base
= (void *) &data
;
375 iov
[0].iov_len
= sizeof (data
);
377 iov
[1].iov_base
= (void *) idx
;
378 iov
[1].iov_len
= sizeof (idx
);
380 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
383 for (elem
= 0; elem
< nelems
; ++elem
)
385 if (elem
< _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
))
388 #define CTYPE_DATA(name, base, len) \
389 case _NL_ITEM_INDEX (name): \
390 iov[2 + elem + offset].iov_base = base; \
391 iov[2 + elem + offset].iov_len = len; \
392 if (elem + 1 < nelems) \
393 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
396 CTYPE_DATA (_NL_CTYPE_CLASS
,
398 (256 + 128) * sizeof (char_class_t
));
400 CTYPE_DATA (_NL_CTYPE_TOUPPER_EB
,
402 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
403 * sizeof (u_int32_t
));
404 CTYPE_DATA (_NL_CTYPE_TOLOWER_EB
,
406 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
407 * sizeof (u_int32_t
));
409 CTYPE_DATA (_NL_CTYPE_TOUPPER_EL
,
411 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
412 * sizeof (u_int32_t
));
413 CTYPE_DATA (_NL_CTYPE_TOLOWER_EL
,
415 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
416 * sizeof (u_int32_t
));
418 CTYPE_DATA (_NL_CTYPE_CLASS32
,
420 (ctype
->plane_size
* ctype
->plane_cnt
421 * sizeof (char_class32_t
)));
423 CTYPE_DATA (_NL_CTYPE_NAMES_EB
,
424 ctype
->names_eb
, (ctype
->plane_size
* ctype
->plane_cnt
425 * sizeof (u_int32_t
)));
426 CTYPE_DATA (_NL_CTYPE_NAMES_EL
,
427 ctype
->names_el
, (ctype
->plane_size
* ctype
->plane_cnt
428 * sizeof (u_int32_t
)));
430 CTYPE_DATA (_NL_CTYPE_HASH_SIZE
,
431 &ctype
->plane_size
, sizeof (u_int32_t
));
432 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS
,
433 &ctype
->plane_cnt
, sizeof (u_int32_t
));
435 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES
):
436 /* The class name array. */
438 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++offset
)
440 iov
[2 + elem
+ offset
].iov_base
441 = (void *) ctype
->classnames
[cnt
];
442 iov
[2 + elem
+ offset
].iov_len
443 = strlen (ctype
->classnames
[cnt
]) + 1;
444 total
+= iov
[2 + elem
+ offset
].iov_len
;
446 iov
[2 + elem
+ offset
].iov_base
= (void *) "";
447 iov
[2 + elem
+ offset
].iov_len
= 1;
450 if (elem
+ 1 < nelems
)
451 idx
[elem
+ 1] = idx
[elem
] + total
;
454 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES
):
455 /* The class name array. */
457 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++offset
)
459 iov
[2 + elem
+ offset
].iov_base
460 = (void *) ctype
->mapnames
[cnt
];
461 iov
[2 + elem
+ offset
].iov_len
462 = strlen (ctype
->mapnames
[cnt
]) + 1;
463 total
+= iov
[2 + elem
+ offset
].iov_len
;
465 iov
[2 + elem
+ offset
].iov_base
= (void *) "";
466 iov
[2 + elem
+ offset
].iov_len
= 1;
469 if (elem
+ 1 < nelems
)
470 idx
[elem
+ 1] = idx
[elem
] + total
;
473 CTYPE_DATA (_NL_CTYPE_WIDTH
,
474 ctype
->width
, ctype
->plane_size
* ctype
->plane_cnt
);
476 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX
,
477 &ctype
->mb_cur_max
, sizeof (u_int32_t
));
479 CTYPE_DATA (_NL_CTYPE_CODESET_NAME
,
480 ctype
->codeset_name
, strlen (ctype
->codeset_name
) + 1);
483 assert (! "unknown CTYPE element");
487 /* Handle extra maps. */
488 size_t nr
= (elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) >> 1;
490 if (((elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) & 1) == 0)
491 iov
[2 + elem
+ offset
].iov_base
= ctype
->map_eb
[nr
];
493 iov
[2 + elem
+ offset
].iov_base
= ctype
->map_el
[nr
];
495 iov
[2 + elem
+ offset
].iov_len
= ((ctype
->plane_size
496 * ctype
->plane_cnt
+ 128)
497 * sizeof (u_int32_t
));
499 if (elem
+ 1 < nelems
)
500 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
+ offset
].iov_len
;
504 assert (2 + elem
+ offset
== (nelems
+ ctype
->nr_charclass
505 + ctype
->map_collection_nr
+ 2));
507 write_locale_data (output_path
, "LC_CTYPE", 2 + elem
+ offset
, iov
);
511 /* Character class handling. */
513 ctype_class_new (struct linereader
*lr
, struct localedef_t
*locale
,
514 enum token_t tok
, struct token
*code
,
515 struct charset_t
*charset
)
517 ctype_class_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
518 code
->val
.str
.start
);
523 ctype_is_charclass (struct linereader
*lr
, struct localedef_t
*locale
,
528 for (cnt
= 0; cnt
< locale
->categories
[LC_CTYPE
].ctype
->nr_charclass
; ++cnt
)
529 if (strcmp (name
, locale
->categories
[LC_CTYPE
].ctype
->classnames
[cnt
])
538 ctype_class_start (struct linereader
*lr
, struct localedef_t
*locale
,
539 enum token_t tok
, const char *str
,
540 struct charset_t
*charset
)
542 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
586 assert (! "illegal token as class name: should not happen");
589 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
590 if (strcmp (str
, ctype
->classnames
[cnt
]) == 0)
593 if (cnt
>= ctype
->nr_charclass
)
594 assert (! "unknown class in class definition: should not happen");
596 ctype
->class_done
|= BIT (tok
);
598 ctype
->current_class_mask
= 1 << cnt
;
599 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
604 ctype_class_from (struct linereader
*lr
, struct localedef_t
*locale
,
605 struct token
*code
, struct charset_t
*charset
)
607 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
610 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
612 ctype
->last_class_char
= value
;
614 if (value
== ILLEGAL_CHAR_VALUE
)
615 /* In the LC_CTYPE category it is no error when a character is
616 not found. This has to be ignored silently. */
619 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
620 &ctype
->class_collection_act
, value
)
621 |= ctype
->current_class_mask
;
626 ctype_class_to (struct linereader
*lr
, struct localedef_t
*locale
,
627 struct token
*code
, struct charset_t
*charset
)
629 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
630 unsigned int value
, cnt
;
632 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
634 assert (value
>= ctype
->last_class_char
);
636 for (cnt
= ctype
->last_class_char
+ 1; cnt
<= value
; ++cnt
)
637 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
638 &ctype
->class_collection_act
, cnt
)
639 |= ctype
->current_class_mask
;
641 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
646 ctype_class_end (struct linereader
*lr
, struct localedef_t
*locale
)
648 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
650 /* We have no special actions to perform here. */
651 ctype
->current_class_mask
= 0;
652 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
656 /* Character map handling. */
658 ctype_map_new (struct linereader
*lr
, struct localedef_t
*locale
,
659 enum token_t tok
, struct token
*code
,
660 struct charset_t
*charset
)
662 ctype_map_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
663 code
->val
.str
.start
, charset
);
668 ctype_is_charconv (struct linereader
*lr
, struct localedef_t
*locale
,
671 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
674 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
675 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
683 ctype_map_start (struct linereader
*lr
, struct localedef_t
*locale
,
684 enum token_t tok
, const char *name
, struct charset_t
*charset
)
686 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
692 ctype
->toupper_done
= 1;
696 ctype
->tolower_done
= 1;
702 assert (! "unknown token in category `LC_CTYPE' should not happen");
705 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
706 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
709 if (cnt
== ctype
->map_collection_nr
)
710 assert (! "unknown token in category `LC_CTYPE' should not happen");
712 ctype
->last_map_idx
= cnt
;
713 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
718 ctype_map_from (struct linereader
*lr
, struct localedef_t
*locale
,
719 struct token
*code
, struct charset_t
*charset
)
721 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
724 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
726 if (value
== ILLEGAL_CHAR_VALUE
)
727 /* In the LC_CTYPE category it is no error when a character is
728 not found. This has to be ignored silently. */
731 assert (ctype
->last_map_idx
< ctype
->map_collection_nr
);
733 ctype
->from_map_char
= value
;
738 ctype_map_to (struct linereader
*lr
, struct localedef_t
*locale
,
739 struct token
*code
, struct charset_t
*charset
)
741 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
744 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
746 if (ctype
->from_map_char
== ILLEGAL_CHAR_VALUE
747 || value
== ILLEGAL_CHAR_VALUE
)
749 /* In the LC_CTYPE category it is no error when a character is
750 not found. This has to be ignored silently. */
751 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
755 *find_idx (ctype
, &ctype
->map_collection
[ctype
->last_map_idx
],
756 &ctype
->map_collection_max
[ctype
->last_map_idx
],
757 &ctype
->map_collection_act
[ctype
->last_map_idx
],
758 ctype
->from_map_char
) = value
;
760 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
765 ctype_map_end (struct linereader
*lr
, struct localedef_t
*locale
)
767 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
769 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
770 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
774 /* Local functions. */
776 ctype_class_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
781 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
782 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
785 if (cnt
< ctype
->nr_charclass
)
787 lr_error (lr
, _("character class `%s' already defined"));
791 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
792 /* Exit code 2 is prescribed in P1003.2b. */
794 implementation limit: no more than %d character classes allowed"),
797 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
802 ctype_map_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
803 const char *name
, struct charset_t
*charset
)
805 size_t max_chars
= 0;
808 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
810 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
813 if (max_chars
< ctype
->map_collection_max
[cnt
])
814 max_chars
= ctype
->map_collection_max
[cnt
];
817 if (cnt
< ctype
->map_collection_nr
)
819 lr_error (lr
, _("character map `%s' already defined"));
823 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
824 /* Exit code 2 is prescribed in P1003.2b. */
826 implementation limit: no more than %d character maps allowed"),
829 ctype
->mapnames
[cnt
] = name
;
832 ctype
->map_collection_max
[cnt
] = charset
->mb_cur_max
== 1 ? 256 : 512;
834 ctype
->map_collection_max
[cnt
] = max_chars
;
836 ctype
->map_collection
[cnt
] = (u_int32_t
*)
837 xmalloc (sizeof (u_int32_t
) * ctype
->map_collection_max
[cnt
]);
838 memset (ctype
->map_collection
[cnt
], '\0',
839 sizeof (u_int32_t
) * ctype
->map_collection_max
[cnt
]);
840 ctype
->map_collection_act
[cnt
] = 256;
842 ++ctype
->map_collection_nr
;
846 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
847 is possible if we only want ot extend the name array. */
849 find_idx (struct locale_ctype_t
*ctype
, u_int32_t
**table
, size_t *max
,
850 size_t *act
, unsigned int idx
)
855 return table
== NULL
? NULL
: &(*table
)[idx
];
857 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
858 if (ctype
->charnames
[cnt
] == idx
)
861 /* We have to distinguish two cases: the names is found or not. */
862 if (cnt
== ctype
->charnames_act
)
864 /* Extend the name array. */
865 if (ctype
->charnames_act
== ctype
->charnames_max
)
867 ctype
->charnames_max
*= 2;
868 ctype
->charnames
= (unsigned int *)
869 xrealloc (ctype
->charnames
,
870 sizeof (unsigned int) * ctype
->charnames_max
);
872 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
876 /* We have done everything we are asked to do. */
883 size_t old_max
= *max
;
889 (u_int32_t
*) xrealloc (*table
, *max
* sizeof (unsigned long int));
890 memset (&(*table
)[old_max
], '\0',
891 (*max
- old_max
) * sizeof (u_int32_t
));
898 return &(*table
)[cnt
];
903 set_class_defaults (struct locale_ctype_t
*ctype
, struct charset_t
*charset
)
905 /* These function defines the default values for the classes and conversions
906 according to POSIX.2 2.5.2.1.
907 It may seem that the order of these if-blocks is arbitrary but it is NOT.
908 Don't move them unless you know what you do! */
910 void set_default (int bit
, int from
, int to
)
917 for (ch
= from
; ch
<= to
; ++ch
)
922 value
= charset_find_value (charset
, tmp
, 1);
923 if (value
== ILLEGAL_CHAR_VALUE
)
926 character `%s' not defined while needed as default value"),
931 ELEM (ctype
, class_collection
, , value
) |= bit
;
935 /* Set default values if keyword was not present. */
936 if ((ctype
->class_done
& BIT (tok_upper
)) == 0)
937 /* "If this keyword [lower] is not specified, the lowercase letters
938 `A' through `Z', ..., shall automatically belong to this class,
939 with implementation defined character values." [P1003.2, 2.5.2.1] */
940 set_default (BIT (tok_upper
), 'A', 'Z');
942 if ((ctype
->class_done
& BIT (tok_lower
)) == 0)
943 /* "If this keyword [lower] is not specified, the lowercase letters
944 `a' through `z', ..., shall automatically belong to this class,
945 with implementation defined character values." [P1003.2, 2.5.2.1] */
946 set_default (BIT (tok_lower
), 'a', 'z');
948 if ((ctype
->class_done
& BIT (tok_alpha
)) == 0)
950 /* Table 2-6 in P1003.2 says that characters in class `upper' or
951 class `lower' *must* be in class `alpha'. */
952 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
955 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
956 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
957 ctype
->class_collection
[cnt
] |= BIT (tok_alpha
);
960 if ((ctype
->class_done
& BIT (tok_digit
)) == 0)
961 /* "If this keyword [digit] is not specified, the digits `0' through
962 `9', ..., shall automatically belong to this class, with
963 implementation-defined character values." [P1003.2, 2.5.2.1] */
964 set_default (BIT (tok_digit
), '0', '9');
966 /* "Only characters specified for the `alpha' and `digit' keyword
967 shall be specified. Characters specified for the keyword `alpha'
968 and `digit' are automatically included in this class. */
970 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
973 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
974 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
975 ctype
->class_collection
[cnt
] |= BIT (tok_alnum
);
978 if ((ctype
->class_done
& BIT (tok_space
)) == 0)
979 /* "If this keyword [space] is not specified, the characters <space>,
980 <form-feed>, <newline>, <carriage-return>, <tab>, and
981 <vertical-tab>, ..., shall automatically belong to this class,
982 with implementation-defined character values." [P1003.2, 2.5.2.1] */
986 value
= charset_find_value (charset
, "space", 5);
987 if (value
== ILLEGAL_CHAR_VALUE
)
989 character `%s' not defined while needed as default value"),
992 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
994 value
= charset_find_value (charset
, "form-feed", 9);
995 if (value
== ILLEGAL_CHAR_VALUE
)
997 character `%s' not defined while needed as default value"),
1000 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1002 value
= charset_find_value (charset
, "newline", 7);
1003 if (value
== ILLEGAL_CHAR_VALUE
)
1005 character `%s' not defined while needed as default value"),
1008 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1010 value
= charset_find_value (charset
, "carriage-return", 15);
1011 if (value
== ILLEGAL_CHAR_VALUE
)
1013 character `%s' not defined while needed as default value"),
1014 "<carriage-return>");
1016 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1018 value
= charset_find_value (charset
, "tab", 3);
1019 if (value
== ILLEGAL_CHAR_VALUE
)
1021 character `%s' not defined while needed as default value"),
1024 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1026 value
= charset_find_value (charset
, "vertical-tab", 12);
1027 if (value
== ILLEGAL_CHAR_VALUE
)
1029 character `%s' not defined while needed as default value"),
1032 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
1035 if ((ctype
->class_done
& BIT (tok_xdigit
)) == 0)
1036 /* "If this keyword is not specified, the digits `0' to `9', the
1037 uppercase letters `A' through `F', and the lowercase letters `a'
1038 through `f', ..., shell automatically belong to this class, with
1039 implementation defined character values." [P1003.2, 2.5.2.1] */
1041 set_default (BIT (tok_xdigit
), '0', '9');
1042 set_default (BIT (tok_xdigit
), 'A', 'F');
1043 set_default (BIT (tok_xdigit
), 'a', 'f');
1046 if ((ctype
->class_done
& BIT (tok_blank
)) == 0)
1047 /* "If this keyword [blank] is unspecified, the characters <space> and
1048 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
1052 value
= charset_find_value (charset
, "space", 5);
1053 if (value
== ILLEGAL_CHAR_VALUE
)
1055 character `%s' not defined while needed as default value"),
1058 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1060 value
= charset_find_value (charset
, "tab", 3);
1061 if (value
== ILLEGAL_CHAR_VALUE
)
1063 character `%s' not defined while needed as default value"),
1066 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1069 if ((ctype
->class_done
& BIT (tok_graph
)) == 0)
1070 /* "If this keyword [graph] is not specified, characters specified for
1071 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
1072 shall belong to this character class." [P1003.2, 2.5.2.1] */
1074 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1075 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1078 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1079 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1080 ctype
->class_collection
[cnt
] |= BIT (tok_graph
);
1083 if ((ctype
->class_done
& BIT (tok_print
)) == 0)
1084 /* "If this keyword [print] is not provided, characters specified for
1085 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
1086 and the <space> character shall belong to this character class."
1087 [P1003.2, 2.5.2.1] */
1089 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1090 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1094 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1095 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1096 ctype
->class_collection
[cnt
] |= BIT (tok_print
);
1098 space
= charset_find_value (charset
, "space", 5);
1099 if (space
== ILLEGAL_CHAR_VALUE
)
1101 character `%s' not defined while needed as default value"),
1104 ELEM (ctype
, class_collection
, , space
) |= BIT (tok_print
);
1107 if (ctype
->toupper_done
== 0)
1108 /* "If this keyword [toupper] is not spcified, the lowercase letters
1109 `a' through `z', and their corresponding uppercase letters `A' to
1110 `Z', ..., shall automatically be included, with implementation-
1111 defined character values." [P1003.2, 2.5.2.1] */
1116 strcpy (tmp
, "<?>");
1118 for (ch
= 'a'; ch
<= 'z'; ++ch
)
1120 unsigned int value_from
, value_to
;
1124 value_from
= charset_find_value (charset
, &tmp
[1], 1);
1125 if (value_from
== ILLEGAL_CHAR_VALUE
)
1128 character `%c' not defined while needed as default value"),
1133 /* This conversion is implementation defined. */
1134 tmp
[1] = (char) (ch
+ ('A' - 'a'));
1135 value_to
= charset_find_value (charset
, &tmp
[1], 1);
1139 character `%s' not defined while needed as default value"),
1144 /* The index [0] is determined by the order of the
1145 `ctype_map_newP' calls in `ctype_startup'. */
1146 ELEM (ctype
, map_collection
, [0], value_from
) = value_to
;
1150 if (ctype
->tolower_done
== 0)
1151 /* "If this keyword [tolower] is not specified, the mapping shall be
1152 the reverse mapping of the one specified to `toupper'." [P1003.2] */
1156 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
1157 if (ctype
->map_collection
[0][cnt
] != 0)
1158 ELEM (ctype
, map_collection
, [1],
1159 ctype
->map_collection
[0][cnt
])
1160 = ctype
->charnames
[cnt
];
1166 allocate_arrays (struct locale_ctype_t
*ctype
, struct charset_t
*charset
)
1170 /* First we have to decide how we organize the arrays. It is easy for
1171 a one-byte character set. But multi-byte character set cannot be
1172 stored flat because they might be sparsly used. So we determine an
1173 optimal hashing function for the used characters.
1175 We use a very trivial hashing function to store the sparse table.
1176 CH % TABSIZE is used as an index. To solve multiple hits we have
1177 N planes. This gurantees a fixed search time for a character [N
1178 / 2]. In the following code we determine the minmum value for
1179 TABSIZE * N, where TABSIZE >= 256. */
1180 size_t min_total
= UINT_MAX
;
1181 size_t act_size
= 256;
1184 Computing table size for character classes might take a while..."),
1187 while (act_size
< min_total
)
1189 size_t cnt
[act_size
];
1190 size_t act_planes
= 1;
1192 memset (cnt
, '\0', sizeof cnt
);
1194 for (idx
= 0; idx
< 256; ++idx
)
1197 for (idx
= 0; idx
< ctype
->charnames_act
; ++idx
)
1198 if (ctype
->charnames
[idx
] >= 256)
1200 size_t nr
= ctype
->charnames
[idx
] % act_size
;
1202 if (++cnt
[nr
] > act_planes
)
1204 act_planes
= cnt
[nr
];
1205 if (act_size
* act_planes
>= min_total
)
1210 if (act_size
* act_planes
< min_total
)
1212 min_total
= act_size
* act_planes
;
1213 ctype
->plane_size
= act_size
;
1214 ctype
->plane_cnt
= act_planes
;
1220 fprintf (stderr
, _(" done\n"));
1223 #if __BYTE_ORDER == __LITTLE_ENDIAN
1224 # define NAMES_B1 ctype->names_el
1225 # define NAMES_B2 ctype->names_eb
1227 # define NAMES_B1 ctype->names_eb
1228 # define NAMES_B2 ctype->names_el
1231 ctype
->names_eb
= (u_int32_t
*) xcalloc (ctype
->plane_size
1233 sizeof (u_int32_t
));
1234 ctype
->names_el
= (u_int32_t
*) xcalloc (ctype
->plane_size
1236 sizeof (u_int32_t
));
1238 for (idx
= 1; idx
< 256; ++idx
)
1239 NAMES_B1
[idx
] = idx
;
1241 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
1244 for (idx
= 256; idx
< ctype
->charnames_act
; ++idx
)
1246 size_t nr
= (ctype
->charnames
[idx
] % ctype
->plane_size
);
1249 while (NAMES_B1
[nr
+ depth
* ctype
->plane_size
])
1251 assert (depth
< ctype
->plane_cnt
);
1253 NAMES_B1
[nr
+ depth
* ctype
->plane_size
] = ctype
->charnames
[idx
];
1255 /* Now for faster access remember the index in the NAMES_B array. */
1256 ctype
->charnames
[idx
] = nr
+ depth
* ctype
->plane_size
;
1260 for (idx
= 0; idx
< ctype
->plane_size
* ctype
->plane_cnt
; ++idx
)
1261 NAMES_B2
[idx
] = SWAPU32 (NAMES_B1
[idx
]);
1264 /* You wonder about this amount of memory? This is only because some
1265 users do not manage to address the array with unsigned values or
1266 data types with range >= 256. '\200' would result in the array
1267 index -128. To help these poor people we duplicate the entries for
1268 128 up to 255 below the entry for \0. */
1269 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128,
1270 sizeof (char_class_t
));
1271 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (ctype
->plane_size
1273 sizeof (char_class32_t
));
1275 /* Fill in the character class information. */
1276 #if __BYTE_ORDER == __LITTLE_ENDIAN
1277 # define TRANS(w) CHAR_CLASS_TRANS (w)
1278 # define TRANS32(w) CHAR_CLASS32_TRANS (w)
1280 # define TRANS(w) (w)
1281 # define TRANS32(w) (w)
1284 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1285 if (ctype
->charnames
[idx
] < 256)
1286 ctype
->ctype_b
[128 + ctype
->charnames
[idx
]]
1287 = TRANS (ctype
->class_collection
[idx
]);
1289 /* Mirror first 127 entries. We must take care that entry -1 is not
1290 mirrored because EOF == -1. */
1291 for (idx
= 0; idx
< 127; ++idx
)
1292 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
1294 /* The 32 bit array contains all characters. */
1295 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1296 ctype
->ctype32_b
[ctype
->charnames
[idx
]]
1297 = TRANS32 (ctype
->class_collection
[idx
]);
1299 /* Room for table of mappings. */
1300 ctype
->map_eb
= (u_int32_t
**) xmalloc (ctype
->map_collection_nr
1301 * sizeof (u_int32_t
*));
1302 ctype
->map_el
= (u_int32_t
**) xmalloc (ctype
->map_collection_nr
1303 * sizeof (u_int32_t
*));
1305 /* Fill in all mappings. */
1306 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
1310 /* Allocate table. */
1311 ctype
->map_eb
[idx
] = (u_int32_t
*) xmalloc ((ctype
->plane_size
1312 * ctype
->plane_cnt
+ 128)
1313 * sizeof (u_int32_t
));
1314 ctype
->map_el
[idx
] = (u_int32_t
*) xmalloc ((ctype
->plane_size
1315 * ctype
->plane_cnt
+ 128)
1316 * sizeof (u_int32_t
));
1318 #if __BYTE_ORDER == __LITTLE_ENDIAN
1319 # define MAP_B1 ctype->map_el
1320 # define MAP_B2 ctype->map_eb
1322 # define MAP_B1 ctype->map_eb
1323 # define MAP_B2 ctype->map_el
1326 /* Copy default value (identity mapping). */
1327 memcpy (&MAP_B1
[idx
][128], NAMES_B1
,
1328 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (u_int32_t
));
1330 /* Copy values from collection. */
1331 for (idx2
= 0; idx2
< ctype
->map_collection_act
[idx
]; ++idx2
)
1332 if (ctype
->map_collection
[idx
][idx2
] != 0)
1333 MAP_B1
[idx
][128 + ctype
->charnames
[idx2
]] =
1334 ctype
->map_collection
[idx
][idx2
];
1336 /* Mirror first 127 entries. We must take care not to map entry
1337 -1 because EOF == -1. */
1338 for (idx2
= 0; idx2
< 127; ++idx2
)
1339 MAP_B1
[idx
][idx2
] = MAP_B1
[idx
][256 + idx2
];
1341 /* EOF must map to EOF. */
1342 MAP_B1
[idx
][127] = EOF
;
1344 /* And now the other byte order. */
1345 for (idx2
= 0; idx2
< ctype
->plane_size
* ctype
->plane_cnt
+ 128; ++idx2
)
1346 MAP_B2
[idx
][idx2
] = SWAPU32 (MAP_B1
[idx
][idx2
]);
1349 /* Extra array for class and map names. */
1350 ctype
->class_name_ptr
= (u_int32_t
*) xmalloc (ctype
->nr_charclass
1351 * sizeof (u_int32_t
));
1352 ctype
->map_name_ptr
= (u_int32_t
*) xmalloc (ctype
->map_collection_nr
1353 * sizeof (u_int32_t
));
1355 /* Array for width information. Because the expected width are very
1356 small we use only one single byte. This save space and we need
1357 not provide the information twice with both endianesses. */
1358 ctype
->width
= (unsigned char *) xmalloc (ctype
->plane_size
1359 * ctype
->plane_cnt
);
1360 /* Initialize with default width value. */
1361 memset (ctype
->width
, charset
->width_default
,
1362 ctype
->plane_size
* ctype
->plane_cnt
);
1363 if (charset
->width_rules
!= NULL
)
1367 for (cnt
= 0; cnt
< charset
->nwidth_rules
; ++cnt
)
1368 if (charset
->width_rules
[cnt
].width
!= charset
->width_default
)
1369 for (idx
= charset
->width_rules
[cnt
].from
;
1370 idx
<= charset
->width_rules
[cnt
].to
; ++idx
)
1372 size_t nr
= idx
% ctype
->plane_size
;
1375 while (NAMES_B1
[nr
+ depth
* ctype
->plane_size
] != nr
)
1377 assert (depth
< ctype
->plane_cnt
);
1379 ctype
->width
[nr
+ depth
* ctype
->plane_size
]
1380 = charset
->width_rules
[cnt
].width
;
1384 /* Compute MB_CUR_MAX. Please note the value mb_cur_max in the
1385 character set definition gives the number of bytes in the wide
1386 character representation. We compute the number of bytes used
1387 for the UTF-8 encoded form. */
1388 ctype
->mb_cur_max
= ((int []) { 2, 3, 5, 6 }) [charset
->mb_cur_max
- 1];
1390 /* We need the name of the currently used 8-bit character set to
1391 make correct conversion between this 8-bit representation and the
1392 ISO 10646 character set used internally for wide characters. */
1393 ctype
->codeset_name
= charset
->code_set_name
;