]>
git.ipfire.org Git - thirdparty/glibc.git/blob - locale/programs/ld-ctype.c
1 /* Copyright (C) 1995, 1996 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper, <drepper@gnu.ai.mit.edu>.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If
17 not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
29 #include "localeinfo.h"
31 #include "locfile-token.h"
32 #include "stringtrans.h"
34 /* Uncomment the following line in the production version. */
39 void *xmalloc (size_t __n
);
40 void *xcalloc (size_t __n
, size_t __s
);
41 void *xrealloc (void *__ptr
, size_t __n
);
44 /* The bit used for representing a special class. */
45 #define BITPOS(class) ((class) - tok_upper)
46 #define BIT(class) (1 << BITPOS (class))
48 #define ELEM(ctype, collection, idx, value) \
49 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
50 &ctype->collection##_act idx, value)
53 (((w) << 24) | (((w) & 0xff00) << 8) | (((w) >> 8) & 0xff00) | ((w) >> 24))
56 ((((w) >> 8) & 0xff) | (((w) & 0xff) << 8))
59 /* To be compatible with former implementations we for now restrict
60 the number of bits for character classes to 16. When compatibility
61 is not necessary anymore increase the number to 32. */
62 #define char_class_t u16_t
63 #define CHAR_CLASS_TRANS SWAPU16
64 #define char_class32_t u32_t
65 #define CHAR_CLASS32_TRANS SWAPU32
68 /* The real definition of the struct for the LC_CTYPE locale. */
71 unsigned int *charnames
;
75 /* We will allow up to 8 * sizeof(u32_t) - 1 character classes. */
76 #define MAX_NR_CHARCLASS (8 * sizeof (u32_t) - 1)
78 const char *classnames
[MAX_NR_CHARCLASS
];
79 unsigned long int current_class_mask
;
80 unsigned int last_class_char
;
81 u32_t
*class_collection
;
82 size_t class_collection_max
;
83 size_t class_collection_act
;
84 unsigned long int class_done
;
86 /* If the following number ever turns out to be too small simply
87 increase it. But I doubt it will. --drepper@gnu */
88 #define MAX_NR_CHARMAP 16
89 const char *mapnames
[MAX_NR_CHARMAP
];
90 u32_t
*map_collection
[MAX_NR_CHARMAP
];
91 unsigned int map_collection_max
[MAX_NR_CHARMAP
];
92 unsigned int map_collection_act
[MAX_NR_CHARMAP
];
93 size_t map_collection_nr
;
95 unsigned int from_map_char
;
99 /* The arrays for the binary representation. */
102 char_class_t
*ctype_b
;
103 char_class32_t
*ctype32_b
;
108 u32_t
*class_name_ptr
;
113 /* Prototypes for local functions. */
114 static void ctype_class_newP (struct linereader
*lr
,
115 struct locale_ctype_t
*ctype
, const char *name
);
116 static void ctype_map_newP (struct linereader
*lr
,
117 struct locale_ctype_t
*ctype
,
118 const char *name
, struct charset_t
*charset
);
119 static u32_t
*find_idx (struct locale_ctype_t
*ctype
, u32_t
**table
,
120 size_t *max
, size_t *act
, unsigned int idx
);
121 static void set_class_defaults (struct locale_ctype_t
*ctype
,
122 struct charset_t
*charset
);
123 static void allocate_arrays (struct locale_ctype_t
*ctype
);
127 ctype_startup (struct linereader
*lr
, struct localedef_t
*locale
,
128 struct charset_t
*charset
)
131 struct locale_ctype_t
*ctype
;
133 /* It is important that we always use UCS1 encoding for strings now. */
134 encoding_method
= ENC_UCS1
;
136 /* Allocate the needed room. */
137 locale
->categories
[LC_CTYPE
].ctype
= ctype
=
138 (struct locale_ctype_t
*) xmalloc (sizeof (struct locale_ctype_t
));
140 /* We have no names seen yet. */
141 ctype
->charnames_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
143 (unsigned int *) xmalloc (ctype
->charnames_max
* sizeof (unsigned int));
144 for (cnt
= 0; cnt
< 256; ++cnt
)
145 ctype
->charnames
[cnt
] = cnt
;
146 ctype
->charnames_act
= 256;
148 /* Fill character class information. */
149 ctype
->nr_charclass
= 0;
150 ctype
->current_class_mask
= 0;
151 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
152 /* The order of the following instructions determines the bit
154 ctype_class_newP (lr
, ctype
, "upper");
155 ctype_class_newP (lr
, ctype
, "lower");
156 ctype_class_newP (lr
, ctype
, "alpha");
157 ctype_class_newP (lr
, ctype
, "digit");
158 ctype_class_newP (lr
, ctype
, "xdigit");
159 ctype_class_newP (lr
, ctype
, "space");
160 ctype_class_newP (lr
, ctype
, "print");
161 ctype_class_newP (lr
, ctype
, "graph");
162 ctype_class_newP (lr
, ctype
, "blank");
163 ctype_class_newP (lr
, ctype
, "cntrl");
164 ctype_class_newP (lr
, ctype
, "punct");
165 ctype_class_newP (lr
, ctype
, "alnum");
167 ctype
->class_collection_max
= charset
->mb_cur_max
== 1 ? 256 : 512;
168 ctype
->class_collection
= (u32_t
*) xmalloc (sizeof (unsigned long int)
169 * ctype
->class_collection_max
);
170 memset (ctype
->class_collection
, '\0',
171 sizeof (unsigned long int) * ctype
->class_collection_max
);
172 ctype
->class_collection_act
= 256;
174 /* Fill character map information. */
175 ctype
->map_collection_nr
= 0;
176 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
177 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
178 ctype_map_newP (lr
, ctype
, "toupper", charset
);
179 ctype_map_newP (lr
, ctype
, "tolower", charset
);
181 /* Fill first 256 entries in `toupper' and `tolower' arrays. */
182 for (cnt
= 0; cnt
< 256; ++cnt
)
184 ctype
->map_collection
[0][cnt
] = cnt
;
185 ctype
->map_collection
[1][cnt
] = cnt
;
191 ctype_finish (struct localedef_t
*locale
, struct charset_t
*charset
)
193 /* See POSIX.2, table 2-6 for the meaning of the following table. */
198 const char allow
[NCLASS
];
200 valid_table
[NCLASS
] =
202 /* The order is important. See token.h for more information.
203 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
204 { "upper", "--MX-XDDXXX-" },
205 { "lower", "--MX-XDDXXX-" },
206 { "alpha", "---X-XDDXXX-" },
207 { "digit", "XXX--XDDXXX-" },
208 { "xdigit", "-----XDDXXX-" },
209 { "space", "XXXXX------X" },
210 { "print", "---------X--" },
211 { "graph", "---------X--" },
212 { "blank", "XXXXXM-----X" },
213 { "cntrl", "XXXXX-XX--XX" },
214 { "punct", "XXXXX-DD-X-X" },
215 { "alnum", "-----XDDXXX-" }
219 unsigned int space_value
;
220 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
222 /* Set default value for classes not specified. */
223 set_class_defaults (ctype
, charset
);
225 /* Check according to table. */
226 for (cnt
= 0; cnt
< ctype
->class_collection_max
; ++cnt
)
228 unsigned long int tmp
;
230 tmp
= ctype
->class_collection
[cnt
];
234 for (cls1
= 0; cls1
< NCLASS
; ++cls1
)
235 if ((tmp
& (1 << cls1
)) != 0)
236 for (cls2
= 0; cls2
< NCLASS
; ++cls2
)
237 if (valid_table
[cls1
].allow
[cls2
] != '-')
239 int eq
= (tmp
& (1 << cls2
)) != 0;
240 switch (valid_table
[cls1
].allow
[cls2
])
249 value
= ctype
->charnames
[cnt
];
251 if ((value
& 0xff000000) != 0)
252 cp
+= sprintf (cp
, "\\%o", (value
>> 24) & 0xff);
253 if ((value
& 0xffff0000) != 0)
254 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
255 if ((value
& 0xffffff00) != 0)
256 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
257 sprintf (cp
, "\\%o", value
& 0xff);
260 character %s'%s' in class `%s' must be in class `%s'"), value
> 256 ? "L" : "",
261 cp
, valid_table
[cls1
].name
,
262 valid_table
[cls2
].name
);
273 value
= ctype
->charnames
[cnt
];
275 if ((value
& 0xff000000) != 0)
276 cp
+= sprintf (cp
, "\\%o", value
>> 24);
277 if ((value
& 0xffff0000) != 0)
278 cp
+= sprintf (cp
, "\\%o", (value
>> 16) & 0xff);
279 if ((value
& 0xffffff00) != 0)
280 cp
+= sprintf (cp
, "\\%o", (value
>> 8) & 0xff);
281 sprintf (cp
, "\\%o", value
& 0xff);
284 character %s'%s' in class `%s' must not be in class `%s'"),
285 value
> 256 ? "L" : "", cp
,
286 valid_table
[cls1
].name
, valid_table
[cls2
].name
);
291 ctype
->class_collection
[cnt
] |= 1 << cls2
;
295 error (5, 0, _("internal error in %s, line %u"),
296 __FUNCTION__
, __LINE__
);
301 /* ... and now test <SP> as a special case. */
302 space_value
= charset_find_value (charset
, "SP", 2);
303 if (space_value
== ILLEGAL_CHAR_VALUE
)
304 error (0, 0, _("character <SP> not defined in character map"));
305 else if ((cnt
= BITPOS (tok_space
),
306 (ELEM (ctype
, class_collection
, , space_value
)
307 & BIT (tok_space
)) == 0)
308 || (cnt
= BITPOS (tok_blank
),
309 (ELEM (ctype
, class_collection
, , space_value
)
310 & BIT (tok_blank
)) == 0))
311 error (0, 0, _("<SP> character not in class `%s'"),
312 valid_table
[cnt
].name
);
313 else if ((cnt
= BITPOS (tok_punct
),
314 (ELEM (ctype
, class_collection
, , space_value
)
315 & BIT (tok_punct
)) != 0)
316 || (cnt
= BITPOS (tok_graph
),
317 (ELEM (ctype
, class_collection
, , space_value
)
320 error (0, 0, _("<SP> character must not be in class `%s'"),
321 valid_table
[cnt
].name
);
323 ELEM (ctype
, class_collection
, , space_value
) |= BIT (tok_print
);
328 ctype_output (struct localedef_t
*locale
, const char *output_path
)
330 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
331 const size_t nelems
= (_NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)
332 + 2 * (ctype
->map_collection_nr
- 2));
333 struct iovec iov
[2 + nelems
+ (ctype
->nr_charclass
+ 1)
334 + (ctype
->map_collection_nr
+ 1)];
335 struct locale_file data
;
337 size_t elem
, cnt
, offset
;
340 if ((locale
->binary
& (1 << LC_CTYPE
)) != 0)
342 iov
[0].iov_base
= ctype
;
343 iov
[0].iov_len
= locale
->len
[LC_CTYPE
];
345 write_locale_data (output_path
, "LC_CTYPE", 1, iov
);
351 /* Now prepare the output: Find the sizes of the table we can use. */
352 allocate_arrays (ctype
);
354 data
.magic
= LIMAGIC (LC_CTYPE
);
356 iov
[0].iov_base
= (void *) &data
;
357 iov
[0].iov_len
= sizeof (data
);
359 iov
[1].iov_base
= (void *) idx
;
360 iov
[1].iov_len
= sizeof (idx
);
362 idx
[0] = iov
[0].iov_len
+ iov
[1].iov_len
;
365 for (elem
= 0; elem
< nelems
; ++elem
)
367 if (elem
< _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
))
370 #define CTYPE_DATA(name, base, len) \
371 case _NL_ITEM_INDEX (name): \
372 iov[2 + elem].iov_base = base; \
373 iov[2 + elem].iov_len = len; \
376 CTYPE_DATA (_NL_CTYPE_CLASS
,
378 (256 + 128) * sizeof (char_class_t
));
380 CTYPE_DATA (_NL_CTYPE_TOUPPER_EB
,
382 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
384 CTYPE_DATA (_NL_CTYPE_TOLOWER_EB
,
386 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
389 CTYPE_DATA (_NL_CTYPE_TOUPPER_EL
,
391 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
393 CTYPE_DATA (_NL_CTYPE_TOLOWER_EL
,
395 (ctype
->plane_size
* ctype
->plane_cnt
+ 128)
398 CTYPE_DATA (_NL_CTYPE_CLASS32
,
400 (ctype
->plane_size
* ctype
->plane_cnt
401 * sizeof (char_class32_t
)));
403 CTYPE_DATA (_NL_CTYPE_NAMES_EB
,
405 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (u32_t
));
406 CTYPE_DATA (_NL_CTYPE_NAMES_EL
,
408 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (u32_t
));
410 CTYPE_DATA (_NL_CTYPE_HASH_SIZE
,
411 &ctype
->plane_size
, sizeof (u32_t
));
412 CTYPE_DATA (_NL_CTYPE_HASH_LAYERS
,
413 &ctype
->plane_cnt
, sizeof (u32_t
));
415 CTYPE_DATA (_NL_CTYPE_CLASS_NAMES
,
416 ctype
->class_name_ptr
,
417 ctype
->nr_charclass
* sizeof (u32_t
));
418 CTYPE_DATA (_NL_CTYPE_MAP_NAMES
,
420 ctype
->map_collection_nr
* sizeof (u32_t
));
422 CTYPE_DATA (_NL_CTYPE_WIDTH
,
423 NULL
, 0); /* Not yet implemented. */
426 assert (! "unknown CTYPE element");
430 /* Handle extra maps. */
431 size_t nr
= (elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) >> 1;
433 if (((elem
- _NL_ITEM_INDEX (_NL_NUM_LC_CTYPE
)) & 1) == 0)
434 iov
[2 + elem
].iov_base
= ctype
->map_eb
[nr
];
436 iov
[2 + elem
].iov_base
= ctype
->map_el
[nr
];
438 iov
[2 + elem
].iov_len
= ((ctype
->plane_size
* ctype
->plane_cnt
+ 128)
442 if (elem
+ 1 < nelems
)
443 idx
[elem
+ 1] = idx
[elem
] + iov
[2 + elem
].iov_len
;
446 offset
= idx
[elem
- 1] + iov
[2 + elem
- 1].iov_len
;
448 /* The class name array. */
449 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
, ++elem
)
451 iov
[2 + elem
].iov_base
= (void *) ctype
->classnames
[cnt
];
452 iov
[2 + elem
].iov_len
= strlen (ctype
->classnames
[cnt
]) + 1;
454 ctype
->class_name_ptr
[cnt
] = offset
;
455 offset
+= iov
[2 + elem
].iov_len
;
457 iov
[2 + elem
].iov_base
= (void *) "";
458 iov
[2 + elem
].iov_len
= 1;
461 /* The map name array. */
462 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
, ++elem
)
464 iov
[2 + elem
].iov_base
= (void *) ctype
->mapnames
[cnt
];
465 iov
[2 + elem
].iov_len
= strlen (ctype
->mapnames
[cnt
]) + 1;
467 ctype
->map_name_ptr
[cnt
] = offset
;
468 offset
+= iov
[2 + elem
].iov_len
;
470 iov
[2 + elem
].iov_base
= (void *) "";
471 iov
[2 + elem
].iov_len
= 1;
474 assert (elem
== nelems
+ ctype
->nr_charclass
+ ctype
->map_collection_nr
+ 2);
476 write_locale_data (output_path
, "LC_CTYPE", 2 + elem
, iov
);
480 /* Character class handling. */
482 ctype_class_new (struct linereader
*lr
, struct localedef_t
*locale
,
483 enum token_t tok
, struct token
*code
,
484 struct charset_t
*charset
)
486 ctype_class_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
487 code
->val
.str
.start
);
492 ctype_is_charclass (struct linereader
*lr
, struct localedef_t
*locale
,
497 for (cnt
= 0; cnt
< locale
->categories
[LC_CTYPE
].ctype
->nr_charclass
; ++cnt
)
498 if (strcmp (name
, locale
->categories
[LC_CTYPE
].ctype
->classnames
[cnt
])
507 ctype_class_start (struct linereader
*lr
, struct localedef_t
*locale
,
508 enum token_t tok
, const char *str
,
509 struct charset_t
*charset
)
511 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
555 assert (! "illegal token as class name: should not happen");
558 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
559 if (strcmp (str
, ctype
->classnames
[cnt
]) == 0)
562 if (cnt
>= ctype
->nr_charclass
)
563 assert (! "unknown class in class definition: should not happen");
565 ctype
->class_done
|= BIT (tok
);
567 ctype
->current_class_mask
= 1 << cnt
;
568 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
573 ctype_class_from (struct linereader
*lr
, struct localedef_t
*locale
,
574 struct token
*code
, struct charset_t
*charset
)
576 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
579 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
581 ctype
->last_class_char
= value
;
583 if (value
== ILLEGAL_CHAR_VALUE
)
584 /* In the LC_CTYPE category it is no error when a character is
585 not found. This has to be ignored silently. */
588 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
589 &ctype
->class_collection_act
, value
)
590 |= ctype
->current_class_mask
;
595 ctype_class_to (struct linereader
*lr
, struct localedef_t
*locale
,
596 struct token
*code
, struct charset_t
*charset
)
598 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
599 unsigned int value
, cnt
;
601 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
603 assert (value
>= ctype
->last_class_char
);
605 for (cnt
= ctype
->last_class_char
+ 1; cnt
<= value
; ++cnt
)
606 *find_idx (ctype
, &ctype
->class_collection
, &ctype
->class_collection_max
,
607 &ctype
->class_collection_act
, cnt
)
608 |= ctype
->current_class_mask
;
610 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
615 ctype_class_end (struct linereader
*lr
, struct localedef_t
*locale
)
617 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
619 /* We have no special actions to perform here. */
620 ctype
->current_class_mask
= 0;
621 ctype
->last_class_char
= ILLEGAL_CHAR_VALUE
;
625 /* Character map handling. */
627 ctype_map_new (struct linereader
*lr
, struct localedef_t
*locale
,
628 enum token_t tok
, struct token
*code
,
629 struct charset_t
*charset
)
631 ctype_map_newP (lr
, locale
->categories
[LC_CTYPE
].ctype
,
632 code
->val
.str
.start
, charset
);
637 ctype_is_charmap (struct linereader
*lr
, struct localedef_t
*locale
,
640 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
643 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
644 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
652 ctype_map_start (struct linereader
*lr
, struct localedef_t
*locale
,
653 enum token_t tok
, const char *name
, struct charset_t
*charset
)
655 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
661 ctype
->toupper_done
= 1;
665 ctype
->tolower_done
= 1;
671 assert (! "unknown token in category `LC_CTYPE' should not happen");
674 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
675 if (strcmp (name
, ctype
->mapnames
[cnt
]) == 0)
678 if (cnt
== ctype
->map_collection_nr
)
679 assert (! "unknown token in category `LC_CTYPE' should not happen");
681 ctype
->last_map_idx
= cnt
;
682 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
687 ctype_map_from (struct linereader
*lr
, struct localedef_t
*locale
,
688 struct token
*code
, struct charset_t
*charset
)
690 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
693 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
695 if (value
== ILLEGAL_CHAR_VALUE
)
696 /* In the LC_CTYPE category it is no error when a character is
697 not found. This has to be ignored silently. */
700 assert (ctype
->last_map_idx
< ctype
->map_collection_nr
);
702 ctype
->from_map_char
= value
;
707 ctype_map_to (struct linereader
*lr
, struct localedef_t
*locale
,
708 struct token
*code
, struct charset_t
*charset
)
710 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
713 value
= charset_find_value (charset
, code
->val
.str
.start
, code
->val
.str
.len
);
715 if (ctype
->from_map_char
== ILLEGAL_CHAR_VALUE
716 || value
== ILLEGAL_CHAR_VALUE
)
718 /* In the LC_CTYPE category it is no error when a character is
719 not found. This has to be ignored silently. */
720 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
724 *find_idx (ctype
, &ctype
->map_collection
[ctype
->last_map_idx
],
725 &ctype
->map_collection_max
[ctype
->last_map_idx
],
726 &ctype
->map_collection_act
[ctype
->last_map_idx
],
727 ctype
->from_map_char
) = value
;
729 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
734 ctype_map_end (struct linereader
*lr
, struct localedef_t
*locale
)
736 struct locale_ctype_t
*ctype
= locale
->categories
[LC_CTYPE
].ctype
;
738 ctype
->last_map_idx
= MAX_NR_CHARMAP
;
739 ctype
->from_map_char
= ILLEGAL_CHAR_VALUE
;
743 /* Local functions. */
745 ctype_class_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
750 for (cnt
= 0; cnt
< ctype
->nr_charclass
; ++cnt
)
751 if (strcmp (ctype
->classnames
[cnt
], name
) == 0)
754 if (cnt
< ctype
->nr_charclass
)
756 lr_error (lr
, _("character class `%s' already defined"));
760 if (ctype
->nr_charclass
== MAX_NR_CHARCLASS
)
761 /* Exit code 2 is prescribed in P1003.2b. */
763 implementation limit: no more than %d character classes allowed"),
766 ctype
->classnames
[ctype
->nr_charclass
++] = name
;
771 ctype_map_newP (struct linereader
*lr
, struct locale_ctype_t
*ctype
,
772 const char *name
, struct charset_t
*charset
)
774 size_t max_chars
= 0;
777 for (cnt
= 0; cnt
< ctype
->map_collection_nr
; ++cnt
)
779 if (strcmp (ctype
->mapnames
[cnt
], name
) == 0)
782 if (max_chars
< ctype
->map_collection_max
[cnt
])
783 max_chars
= ctype
->map_collection_max
[cnt
];
786 if (cnt
< ctype
->map_collection_nr
)
788 lr_error (lr
, _("character map `%s' already defined"));
792 if (ctype
->map_collection_nr
== MAX_NR_CHARMAP
)
793 /* Exit code 2 is prescribed in P1003.2b. */
795 implementation limit: no more than %d character maps allowed"),
798 ctype
->mapnames
[cnt
] = name
;
801 ctype
->map_collection_max
[cnt
] = charset
->mb_cur_max
== 1 ? 256
804 ctype
->map_collection_max
[cnt
] = max_chars
;
806 ctype
->map_collection
[cnt
] =
807 (u32_t
*) xmalloc (sizeof (u32_t
) * ctype
->map_collection_max
[cnt
]);
808 memset (ctype
->map_collection
[cnt
], '\0',
809 sizeof (u32_t
) * ctype
->map_collection_max
[cnt
]);
810 ctype
->map_collection_act
[cnt
] = 256;
812 ++ctype
->map_collection_nr
;
817 find_idx (struct locale_ctype_t
*ctype
, u32_t
**table
, size_t *max
,
818 size_t *act
, unsigned int idx
)
823 return &(*table
)[idx
];
825 for (cnt
= 256; cnt
< ctype
->charnames_act
; ++cnt
)
826 if (ctype
->charnames
[cnt
] == idx
)
829 /* We have to distinguish two cases: the names is found or not. */
830 if (cnt
== ctype
->charnames_act
)
832 /* Extend the name array. */
833 if (ctype
->charnames_act
== ctype
->charnames_max
)
835 ctype
->charnames_max
*= 2;
836 ctype
->charnames
= (unsigned int *)
837 xrealloc (ctype
->charnames
,
838 sizeof (unsigned int) * ctype
->charnames_max
);
840 ctype
->charnames
[ctype
->charnames_act
++] = idx
;
847 size_t old_max
= *max
;
853 (u32_t
*) xrealloc (*table
, *max
* sizeof (unsigned long int));
854 memset (&(*table
)[old_max
], '\0', (*max
- old_max
) * sizeof (u32_t
));
861 return &(*table
)[cnt
];
866 set_class_defaults (struct locale_ctype_t
*ctype
, struct charset_t
*charset
)
868 /* These function defines the default values for the classes and conversions
869 according to POSIX.2 2.5.2.1.
870 It may seem that the order of these if-blocks is arbitrary but it is NOT.
871 Don't move them unless you know what you do! */
873 void set_default (int bit
, int from
, int to
)
880 for (ch
= from
; ch
<= to
; ++ch
)
885 value
= charset_find_value (charset
, tmp
, 1);
886 if (value
== ILLEGAL_CHAR_VALUE
)
889 character `%s' not defined while needed as default value"),
894 ELEM (ctype
, class_collection
, , value
) |= bit
;
898 /* Set default values if keyword was not present. */
899 if ((ctype
->class_done
& BIT (tok_upper
)) == 0)
900 /* "If this keyword [lower] is not specified, the lowercase letters
901 `A' through `Z', ..., shall automatically belong to this class,
902 with implementation defined character values." [P1003.2, 2.5.2.1] */
903 set_default (BIT (tok_upper
), 'A', 'Z');
905 if ((ctype
->class_done
& BIT (tok_lower
)) == 0)
906 /* "If this keyword [lower] is not specified, the lowercase letters
907 `a' through `z', ..., shall automatically belong to this class,
908 with implementation defined character values." [P1003.2, 2.5.2.1] */
909 set_default (BIT (tok_lower
), 'a', 'z');
911 if ((ctype
->class_done
& BIT (tok_alpha
)) == 0)
913 /* Table 2-6 in P1003.2 says that characters in class `upper' or
914 class `lower' *must* be in class `alpha'. */
915 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
);
918 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
919 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
920 ctype
->class_collection
[cnt
] |= BIT (tok_alpha
);
923 if ((ctype
->class_done
& BIT (tok_digit
)) == 0)
924 /* "If this keyword [digit] is not specified, the digits `0' through
925 `9', ..., shall automatically belong to this class, with
926 implementation-defined character values." [P1003.2, 2.5.2.1] */
927 set_default (BIT (tok_digit
), '0', '9');
929 /* "Only characters specified for the `alpha' and `digit' keyword
930 shall be specified. Characters specified for the keyword `alpha'
931 and `digit' are automatically included in this class. */
933 unsigned long int mask
= BIT (tok_alpha
) | BIT (tok_digit
);
936 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
937 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
938 ctype
->class_collection
[cnt
] |= BIT (tok_alnum
);
941 if ((ctype
->class_done
& BIT (tok_space
)) == 0)
942 /* "If this keyword [space] is not specified, the characters <space>,
943 <form-feed>, <newline>, <carriage-return>, <tab>, and
944 <vertical-tab>, ..., shall automatically belong to this class,
945 with implementation-defined character values." [P1003.2, 2.5.2.1] */
949 value
= charset_find_value (charset
, "space", 5);
950 if (value
== ILLEGAL_CHAR_VALUE
)
952 character `%s' not defined while needed as default value"),
955 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
957 value
= charset_find_value (charset
, "form-feed", 9);
958 if (value
== ILLEGAL_CHAR_VALUE
)
960 character `%s' not defined while needed as default value"),
963 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
965 value
= charset_find_value (charset
, "newline", 7);
966 if (value
== ILLEGAL_CHAR_VALUE
)
968 character `%s' not defined while needed as default value"),
971 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
973 value
= charset_find_value (charset
, "carriage-return", 15);
974 if (value
== ILLEGAL_CHAR_VALUE
)
976 character `%s' not defined while needed as default value"),
977 "<carriage-return>");
979 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
981 value
= charset_find_value (charset
, "tab", 3);
982 if (value
== ILLEGAL_CHAR_VALUE
)
984 character `%s' not defined while needed as default value"),
987 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
989 value
= charset_find_value (charset
, "vertical-tab", 12);
990 if (value
== ILLEGAL_CHAR_VALUE
)
992 character `%s' not defined while needed as default value"),
995 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_space
);
998 if ((ctype
->class_done
& BIT (tok_xdigit
)) == 0)
999 /* "If this keyword is not specified, the digits `0' to `9', the
1000 uppercase letters `A' through `F', and the lowercase letters `a'
1001 through `f', ..., shell automatically belong to this class, with
1002 implementation defined character values." [P1003.2, 2.5.2.1] */
1004 set_default (BIT (tok_xdigit
), '0', '9');
1005 set_default (BIT (tok_xdigit
), 'A', 'F');
1006 set_default (BIT (tok_xdigit
), 'a', 'f');
1009 if ((ctype
->class_done
& BIT (tok_blank
)) == 0)
1010 /* "If this keyword [blank] is unspecified, the characters <space> and
1011 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
1015 value
= charset_find_value (charset
, "space", 5);
1016 if (value
== ILLEGAL_CHAR_VALUE
)
1018 character `%s' not defined while needed as default value"),
1021 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1023 value
= charset_find_value (charset
, "tab", 3);
1024 if (value
== ILLEGAL_CHAR_VALUE
)
1026 character `%s' not defined while needed as default value"),
1029 ELEM (ctype
, class_collection
, , value
) |= BIT (tok_blank
);
1032 if ((ctype
->class_done
& BIT (tok_graph
)) == 0)
1033 /* "If this keyword [graph] is not specified, characters specified for
1034 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
1035 shall belong to this character class." [P1003.2, 2.5.2.1] */
1037 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1038 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1041 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1042 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1043 ctype
->class_collection
[cnt
] |= BIT (tok_graph
);
1046 if ((ctype
->class_done
& BIT (tok_print
)) == 0)
1047 /* "If this keyword [print] is not provided, characters specified for
1048 the keywords `upper', `lower', `alpha', `digit', `xdigit', `punct',
1049 and the <space> character shall belong to this character class."
1050 [P1003.2, 2.5.2.1] */
1052 unsigned long int mask
= BIT (tok_upper
) | BIT (tok_lower
) |
1053 BIT (tok_alpha
) | BIT (tok_digit
) | BIT (tok_xdigit
) | BIT (tok_punct
);
1057 for (cnt
= 0; cnt
< ctype
->class_collection_act
; ++cnt
)
1058 if ((ctype
->class_collection
[cnt
] & mask
) != 0)
1059 ctype
->class_collection
[cnt
] |= BIT (tok_print
);
1061 space
= charset_find_value (charset
, "space", 5);
1062 if (space
== ILLEGAL_CHAR_VALUE
)
1064 character `%s' not defined while needed as default value"),
1067 ELEM (ctype
, class_collection
, , space
) |= BIT (tok_print
);
1070 if (ctype
->toupper_done
== 0)
1071 /* "If this keyword [toupper] is not spcified, the lowercase letters
1072 `a' through `z', and their corresponding uppercase letters `A' to
1073 `Z', ..., shall automatically be included, with implementation-
1074 defined character values." [P1003.2, 2.5.2.1] */
1079 strcpy (tmp
, "<?>");
1081 for (ch
= 'a'; ch
<= 'z'; ++ch
)
1083 unsigned int value_from
, value_to
;
1087 value_from
= charset_find_value (charset
, &tmp
[1], 1);
1088 if (value_from
== ILLEGAL_CHAR_VALUE
)
1091 character `%c' not defined while needed as default value"),
1096 /* This conversion is implementation defined. */
1097 tmp
[1] = (char) (ch
+ ('A' - 'a'));
1098 value_to
= charset_find_value (charset
, &tmp
[1], 1);
1102 character `%s' not defined while needed as default value"),
1107 /* The index [0] is determined by the order of the
1108 `ctype_map_newP' calls in `ctype_startup'. */
1109 ELEM (ctype
, map_collection
, [0], value_from
) = value_to
;
1113 if (ctype
->tolower_done
== 0)
1114 /* "If this keyword [tolower] is not specified, the mapping shall be
1115 the reverse mapping of the one specified to `toupper'." [P1003.2] */
1119 for (cnt
= 0; cnt
< ctype
->map_collection_act
[0]; ++cnt
)
1120 if (ctype
->map_collection
[0][cnt
] != 0)
1121 ELEM (ctype
, map_collection
, [1],
1122 ctype
->map_collection
[0][cnt
])
1123 = ctype
->charnames
[cnt
];
1129 allocate_arrays (struct locale_ctype_t
*ctype
)
1133 /* First we have to decide how we organize the arrays. It is easy for
1134 a one-byte character set. But multi-byte character set cannot be
1135 stored flat because they might be sparsly used. So we determine an
1136 optimal hashing function for the used characters.
1138 We use a very trivial hashing function to store the sparse table.
1139 CH % TABSIZE is used as an index. To solve multiple hits we have
1140 N planes. This gurantees a fixed search time for a character [N
1141 / 2]. In the following code we determine the minmum value for
1142 TABSIZE * N, where TABSIZE >= 256. */
1143 size_t min_total
= UINT_MAX
;
1144 size_t act_size
= 256;
1147 Computing table size for character classes might take a while..."),
1150 while (act_size
< min_total
)
1152 size_t cnt
[act_size
];
1153 size_t act_planes
= 1;
1155 memset (cnt
, '\0', sizeof cnt
);
1157 for (idx
= 0; idx
< 256; ++idx
)
1160 for (idx
= 0; idx
< ctype
->charnames_act
; ++idx
)
1161 if (ctype
->charnames
[idx
] >= 256)
1163 size_t nr
= ctype
->charnames
[idx
] % act_size
;
1165 if (++cnt
[nr
] > act_planes
)
1167 act_planes
= cnt
[nr
];
1168 if (act_size
* act_planes
>= min_total
)
1173 if (act_size
* act_planes
< min_total
)
1175 min_total
= act_size
* act_planes
;
1176 ctype
->plane_size
= act_size
;
1177 ctype
->plane_cnt
= act_planes
;
1183 fprintf (stderr
, _(" done\n"));
1185 #if __BYTE_ORDER == __LITTLE_ENDIAN
1186 # define NAMES_B1 ctype->names_el
1187 # define NAMES_B2 ctype->names_eb
1189 # define NAMES_B1 ctype->names_eb
1190 # define NAMES_B2 ctype->names_el
1193 ctype
->names_eb
= (u32_t
*) xcalloc (ctype
->plane_size
* ctype
->plane_cnt
,
1195 ctype
->names_el
= (u32_t
*) xcalloc (ctype
->plane_size
* ctype
->plane_cnt
,
1198 for (idx
= 1; idx
< 256; ++idx
)
1199 NAMES_B1
[idx
] = idx
;
1201 /* Trick: change the 0th entry's name to 1 to mark the cell occupied. */
1204 for (idx
= 256; idx
< ctype
->charnames_act
; ++idx
)
1206 size_t nr
= (ctype
->charnames
[idx
] % ctype
->plane_size
);
1209 while (NAMES_B1
[nr
+ depth
* ctype
->plane_size
])
1211 assert (depth
< ctype
->plane_cnt
);
1213 NAMES_B1
[nr
+ depth
* ctype
->plane_size
] = ctype
->charnames
[idx
];
1215 /* Now for faster access remember the index in the NAMES_B array. */
1216 ctype
->charnames
[idx
] = nr
+ depth
* ctype
->plane_size
;
1220 for (idx
= 0; idx
< ctype
->plane_size
* ctype
->plane_cnt
; ++idx
)
1221 NAMES_B2
[idx
] = SWAPU32 (NAMES_B1
[idx
]);
1224 /* You wonder about this amount of memory? This is only because some
1225 users do not manage to address the array with unsigned values or
1226 data types with range >= 256. '\200' would result in the array
1227 index -128. To help these poor people we duplicate the entries for
1228 128 up to 255 below the entry for \0. */
1229 ctype
->ctype_b
= (char_class_t
*) xcalloc (256 + 128,
1230 sizeof (char_class_t
));
1231 ctype
->ctype32_b
= (char_class32_t
*) xcalloc (ctype
->plane_size
1233 sizeof (char_class32_t
));
1235 /* Fill in the character class information. */
1236 #if __BYTE_ORDER == __LITTLE_ENDIAN
1237 # define TRANS(w) CHAR_CLASS_TRANS (w)
1238 # define TRANS32(w) CHAR_CLASS32_TRANS (w)
1240 # define TRANS(w) (w)
1241 # define TRANS32(w) (w)
1244 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1245 if (ctype
->charnames
[idx
] < 256)
1246 ctype
->ctype_b
[128 + ctype
->charnames
[idx
]]
1247 = TRANS (ctype
->class_collection
[idx
]);
1249 /* Mirror first 128 entries. */
1250 for (idx
= 0; idx
< 128; ++idx
)
1251 ctype
->ctype_b
[idx
] = ctype
->ctype_b
[256 + idx
];
1253 /* The 32 bit array contains all characters. */
1254 for (idx
= 0; idx
< ctype
->class_collection_act
; ++idx
)
1255 ctype
->ctype32_b
[ctype
->charnames
[idx
]]
1256 = TRANS32 (ctype
->class_collection
[idx
]);
1258 /* Room for table of mappings. */
1259 ctype
->map_eb
= (u32_t
**) xmalloc (ctype
->map_collection_nr
1260 * sizeof (u32_t
*));
1261 ctype
->map_el
= (u32_t
**) xmalloc (ctype
->map_collection_nr
1262 * sizeof (u32_t
*));
1264 /* Fill in all mappings. */
1265 for (idx
= 0; idx
< ctype
->map_collection_nr
; ++idx
)
1269 /* Allocate table. */
1270 ctype
->map_eb
[idx
] = (u32_t
*) xmalloc ((ctype
->plane_size
1271 * ctype
->plane_cnt
+ 128)
1273 ctype
->map_el
[idx
] = (u32_t
*) xmalloc ((ctype
->plane_size
1274 * ctype
->plane_cnt
+ 128)
1277 #if __BYTE_ORDER == __LITTLE_ENDIAN
1278 # define MAP_B1 ctype->map_el
1279 # define MAP_B2 ctype->map_eb
1281 # define MAP_B1 ctype->map_eb
1282 # define MAP_B2 ctype->map_el
1285 /* Copy default value (identity mapping). */
1286 memcpy (&MAP_B1
[idx
][128], NAMES_B1
,
1287 ctype
->plane_size
* ctype
->plane_cnt
* sizeof (u32_t
));
1289 /* Copy values from collection. */
1290 for (idx2
= 0; idx2
< ctype
->map_collection_act
[idx
]; ++idx2
)
1291 if (ctype
->map_collection
[idx
][idx2
] != 0)
1292 MAP_B1
[idx
][128 + ctype
->charnames
[idx2
]] =
1293 ctype
->map_collection
[idx
][idx2
];
1295 /* Mirror first 128 entries. */
1296 for (idx2
= 0; idx2
< 128; ++idx2
)
1297 MAP_B1
[idx
][idx2
] = MAP_B1
[idx
][256 + idx2
];
1300 /* And now the other byte order. */
1301 for (idx2
= 0; idx2
< ctype
->plane_size
* ctype
->plane_cnt
+ 128; ++idx2
)
1302 MAP_B2
[idx
][idx2
] = SWAPU32 (MAP_B1
[idx
][idx2
]);
1305 /* Extra array for class and map names. */
1306 ctype
->class_name_ptr
= (u32_t
*) xmalloc (ctype
->nr_charclass
1308 ctype
->map_name_ptr
= (u32_t
*) xmalloc (ctype
->map_collection_nr