2 * Copyright (c) 2014 SGI.
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation.
9 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 /* Generator for a compact trie for unicode normalization */
21 #include <sys/types.h>
30 /* Default names of the in- and output files. */
32 #define AGE_NAME "DerivedAge.txt"
33 #define CCC_NAME "DerivedCombiningClass.txt"
34 #define PROP_NAME "DerivedCoreProperties.txt"
35 #define DATA_NAME "UnicodeData.txt"
36 #define FOLD_NAME "CaseFolding.txt"
37 #define NORM_NAME "NormalizationCorrections.txt"
38 #define TEST_NAME "NormalizationTest.txt"
39 #define UTF8_NAME "utf8data.h"
41 const char *age_name
= AGE_NAME
;
42 const char *ccc_name
= CCC_NAME
;
43 const char *prop_name
= PROP_NAME
;
44 const char *data_name
= DATA_NAME
;
45 const char *fold_name
= FOLD_NAME
;
46 const char *norm_name
= NORM_NAME
;
47 const char *test_name
= TEST_NAME
;
48 const char *utf8_name
= UTF8_NAME
;
52 /* An arbitrary line size limit on input lines. */
63 /* ------------------------------------------------------------------ */
66 * Unicode version numbers consist of three parts: major, minor, and a
67 * revision. These numbers are packed into an unsigned int to obtain
68 * a single version number.
70 * To save space in the generated trie, the unicode version is not
71 * stored directly, instead we calculate a generation number from the
72 * unicode versions seen in the DerivedAge file, and use that as an
73 * index into a table of unicode versions.
75 #define UNICODE_MAJ_SHIFT (16)
76 #define UNICODE_MIN_SHIFT (8)
78 #define UNICODE_MAJ_MAX ((unsigned short)-1)
79 #define UNICODE_MIN_MAX ((unsigned char)-1)
80 #define UNICODE_REV_MAX ((unsigned char)-1)
82 #define UNICODE_AGE(MAJ,MIN,REV) \
83 (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) | \
84 ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) | \
85 ((unsigned int)(REV)))
90 unsigned int unicode_maxage
;
92 static int age_valid(unsigned int major
, unsigned int minor
,
93 unsigned int revision
)
95 if (major
> UNICODE_MAJ_MAX
)
97 if (minor
> UNICODE_MIN_MAX
)
99 if (revision
> UNICODE_REV_MAX
)
104 /* ------------------------------------------------------------------ */
109 * A compact binary tree, used to decode UTF-8 characters.
111 * Internal nodes are one byte for the node itself, and up to three
112 * bytes for an offset into the tree. The first byte contains the
113 * following information:
114 * NEXTBYTE - flag - advance to next byte if set
115 * BITNUM - 3 bit field - the bit number to tested
116 * OFFLEN - 2 bit field - number of bytes in the offset
117 * if offlen == 0 (non-branching node)
118 * RIGHTPATH - 1 bit field - set if the following node is for the
119 * right-hand path (tested bit is set)
120 * TRIENODE - 1 bit field - set if the following node is an internal
121 * node, otherwise it is a leaf node
122 * if offlen != 0 (branching node)
123 * LEFTNODE - 1 bit field - set if the left-hand node is internal
124 * RIGHTNODE - 1 bit field - set if the right-hand node is internal
126 * Due to the way utf8 works, there cannot be branching nodes with
127 * NEXTBYTE set, and moreover those nodes always have a righthand
130 typedef unsigned char utf8trie_t
;
132 #define NEXTBYTE 0x08
134 #define OFFLEN_SHIFT 4
135 #define RIGHTPATH 0x40
136 #define TRIENODE 0x80
137 #define RIGHTNODE 0x40
138 #define LEFTNODE 0x80
143 * The leaves of the trie are embedded in the trie, and so the same
144 * underlying datatype, unsigned char.
146 * leaf[0]: The unicode version, stored as a generation number that is
147 * an index into utf8agetab[]. With this we can filter code
148 * points based on the unicode version in which they were
149 * defined. The CCC of a non-defined code point is 0.
150 * leaf[1]: Canonical Combining Class. During normalization, we need
151 * to do a stable sort into ascending order of all characters
152 * with a non-zero CCC that occur between two characters with
153 * a CCC of 0, or at the begin or end of a string.
154 * The unicode standard guarantees that all CCC values are
155 * between 0 and 254 inclusive, which leaves 255 available as
157 * Code points with CCC 0 are known as stoppers.
158 * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
159 * start of a NUL-terminated string that is the decomposition
161 * The CCC of a decomposable character is the same as the CCC
162 * of the first character of its decomposition.
163 * Some characters decompose as the empty string: these are
164 * characters with the Default_Ignorable_Code_Point property.
165 * These do affect normalization, as they all have CCC 0.
167 * The decompositions in the trie have been fully expanded.
169 * Casefolding, if applicable, is also done using decompositions.
171 typedef unsigned char utf8leaf_t
;
173 #define LEAF_GEN(LEAF) ((LEAF)[0])
174 #define LEAF_CCC(LEAF) ((LEAF)[1])
175 #define LEAF_STR(LEAF) ((const char*)((LEAF) + 2))
182 #define DECOMPOSE (255)
183 #define HANGUL ((char)(255))
185 #define UTF8HANGULLEAF (12)
188 static utf8leaf_t
*utf8nlookup(struct tree
*, unsigned char *,
189 const char *, size_t);
190 static utf8leaf_t
*utf8lookup(struct tree
*, unsigned char *, const char *);
192 unsigned char *utf8data
;
193 size_t utf8data_size
;
198 /* ------------------------------------------------------------------ */
203 * The UTF-8 encoding spreads the bits of a 32bit word over several
204 * bytes. This table gives the ranges that can be held and how they'd
207 * 0x00000000 0x0000007F: 0xxxxxxx
208 * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
209 * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
210 * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
211 * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
212 * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
214 * There is an additional requirement on UTF-8, in that only the
215 * shortest representation of a 32bit value is to be used. A decoder
216 * must not decode sequences that do not satisfy this requirement.
217 * Thus the allowed ranges have a lower bound.
219 * 0x00000000 0x0000007F: 0xxxxxxx
220 * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
221 * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
222 * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
223 * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
224 * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
226 * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
227 * 17 planes of 65536 values. This limits the sequences actually seen
228 * even more, to just the following.
231 * 0x80 - 0x7ff: 0xc2 0x80 0xdf 0xbf
232 * 0x800 - 0xffff: 0xe0 0xa0 0x80 0xef 0xbf 0xbf
233 * 0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80 0xf4 0x8f 0xbf 0xbf
235 * Even within those ranges not all values are allowed: the surrogates
236 * 0xd800 - 0xdfff should never be seen.
238 * Note that the longest sequence seen with valid usage is 4 bytes,
239 * the same a single UTF-32 character. This makes the UTF-8
240 * representation of Unicode strictly smaller than UTF-32.
242 * The shortest sequence requirement was introduced by:
243 * Corrigendum #1: UTF-8 Shortest Form
244 * It can be found here:
245 * http://www.unicode.org/versions/corrigendum1.html
249 #define UTF8_2_BITS 0xC0
250 #define UTF8_3_BITS 0xE0
251 #define UTF8_4_BITS 0xF0
252 #define UTF8_N_BITS 0x80
253 #define UTF8_2_MASK 0xE0
254 #define UTF8_3_MASK 0xF0
255 #define UTF8_4_MASK 0xF8
256 #define UTF8_N_MASK 0xC0
257 #define UTF8_V_MASK 0x3F
258 #define UTF8_V_SHIFT 6
260 static int utf8encode(char *str
, unsigned int val
)
267 } else if (val
< 0x800) {
268 str
[1] = val
& UTF8_V_MASK
;
269 str
[1] |= UTF8_N_BITS
;
270 val
>>= UTF8_V_SHIFT
;
272 str
[0] |= UTF8_2_BITS
;
274 } else if (val
< 0x10000) {
275 str
[2] = val
& UTF8_V_MASK
;
276 str
[2] |= UTF8_N_BITS
;
277 val
>>= UTF8_V_SHIFT
;
278 str
[1] = val
& UTF8_V_MASK
;
279 str
[1] |= UTF8_N_BITS
;
280 val
>>= UTF8_V_SHIFT
;
282 str
[0] |= UTF8_3_BITS
;
284 } else if (val
< 0x110000) {
285 str
[3] = val
& UTF8_V_MASK
;
286 str
[3] |= UTF8_N_BITS
;
287 val
>>= UTF8_V_SHIFT
;
288 str
[2] = val
& UTF8_V_MASK
;
289 str
[2] |= UTF8_N_BITS
;
290 val
>>= UTF8_V_SHIFT
;
291 str
[1] = val
& UTF8_V_MASK
;
292 str
[1] |= UTF8_N_BITS
;
293 val
>>= UTF8_V_SHIFT
;
295 str
[0] |= UTF8_4_BITS
;
298 printf("%#x: illegal val\n", val
);
304 static unsigned int utf8decode(const char *str
)
306 const unsigned char *s
= (const unsigned char*)str
;
307 unsigned int unichar
= 0;
311 } else if (*s
< UTF8_3_BITS
) {
312 unichar
= *s
++ & 0x1F;
313 unichar
<<= UTF8_V_SHIFT
;
314 unichar
|= *s
& 0x3F;
315 } else if (*s
< UTF8_4_BITS
) {
316 unichar
= *s
++ & 0x0F;
317 unichar
<<= UTF8_V_SHIFT
;
318 unichar
|= *s
++ & 0x3F;
319 unichar
<<= UTF8_V_SHIFT
;
320 unichar
|= *s
& 0x3F;
322 unichar
= *s
++ & 0x0F;
323 unichar
<<= UTF8_V_SHIFT
;
324 unichar
|= *s
++ & 0x3F;
325 unichar
<<= UTF8_V_SHIFT
;
326 unichar
|= *s
++ & 0x3F;
327 unichar
<<= UTF8_V_SHIFT
;
328 unichar
|= *s
& 0x3F;
333 static int utf32valid(unsigned int unichar
)
335 return unichar
< 0x110000;
338 #define HANGUL_SYLLABLE(U) ((U) >= 0xAC00 && (U) <= 0xD7A3)
349 int (*leaf_equal
)(void *, void *);
350 void (*leaf_print
)(void *, int);
351 int (*leaf_mark
)(void *);
352 int (*leaf_size
)(void *);
353 int *(*leaf_index
)(struct tree
*, void *);
354 unsigned char *(*leaf_emit
)(void *, unsigned char *);
355 int leafindex
[0x110000];
367 unsigned char bitnum
;
368 unsigned char nextbyte
;
369 unsigned char leftnode
;
370 unsigned char rightnode
;
371 unsigned int keybits
;
372 unsigned int keymask
;
376 * Example lookup function for a tree.
378 static void *lookup(struct tree
*tree
, const char *key
)
384 while (!leaf
&& node
) {
387 if (*key
& (1 << (node
->bitnum
& 7))) {
389 if (node
->rightnode
== NODE
) {
391 } else if (node
->rightnode
== LEAF
) {
398 if (node
->leftnode
== NODE
) {
400 } else if (node
->leftnode
== LEAF
) {
412 * A simple non-recursive tree walker: keep track of visits to the
413 * left and right branches in the leftmask and rightmask.
415 static void tree_walk(struct tree
*tree
)
418 unsigned int leftmask
;
419 unsigned int rightmask
;
420 unsigned int bitmask
;
422 int nodes
, singletons
, leaves
;
424 nodes
= singletons
= leaves
= 0;
426 printf("%s_%x root %p\n", tree
->type
, tree
->maxage
, tree
->root
);
427 if (tree
->childnode
== LEAF
) {
429 tree
->leaf_print(tree
->root
, indent
);
432 assert(tree
->childnode
== NODE
);
434 leftmask
= rightmask
= 0;
436 printf("%*snode @ %p bitnum %d nextbyte %d"
437 " left %p right %p mask %x bits %x\n",
439 node
->bitnum
, node
->nextbyte
,
440 node
->left
, node
->right
,
441 node
->keymask
, node
->keybits
);
443 if (!(node
->left
&& node
->right
))
447 bitmask
= 1 << node
->bitnum
;
448 if ((leftmask
& bitmask
) == 0) {
450 if (node
->leftnode
== LEAF
) {
452 tree
->leaf_print(node
->left
,
455 } else if (node
->left
) {
456 assert(node
->leftnode
== NODE
);
462 if ((rightmask
& bitmask
) == 0) {
463 rightmask
|= bitmask
;
464 if (node
->rightnode
== LEAF
) {
466 tree
->leaf_print(node
->right
,
469 } else if (node
->right
) {
470 assert(node
->rightnode
== NODE
);
476 leftmask
&= ~bitmask
;
477 rightmask
&= ~bitmask
;
483 printf("nodes %d leaves %d singletons %d\n",
484 nodes
, leaves
, singletons
);
488 * Allocate an initialize a new internal node.
490 static struct node
*alloc_node(struct node
*parent
)
495 node
= malloc(sizeof(*node
));
496 node
->left
= node
->right
= NULL
;
497 node
->parent
= parent
;
498 node
->leftnode
= NODE
;
499 node
->rightnode
= NODE
;
508 bitnum
= parent
->bitnum
;
509 if ((bitnum
& 7) == 0) {
510 node
->bitnum
= bitnum
+ 7 + 8;
513 node
->bitnum
= bitnum
- 1;
525 * Insert a new leaf into the tree, and collapse any subtrees that are
526 * fully populated and end in identical leaves. A nextbyte tagged
527 * internal node will not be removed to preserve the tree's integrity.
528 * Note that due to the structure of utf8, no nextbyte tagged node
529 * will be a candidate for removal.
531 static int insert(struct tree
*tree
, char *key
, int keylen
, void *leaf
)
538 assert(keylen
>= 1 && keylen
<= 4);
541 cursor
= &tree
->root
;
542 keybits
= 8 * keylen
;
544 /* Insert, creating path along the way. */
547 *cursor
= alloc_node(node
);
551 if (*key
& (1 << (node
->bitnum
& 7)))
552 cursor
= &node
->right
;
554 cursor
= &node
->left
;
559 /* Merge subtrees if possible. */
561 if (*key
& (1 << (node
->bitnum
& 7)))
562 node
->rightnode
= LEAF
;
564 node
->leftnode
= LEAF
;
567 if (node
->leftnode
== NODE
|| node
->rightnode
== NODE
)
572 if (! tree
->leaf_equal(node
->left
, node
->right
))
574 /* Keep left, drop right leaf. */
576 /* Check in parent */
577 parent
= node
->parent
;
581 tree
->childnode
= LEAF
;
582 } else if (parent
->left
== node
) {
584 parent
->leftnode
= LEAF
;
589 parent
->keymask
|= (1 << node
->bitnum
);
591 } else if (parent
->right
== node
) {
592 parent
->right
= leaf
;
593 parent
->rightnode
= LEAF
;
598 parent
->keymask
|= (1 << node
->bitnum
);
599 parent
->keybits
|= (1 << node
->bitnum
);
602 /* internal tree error */
609 /* Propagate keymasks up along singleton chains. */
611 parent
= node
->parent
;
614 /* Nix the mask for parents with two children. */
615 if (node
->keymask
== 0) {
618 } else if (parent
->left
&& parent
->right
) {
622 assert((parent
->keymask
& node
->keymask
) == 0);
623 parent
->keymask
|= node
->keymask
;
624 parent
->keymask
|= (1 << parent
->bitnum
);
625 parent
->keybits
|= node
->keybits
;
627 parent
->keybits
|= (1 << parent
->bitnum
);
636 * Prune internal nodes.
638 * Fully populated subtrees that end at the same leaf have already
639 * been collapsed. There are still internal nodes that have for both
640 * their left and right branches a sequence of singletons that make
641 * identical choices and end in identical leaves. The keymask and
642 * keybits collected in the nodes describe the choices made in these
643 * singleton chains. When they are identical for the left and right
644 * branch of a node, and the two leaves comare identical, the node in
645 * question can be removed.
647 * Note that nodes with the nextbyte tag set will not be removed by
648 * this to ensure tree integrity. Note as well that the structure of
649 * utf8 ensures that these nodes would not have been candidates for
650 * removal in any case.
652 static void prune(struct tree
*tree
)
660 unsigned int leftmask
;
661 unsigned int rightmask
;
662 unsigned int bitmask
;
666 printf("Pruning %s_%x\n", tree
->type
, tree
->maxage
);
669 if (tree
->childnode
== LEAF
)
674 leftmask
= rightmask
= 0;
679 if (node
->leftnode
== LEAF
)
681 if (node
->rightnode
== LEAF
)
689 if (left
->keymask
== 0)
691 if (right
->keymask
== 0)
693 if (left
->keymask
!= right
->keymask
)
695 if (left
->keybits
!= right
->keybits
)
699 assert(left
->left
|| left
->right
);
700 if (left
->leftnode
== LEAF
)
701 leftleaf
= left
->left
;
702 else if (left
->rightnode
== LEAF
)
703 leftleaf
= left
->right
;
706 else if (left
->right
)
713 assert(right
->left
|| right
->right
);
714 if (right
->leftnode
== LEAF
)
715 rightleaf
= right
->left
;
716 else if (right
->rightnode
== LEAF
)
717 rightleaf
= right
->right
;
718 else if (right
->left
)
720 else if (right
->right
)
721 right
= right
->right
;
725 if (! tree
->leaf_equal(leftleaf
, rightleaf
))
728 * This node has identical singleton-only subtrees.
731 parent
= node
->parent
;
734 if (parent
->left
== node
)
736 else if (parent
->right
== node
)
737 parent
->right
= left
;
740 left
->parent
= parent
;
741 left
->keymask
|= (1 << node
->bitnum
);
744 bitmask
= 1 << node
->bitnum
;
745 leftmask
&= ~bitmask
;
746 rightmask
&= ~bitmask
;
747 if (node
->leftnode
== NODE
&& node
->left
) {
752 } else if (node
->rightnode
== NODE
&& node
->right
) {
761 /* Propagate keymasks up along singleton chains. */
764 bitmask
= 1 << node
->bitnum
;
765 leftmask
&= ~bitmask
;
766 rightmask
&= ~bitmask
;
768 if (node
->left
&& node
->right
)
772 node
->keymask
|= left
->keymask
;
773 node
->keybits
|= left
->keybits
;
777 node
->keymask
|= right
->keymask
;
778 node
->keybits
|= right
->keybits
;
780 node
->keymask
|= (1 << node
->bitnum
);
783 bitmask
= 1 << node
->bitnum
;
784 leftmask
&= ~bitmask
;
785 rightmask
&= ~bitmask
;
788 bitmask
= 1 << node
->bitnum
;
789 if ((leftmask
& bitmask
) == 0 &&
790 node
->leftnode
== NODE
&&
794 } else if ((rightmask
& bitmask
) == 0 &&
795 node
->rightnode
== NODE
&&
797 rightmask
|= bitmask
;
800 leftmask
&= ~bitmask
;
801 rightmask
&= ~bitmask
;
806 printf("Pruned %d nodes\n", count
);
810 * Mark the nodes in the tree that lead to leaves that must be
813 static void mark_nodes(struct tree
*tree
)
817 unsigned int leftmask
;
818 unsigned int rightmask
;
819 unsigned int bitmask
;
824 printf("Marking %s_%x\n", tree
->type
, tree
->maxage
);
825 if (tree
->childnode
== LEAF
)
828 assert(tree
->childnode
== NODE
);
830 leftmask
= rightmask
= 0;
832 bitmask
= 1 << node
->bitnum
;
833 if ((leftmask
& bitmask
) == 0) {
835 if (node
->leftnode
== LEAF
) {
837 if (tree
->leaf_mark(node
->left
)) {
839 while (n
&& !n
->mark
) {
845 } else if (node
->left
) {
846 assert(node
->leftnode
== NODE
);
851 if ((rightmask
& bitmask
) == 0) {
852 rightmask
|= bitmask
;
853 if (node
->rightnode
== LEAF
) {
855 if (tree
->leaf_mark(node
->right
)) {
857 while (n
&& !n
->mark
) {
863 } else if (node
->right
) {
864 assert(node
->rightnode
== NODE
);
869 leftmask
&= ~bitmask
;
870 rightmask
&= ~bitmask
;
874 /* second pass: left siblings and singletons */
876 assert(tree
->childnode
== NODE
);
878 leftmask
= rightmask
= 0;
880 bitmask
= 1 << node
->bitnum
;
881 if ((leftmask
& bitmask
) == 0) {
883 if (node
->leftnode
== LEAF
) {
885 if (tree
->leaf_mark(node
->left
)) {
887 while (n
&& !n
->mark
) {
893 } else if (node
->left
) {
894 assert(node
->leftnode
== NODE
);
896 if (!node
->mark
&& node
->parent
->mark
) {
903 if ((rightmask
& bitmask
) == 0) {
904 rightmask
|= bitmask
;
905 if (node
->rightnode
== LEAF
) {
907 if (tree
->leaf_mark(node
->right
)) {
909 while (n
&& !n
->mark
) {
915 } else if (node
->right
) {
916 assert(node
->rightnode
== NODE
);
918 if (!node
->mark
&& node
->parent
->mark
&&
919 !node
->parent
->left
) {
926 leftmask
&= ~bitmask
;
927 rightmask
&= ~bitmask
;
932 printf("Marked %d nodes\n", marked
);
936 * Compute the index of each node and leaf, which is the offset in the
937 * emitted trie. These values must be pre-computed because relative
938 * offsets between nodes are used to navigate the tree.
940 static int index_nodes(struct tree
*tree
, int index
)
943 unsigned int leftmask
;
944 unsigned int rightmask
;
945 unsigned int bitmask
;
949 /* Align to a cache line (or half a cache line?). */
957 printf("Indexing %s_%x: %d\n", tree
->type
, tree
->maxage
, index
);
958 if (tree
->childnode
== LEAF
) {
959 index
+= tree
->leaf_size(tree
->root
);
963 assert(tree
->childnode
== NODE
);
965 leftmask
= rightmask
= 0;
970 if (node
->index
!= index
)
975 bitmask
= 1 << node
->bitnum
;
976 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
978 if (node
->leftnode
== LEAF
) {
980 *tree
->leaf_index(tree
, node
->left
) =
982 index
+= tree
->leaf_size(node
->left
);
984 } else if (node
->left
) {
985 assert(node
->leftnode
== NODE
);
991 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
992 rightmask
|= bitmask
;
993 if (node
->rightnode
== LEAF
) {
995 *tree
->leaf_index(tree
, node
->right
) = index
;
996 index
+= tree
->leaf_size(node
->right
);
998 } else if (node
->right
) {
999 assert(node
->rightnode
== NODE
);
1005 leftmask
&= ~bitmask
;
1006 rightmask
&= ~bitmask
;
1007 node
= node
->parent
;
1012 /* Round up to a multiple of 16 */
1016 printf("Final index %d\n", index
);
1021 * Mark the nodes in a subtree, helper for size_nodes().
1023 static int mark_subtree(struct node
*node
)
1027 if (!node
|| node
->mark
)
1030 node
->index
= node
->parent
->index
;
1032 if (node
->leftnode
== NODE
)
1033 changed
+= mark_subtree(node
->left
);
1034 if (node
->rightnode
== NODE
)
1035 changed
+= mark_subtree(node
->right
);
1040 * Compute the size of nodes and leaves. We start by assuming that
1041 * each node needs to store a three-byte offset. The indexes of the
1042 * nodes are calculated based on that, and then this function is
1043 * called to see if the sizes of some nodes can be reduced. This is
1044 * repeated until no more changes are seen.
1046 static int size_nodes(struct tree
*tree
)
1052 unsigned int leftmask
;
1053 unsigned int rightmask
;
1054 unsigned int bitmask
;
1055 unsigned int pathbits
;
1056 unsigned int pathmask
;
1068 printf("Sizing %s_%x\n", tree
->type
, tree
->maxage
);
1069 if (tree
->childnode
== LEAF
)
1072 assert(tree
->childnode
== NODE
);
1076 leftmask
= rightmask
= 0;
1081 if (!node
->left
|| !node
->right
) {
1084 if (node
->rightnode
== NODE
) {
1086 * If the right node is not marked,
1087 * look for a corresponding node in
1088 * the next tree. Such a node need
1091 right
= node
->right
;
1093 while (!right
->mark
) {
1096 while (n
->bitnum
!= node
->bitnum
) {
1097 nbit
= 1 << n
->bitnum
;
1098 if (!(pathmask
& nbit
))
1100 if (pathbits
& nbit
) {
1101 if (n
->rightnode
== LEAF
)
1105 if (n
->leftnode
== LEAF
)
1110 if (n
->bitnum
!= node
->bitnum
)
1116 /* Make sure the right node is marked. */
1118 changed
+= mark_subtree(right
);
1119 offset
= right
->index
- node
->index
;
1121 offset
= *tree
->leaf_index(tree
, node
->right
);
1122 offset
-= node
->index
;
1124 assert(offset
>= 0);
1125 assert(offset
<= 0xffffff);
1126 if (offset
<= 0xff) {
1128 } else if (offset
<= 0xffff) {
1130 } else { /* offset <= 0xffffff */
1134 if (node
->size
!= size
|| node
->offset
!= offset
) {
1136 node
->offset
= offset
;
1141 bitmask
= 1 << node
->bitnum
;
1142 pathmask
|= bitmask
;
1143 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1144 leftmask
|= bitmask
;
1145 if (node
->leftnode
== LEAF
) {
1147 } else if (node
->left
) {
1148 assert(node
->leftnode
== NODE
);
1154 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1155 rightmask
|= bitmask
;
1156 pathbits
|= bitmask
;
1157 if (node
->rightnode
== LEAF
) {
1158 assert(node
->right
);
1159 } else if (node
->right
) {
1160 assert(node
->rightnode
== NODE
);
1166 leftmask
&= ~bitmask
;
1167 rightmask
&= ~bitmask
;
1168 pathmask
&= ~bitmask
;
1169 pathbits
&= ~bitmask
;
1170 node
= node
->parent
;
1176 printf("Found %d changes\n", changed
);
1181 * Emit a trie for the given tree into the data array.
1183 static void emit(struct tree
*tree
, unsigned char *data
)
1186 unsigned int leftmask
;
1187 unsigned int rightmask
;
1188 unsigned int bitmask
;
1199 nodes
[0] = nodes
[1] = nodes
[2] = nodes
[3] = 0;
1202 index
= tree
->index
;
1206 printf("Emitting %s_%x\n", tree
->type
, tree
->maxage
);
1207 if (tree
->childnode
== LEAF
) {
1209 tree
->leaf_emit(tree
->root
, data
);
1210 size
= tree
->leaf_size(tree
->root
);
1216 assert(tree
->childnode
== NODE
);
1218 leftmask
= rightmask
= 0;
1222 assert(node
->offset
!= -1);
1223 assert(node
->index
== index
);
1228 byte
|= (node
->bitnum
& BITNUM
);
1229 if (node
->left
&& node
->right
) {
1230 if (node
->leftnode
== NODE
)
1232 if (node
->rightnode
== NODE
)
1234 if (node
->offset
<= 0xff)
1236 else if (node
->offset
<= 0xffff)
1241 offset
= node
->offset
;
1242 byte
|= offlen
<< OFFLEN_SHIFT
;
1246 *data
++ = offset
& 0xff;
1250 } else if (node
->left
) {
1251 if (node
->leftnode
== NODE
)
1256 } else if (node
->right
) {
1258 if (node
->rightnode
== NODE
)
1268 bitmask
= 1 << node
->bitnum
;
1269 if (node
->mark
&& (leftmask
& bitmask
) == 0) {
1270 leftmask
|= bitmask
;
1271 if (node
->leftnode
== LEAF
) {
1273 data
= tree
->leaf_emit(node
->left
,
1275 size
= tree
->leaf_size(node
->left
);
1279 } else if (node
->left
) {
1280 assert(node
->leftnode
== NODE
);
1286 if (node
->mark
&& (rightmask
& bitmask
) == 0) {
1287 rightmask
|= bitmask
;
1288 if (node
->rightnode
== LEAF
) {
1289 assert(node
->right
);
1290 data
= tree
->leaf_emit(node
->right
,
1292 size
= tree
->leaf_size(node
->right
);
1296 } else if (node
->right
) {
1297 assert(node
->rightnode
== NODE
);
1303 leftmask
&= ~bitmask
;
1304 rightmask
&= ~bitmask
;
1305 node
= node
->parent
;
1311 printf("Emitted %d (%d) leaves",
1313 printf(" %d (%d+%d+%d+%d) nodes",
1314 nodes
[0] + nodes
[1] + nodes
[2] + nodes
[3],
1315 nodes
[0], nodes
[1], nodes
[2], nodes
[3]);
1316 printf(" %d total\n", index
- tree
->index
);
1320 /* ------------------------------------------------------------------ */
1325 * We need to keep track of the Canonical Combining Class, the Age,
1326 * and decompositions for a code point.
1328 * For the Age, we store the index into the ages table. Effectively
1329 * this is a generation number that the table maps to a unicode
1332 * The correction field is used to indicate that this entry is in the
1333 * corrections array, which contains decompositions that were
1334 * corrected in later revisions. The value of the correction field is
1335 * the Unicode version in which the mapping was corrected.
1337 struct unicode_data
{
1342 unsigned int *utf32nfkdi
;
1343 unsigned int *utf32nfkdicf
;
1348 struct unicode_data unicode_data
[0x110000];
1349 struct unicode_data
*corrections
;
1350 int corrections_count
;
1352 struct tree
*nfkdi_tree
;
1353 struct tree
*nfkdicf_tree
;
1359 * Check the corrections array to see if this entry was corrected at
1362 static struct unicode_data
*corrections_lookup(struct unicode_data
*u
)
1366 for (i
= 0; i
!= corrections_count
; i
++)
1367 if (u
->code
== corrections
[i
].code
)
1368 return &corrections
[i
];
1372 static int nfkdi_equal(void *l
, void *r
)
1374 struct unicode_data
*left
= l
;
1375 struct unicode_data
*right
= r
;
1377 if (left
->gen
!= right
->gen
)
1379 if (left
->ccc
!= right
->ccc
)
1381 if (left
->utf8nfkdi
&& right
->utf8nfkdi
&&
1382 strcmp(left
->utf8nfkdi
, right
->utf8nfkdi
) == 0)
1384 if (left
->utf8nfkdi
|| right
->utf8nfkdi
)
1389 static int nfkdicf_equal(void *l
, void *r
)
1391 struct unicode_data
*left
= l
;
1392 struct unicode_data
*right
= r
;
1394 if (left
->gen
!= right
->gen
)
1396 if (left
->ccc
!= right
->ccc
)
1398 if (left
->utf8nfkdicf
&& right
->utf8nfkdicf
&&
1399 strcmp(left
->utf8nfkdicf
, right
->utf8nfkdicf
) == 0)
1401 if (left
->utf8nfkdicf
&& right
->utf8nfkdicf
)
1403 if (left
->utf8nfkdicf
|| right
->utf8nfkdicf
)
1405 if (left
->utf8nfkdi
&& right
->utf8nfkdi
&&
1406 strcmp(left
->utf8nfkdi
, right
->utf8nfkdi
) == 0)
1408 if (left
->utf8nfkdi
|| right
->utf8nfkdi
)
1413 static void nfkdi_print(void *l
, int indent
)
1415 struct unicode_data
*leaf
= l
;
1417 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1418 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1419 if (leaf
->utf8nfkdi
&& leaf
->utf8nfkdi
[0] == HANGUL
)
1420 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1421 else if (leaf
->utf8nfkdi
)
1422 printf(" nfkdi \"%s\"", (const char*)leaf
->utf8nfkdi
);
1426 static void nfkdicf_print(void *l
, int indent
)
1428 struct unicode_data
*leaf
= l
;
1430 printf("%*sleaf @ %p code %X ccc %d gen %d", indent
, "", leaf
,
1431 leaf
->code
, leaf
->ccc
, leaf
->gen
);
1432 if (leaf
->utf8nfkdicf
)
1433 printf(" nfkdicf \"%s\"", (const char*)leaf
->utf8nfkdicf
);
1434 else if (leaf
->utf8nfkdi
&& leaf
->utf8nfkdi
[0] == HANGUL
)
1435 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1436 else if (leaf
->utf8nfkdi
)
1437 printf(" nfkdi \"%s\"", (const char*)leaf
->utf8nfkdi
);
1441 static int nfkdi_mark(void *l
)
1446 static int nfkdicf_mark(void *l
)
1448 struct unicode_data
*leaf
= l
;
1450 if (leaf
->utf8nfkdicf
)
1455 static int correction_mark(void *l
)
1457 struct unicode_data
*leaf
= l
;
1459 return leaf
->correction
;
1462 static int nfkdi_size(void *l
)
1464 struct unicode_data
*leaf
= l
;
1467 if (HANGUL_SYLLABLE(leaf
->code
))
1469 else if (leaf
->utf8nfkdi
)
1470 size
+= strlen(leaf
->utf8nfkdi
) + 1;
1474 static int nfkdicf_size(void *l
)
1476 struct unicode_data
*leaf
= l
;
1479 if (HANGUL_SYLLABLE(leaf
->code
))
1481 else if (leaf
->utf8nfkdicf
)
1482 size
+= strlen(leaf
->utf8nfkdicf
) + 1;
1483 else if (leaf
->utf8nfkdi
)
1484 size
+= strlen(leaf
->utf8nfkdi
) + 1;
1488 static int *nfkdi_index(struct tree
*tree
, void *l
)
1490 struct unicode_data
*leaf
= l
;
1492 return &tree
->leafindex
[leaf
->code
];
1495 static int *nfkdicf_index(struct tree
*tree
, void *l
)
1497 struct unicode_data
*leaf
= l
;
1499 return &tree
->leafindex
[leaf
->code
];
1502 static unsigned char *nfkdi_emit(void *l
, unsigned char *data
)
1504 struct unicode_data
*leaf
= l
;
1507 *data
++ = leaf
->gen
;
1508 if (HANGUL_SYLLABLE(leaf
->code
)) {
1509 *data
++ = DECOMPOSE
;
1511 } else if (leaf
->utf8nfkdi
) {
1512 *data
++ = DECOMPOSE
;
1513 s
= (unsigned char*)leaf
->utf8nfkdi
;
1514 while ((*data
++ = *s
++) != 0)
1517 *data
++ = leaf
->ccc
;
1522 static unsigned char *nfkdicf_emit(void *l
, unsigned char *data
)
1524 struct unicode_data
*leaf
= l
;
1527 *data
++ = leaf
->gen
;
1528 if (HANGUL_SYLLABLE(leaf
->code
)) {
1529 *data
++ = DECOMPOSE
;
1531 } else if (leaf
->utf8nfkdicf
) {
1532 *data
++ = DECOMPOSE
;
1533 s
= (unsigned char*)leaf
->utf8nfkdicf
;
1534 while ((*data
++ = *s
++) != 0)
1536 } else if (leaf
->utf8nfkdi
) {
1537 *data
++ = DECOMPOSE
;
1538 s
= (unsigned char*)leaf
->utf8nfkdi
;
1539 while ((*data
++ = *s
++) != 0)
1542 *data
++ = leaf
->ccc
;
1547 static void utf8_create(struct unicode_data
*data
)
1554 if (data
->utf8nfkdi
) {
1555 assert(data
->utf8nfkdi
[0] == HANGUL
);
1560 um
= data
->utf32nfkdi
;
1562 for (i
= 0; um
[i
]; i
++)
1563 u
+= utf8encode(u
, um
[i
]);
1565 data
->utf8nfkdi
= strdup(utf
);
1568 um
= data
->utf32nfkdicf
;
1570 for (i
= 0; um
[i
]; i
++)
1571 u
+= utf8encode(u
, um
[i
]);
1573 if (!data
->utf8nfkdi
|| strcmp(data
->utf8nfkdi
, utf
))
1574 data
->utf8nfkdicf
= strdup(utf
);
1578 static void utf8_init(void)
1580 unsigned int unichar
;
1583 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
1584 utf8_create(&unicode_data
[unichar
]);
1586 for (i
= 0; i
!= corrections_count
; i
++)
1587 utf8_create(&corrections
[i
]);
1590 static void trees_init(void)
1592 struct unicode_data
*data
;
1593 unsigned int maxage
;
1594 unsigned int nextage
;
1599 /* Count the number of different ages. */
1601 nextage
= (unsigned int)-1;
1605 for (i
= 0; i
<= corrections_count
; i
++) {
1606 data
= &corrections
[i
];
1607 if (nextage
< data
->correction
&&
1608 data
->correction
< maxage
)
1609 nextage
= data
->correction
;
1614 /* Two trees per age: nfkdi and nfkdicf */
1615 trees_count
= count
* 2;
1616 trees
= calloc(trees_count
, sizeof(struct tree
));
1618 /* Assign ages to the trees. */
1619 count
= trees_count
;
1620 nextage
= (unsigned int)-1;
1623 trees
[--count
].maxage
= maxage
;
1624 trees
[--count
].maxage
= maxage
;
1626 for (i
= 0; i
<= corrections_count
; i
++) {
1627 data
= &corrections
[i
];
1628 if (nextage
< data
->correction
&&
1629 data
->correction
< maxage
)
1630 nextage
= data
->correction
;
1634 /* The ages assigned above are off by one. */
1635 for (i
= 0; i
!= trees_count
; i
++) {
1637 while (ages
[j
] < trees
[i
].maxage
)
1639 trees
[i
].maxage
= ages
[j
-1];
1642 /* Set up the forwarding between trees. */
1643 trees
[trees_count
-2].next
= &trees
[trees_count
-1];
1644 trees
[trees_count
-1].leaf_mark
= nfkdi_mark
;
1645 trees
[trees_count
-2].leaf_mark
= nfkdicf_mark
;
1646 for (i
= 0; i
!= trees_count
-2; i
+= 2) {
1647 trees
[i
].next
= &trees
[trees_count
-2];
1648 trees
[i
].leaf_mark
= correction_mark
;
1649 trees
[i
+1].next
= &trees
[trees_count
-1];
1650 trees
[i
+1].leaf_mark
= correction_mark
;
1653 /* Assign the callouts. */
1654 for (i
= 0; i
!= trees_count
; i
+= 2) {
1655 trees
[i
].type
= "nfkdicf";
1656 trees
[i
].leaf_equal
= nfkdicf_equal
;
1657 trees
[i
].leaf_print
= nfkdicf_print
;
1658 trees
[i
].leaf_size
= nfkdicf_size
;
1659 trees
[i
].leaf_index
= nfkdicf_index
;
1660 trees
[i
].leaf_emit
= nfkdicf_emit
;
1662 trees
[i
+1].type
= "nfkdi";
1663 trees
[i
+1].leaf_equal
= nfkdi_equal
;
1664 trees
[i
+1].leaf_print
= nfkdi_print
;
1665 trees
[i
+1].leaf_size
= nfkdi_size
;
1666 trees
[i
+1].leaf_index
= nfkdi_index
;
1667 trees
[i
+1].leaf_emit
= nfkdi_emit
;
1671 for (i
= 0; i
!= trees_count
; i
++)
1672 trees
[i
].childnode
= NODE
;
1675 static void trees_populate(void)
1677 struct unicode_data
*data
;
1678 unsigned int unichar
;
1683 for (i
= 0; i
!= trees_count
; i
++) {
1685 printf("Populating %s_%x\n",
1686 trees
[i
].type
, trees
[i
].maxage
);
1688 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1689 if (unicode_data
[unichar
].gen
< 0)
1691 keylen
= utf8encode(keyval
, unichar
);
1692 data
= corrections_lookup(&unicode_data
[unichar
]);
1693 if (data
->correction
<= trees
[i
].maxage
)
1694 data
= &unicode_data
[unichar
];
1695 insert(&trees
[i
], keyval
, keylen
, data
);
1700 static void trees_reduce(void)
1706 for (i
= 0; i
!= trees_count
; i
++)
1708 for (i
= 0; i
!= trees_count
; i
++)
1709 mark_nodes(&trees
[i
]);
1712 for (i
= 0; i
!= trees_count
; i
++)
1713 size
= index_nodes(&trees
[i
], size
);
1715 for (i
= 0; i
!= trees_count
; i
++)
1716 changed
+= size_nodes(&trees
[i
]);
1719 utf8data
= calloc(size
, 1);
1720 utf8data_size
= size
;
1721 for (i
= 0; i
!= trees_count
; i
++)
1722 emit(&trees
[i
], utf8data
);
1725 for (i
= 0; i
!= trees_count
; i
++) {
1726 printf("%s_%x idx %d\n",
1727 trees
[i
].type
, trees
[i
].maxage
, trees
[i
].index
);
1731 nfkdi
= utf8data
+ trees
[trees_count
-1].index
;
1732 nfkdicf
= utf8data
+ trees
[trees_count
-2].index
;
1734 nfkdi_tree
= &trees
[trees_count
-1];
1735 nfkdicf_tree
= &trees
[trees_count
-2];
1738 static void verify(struct tree
*tree
)
1740 struct unicode_data
*data
;
1742 unsigned int unichar
;
1744 unsigned char hangul
[UTF8HANGULLEAF
];
1749 printf("Verifying %s_%x\n", tree
->type
, tree
->maxage
);
1750 nocf
= strcmp(tree
->type
, "nfkdicf");
1752 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
1754 data
= corrections_lookup(&unicode_data
[unichar
]);
1755 if (data
->correction
<= tree
->maxage
)
1756 data
= &unicode_data
[unichar
];
1757 utf8encode(key
,unichar
);
1758 leaf
= utf8lookup(tree
, hangul
, key
);
1761 if (data
->gen
!= -1)
1763 if (unichar
< 0xd800 || unichar
> 0xdfff)
1766 if (unichar
>= 0xd800 && unichar
<= 0xdfff)
1768 if (data
->gen
== -1)
1770 if (data
->gen
!= LEAF_GEN(leaf
))
1772 if (LEAF_CCC(leaf
) == DECOMPOSE
) {
1773 if (HANGUL_SYLLABLE(data
->code
)) {
1774 if (data
->utf8nfkdi
[0] != HANGUL
)
1777 if (!data
->utf8nfkdi
) {
1779 } else if (strcmp(data
->utf8nfkdi
,
1784 if (!data
->utf8nfkdicf
&&
1787 } else if (data
->utf8nfkdicf
) {
1788 if (strcmp(data
->utf8nfkdicf
,
1791 } else if (strcmp(data
->utf8nfkdi
,
1796 } else if (data
->ccc
!= LEAF_CCC(leaf
)) {
1801 printf("%X code %X gen %d ccc %d"
1803 unichar
, data
->code
, data
->gen
,
1807 printf(" gen %d ccc %d"
1811 LEAF_CCC(leaf
) == DECOMPOSE
?
1812 LEAF_STR(leaf
) : "");
1819 static void trees_verify(void)
1823 for (i
= 0; i
!= trees_count
; i
++)
1827 /* ------------------------------------------------------------------ */
1829 static void help(void)
1831 printf("Usage: %s [options]\n", argv0
);
1833 printf("This program creates an a data trie used for parsing and\n");
1834 printf("normalization of UTF-8 strings. The trie is derived from\n");
1835 printf("a set of input files from the Unicode character database\n");
1836 printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1838 printf("The generated tree supports two normalization forms:\n");
1840 printf("\tnfkdi:\n");
1841 printf("\t- Apply unicode normalization form NFKD.\n");
1842 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1844 printf("\tnfkdicf:\n");
1845 printf("\t- Apply unicode normalization form NFKD.\n");
1846 printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1847 printf("\t- Apply a full casefold (C + F).\n");
1849 printf("These forms were chosen as being most useful when dealing\n");
1850 printf("with file names: NFKD catches most cases where characters\n");
1851 printf("should be considered equivalent. The ignorables are mostly\n");
1852 printf("invisible, making names hard to type.\n");
1854 printf("The options to specify the files to be used are listed\n");
1855 printf("below with their default values, which are the names used\n");
1856 printf("by version 11.0.0 of the Unicode Character Database.\n");
1858 printf("The input files:\n");
1859 printf("\t-a %s\n", AGE_NAME
);
1860 printf("\t-c %s\n", CCC_NAME
);
1861 printf("\t-p %s\n", PROP_NAME
);
1862 printf("\t-d %s\n", DATA_NAME
);
1863 printf("\t-f %s\n", FOLD_NAME
);
1864 printf("\t-n %s\n", NORM_NAME
);
1866 printf("Additionally, the generated tables are tested using:\n");
1867 printf("\t-t %s\n", TEST_NAME
);
1869 printf("Finally, the output file:\n");
1870 printf("\t-o %s\n", UTF8_NAME
);
1874 static void usage(void)
1880 static void open_fail(const char *name
, int error
)
1882 printf("Error %d opening %s: %s\n", error
, name
, strerror(error
));
1886 static void file_fail(const char *filename
)
1888 printf("Error parsing %s\n", filename
);
1892 static void line_fail(const char *filename
, const char *line
)
1894 printf("Error parsing %s:%s\n", filename
, line
);
1898 /* ------------------------------------------------------------------ */
1900 static void print_utf32(unsigned int *utf32str
)
1904 for (i
= 0; utf32str
[i
]; i
++)
1905 printf(" %X", utf32str
[i
]);
1908 static void print_utf32nfkdi(unsigned int unichar
)
1910 printf(" %X ->", unichar
);
1911 print_utf32(unicode_data
[unichar
].utf32nfkdi
);
1915 static void print_utf32nfkdicf(unsigned int unichar
)
1917 printf(" %X ->", unichar
);
1918 print_utf32(unicode_data
[unichar
].utf32nfkdicf
);
1922 /* ------------------------------------------------------------------ */
1924 static void age_init(void)
1929 unsigned int unichar
;
1932 unsigned int revision
;
1938 printf("Parsing %s\n", age_name
);
1940 file
= fopen(age_name
, "r");
1942 open_fail(age_name
, errno
);
1946 while (fgets(line
, LINESIZE
, file
)) {
1947 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1948 &major
, &minor
, &revision
);
1952 printf(" Age V%d_%d_%d\n",
1953 major
, minor
, revision
);
1954 if (!age_valid(major
, minor
, revision
))
1955 line_fail(age_name
, line
);
1958 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
1962 printf(" Age V%d_%d\n", major
, minor
);
1963 if (!age_valid(major
, minor
, 0))
1964 line_fail(age_name
, line
);
1969 /* We must have found something above. */
1971 printf("%d age entries\n", ages_count
);
1972 if (ages_count
== 0 || ages_count
> MAXGEN
)
1973 file_fail(age_name
);
1975 /* There is a 0 entry. */
1977 ages
= calloc(ages_count
+ 1, sizeof(*ages
));
1978 /* And a guard entry. */
1979 ages
[ages_count
] = (unsigned int)-1;
1984 while (fgets(line
, LINESIZE
, file
)) {
1985 ret
= sscanf(line
, "# Age=V%d_%d_%d",
1986 &major
, &minor
, &revision
);
1989 UNICODE_AGE(major
, minor
, revision
);
1991 printf(" Age V%d_%d_%d = gen %d\n",
1992 major
, minor
, revision
, gen
);
1993 if (!age_valid(major
, minor
, revision
))
1994 line_fail(age_name
, line
);
1997 ret
= sscanf(line
, "# Age=V%d_%d", &major
, &minor
);
1999 ages
[++gen
] = UNICODE_AGE(major
, minor
, 0);
2001 printf(" Age V%d_%d = %d\n",
2003 if (!age_valid(major
, minor
, 0))
2004 line_fail(age_name
, line
);
2007 ret
= sscanf(line
, "%X..%X ; %d.%d #",
2008 &first
, &last
, &major
, &minor
);
2010 for (unichar
= first
; unichar
<= last
; unichar
++)
2011 unicode_data
[unichar
].gen
= gen
;
2012 count
+= 1 + last
- first
;
2014 printf(" %X..%X gen %d\n", first
, last
, gen
);
2015 if (!utf32valid(first
) || !utf32valid(last
))
2016 line_fail(age_name
, line
);
2019 ret
= sscanf(line
, "%X ; %d.%d #", &unichar
, &major
, &minor
);
2021 unicode_data
[unichar
].gen
= gen
;
2024 printf(" %X gen %d\n", unichar
, gen
);
2025 if (!utf32valid(unichar
))
2026 line_fail(age_name
, line
);
2030 unicode_maxage
= ages
[gen
];
2033 /* Nix surrogate block */
2035 printf(" Removing surrogate block D800..DFFF\n");
2036 for (unichar
= 0xd800; unichar
<= 0xdfff; unichar
++)
2037 unicode_data
[unichar
].gen
= -1;
2040 printf("Found %d entries\n", count
);
2042 file_fail(age_name
);
2045 static void ccc_init(void)
2050 unsigned int unichar
;
2056 printf("Parsing %s\n", ccc_name
);
2058 file
= fopen(ccc_name
, "r");
2060 open_fail(ccc_name
, errno
);
2063 while (fgets(line
, LINESIZE
, file
)) {
2064 ret
= sscanf(line
, "%X..%X ; %d #", &first
, &last
, &value
);
2066 for (unichar
= first
; unichar
<= last
; unichar
++) {
2067 unicode_data
[unichar
].ccc
= value
;
2071 printf(" %X..%X ccc %d\n", first
, last
, value
);
2072 if (!utf32valid(first
) || !utf32valid(last
))
2073 line_fail(ccc_name
, line
);
2076 ret
= sscanf(line
, "%X ; %d #", &unichar
, &value
);
2078 unicode_data
[unichar
].ccc
= value
;
2081 printf(" %X ccc %d\n", unichar
, value
);
2082 if (!utf32valid(unichar
))
2083 line_fail(ccc_name
, line
);
2090 printf("Found %d entries\n", count
);
2092 file_fail(ccc_name
);
2095 static void nfkdi_init(void)
2098 unsigned int unichar
;
2099 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2107 printf("Parsing %s\n", data_name
);
2108 file
= fopen(data_name
, "r");
2110 open_fail(data_name
, errno
);
2113 while (fgets(line
, LINESIZE
, file
)) {
2114 ret
= sscanf(line
, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2118 if (!utf32valid(unichar
))
2119 line_fail(data_name
, line
);
2122 /* skip over <tag> */
2126 /* decode the decomposition into UTF-32 */
2129 mapping
[i
] = strtoul(s
, &s
, 16);
2130 if (!utf32valid(mapping
[i
]))
2131 line_fail(data_name
, line
);
2136 um
= malloc(i
* sizeof(unsigned int));
2137 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2138 unicode_data
[unichar
].utf32nfkdi
= um
;
2141 print_utf32nfkdi(unichar
);
2146 printf("Found %d entries\n", count
);
2148 file_fail(data_name
);
2151 static void nfkdicf_init(void)
2154 unsigned int unichar
;
2155 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2164 printf("Parsing %s\n", fold_name
);
2165 file
= fopen(fold_name
, "r");
2167 open_fail(fold_name
, errno
);
2170 while (fgets(line
, LINESIZE
, file
)) {
2171 ret
= sscanf(line
, "%X; %c; %[^;];", &unichar
, &status
, buf0
);
2174 if (!utf32valid(unichar
))
2175 line_fail(fold_name
, line
);
2176 /* Use the C+F casefold. */
2177 if (status
!= 'C' && status
!= 'F')
2185 mapping
[i
] = strtoul(s
, &s
, 16);
2186 if (!utf32valid(mapping
[i
]))
2187 line_fail(fold_name
, line
);
2192 um
= malloc(i
* sizeof(unsigned int));
2193 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2194 unicode_data
[unichar
].utf32nfkdicf
= um
;
2197 print_utf32nfkdicf(unichar
);
2202 printf("Found %d entries\n", count
);
2204 file_fail(fold_name
);
2207 static void ignore_init(void)
2210 unsigned int unichar
;
2218 printf("Parsing %s\n", prop_name
);
2219 file
= fopen(prop_name
, "r");
2221 open_fail(prop_name
, errno
);
2224 while (fgets(line
, LINESIZE
, file
)) {
2225 ret
= sscanf(line
, "%X..%X ; %s # ", &first
, &last
, buf0
);
2227 if (strcmp(buf0
, "Default_Ignorable_Code_Point"))
2229 if (!utf32valid(first
) || !utf32valid(last
))
2230 line_fail(prop_name
, line
);
2231 for (unichar
= first
; unichar
<= last
; unichar
++) {
2232 free(unicode_data
[unichar
].utf32nfkdi
);
2233 um
= malloc(sizeof(unsigned int));
2235 unicode_data
[unichar
].utf32nfkdi
= um
;
2236 free(unicode_data
[unichar
].utf32nfkdicf
);
2237 um
= malloc(sizeof(unsigned int));
2239 unicode_data
[unichar
].utf32nfkdicf
= um
;
2243 printf(" %X..%X Default_Ignorable_Code_Point\n",
2247 ret
= sscanf(line
, "%X ; %s # ", &unichar
, buf0
);
2249 if (strcmp(buf0
, "Default_Ignorable_Code_Point"))
2251 if (!utf32valid(unichar
))
2252 line_fail(prop_name
, line
);
2253 free(unicode_data
[unichar
].utf32nfkdi
);
2254 um
= malloc(sizeof(unsigned int));
2256 unicode_data
[unichar
].utf32nfkdi
= um
;
2257 free(unicode_data
[unichar
].utf32nfkdicf
);
2258 um
= malloc(sizeof(unsigned int));
2260 unicode_data
[unichar
].utf32nfkdicf
= um
;
2262 printf(" %X Default_Ignorable_Code_Point\n",
2271 printf("Found %d entries\n", count
);
2273 file_fail(prop_name
);
2276 static void corrections_init(void)
2279 unsigned int unichar
;
2282 unsigned int revision
;
2285 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2292 printf("Parsing %s\n", norm_name
);
2293 file
= fopen(norm_name
, "r");
2295 open_fail(norm_name
, errno
);
2298 while (fgets(line
, LINESIZE
, file
)) {
2299 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2300 &unichar
, buf0
, buf1
,
2301 &major
, &minor
, &revision
);
2304 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2305 line_fail(norm_name
, line
);
2308 corrections
= calloc(count
, sizeof(struct unicode_data
));
2309 corrections_count
= count
;
2313 while (fgets(line
, LINESIZE
, file
)) {
2314 ret
= sscanf(line
, "%X;%[^;];%[^;];%d.%d.%d #",
2315 &unichar
, buf0
, buf1
,
2316 &major
, &minor
, &revision
);
2319 if (!utf32valid(unichar
) || !age_valid(major
, minor
, revision
))
2320 line_fail(norm_name
, line
);
2321 corrections
[count
] = unicode_data
[unichar
];
2322 assert(corrections
[count
].code
== unichar
);
2323 age
= UNICODE_AGE(major
, minor
, revision
);
2324 corrections
[count
].correction
= age
;
2329 mapping
[i
] = strtoul(s
, &s
, 16);
2330 if (!utf32valid(mapping
[i
]))
2331 line_fail(norm_name
, line
);
2336 um
= malloc(i
* sizeof(unsigned int));
2337 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2338 corrections
[count
].utf32nfkdi
= um
;
2341 printf(" %X -> %s -> %s V%d_%d_%d\n",
2342 unichar
, buf0
, buf1
, major
, minor
, revision
);
2348 printf("Found %d entries\n", count
);
2350 file_fail(norm_name
);
2353 /* ------------------------------------------------------------------ */
2356 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2358 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2359 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2368 * NCount = 588 (VCount * TCount)
2369 * SCount = 11172 (LCount * NCount)
2372 * SIndex = s - SBase
2374 * LV (Canonical/Full)
2375 * LIndex = SIndex / NCount
2376 * VIndex = (Sindex % NCount) / TCount
2377 * LPart = LBase + LIndex
2378 * VPart = VBase + VIndex
2381 * LVIndex = (SIndex / TCount) * TCount
2382 * TIndex = (Sindex % TCount)
2383 * LVPart = SBase + LVIndex
2384 * TPart = TBase + TIndex
2387 * LIndex = SIndex / NCount
2388 * VIndex = (Sindex % NCount) / TCount
2389 * TIndex = (Sindex % TCount)
2390 * LPart = LBase + LIndex
2391 * VPart = VBase + VIndex
2392 * if (TIndex == 0) {
2393 * d = <LPart, VPart>
2395 * TPart = TBase + TIndex
2396 * d = <LPart, VPart, TPart>
2401 static void hangul_decompose(void)
2403 unsigned int sb
= 0xAC00;
2404 unsigned int lb
= 0x1100;
2405 unsigned int vb
= 0x1161;
2406 unsigned int tb
= 0x11a7;
2407 /* unsigned int lc = 19; */
2408 unsigned int vc
= 21;
2409 unsigned int tc
= 28;
2410 unsigned int nc
= (vc
* tc
);
2411 /* unsigned int sc = (lc * nc); */
2412 unsigned int unichar
;
2413 unsigned int mapping
[4];
2419 printf("Decomposing hangul\n");
2422 for (unichar
= 0xAC00; unichar
<= 0xD7A3; unichar
++) {
2423 unsigned int si
= unichar
- sb
;
2424 unsigned int li
= si
/ nc
;
2425 unsigned int vi
= (si
% nc
) / tc
;
2426 unsigned int ti
= si
% tc
;
2429 mapping
[i
++] = lb
+ li
;
2430 mapping
[i
++] = vb
+ vi
;
2432 mapping
[i
++] = tb
+ ti
;
2435 assert(!unicode_data
[unichar
].utf32nfkdi
);
2436 um
= malloc(i
* sizeof(unsigned int));
2437 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2438 unicode_data
[unichar
].utf32nfkdi
= um
;
2440 assert(!unicode_data
[unichar
].utf32nfkdicf
);
2441 um
= malloc(i
* sizeof(unsigned int));
2442 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2443 unicode_data
[unichar
].utf32nfkdicf
= um
;
2446 * Add a cookie as a reminder that the hangul syllable
2447 * decompositions must not be stored in the generated
2450 unicode_data
[unichar
].utf8nfkdi
= malloc(2);
2451 unicode_data
[unichar
].utf8nfkdi
[0] = HANGUL
;
2452 unicode_data
[unichar
].utf8nfkdi
[1] = '\0';
2455 print_utf32nfkdi(unichar
);
2460 printf("Created %d entries\n", count
);
2463 static void nfkdi_decompose(void)
2465 unsigned int unichar
;
2466 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2475 printf("Decomposing nfkdi\n");
2478 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2479 if (!unicode_data
[unichar
].utf32nfkdi
)
2484 um
= unicode_data
[unichar
].utf32nfkdi
;
2486 dc
= unicode_data
[*um
].utf32nfkdi
;
2488 for (j
= 0; dc
[j
]; j
++)
2489 mapping
[i
++] = dc
[j
];
2499 free(unicode_data
[unichar
].utf32nfkdi
);
2500 um
= malloc(i
* sizeof(unsigned int));
2501 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2502 unicode_data
[unichar
].utf32nfkdi
= um
;
2504 /* Add this decomposition to nfkdicf if there is no entry. */
2505 if (!unicode_data
[unichar
].utf32nfkdicf
) {
2506 um
= malloc(i
* sizeof(unsigned int));
2507 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2508 unicode_data
[unichar
].utf32nfkdicf
= um
;
2511 print_utf32nfkdi(unichar
);
2515 printf("Processed %d entries\n", count
);
2518 static void nfkdicf_decompose(void)
2520 unsigned int unichar
;
2521 unsigned int mapping
[19]; /* Magic - guaranteed not to be exceeded. */
2530 printf("Decomposing nfkdicf\n");
2532 for (unichar
= 0; unichar
!= 0x110000; unichar
++) {
2533 if (!unicode_data
[unichar
].utf32nfkdicf
)
2538 um
= unicode_data
[unichar
].utf32nfkdicf
;
2540 dc
= unicode_data
[*um
].utf32nfkdicf
;
2542 for (j
= 0; dc
[j
]; j
++)
2543 mapping
[i
++] = dc
[j
];
2553 free(unicode_data
[unichar
].utf32nfkdicf
);
2554 um
= malloc(i
* sizeof(unsigned int));
2555 memcpy(um
, mapping
, i
* sizeof(unsigned int));
2556 unicode_data
[unichar
].utf32nfkdicf
= um
;
2559 print_utf32nfkdicf(unichar
);
2563 printf("Processed %d entries\n", count
);
2566 /* ------------------------------------------------------------------ */
2568 int utf8agemax(struct tree
*, const char *);
2569 int utf8nagemax(struct tree
*, const char *, size_t);
2570 int utf8agemin(struct tree
*, const char *);
2571 int utf8nagemin(struct tree
*, const char *, size_t);
2572 ssize_t
utf8len(struct tree
*, const char *);
2573 ssize_t
utf8nlen(struct tree
*, const char *, size_t);
2575 int utf8cursor(struct utf8cursor
*, struct tree
*, const char *);
2576 int utf8ncursor(struct utf8cursor
*, struct tree
*, const char *, size_t);
2577 int utf8byte(struct utf8cursor
*);
2580 * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2582 * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2583 * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2592 * NCount = 588 (VCount * TCount)
2593 * SCount = 11172 (LCount * NCount)
2596 * SIndex = s - SBase
2598 * LV (Canonical/Full)
2599 * LIndex = SIndex / NCount
2600 * VIndex = (Sindex % NCount) / TCount
2601 * LPart = LBase + LIndex
2602 * VPart = VBase + VIndex
2605 * LVIndex = (SIndex / TCount) * TCount
2606 * TIndex = (Sindex % TCount)
2607 * LVPart = SBase + LVIndex
2608 * TPart = TBase + TIndex
2611 * LIndex = SIndex / NCount
2612 * VIndex = (Sindex % NCount) / TCount
2613 * TIndex = (Sindex % TCount)
2614 * LPart = LBase + LIndex
2615 * VPart = VBase + VIndex
2616 * if (TIndex == 0) {
2617 * d = <LPart, VPart>
2619 * TPart = TBase + TIndex
2620 * d = <LPart, VPart, TPart>
2632 #define NC (VC * TC)
2633 #define SC (LC * NC)
2635 /* Algorithmic decomposition of hangul syllable. */
2636 static utf8leaf_t
*utf8hangul(const char *str
, unsigned char *hangul
)
2644 /* Calculate the SI, LI, VI, and TI values. */
2645 si
= utf8decode(str
) - SB
;
2647 vi
= (si
% NC
) / TC
;
2650 /* Fill in base of leaf. */
2653 LEAF_CCC(h
) = DECOMPOSE
;
2656 /* Add LPart, a 3-byte UTF-8 sequence. */
2657 h
+= utf8encode((char *)h
, li
+ LB
);
2659 /* Add VPart, a 3-byte UTF-8 sequence. */
2660 h
+= utf8encode((char *)h
, vi
+ VB
);
2662 /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2664 h
+= utf8encode((char *)h
, ti
+ TB
);
2666 /* Terminate string. */
2673 * Use trie to scan s, touching at most len bytes.
2674 * Returns the leaf if one exists, NULL otherwise.
2676 * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677 * is well-formed and corresponds to a known unicode code point. The
2678 * shorthand for this will be "is valid UTF-8 unicode".
2680 static utf8leaf_t
*utf8nlookup(struct tree
*tree
, unsigned char *hangul
,
2681 const char *s
, size_t len
)
2683 utf8trie_t
*trie
= utf8data
+ tree
->index
;
2695 offlen
= (*trie
& OFFLEN
) >> OFFLEN_SHIFT
;
2696 if (*trie
& NEXTBYTE
) {
2701 mask
= 1 << (*trie
& BITNUM
);
2705 /* Right node at offset of trie */
2706 node
= (*trie
& RIGHTNODE
);
2707 offset
= trie
[offlen
];
2710 offset
|= trie
[offlen
];
2713 } else if (*trie
& RIGHTPATH
) {
2714 /* Right node after this node */
2715 node
= (*trie
& TRIENODE
);
2718 /* No right node. */
2724 /* Left node after this node. */
2725 node
= (*trie
& LEFTNODE
);
2727 } else if (*trie
& RIGHTPATH
) {
2731 /* Left node after this node */
2732 node
= (*trie
& TRIENODE
);
2738 * Hangul decomposition is done algorithmically. These are the
2739 * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2740 * always 3 bytes long, so s has been advanced twice, and the
2741 * start of the sequence is at s-2.
2743 if (LEAF_CCC(trie
) == DECOMPOSE
&& LEAF_STR(trie
)[0] == HANGUL
)
2744 trie
= utf8hangul(s
- 2, hangul
);
2749 * Use trie to scan s.
2750 * Returns the leaf if one exists, NULL otherwise.
2752 * Forwards to trie_nlookup().
2754 static utf8leaf_t
*utf8lookup(struct tree
*tree
, unsigned char *hangul
,
2757 return utf8nlookup(tree
, hangul
, s
, (size_t)-1);
2761 * Return the number of bytes used by the current UTF-8 sequence.
2762 * Assumes the input points to the first byte of a valid UTF-8
2765 static inline int utf8clen(const char *s
)
2767 unsigned char c
= *s
;
2768 return 1 + (c
>= 0xC0) + (c
>= 0xE0) + (c
>= 0xF0);
2772 * Maximum age of any character in s.
2773 * Return -1 if s is not valid UTF-8 unicode.
2774 * Return 0 if only non-assigned code points are used.
2776 int utf8agemax(struct tree
*tree
, const char *s
)
2781 unsigned char hangul
[UTF8HANGULLEAF
];
2787 leaf
= utf8lookup(tree
, hangul
, s
);
2790 leaf_age
= ages
[LEAF_GEN(leaf
)];
2791 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2799 * Minimum age of any character in s.
2800 * Return -1 if s is not valid UTF-8 unicode.
2801 * Return 0 if non-assigned code points are used.
2803 int utf8agemin(struct tree
*tree
, const char *s
)
2808 unsigned char hangul
[UTF8HANGULLEAF
];
2814 leaf
= utf8lookup(tree
, hangul
, s
);
2817 leaf_age
= ages
[LEAF_GEN(leaf
)];
2818 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2826 * Maximum age of any character in s, touch at most len bytes.
2827 * Return -1 if s is not valid UTF-8 unicode.
2829 int utf8nagemax(struct tree
*tree
, const char *s
, size_t len
)
2834 unsigned char hangul
[UTF8HANGULLEAF
];
2840 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2843 leaf_age
= ages
[LEAF_GEN(leaf
)];
2844 if (leaf_age
<= tree
->maxage
&& leaf_age
> age
)
2853 * Maximum age of any character in s, touch at most len bytes.
2854 * Return -1 if s is not valid UTF-8 unicode.
2856 int utf8nagemin(struct tree
*tree
, const char *s
, size_t len
)
2861 unsigned char hangul
[UTF8HANGULLEAF
];
2867 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2870 leaf_age
= ages
[LEAF_GEN(leaf
)];
2871 if (leaf_age
<= tree
->maxage
&& leaf_age
< age
)
2880 * Length of the normalization of s.
2881 * Return -1 if s is not valid UTF-8 unicode.
2883 * A string of Default_Ignorable_Code_Point has length 0.
2885 ssize_t
utf8len(struct tree
*tree
, const char *s
)
2889 unsigned char hangul
[UTF8HANGULLEAF
];
2894 leaf
= utf8lookup(tree
, hangul
, s
);
2897 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2899 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2900 ret
+= strlen(LEAF_STR(leaf
));
2909 * Length of the normalization of s, touch at most len bytes.
2910 * Return -1 if s is not valid UTF-8 unicode.
2912 ssize_t
utf8nlen(struct tree
*tree
, const char *s
, size_t len
)
2916 unsigned char hangul
[UTF8HANGULLEAF
];
2921 leaf
= utf8nlookup(tree
, hangul
, s
, len
);
2924 if (ages
[LEAF_GEN(leaf
)] > tree
->maxage
)
2926 else if (LEAF_CCC(leaf
) == DECOMPOSE
)
2927 ret
+= strlen(LEAF_STR(leaf
));
2937 * Cursor structure used by the normalizer.
2949 unsigned int unichar
;
2950 unsigned char hangul
[UTF8HANGULLEAF
];
2954 * Set up an utf8cursor for use by utf8byte().
2957 * len : length of s.
2958 * u8c : pointer to cursor.
2959 * trie : utf8trie_t to use for normalization.
2961 * Returns -1 on error, 0 on success.
2963 int utf8ncursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
,
2978 u8c
->nccc
= STOPPER
;
2980 /* Check we didn't clobber the maximum length. */
2981 if (u8c
->len
!= len
)
2983 /* The first byte of s may not be an utf8 continuation. */
2984 if (len
> 0 && (*s
& 0xC0) == 0x80)
2990 * Set up an utf8cursor for use by utf8byte().
2992 * s : NUL-terminated string.
2993 * u8c : pointer to cursor.
2994 * trie : utf8trie_t to use for normalization.
2996 * Returns -1 on error, 0 on success.
2998 int utf8cursor(struct utf8cursor
*u8c
, struct tree
*tree
, const char *s
)
3000 return utf8ncursor(u8c
, tree
, s
, (unsigned int)-1);
3004 * Get one byte from the normalized form of the string described by u8c.
3006 * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3008 * The cursor keeps track of the location in the string in u8c->s.
3009 * When a character is decomposed, the current location is stored in
3010 * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011 * that bytes from a decomposition do not count against u8c->len.
3013 * Characters are emitted if they match the current CCC in u8c->ccc.
3014 * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3015 * and the function returns 0 in that case.
3017 * Sorting by CCC is done by repeatedly scanning the string. The
3018 * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3019 * the start of the scan. The first pass finds the lowest CCC to be
3020 * emitted and stores it in u8c->nccc, the second pass emits the
3021 * characters with this CCC and finds the next lowest CCC. This limits
3022 * the number of passes to 1 + the number of different CCCs in the
3023 * sequence being scanned.
3026 * u8c->p != NULL -> a decomposition is being scanned.
3027 * u8c->ss != NULL -> this is a repeating scan.
3028 * u8c->ccc == -1 -> this is the first scan of a repeating scan.
3030 int utf8byte(struct utf8cursor
*u8c
)
3036 /* Check for the end of a decomposed character. */
3037 if (u8c
->p
&& *u8c
->s
== '\0') {
3042 /* Check for end-of-string. */
3043 if (!u8c
->p
&& (u8c
->len
== 0 || *u8c
->s
== '\0')) {
3044 /* There is no next byte. */
3045 if (u8c
->ccc
== STOPPER
)
3047 /* End-of-string during a scan counts as a stopper. */
3050 } else if ((*u8c
->s
& 0xC0) == 0x80) {
3051 /* This is a continuation of the current character. */
3054 return (unsigned char)*u8c
->s
++;
3057 /* Look up the data for the current character. */
3059 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3061 leaf
= utf8nlookup(u8c
->tree
, u8c
->hangul
,
3065 /* No leaf found implies that the input is a binary blob. */
3069 /* Characters that are too new have CCC 0. */
3070 if (ages
[LEAF_GEN(leaf
)] > u8c
->tree
->maxage
) {
3072 } else if ((ccc
= LEAF_CCC(leaf
)) == DECOMPOSE
) {
3073 u8c
->len
-= utf8clen(u8c
->s
);
3074 u8c
->p
= u8c
->s
+ utf8clen(u8c
->s
);
3075 u8c
->s
= LEAF_STR(leaf
);
3076 /* Empty decomposition implies CCC 0. */
3077 if (*u8c
->s
== '\0') {
3078 if (u8c
->ccc
== STOPPER
)
3083 leaf
= utf8lookup(u8c
->tree
, u8c
->hangul
, u8c
->s
);
3084 ccc
= LEAF_CCC(leaf
);
3086 u8c
->unichar
= utf8decode(u8c
->s
);
3089 * If this is not a stopper, then see if it updates
3090 * the next canonical class to be emitted.
3092 if (ccc
!= STOPPER
&& u8c
->ccc
< ccc
&& ccc
< u8c
->nccc
)
3096 * Return the current byte if this is the current
3099 if (ccc
== u8c
->ccc
) {
3102 return (unsigned char)*u8c
->s
++;
3105 /* Current combining class mismatch. */
3107 if (u8c
->nccc
== STOPPER
) {
3109 * Scan forward for the first canonical class
3110 * to be emitted. Save the position from
3113 assert(u8c
->ccc
== STOPPER
);
3114 u8c
->ccc
= MINCCC
- 1;
3118 u8c
->slen
= u8c
->len
;
3120 u8c
->len
-= utf8clen(u8c
->s
);
3121 u8c
->s
+= utf8clen(u8c
->s
);
3122 } else if (ccc
!= STOPPER
) {
3123 /* Not a stopper, and not the ccc we're emitting. */
3125 u8c
->len
-= utf8clen(u8c
->s
);
3126 u8c
->s
+= utf8clen(u8c
->s
);
3127 } else if (u8c
->nccc
!= MAXCCC
+ 1) {
3128 /* At a stopper, restart for next ccc. */
3129 u8c
->ccc
= u8c
->nccc
;
3130 u8c
->nccc
= MAXCCC
+ 1;
3133 u8c
->len
= u8c
->slen
;
3135 /* All done, proceed from here. */
3137 u8c
->nccc
= STOPPER
;
3145 /* ------------------------------------------------------------------ */
3147 static int normalize_line(struct tree
*tree
)
3152 struct utf8cursor u8c
;
3154 /* First test: null-terminated string. */
3157 if (utf8cursor(&u8c
, tree
, s
))
3159 while ((c
= utf8byte(&u8c
)) > 0)
3160 if (c
!= (unsigned char)*t
++)
3167 /* Second test: length-limited string. */
3169 /* Replace NUL with a value that will cause an error if seen. */
3170 s
[strlen(s
) + 1] = -1;
3172 if (utf8cursor(&u8c
, tree
, s
))
3174 while ((c
= utf8byte(&u8c
)) > 0)
3175 if (c
!= (unsigned char)*t
++)
3185 static void normalization_test(void)
3188 unsigned int unichar
;
3189 struct unicode_data
*data
;
3198 printf("Parsing %s\n", test_name
);
3199 /* Step one, read data from file. */
3200 file
= fopen(test_name
, "r");
3202 open_fail(test_name
, errno
);
3204 while (fgets(line
, LINESIZE
, file
)) {
3205 ret
= sscanf(line
, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
3207 if (ret
!= 2 || *line
== '#')
3212 unichar
= strtoul(s
, &s
, 16);
3213 t
+= utf8encode(t
, unichar
);
3221 unichar
= strtoul(s
, &s
, 16);
3222 data
= &unicode_data
[unichar
];
3223 if (data
->utf8nfkdi
&& !*data
->utf8nfkdi
)
3226 t
+= utf8encode(t
, unichar
);
3231 if (normalize_line(nfkdi_tree
) < 0) {
3232 printf("Line %s -> %s", buf0
, buf1
);
3234 printf(" (ignorables removed)");
3235 printf(" failure\n");
3241 printf("Ran %d tests with %d failures\n", tests
, failures
);
3243 file_fail(test_name
);
3246 /* ------------------------------------------------------------------ */
3248 static void write_file(void)
3257 printf("Writing %s\n", utf8_name
);
3258 file
= fopen(utf8_name
, "w");
3260 open_fail(utf8_name
, errno
);
3262 fprintf(file
, "/* This file is generated code, do not edit. */\n");
3263 fprintf(file
, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3264 fprintf(file
, "#error Only nls_utf8-norm.c should include this file.\n");
3265 fprintf(file
, "#endif\n");
3266 fprintf(file
, "\n");
3267 fprintf(file
, "static const unsigned int utf8vers = %#x;\n",
3269 fprintf(file
, "\n");
3270 fprintf(file
, "static const unsigned int utf8agetab[] = {\n");
3271 for (i
= 0; i
!= ages_count
; i
++)
3272 fprintf(file
, "\t%#x%s\n", ages
[i
],
3273 ages
[i
] == unicode_maxage
? "" : ",");
3274 fprintf(file
, "};\n");
3275 fprintf(file
, "\n");
3276 fprintf(file
, "static const struct utf8data utf8nfkdicfdata[] = {\n");
3278 for (gen
= 0; gen
< ages_count
; gen
++) {
3279 fprintf(file
, "\t{ %#x, %d }%s\n",
3280 ages
[gen
], trees
[t
].index
,
3281 ages
[gen
] == unicode_maxage
? "" : ",");
3282 if (trees
[t
].maxage
== ages
[gen
])
3285 fprintf(file
, "};\n");
3286 fprintf(file
, "\n");
3287 fprintf(file
, "static const struct utf8data utf8nfkdidata[] = {\n");
3289 for (gen
= 0; gen
< ages_count
; gen
++) {
3290 fprintf(file
, "\t{ %#x, %d }%s\n",
3291 ages
[gen
], trees
[t
].index
,
3292 ages
[gen
] == unicode_maxage
? "" : ",");
3293 if (trees
[t
].maxage
== ages
[gen
])
3296 fprintf(file
, "};\n");
3297 fprintf(file
, "\n");
3298 fprintf(file
, "static const unsigned char utf8data[%zd] = {\n",
3301 for (i
= 0; i
!= utf8data_size
; i
+= 16) {
3302 if (i
== trees
[t
].index
) {
3303 fprintf(file
, "\t/* %s_%x */\n",
3304 trees
[t
].type
, trees
[t
].maxage
);
3305 if (t
< trees_count
-1)
3308 fprintf(file
, "\t");
3309 for (j
= i
; j
!= i
+ 16; j
++)
3310 fprintf(file
, "0x%.2x%s", utf8data
[j
],
3311 (j
< utf8data_size
-1 ? "," : ""));
3312 fprintf(file
, "\n");
3314 fprintf(file
, "};\n");
3318 /* ------------------------------------------------------------------ */
3320 int main(int argc
, char *argv
[])
3322 unsigned int unichar
;
3327 while ((opt
= getopt(argc
, argv
, "a:c:d:f:hn:o:p:t:v")) != -1) {
3366 for (unichar
= 0; unichar
!= 0x110000; unichar
++)
3367 unicode_data
[unichar
].code
= unichar
;
3376 nfkdicf_decompose();
3382 /* Prevent "unused function" warning. */
3383 (void)lookup(nfkdi_tree
, " ");
3385 tree_walk(nfkdi_tree
);
3387 tree_walk(nfkdicf_tree
);
3388 normalization_test();