util/mkutf8data.c

   1 /*
   2  * Copyright (c) 2014 SGI.
   3  * All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18
  19 /* Generator for a compact trie for unicode normalization */
  20
  21 #include <sys/types.h>
  22 #include <stddef.h>
  23 #include <stdlib.h>
  24 #include <stdio.h>
  25 #include <assert.h>
  26 #include <string.h>
  27 #include <unistd.h>
  28 #include <errno.h>
  29
  30 /* Default names of the in- and output files. */
  31
  32 #define AGE_NAME        "DerivedAge.txt"
  33 #define CCC_NAME        "DerivedCombiningClass.txt"
  34 #define PROP_NAME       "DerivedCoreProperties.txt"
  35 #define DATA_NAME       "UnicodeData.txt"
  36 #define FOLD_NAME       "CaseFolding.txt"
  37 #define NORM_NAME       "NormalizationCorrections.txt"
  38 #define TEST_NAME       "NormalizationTest.txt"
  39 #define UTF8_NAME       "utf8data.h"
  40
  41 const char      *age_name  = AGE_NAME;
  42 const char      *ccc_name  = CCC_NAME;
  43 const char      *prop_name = PROP_NAME;
  44 const char      *data_name = DATA_NAME;
  45 const char      *fold_name = FOLD_NAME;
  46 const char      *norm_name = NORM_NAME;
  47 const char      *test_name = TEST_NAME;
  48 const char      *utf8_name = UTF8_NAME;
  49
  50 int verbose = 0;
  51
  52 /* An arbitrary line size limit on input lines. */
  53
  54 #define LINESIZE        1024
  55 char line[LINESIZE];
  56 char buf0[LINESIZE];
  57 char buf1[LINESIZE];
  58 char buf2[LINESIZE];
  59 char buf3[LINESIZE];
  60
  61 const char *argv0;
  62
  63 /* ------------------------------------------------------------------ */
  64
  65 /*
  66  * Unicode version numbers consist of three parts: major, minor, and a
  67  * revision.  These numbers are packed into an unsigned int to obtain
  68  * a single version number.
  69  *
  70  * To save space in the generated trie, the unicode version is not
  71  * stored directly, instead we calculate a generation number from the
  72  * unicode versions seen in the DerivedAge file, and use that as an
  73  * index into a table of unicode versions.
  74  */
  75 #define UNICODE_MAJ_SHIFT               (16)
  76 #define UNICODE_MIN_SHIFT               (8)
  77
  78 #define UNICODE_MAJ_MAX                 ((unsigned short)-1)
  79 #define UNICODE_MIN_MAX                 ((unsigned char)-1)
  80 #define UNICODE_REV_MAX                 ((unsigned char)-1)
  81
  82 #define UNICODE_AGE(MAJ,MIN,REV)                        \
  83         (((unsigned int)(MAJ) << UNICODE_MAJ_SHIFT) |   \
  84          ((unsigned int)(MIN) << UNICODE_MIN_SHIFT) |   \
  85          ((unsigned int)(REV)))
  86
  87 unsigned int *ages;
  88 int ages_count;
  89
  90 unsigned int unicode_maxage;
  91
  92 static int age_valid(unsigned int major, unsigned int minor,
  93                      unsigned int revision)
  94 {
  95         if (major > UNICODE_MAJ_MAX)
  96                 return 0;
  97         if (minor > UNICODE_MIN_MAX)
  98                 return 0;
  99         if (revision > UNICODE_REV_MAX)
 100                 return 0;
 101         return 1;
 102 }
 103
 104 /* ------------------------------------------------------------------ */
 105
 106 /*
 107  * utf8trie_t
 108  *
 109  * A compact binary tree, used to decode UTF-8 characters.
 110  *
 111  * Internal nodes are one byte for the node itself, and up to three
 112  * bytes for an offset into the tree.  The first byte contains the
 113  * following information:
 114  *  NEXTBYTE  - flag        - advance to next byte if set
 115  *  BITNUM    - 3 bit field - the bit number to tested
 116  *  OFFLEN    - 2 bit field - number of bytes in the offset
 117  * if offlen == 0 (non-branching node)
 118  *  RIGHTPATH - 1 bit field - set if the following node is for the
 119  *                            right-hand path (tested bit is set)
 120  *  TRIENODE  - 1 bit field - set if the following node is an internal
 121  *                            node, otherwise it is a leaf node
 122  * if offlen != 0 (branching node)
 123  *  LEFTNODE  - 1 bit field - set if the left-hand node is internal
 124  *  RIGHTNODE - 1 bit field - set if the right-hand node is internal
 125  *
 126  * Due to the way utf8 works, there cannot be branching nodes with
 127  * NEXTBYTE set, and moreover those nodes always have a righthand
 128  * descendant.
 129  */
 130 typedef unsigned char utf8trie_t;
 131 #define BITNUM          0x07
 132 #define NEXTBYTE        0x08
 133 #define OFFLEN          0x30
 134 #define OFFLEN_SHIFT    4
 135 #define RIGHTPATH       0x40
 136 #define TRIENODE        0x80
 137 #define RIGHTNODE       0x40
 138 #define LEFTNODE        0x80
 139
 140 /*
 141  * utf8leaf_t
 142  *
 143  * The leaves of the trie are embedded in the trie, and so the same
 144  * underlying datatype, unsigned char.
 145  *
 146  * leaf[0]: The unicode version, stored as a generation number that is
 147  *          an index into utf8agetab[].  With this we can filter code
 148  *          points based on the unicode version in which they were
 149  *          defined.  The CCC of a non-defined code point is 0.
 150  * leaf[1]: Canonical Combining Class. During normalization, we need
 151  *          to do a stable sort into ascending order of all characters
 152  *          with a non-zero CCC that occur between two characters with
 153  *          a CCC of 0, or at the begin or end of a string.
 154  *          The unicode standard guarantees that all CCC values are
 155  *          between 0 and 254 inclusive, which leaves 255 available as
 156  *          a special value.
 157  *          Code points with CCC 0 are known as stoppers.
 158  * leaf[2]: Decomposition. If leaf[1] == 255, then leaf[2] is the
 159  *          start of a NUL-terminated string that is the decomposition
 160  *          of the character.
 161  *          The CCC of a decomposable character is the same as the CCC
 162  *          of the first character of its decomposition.
 163  *          Some characters decompose as the empty string: these are
 164  *          characters with the Default_Ignorable_Code_Point property.
 165  *          These do affect normalization, as they all have CCC 0.
 166  *
 167  * The decompositions in the trie have been fully expanded.
 168  *
 169  * Casefolding, if applicable, is also done using decompositions.
 170  */
 171 typedef unsigned char utf8leaf_t;
 172
 173 #define LEAF_GEN(LEAF)  ((LEAF)[0])
 174 #define LEAF_CCC(LEAF)  ((LEAF)[1])
 175 #define LEAF_STR(LEAF)  ((const char*)((LEAF) + 2))
 176
 177 #define MAXGEN          (255)
 178
 179 #define MINCCC          (0)
 180 #define MAXCCC          (254)
 181 #define STOPPER         (0)
 182 #define DECOMPOSE       (255)
 183 #define HANGUL          ((char)(255))
 184
 185 #define UTF8HANGULLEAF  (12)
 186
 187 struct tree;
 188 static utf8leaf_t *utf8nlookup(struct tree *, unsigned char *,
 189                                const char *, size_t);
 190 static utf8leaf_t *utf8lookup(struct tree *, unsigned char *, const char *);
 191
 192 unsigned char *utf8data;
 193 size_t utf8data_size;
 194
 195 utf8trie_t *nfkdi;
 196 utf8trie_t *nfkdicf;
 197
 198 /* ------------------------------------------------------------------ */
 199
 200 /*
 201  * UTF8 valid ranges.
 202  *
 203  * The UTF-8 encoding spreads the bits of a 32bit word over several
 204  * bytes. This table gives the ranges that can be held and how they'd
 205  * be represented.
 206  *
 207  * 0x00000000 0x0000007F: 0xxxxxxx
 208  * 0x00000000 0x000007FF: 110xxxxx 10xxxxxx
 209  * 0x00000000 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 210  * 0x00000000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 211  * 0x00000000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 212  * 0x00000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 213  *
 214  * There is an additional requirement on UTF-8, in that only the
 215  * shortest representation of a 32bit value is to be used.  A decoder
 216  * must not decode sequences that do not satisfy this requirement.
 217  * Thus the allowed ranges have a lower bound.
 218  *
 219  * 0x00000000 0x0000007F: 0xxxxxxx
 220  * 0x00000080 0x000007FF: 110xxxxx 10xxxxxx
 221  * 0x00000800 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
 222  * 0x00010000 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 223  * 0x00200000 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 224  * 0x04000000 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 225  *
 226  * Actual unicode characters are limited to the range 0x0 - 0x10FFFF,
 227  * 17 planes of 65536 values.  This limits the sequences actually seen
 228  * even more, to just the following.
 229  *
 230  *          0 -     0x7f: 0                     0x7f
 231  *       0x80 -    0x7ff: 0xc2 0x80             0xdf 0xbf
 232  *      0x800 -   0xffff: 0xe0 0xa0 0x80        0xef 0xbf 0xbf
 233  *    0x10000 - 0x10ffff: 0xf0 0x90 0x80 0x80   0xf4 0x8f 0xbf 0xbf
 234  *
 235  * Even within those ranges not all values are allowed: the surrogates
 236  * 0xd800 - 0xdfff should never be seen.
 237  *
 238  * Note that the longest sequence seen with valid usage is 4 bytes,
 239  * the same a single UTF-32 character.  This makes the UTF-8
 240  * representation of Unicode strictly smaller than UTF-32.
 241  *
 242  * The shortest sequence requirement was introduced by:
 243  *    Corrigendum #1: UTF-8 Shortest Form
 244  * It can be found here:
 245  *    http://www.unicode.org/versions/corrigendum1.html
 246  *
 247  */
 248
 249 #define UTF8_2_BITS     0xC0
 250 #define UTF8_3_BITS     0xE0
 251 #define UTF8_4_BITS     0xF0
 252 #define UTF8_N_BITS     0x80
 253 #define UTF8_2_MASK     0xE0
 254 #define UTF8_3_MASK     0xF0
 255 #define UTF8_4_MASK     0xF8
 256 #define UTF8_N_MASK     0xC0
 257 #define UTF8_V_MASK     0x3F
 258 #define UTF8_V_SHIFT    6
 259
 260 static int utf8encode(char *str, unsigned int val)
 261 {
 262         int len;
 263
 264         if (val < 0x80) {
 265                 str[0] = val;
 266                 len = 1;
 267         } else if (val < 0x800) {
 268                 str[1] = val & UTF8_V_MASK;
 269                 str[1] |= UTF8_N_BITS;
 270                 val >>= UTF8_V_SHIFT;
 271                 str[0] = val;
 272                 str[0] |= UTF8_2_BITS;
 273                 len = 2;
 274         } else if (val < 0x10000) {
 275                 str[2] = val & UTF8_V_MASK;
 276                 str[2] |= UTF8_N_BITS;
 277                 val >>= UTF8_V_SHIFT;
 278                 str[1] = val & UTF8_V_MASK;
 279                 str[1] |= UTF8_N_BITS;
 280                 val >>= UTF8_V_SHIFT;
 281                 str[0] = val;
 282                 str[0] |= UTF8_3_BITS;
 283                 len = 3;
 284         } else if (val < 0x110000) {
 285                 str[3] = val & UTF8_V_MASK;
 286                 str[3] |= UTF8_N_BITS;
 287                 val >>= UTF8_V_SHIFT;
 288                 str[2] = val & UTF8_V_MASK;
 289                 str[2] |= UTF8_N_BITS;
 290                 val >>= UTF8_V_SHIFT;
 291                 str[1] = val & UTF8_V_MASK;
 292                 str[1] |= UTF8_N_BITS;
 293                 val >>= UTF8_V_SHIFT;
 294                 str[0] = val;
 295                 str[0] |= UTF8_4_BITS;
 296                 len = 4;
 297         } else {
 298                 printf("%#x: illegal val\n", val);
 299                 len = 0;
 300         }
 301         return len;
 302 }
 303
 304 static unsigned int utf8decode(const char *str)
 305 {
 306         const unsigned char *s = (const unsigned char*)str;
 307         unsigned int unichar = 0;
 308
 309         if (*s < 0x80) {
 310                 unichar = *s;
 311         } else if (*s < UTF8_3_BITS) {
 312                 unichar = *s++ & 0x1F;
 313                 unichar <<= UTF8_V_SHIFT;
 314                 unichar |= *s & 0x3F;
 315         } else if (*s < UTF8_4_BITS) {
 316                 unichar = *s++ & 0x0F;
 317                 unichar <<= UTF8_V_SHIFT;
 318                 unichar |= *s++ & 0x3F;
 319                 unichar <<= UTF8_V_SHIFT;
 320                 unichar |= *s & 0x3F;
 321         } else {
 322                 unichar = *s++ & 0x0F;
 323                 unichar <<= UTF8_V_SHIFT;
 324                 unichar |= *s++ & 0x3F;
 325                 unichar <<= UTF8_V_SHIFT;
 326                 unichar |= *s++ & 0x3F;
 327                 unichar <<= UTF8_V_SHIFT;
 328                 unichar |= *s & 0x3F;
 329         }
 330         return unichar;
 331 }
 332
 333 static int utf32valid(unsigned int unichar)
 334 {
 335         return unichar < 0x110000;
 336 }
 337
 338 #define HANGUL_SYLLABLE(U)      ((U) >= 0xAC00 && (U) <= 0xD7A3)
 339
 340 #define NODE 1
 341 #define LEAF 0
 342
 343 struct tree {
 344         void *root;
 345         int childnode;
 346         const char *type;
 347         unsigned int maxage;
 348         struct tree *next;
 349         int (*leaf_equal)(void *, void *);
 350         void (*leaf_print)(void *, int);
 351         int (*leaf_mark)(void *);
 352         int (*leaf_size)(void *);
 353         int *(*leaf_index)(struct tree *, void *);
 354         unsigned char *(*leaf_emit)(void *, unsigned char *);
 355         int leafindex[0x110000];
 356         int index;
 357 };
 358
 359 struct node {
 360         int index;
 361         int offset;
 362         int mark;
 363         int size;
 364         struct node *parent;
 365         void *left;
 366         void *right;
 367         unsigned char bitnum;
 368         unsigned char nextbyte;
 369         unsigned char leftnode;
 370         unsigned char rightnode;
 371         unsigned int keybits;
 372         unsigned int keymask;
 373 };
 374
 375 /*
 376  * Example lookup function for a tree.
 377  */
 378 static void *lookup(struct tree *tree, const char *key)
 379 {
 380         struct node *node;
 381         void *leaf = NULL;
 382
 383         node = tree->root;
 384         while (!leaf && node) {
 385                 if (node->nextbyte)
 386                         key++;
 387                 if (*key & (1 << (node->bitnum & 7))) {
 388                         /* Right leg */
 389                         if (node->rightnode == NODE) {
 390                                 node = node->right;
 391                         } else if (node->rightnode == LEAF) {
 392                                 leaf = node->right;
 393                         } else {
 394                                 node = NULL;
 395                         }
 396                 } else {
 397                         /* Left leg */
 398                         if (node->leftnode == NODE) {
 399                                 node = node->left;
 400                         } else if (node->leftnode == LEAF) {
 401                                 leaf = node->left;
 402                         } else {
 403                                 node = NULL;
 404                         }
 405                 }
 406         }
 407
 408         return leaf;
 409 }
 410
 411 /*
 412  * A simple non-recursive tree walker: keep track of visits to the
 413  * left and right branches in the leftmask and rightmask.
 414  */
 415 static void tree_walk(struct tree *tree)
 416 {
 417         struct node *node;
 418         unsigned int leftmask;
 419         unsigned int rightmask;
 420         unsigned int bitmask;
 421         int indent = 1;
 422         int nodes, singletons, leaves;
 423
 424         nodes = singletons = leaves = 0;
 425
 426         printf("%s_%x root %p\n", tree->type, tree->maxage, tree->root);
 427         if (tree->childnode == LEAF) {
 428                 assert(tree->root);
 429                 tree->leaf_print(tree->root, indent);
 430                 leaves = 1;
 431         } else {
 432                 assert(tree->childnode == NODE);
 433                 node = tree->root;
 434                 leftmask = rightmask = 0;
 435                 while (node) {
 436                         printf("%*snode @ %p bitnum %d nextbyte %d"
 437                                " left %p right %p mask %x bits %x\n",
 438                                 indent, "", node,
 439                                 node->bitnum, node->nextbyte,
 440                                 node->left, node->right,
 441                                 node->keymask, node->keybits);
 442                         nodes += 1;
 443                         if (!(node->left && node->right))
 444                                 singletons += 1;
 445
 446                         while (node) {
 447                                 bitmask = 1 << node->bitnum;
 448                                 if ((leftmask & bitmask) == 0) {
 449                                         leftmask |= bitmask;
 450                                         if (node->leftnode == LEAF) {
 451                                                 assert(node->left);
 452                                                 tree->leaf_print(node->left,
 453                                                                  indent+1);
 454                                                 leaves += 1;
 455                                         } else if (node->left) {
 456                                                 assert(node->leftnode == NODE);
 457                                                 indent += 1;
 458                                                 node = node->left;
 459                                                 break;
 460                                         }
 461                                 }
 462                                 if ((rightmask & bitmask) == 0) {
 463                                         rightmask |= bitmask;
 464                                         if (node->rightnode == LEAF) {
 465                                                 assert(node->right);
 466                                                 tree->leaf_print(node->right,
 467                                                                  indent+1);
 468                                                 leaves += 1;
 469                                         } else if (node->right) {
 470                                                 assert(node->rightnode == NODE);
 471                                                 indent += 1;
 472                                                 node = node->right;
 473                                                 break;
 474                                         }
 475                                 }
 476                                 leftmask &= ~bitmask;
 477                                 rightmask &= ~bitmask;
 478                                 node = node->parent;
 479                                 indent -= 1;
 480                         }
 481                 }
 482         }
 483         printf("nodes %d leaves %d singletons %d\n",
 484                nodes, leaves, singletons);
 485 }
 486
 487 /*
 488  * Allocate an initialize a new internal node.
 489  */
 490 static struct node *alloc_node(struct node *parent)
 491 {
 492         struct node *node;
 493         int bitnum;
 494
 495         node = malloc(sizeof(*node));
 496         node->left = node->right = NULL;
 497         node->parent = parent;
 498         node->leftnode = NODE;
 499         node->rightnode = NODE;
 500         node->keybits = 0;
 501         node->keymask = 0;
 502         node->mark = 0;
 503         node->index = 0;
 504         node->offset = -1;
 505         node->size = 4;
 506
 507         if (node->parent) {
 508                 bitnum = parent->bitnum;
 509                 if ((bitnum & 7) == 0) {
 510                         node->bitnum = bitnum + 7 + 8;
 511                         node->nextbyte = 1;
 512                 } else {
 513                         node->bitnum = bitnum - 1;
 514                         node->nextbyte = 0;
 515                 }
 516         } else {
 517                 node->bitnum = 7;
 518                 node->nextbyte = 0;
 519         }
 520
 521         return node;
 522 }
 523
 524 /*
 525  * Insert a new leaf into the tree, and collapse any subtrees that are
 526  * fully populated and end in identical leaves. A nextbyte tagged
 527  * internal node will not be removed to preserve the tree's integrity.
 528  * Note that due to the structure of utf8, no nextbyte tagged node
 529  * will be a candidate for removal.
 530  */
 531 static int insert(struct tree *tree, char *key, int keylen, void *leaf)
 532 {
 533         struct node *node;
 534         struct node *parent;
 535         void **cursor;
 536         int keybits;
 537
 538         assert(keylen >= 1 && keylen <= 4);
 539
 540         node = NULL;
 541         cursor = &tree->root;
 542         keybits = 8 * keylen;
 543
 544         /* Insert, creating path along the way. */
 545         while (keybits) {
 546                 if (!*cursor)
 547                         *cursor = alloc_node(node);
 548                 node = *cursor;
 549                 if (node->nextbyte)
 550                         key++;
 551                 if (*key & (1 << (node->bitnum & 7)))
 552                         cursor = &node->right;
 553                 else
 554                         cursor = &node->left;
 555                 keybits--;
 556         }
 557         *cursor = leaf;
 558
 559         /* Merge subtrees if possible. */
 560         while (node) {
 561                 if (*key & (1 << (node->bitnum & 7)))
 562                         node->rightnode = LEAF;
 563                 else
 564                         node->leftnode = LEAF;
 565                 if (node->nextbyte)
 566                         break;
 567                 if (node->leftnode == NODE || node->rightnode == NODE)
 568                         break;
 569                 assert(node->left);
 570                 assert(node->right);
 571                 /* Compare */
 572                 if (! tree->leaf_equal(node->left, node->right))
 573                         break;
 574                 /* Keep left, drop right leaf. */
 575                 leaf = node->left;
 576                 /* Check in parent */
 577                 parent = node->parent;
 578                 if (!parent) {
 579                         /* root of tree! */
 580                         tree->root = leaf;
 581                         tree->childnode = LEAF;
 582                 } else if (parent->left == node) {
 583                         parent->left = leaf;
 584                         parent->leftnode = LEAF;
 585                         if (parent->right) {
 586                                 parent->keymask = 0;
 587                                 parent->keybits = 0;
 588                         } else {
 589                                 parent->keymask |= (1 << node->bitnum);
 590                         }
 591                 } else if (parent->right == node) {
 592                         parent->right = leaf;
 593                         parent->rightnode = LEAF;
 594                         if (parent->left) {
 595                                 parent->keymask = 0;
 596                                 parent->keybits = 0;
 597                         } else {
 598                                 parent->keymask |= (1 << node->bitnum);
 599                                 parent->keybits |= (1 << node->bitnum);
 600                         }
 601                 } else {
 602                         /* internal tree error */
 603                         assert(0);
 604                 }
 605                 free(node);
 606                 node = parent;
 607         }
 608
 609         /* Propagate keymasks up along singleton chains. */
 610         while (node) {
 611                 parent = node->parent;
 612                 if (!parent)
 613                         break;
 614                 /* Nix the mask for parents with two children. */
 615                 if (node->keymask == 0) {
 616                         parent->keymask = 0;
 617                         parent->keybits = 0;
 618                 } else if (parent->left && parent->right) {
 619                         parent->keymask = 0;
 620                         parent->keybits = 0;
 621                 } else {
 622                         assert((parent->keymask & node->keymask) == 0);
 623                         parent->keymask |= node->keymask;
 624                         parent->keymask |= (1 << parent->bitnum);
 625                         parent->keybits |= node->keybits;
 626                         if (parent->right)
 627                                 parent->keybits |= (1 << parent->bitnum);
 628                 }
 629                 node = parent;
 630         }
 631
 632         return 0;
 633 }
 634
 635 /*
 636  * Prune internal nodes.
 637  *
 638  * Fully populated subtrees that end at the same leaf have already
 639  * been collapsed.  There are still internal nodes that have for both
 640  * their left and right branches a sequence of singletons that make
 641  * identical choices and end in identical leaves.  The keymask and
 642  * keybits collected in the nodes describe the choices made in these
 643  * singleton chains.  When they are identical for the left and right
 644  * branch of a node, and the two leaves comare identical, the node in
 645  * question can be removed.
 646  *
 647  * Note that nodes with the nextbyte tag set will not be removed by
 648  * this to ensure tree integrity.  Note as well that the structure of
 649  * utf8 ensures that these nodes would not have been candidates for
 650  * removal in any case.
 651  */
 652 static void prune(struct tree *tree)
 653 {
 654         struct node *node;
 655         struct node *left;
 656         struct node *right;
 657         struct node *parent;
 658         void *leftleaf;
 659         void *rightleaf;
 660         unsigned int leftmask;
 661         unsigned int rightmask;
 662         unsigned int bitmask;
 663         int count;
 664
 665         if (verbose > 0)
 666                 printf("Pruning %s_%x\n", tree->type, tree->maxage);
 667
 668         count = 0;
 669         if (tree->childnode == LEAF)
 670                 return;
 671         if (!tree->root)
 672                 return;
 673
 674         leftmask = rightmask = 0;
 675         node = tree->root;
 676         while (node) {
 677                 if (node->nextbyte)
 678                         goto advance;
 679                 if (node->leftnode == LEAF)
 680                         goto advance;
 681                 if (node->rightnode == LEAF)
 682                         goto advance;
 683                 if (!node->left)
 684                         goto advance;
 685                 if (!node->right)
 686                         goto advance;
 687                 left = node->left;
 688                 right = node->right;
 689                 if (left->keymask == 0)
 690                         goto advance;
 691                 if (right->keymask == 0)
 692                         goto advance;
 693                 if (left->keymask != right->keymask)
 694                         goto advance;
 695                 if (left->keybits != right->keybits)
 696                         goto advance;
 697                 leftleaf = NULL;
 698                 while (!leftleaf) {
 699                         assert(left->left || left->right);
 700                         if (left->leftnode == LEAF)
 701                                 leftleaf = left->left;
 702                         else if (left->rightnode == LEAF)
 703                                 leftleaf = left->right;
 704                         else if (left->left)
 705                                 left = left->left;
 706                         else if (left->right)
 707                                 left = left->right;
 708                         else
 709                                 assert(0);
 710                 }
 711                 rightleaf = NULL;
 712                 while (!rightleaf) {
 713                         assert(right->left || right->right);
 714                         if (right->leftnode == LEAF)
 715                                 rightleaf = right->left;
 716                         else if (right->rightnode == LEAF)
 717                                 rightleaf = right->right;
 718                         else if (right->left)
 719                                 right = right->left;
 720                         else if (right->right)
 721                                 right = right->right;
 722                         else
 723                                 assert(0);
 724                 }
 725                 if (! tree->leaf_equal(leftleaf, rightleaf))
 726                         goto advance;
 727                 /*
 728                  * This node has identical singleton-only subtrees.
 729                  * Remove it.
 730                  */
 731                 parent = node->parent;
 732                 left = node->left;
 733                 right = node->right;
 734                 if (parent->left == node)
 735                         parent->left = left;
 736                 else if (parent->right == node)
 737                         parent->right = left;
 738                 else
 739                         assert(0);
 740                 left->parent = parent;
 741                 left->keymask |= (1 << node->bitnum);
 742                 node->left = NULL;
 743                 while (node) {
 744                         bitmask = 1 << node->bitnum;
 745                         leftmask &= ~bitmask;
 746                         rightmask &= ~bitmask;
 747                         if (node->leftnode == NODE && node->left) {
 748                                 left = node->left;
 749                                 free(node);
 750                                 count++;
 751                                 node = left;
 752                         } else if (node->rightnode == NODE && node->right) {
 753                                 right = node->right;
 754                                 free(node);
 755                                 count++;
 756                                 node = right;
 757                         } else {
 758                                 node = NULL;
 759                         }
 760                 }
 761                 /* Propagate keymasks up along singleton chains. */
 762                 node = parent;
 763                 /* Force re-check */
 764                 bitmask = 1 << node->bitnum;
 765                 leftmask &= ~bitmask;
 766                 rightmask &= ~bitmask;
 767                 for (;;) {
 768                         if (node->left && node->right)
 769                                 break;
 770                         if (node->left) {
 771                                 left = node->left;
 772                                 node->keymask |= left->keymask;
 773                                 node->keybits |= left->keybits;
 774                         }
 775                         if (node->right) {
 776                                 right = node->right;
 777                                 node->keymask |= right->keymask;
 778                                 node->keybits |= right->keybits;
 779                         }
 780                         node->keymask |= (1 << node->bitnum);
 781                         node = node->parent;
 782                         /* Force re-check */
 783                         bitmask = 1 << node->bitnum;
 784                         leftmask &= ~bitmask;
 785                         rightmask &= ~bitmask;
 786                 }
 787         advance:
 788                 bitmask = 1 << node->bitnum;
 789                 if ((leftmask & bitmask) == 0 &&
 790                     node->leftnode == NODE &&
 791                     node->left) {
 792                         leftmask |= bitmask;
 793                         node = node->left;
 794                 } else if ((rightmask & bitmask) == 0 &&
 795                            node->rightnode == NODE &&
 796                            node->right) {
 797                         rightmask |= bitmask;
 798                         node = node->right;
 799                 } else {
 800                         leftmask &= ~bitmask;
 801                         rightmask &= ~bitmask;
 802                         node = node->parent;
 803                 }
 804         }
 805         if (verbose > 0)
 806                 printf("Pruned %d nodes\n", count);
 807 }
 808
 809 /*
 810  * Mark the nodes in the tree that lead to leaves that must be
 811  * emitted.
 812  */
 813 static void mark_nodes(struct tree *tree)
 814 {
 815         struct node *node;
 816         struct node *n;
 817         unsigned int leftmask;
 818         unsigned int rightmask;
 819         unsigned int bitmask;
 820         int marked;
 821
 822         marked = 0;
 823         if (verbose > 0)
 824                 printf("Marking %s_%x\n", tree->type, tree->maxage);
 825         if (tree->childnode == LEAF)
 826                 goto done;
 827
 828         assert(tree->childnode == NODE);
 829         node = tree->root;
 830         leftmask = rightmask = 0;
 831         while (node) {
 832                 bitmask = 1 << node->bitnum;
 833                 if ((leftmask & bitmask) == 0) {
 834                         leftmask |= bitmask;
 835                         if (node->leftnode == LEAF) {
 836                                 assert(node->left);
 837                                 if (tree->leaf_mark(node->left)) {
 838                                         n = node;
 839                                         while (n && !n->mark) {
 840                                                 marked++;
 841                                                 n->mark = 1;
 842                                                 n = n->parent;
 843                                         }
 844                                 }
 845                         } else if (node->left) {
 846                                 assert(node->leftnode == NODE);
 847                                 node = node->left;
 848                                 continue;
 849                         }
 850                 }
 851                 if ((rightmask & bitmask) == 0) {
 852                         rightmask |= bitmask;
 853                         if (node->rightnode == LEAF) {
 854                                 assert(node->right);
 855                                 if (tree->leaf_mark(node->right)) {
 856                                         n = node;
 857                                         while (n && !n->mark) {
 858                                                 marked++;
 859                                                 n->mark = 1;
 860                                                 n = n->parent;
 861                                         }
 862                                 }
 863                         } else if (node->right) {
 864                                 assert(node->rightnode == NODE);
 865                                 node = node->right;
 866                                 continue;
 867                         }
 868                 }
 869                 leftmask &= ~bitmask;
 870                 rightmask &= ~bitmask;
 871                 node = node->parent;
 872         }
 873
 874         /* second pass: left siblings and singletons */
 875
 876         assert(tree->childnode == NODE);
 877         node = tree->root;
 878         leftmask = rightmask = 0;
 879         while (node) {
 880                 bitmask = 1 << node->bitnum;
 881                 if ((leftmask & bitmask) == 0) {
 882                         leftmask |= bitmask;
 883                         if (node->leftnode == LEAF) {
 884                                 assert(node->left);
 885                                 if (tree->leaf_mark(node->left)) {
 886                                         n = node;
 887                                         while (n && !n->mark) {
 888                                                 marked++;
 889                                                 n->mark = 1;
 890                                                 n = n->parent;
 891                                         }
 892                                 }
 893                         } else if (node->left) {
 894                                 assert(node->leftnode == NODE);
 895                                 node = node->left;
 896                                 if (!node->mark && node->parent->mark) {
 897                                         marked++;
 898                                         node->mark = 1;
 899                                 }
 900                                 continue;
 901                         }
 902                 }
 903                 if ((rightmask & bitmask) == 0) {
 904                         rightmask |= bitmask;
 905                         if (node->rightnode == LEAF) {
 906                                 assert(node->right);
 907                                 if (tree->leaf_mark(node->right)) {
 908                                         n = node;
 909                                         while (n && !n->mark) {
 910                                                 marked++;
 911                                                 n->mark = 1;
 912                                                 n = n->parent;
 913                                         }
 914                                 }
 915                         } else if (node->right) {
 916                                 assert(node->rightnode == NODE);
 917                                 node = node->right;
 918                                 if (!node->mark && node->parent->mark &&
 919                                     !node->parent->left) {
 920                                         marked++;
 921                                         node->mark = 1;
 922                                 }
 923                                 continue;
 924                         }
 925                 }
 926                 leftmask &= ~bitmask;
 927                 rightmask &= ~bitmask;
 928                 node = node->parent;
 929         }
 930 done:
 931         if (verbose > 0)
 932                 printf("Marked %d nodes\n", marked);
 933 }
 934
 935 /*
 936  * Compute the index of each node and leaf, which is the offset in the
 937  * emitted trie.  These values must be pre-computed because relative
 938  * offsets between nodes are used to navigate the tree.
 939  */
 940 static int index_nodes(struct tree *tree, int index)
 941 {
 942         struct node *node;
 943         unsigned int leftmask;
 944         unsigned int rightmask;
 945         unsigned int bitmask;
 946         int count;
 947         int indent;
 948
 949         /* Align to a cache line (or half a cache line?). */
 950         while (index % 64)
 951                 index++;
 952         tree->index = index;
 953         indent = 1;
 954         count = 0;
 955
 956         if (verbose > 0)
 957                 printf("Indexing %s_%x: %d\n", tree->type, tree->maxage, index);
 958         if (tree->childnode == LEAF) {
 959                 index += tree->leaf_size(tree->root);
 960                 goto done;
 961         }
 962
 963         assert(tree->childnode == NODE);
 964         node = tree->root;
 965         leftmask = rightmask = 0;
 966         while (node) {
 967                 if (!node->mark)
 968                         goto skip;
 969                 count++;
 970                 if (node->index != index)
 971                         node->index = index;
 972                 index += node->size;
 973 skip:
 974                 while (node) {
 975                         bitmask = 1 << node->bitnum;
 976                         if (node->mark && (leftmask & bitmask) == 0) {
 977                                 leftmask |= bitmask;
 978                                 if (node->leftnode == LEAF) {
 979                                         assert(node->left);
 980                                         *tree->leaf_index(tree, node->left) =
 981                                                                         index;
 982                                         index += tree->leaf_size(node->left);
 983                                         count++;
 984                                 } else if (node->left) {
 985                                         assert(node->leftnode == NODE);
 986                                         indent += 1;
 987                                         node = node->left;
 988                                         break;
 989                                 }
 990                         }
 991                         if (node->mark && (rightmask & bitmask) == 0) {
 992                                 rightmask |= bitmask;
 993                                 if (node->rightnode == LEAF) {
 994                                         assert(node->right);
 995                                         *tree->leaf_index(tree, node->right) = index;
 996                                         index += tree->leaf_size(node->right);
 997                                         count++;
 998                                 } else if (node->right) {
 999                                         assert(node->rightnode == NODE);
1000                                         indent += 1;
1001                                         node = node->right;
1002                                         break;
1003                                 }
1004                         }
1005                         leftmask &= ~bitmask;
1006                         rightmask &= ~bitmask;
1007                         node = node->parent;
1008                         indent -= 1;
1009                 }
1010         }
1011 done:
1012         /* Round up to a multiple of 16 */
1013         while (index % 16)
1014                 index++;
1015         if (verbose > 0)
1016                 printf("Final index %d\n", index);
1017         return index;
1018 }
1019
1020 /*
1021  * Mark the nodes in a subtree, helper for size_nodes().
1022  */
1023 static int mark_subtree(struct node *node)
1024 {
1025         int changed;
1026
1027         if (!node || node->mark)
1028                 return 0;
1029         node->mark = 1;
1030         node->index = node->parent->index;
1031         changed = 1;
1032         if (node->leftnode == NODE)
1033                 changed += mark_subtree(node->left);
1034         if (node->rightnode == NODE)
1035                 changed += mark_subtree(node->right);
1036         return changed;
1037 }
1038
1039 /*
1040  * Compute the size of nodes and leaves. We start by assuming that
1041  * each node needs to store a three-byte offset. The indexes of the
1042  * nodes are calculated based on that, and then this function is
1043  * called to see if the sizes of some nodes can be reduced.  This is
1044  * repeated until no more changes are seen.
1045  */
1046 static int size_nodes(struct tree *tree)
1047 {
1048         struct tree *next;
1049         struct node *node;
1050         struct node *right;
1051         struct node *n;
1052         unsigned int leftmask;
1053         unsigned int rightmask;
1054         unsigned int bitmask;
1055         unsigned int pathbits;
1056         unsigned int pathmask;
1057         unsigned int nbit;
1058         int changed;
1059         int offset;
1060         int size;
1061         int indent;
1062
1063         indent = 1;
1064         changed = 0;
1065         size = 0;
1066
1067         if (verbose > 0)
1068                 printf("Sizing %s_%x\n", tree->type, tree->maxage);
1069         if (tree->childnode == LEAF)
1070                 goto done;
1071
1072         assert(tree->childnode == NODE);
1073         pathbits = 0;
1074         pathmask = 0;
1075         node = tree->root;
1076         leftmask = rightmask = 0;
1077         while (node) {
1078                 if (!node->mark)
1079                         goto skip;
1080                 offset = 0;
1081                 if (!node->left || !node->right) {
1082                         size = 1;
1083                 } else {
1084                         if (node->rightnode == NODE) {
1085                                 /*
1086                                  * If the right node is not marked,
1087                                  * look for a corresponding node in
1088                                  * the next tree.  Such a node need
1089                                  * not exist.
1090                                  */
1091                                 right = node->right;
1092                                 next = tree->next;
1093                                 while (!right->mark) {
1094                                         assert(next);
1095                                         n = next->root;
1096                                         while (n->bitnum != node->bitnum) {
1097                                                 nbit = 1 << n->bitnum;
1098                                                 if (!(pathmask & nbit))
1099                                                         break;
1100                                                 if (pathbits & nbit) {
1101                                                         if (n->rightnode == LEAF)
1102                                                                 break;
1103                                                         n = n->right;
1104                                                 } else {
1105                                                         if (n->leftnode == LEAF)
1106                                                                 break;
1107                                                         n = n->left;
1108                                                 }
1109                                         }
1110                                         if (n->bitnum != node->bitnum)
1111                                                 break;
1112                                         n = n->right;
1113                                         right = n;
1114                                         next = next->next;
1115                                 }
1116                                 /* Make sure the right node is marked. */
1117                                 if (!right->mark)
1118                                         changed += mark_subtree(right);
1119                                 offset = right->index - node->index;
1120                         } else {
1121                                 offset = *tree->leaf_index(tree, node->right);
1122                                 offset -= node->index;
1123                         }
1124                         assert(offset >= 0);
1125                         assert(offset <= 0xffffff);
1126                         if (offset <= 0xff) {
1127                                 size = 2;
1128                         } else if (offset <= 0xffff) {
1129                                 size = 3;
1130                         } else { /* offset <= 0xffffff */
1131                                 size = 4;
1132                         }
1133                 }
1134                 if (node->size != size || node->offset != offset) {
1135                         node->size = size;
1136                         node->offset = offset;
1137                         changed++;
1138                 }
1139 skip:
1140                 while (node) {
1141                         bitmask = 1 << node->bitnum;
1142                         pathmask |= bitmask;
1143                         if (node->mark && (leftmask & bitmask) == 0) {
1144                                 leftmask |= bitmask;
1145                                 if (node->leftnode == LEAF) {
1146                                         assert(node->left);
1147                                 } else if (node->left) {
1148                                         assert(node->leftnode == NODE);
1149                                         indent += 1;
1150                                         node = node->left;
1151                                         break;
1152                                 }
1153                         }
1154                         if (node->mark && (rightmask & bitmask) == 0) {
1155                                 rightmask |= bitmask;
1156                                 pathbits |= bitmask;
1157                                 if (node->rightnode == LEAF) {
1158                                         assert(node->right);
1159                                 } else if (node->right) {
1160                                         assert(node->rightnode == NODE);
1161                                         indent += 1;
1162                                         node = node->right;
1163                                         break;
1164                                 }
1165                         }
1166                         leftmask &= ~bitmask;
1167                         rightmask &= ~bitmask;
1168                         pathmask &= ~bitmask;
1169                         pathbits &= ~bitmask;
1170                         node = node->parent;
1171                         indent -= 1;
1172                 }
1173         }
1174 done:
1175         if (verbose > 0)
1176                 printf("Found %d changes\n", changed);
1177         return changed;
1178 }
1179
1180 /*
1181  * Emit a trie for the given tree into the data array.
1182  */
1183 static void emit(struct tree *tree, unsigned char *data)
1184 {
1185         struct node *node;
1186         unsigned int leftmask;
1187         unsigned int rightmask;
1188         unsigned int bitmask;
1189         int offlen;
1190         int offset;
1191         int index;
1192         int indent;
1193         int size;
1194         int bytes;
1195         int leaves;
1196         int nodes[4];
1197         unsigned char byte;
1198
1199         nodes[0] = nodes[1] = nodes[2] = nodes[3] = 0;
1200         leaves = 0;
1201         bytes = 0;
1202         index = tree->index;
1203         data += index;
1204         indent = 1;
1205         if (verbose > 0)
1206                 printf("Emitting %s_%x\n", tree->type, tree->maxage);
1207         if (tree->childnode == LEAF) {
1208                 assert(tree->root);
1209                 tree->leaf_emit(tree->root, data);
1210                 size = tree->leaf_size(tree->root);
1211                 index += size;
1212                 leaves++;
1213                 goto done;
1214         }
1215
1216         assert(tree->childnode == NODE);
1217         node = tree->root;
1218         leftmask = rightmask = 0;
1219         while (node) {
1220                 if (!node->mark)
1221                         goto skip;
1222                 assert(node->offset != -1);
1223                 assert(node->index == index);
1224
1225                 byte = 0;
1226                 if (node->nextbyte)
1227                         byte |= NEXTBYTE;
1228                 byte |= (node->bitnum & BITNUM);
1229                 if (node->left && node->right) {
1230                         if (node->leftnode == NODE)
1231                                 byte |= LEFTNODE;
1232                         if (node->rightnode == NODE)
1233                                 byte |= RIGHTNODE;
1234                         if (node->offset <= 0xff)
1235                                 offlen = 1;
1236                         else if (node->offset <= 0xffff)
1237                                 offlen = 2;
1238                         else
1239                                 offlen = 3;
1240                         nodes[offlen]++;
1241                         offset = node->offset;
1242                         byte |= offlen << OFFLEN_SHIFT;
1243                         *data++ = byte;
1244                         index++;
1245                         while (offlen--) {
1246                                 *data++ = offset & 0xff;
1247                                 index++;
1248                                 offset >>= 8;
1249                         }
1250                 } else if (node->left) {
1251                         if (node->leftnode == NODE)
1252                                 byte |= TRIENODE;
1253                         nodes[0]++;
1254                         *data++ = byte;
1255                         index++;
1256                 } else if (node->right) {
1257                         byte |= RIGHTNODE;
1258                         if (node->rightnode == NODE)
1259                                 byte |= TRIENODE;
1260                         nodes[0]++;
1261                         *data++ = byte;
1262                         index++;
1263                 } else {
1264                         assert(0);
1265                 }
1266 skip:
1267                 while (node) {
1268                         bitmask = 1 << node->bitnum;
1269                         if (node->mark && (leftmask & bitmask) == 0) {
1270                                 leftmask |= bitmask;
1271                                 if (node->leftnode == LEAF) {
1272                                         assert(node->left);
1273                                         data = tree->leaf_emit(node->left,
1274                                                                data);
1275                                         size = tree->leaf_size(node->left);
1276                                         index += size;
1277                                         bytes += size;
1278                                         leaves++;
1279                                 } else if (node->left) {
1280                                         assert(node->leftnode == NODE);
1281                                         indent += 1;
1282                                         node = node->left;
1283                                         break;
1284                                 }
1285                         }
1286                         if (node->mark && (rightmask & bitmask) == 0) {
1287                                 rightmask |= bitmask;
1288                                 if (node->rightnode == LEAF) {
1289                                         assert(node->right);
1290                                         data = tree->leaf_emit(node->right,
1291                                                                data);
1292                                         size = tree->leaf_size(node->right);
1293                                         index += size;
1294                                         bytes += size;
1295                                         leaves++;
1296                                 } else if (node->right) {
1297                                         assert(node->rightnode == NODE);
1298                                         indent += 1;
1299                                         node = node->right;
1300                                         break;
1301                                 }
1302                         }
1303                         leftmask &= ~bitmask;
1304                         rightmask &= ~bitmask;
1305                         node = node->parent;
1306                         indent -= 1;
1307                 }
1308         }
1309 done:
1310         if (verbose > 0) {
1311                 printf("Emitted %d (%d) leaves",
1312                         leaves, bytes);
1313                 printf(" %d (%d+%d+%d+%d) nodes",
1314                         nodes[0] + nodes[1] + nodes[2] + nodes[3],
1315                         nodes[0], nodes[1], nodes[2], nodes[3]);
1316                 printf(" %d total\n", index - tree->index);
1317         }
1318 }
1319
1320 /* ------------------------------------------------------------------ */
1321
1322 /*
1323  * Unicode data.
1324  *
1325  * We need to keep track of the Canonical Combining Class, the Age,
1326  * and decompositions for a code point.
1327  *
1328  * For the Age, we store the index into the ages table.  Effectively
1329  * this is a generation number that the table maps to a unicode
1330  * version.
1331  *
1332  * The correction field is used to indicate that this entry is in the
1333  * corrections array, which contains decompositions that were
1334  * corrected in later revisions.  The value of the correction field is
1335  * the Unicode version in which the mapping was corrected.
1336  */
1337 struct unicode_data {
1338         unsigned int code;
1339         int ccc;
1340         int gen;
1341         int correction;
1342         unsigned int *utf32nfkdi;
1343         unsigned int *utf32nfkdicf;
1344         char *utf8nfkdi;
1345         char *utf8nfkdicf;
1346 };
1347
1348 struct unicode_data unicode_data[0x110000];
1349 struct unicode_data *corrections;
1350 int    corrections_count;
1351
1352 struct tree *nfkdi_tree;
1353 struct tree *nfkdicf_tree;
1354
1355 struct tree *trees;
1356 int          trees_count;
1357
1358 /*
1359  * Check the corrections array to see if this entry was corrected at
1360  * some point.
1361  */
1362 static struct unicode_data *corrections_lookup(struct unicode_data *u)
1363 {
1364         int i;
1365
1366         for (i = 0; i != corrections_count; i++)
1367                 if (u->code == corrections[i].code)
1368                         return &corrections[i];
1369         return u;
1370 }
1371
1372 static int nfkdi_equal(void *l, void *r)
1373 {
1374         struct unicode_data *left = l;
1375         struct unicode_data *right = r;
1376
1377         if (left->gen != right->gen)
1378                 return 0;
1379         if (left->ccc != right->ccc)
1380                 return 0;
1381         if (left->utf8nfkdi && right->utf8nfkdi &&
1382             strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1383                 return 1;
1384         if (left->utf8nfkdi || right->utf8nfkdi)
1385                 return 0;
1386         return 1;
1387 }
1388
1389 static int nfkdicf_equal(void *l, void *r)
1390 {
1391         struct unicode_data *left = l;
1392         struct unicode_data *right = r;
1393
1394         if (left->gen != right->gen)
1395                 return 0;
1396         if (left->ccc != right->ccc)
1397                 return 0;
1398         if (left->utf8nfkdicf && right->utf8nfkdicf &&
1399             strcmp(left->utf8nfkdicf, right->utf8nfkdicf) == 0)
1400                 return 1;
1401         if (left->utf8nfkdicf && right->utf8nfkdicf)
1402                 return 0;
1403         if (left->utf8nfkdicf || right->utf8nfkdicf)
1404                 return 0;
1405         if (left->utf8nfkdi && right->utf8nfkdi &&
1406             strcmp(left->utf8nfkdi, right->utf8nfkdi) == 0)
1407                 return 1;
1408         if (left->utf8nfkdi || right->utf8nfkdi)
1409                 return 0;
1410         return 1;
1411 }
1412
1413 static void nfkdi_print(void *l, int indent)
1414 {
1415         struct unicode_data *leaf = l;
1416
1417         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1418                 leaf->code, leaf->ccc, leaf->gen);
1419         if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1420                 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1421         else if (leaf->utf8nfkdi)
1422                 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1423         printf("\n");
1424 }
1425
1426 static void nfkdicf_print(void *l, int indent)
1427 {
1428         struct unicode_data *leaf = l;
1429
1430         printf("%*sleaf @ %p code %X ccc %d gen %d", indent, "", leaf,
1431                 leaf->code, leaf->ccc, leaf->gen);
1432         if (leaf->utf8nfkdicf)
1433                 printf(" nfkdicf \"%s\"", (const char*)leaf->utf8nfkdicf);
1434         else if (leaf->utf8nfkdi && leaf->utf8nfkdi[0] == HANGUL)
1435                 printf(" nfkdi \"%s\"", "HANGUL SYLLABLE");
1436         else if (leaf->utf8nfkdi)
1437                 printf(" nfkdi \"%s\"", (const char*)leaf->utf8nfkdi);
1438         printf("\n");
1439 }
1440
1441 static int nfkdi_mark(void *l)
1442 {
1443         return 1;
1444 }
1445
1446 static int nfkdicf_mark(void *l)
1447 {
1448         struct unicode_data *leaf = l;
1449
1450         if (leaf->utf8nfkdicf)
1451                 return 1;
1452         return 0;
1453 }
1454
1455 static int correction_mark(void *l)
1456 {
1457         struct unicode_data *leaf = l;
1458
1459         return leaf->correction;
1460 }
1461
1462 static int nfkdi_size(void *l)
1463 {
1464         struct unicode_data *leaf = l;
1465
1466         int size = 2;
1467         if (HANGUL_SYLLABLE(leaf->code))
1468                 size += 1;
1469         else if (leaf->utf8nfkdi)
1470                 size += strlen(leaf->utf8nfkdi) + 1;
1471         return size;
1472 }
1473
1474 static int nfkdicf_size(void *l)
1475 {
1476         struct unicode_data *leaf = l;
1477
1478         int size = 2;
1479         if (HANGUL_SYLLABLE(leaf->code))
1480                 size += 1;
1481         else if (leaf->utf8nfkdicf)
1482                 size += strlen(leaf->utf8nfkdicf) + 1;
1483         else if (leaf->utf8nfkdi)
1484                 size += strlen(leaf->utf8nfkdi) + 1;
1485         return size;
1486 }
1487
1488 static int *nfkdi_index(struct tree *tree, void *l)
1489 {
1490         struct unicode_data *leaf = l;
1491
1492         return &tree->leafindex[leaf->code];
1493 }
1494
1495 static int *nfkdicf_index(struct tree *tree, void *l)
1496 {
1497         struct unicode_data *leaf = l;
1498
1499         return &tree->leafindex[leaf->code];
1500 }
1501
1502 static unsigned char *nfkdi_emit(void *l, unsigned char *data)
1503 {
1504         struct unicode_data *leaf = l;
1505         unsigned char *s;
1506
1507         *data++ = leaf->gen;
1508         if (HANGUL_SYLLABLE(leaf->code)) {
1509                 *data++ = DECOMPOSE;
1510                 *data++ = HANGUL;
1511         } else if (leaf->utf8nfkdi) {
1512                 *data++ = DECOMPOSE;
1513                 s = (unsigned char*)leaf->utf8nfkdi;
1514                 while ((*data++ = *s++) != 0)
1515                         ;
1516         } else {
1517                 *data++ = leaf->ccc;
1518         }
1519         return data;
1520 }
1521
1522 static unsigned char *nfkdicf_emit(void *l, unsigned char *data)
1523 {
1524         struct unicode_data *leaf = l;
1525         unsigned char *s;
1526
1527         *data++ = leaf->gen;
1528         if (HANGUL_SYLLABLE(leaf->code)) {
1529                 *data++ = DECOMPOSE;
1530                 *data++ = HANGUL;
1531         } else if (leaf->utf8nfkdicf) {
1532                 *data++ = DECOMPOSE;
1533                 s = (unsigned char*)leaf->utf8nfkdicf;
1534                 while ((*data++ = *s++) != 0)
1535                         ;
1536         } else if (leaf->utf8nfkdi) {
1537                 *data++ = DECOMPOSE;
1538                 s = (unsigned char*)leaf->utf8nfkdi;
1539                 while ((*data++ = *s++) != 0)
1540                         ;
1541         } else {
1542                 *data++ = leaf->ccc;
1543         }
1544         return data;
1545 }
1546
1547 static void utf8_create(struct unicode_data *data)
1548 {
1549         char utf[18*4+1];
1550         char *u;
1551         unsigned int *um;
1552         int i;
1553
1554         if (data->utf8nfkdi) {
1555                 assert(data->utf8nfkdi[0] == HANGUL);
1556                 return;
1557         }
1558
1559         u = utf;
1560         um = data->utf32nfkdi;
1561         if (um) {
1562                 for (i = 0; um[i]; i++)
1563                         u += utf8encode(u, um[i]);
1564                 *u = '\0';
1565                 data->utf8nfkdi = strdup(utf);
1566         }
1567         u = utf;
1568         um = data->utf32nfkdicf;
1569         if (um) {
1570                 for (i = 0; um[i]; i++)
1571                         u += utf8encode(u, um[i]);
1572                 *u = '\0';
1573                 if (!data->utf8nfkdi || strcmp(data->utf8nfkdi, utf))
1574                         data->utf8nfkdicf = strdup(utf);
1575         }
1576 }
1577
1578 static void utf8_init(void)
1579 {
1580         unsigned int unichar;
1581         int i;
1582
1583         for (unichar = 0; unichar != 0x110000; unichar++)
1584                 utf8_create(&unicode_data[unichar]);
1585
1586         for (i = 0; i != corrections_count; i++)
1587                 utf8_create(&corrections[i]);
1588 }
1589
1590 static void trees_init(void)
1591 {
1592         struct unicode_data *data;
1593         unsigned int maxage;
1594         unsigned int nextage;
1595         int count;
1596         int i;
1597         int j;
1598
1599         /* Count the number of different ages. */
1600         count = 0;
1601         nextage = (unsigned int)-1;
1602         do {
1603                 maxage = nextage;
1604                 nextage = 0;
1605                 for (i = 0; i <= corrections_count; i++) {
1606                         data = &corrections[i];
1607                         if (nextage < data->correction &&
1608                             data->correction < maxage)
1609                                 nextage = data->correction;
1610                 }
1611                 count++;
1612         } while (nextage);
1613
1614         /* Two trees per age: nfkdi and nfkdicf */
1615         trees_count = count * 2;
1616         trees = calloc(trees_count, sizeof(struct tree));
1617
1618         /* Assign ages to the trees. */
1619         count = trees_count;
1620         nextage = (unsigned int)-1;
1621         do {
1622                 maxage = nextage;
1623                 trees[--count].maxage = maxage;
1624                 trees[--count].maxage = maxage;
1625                 nextage = 0;
1626                 for (i = 0; i <= corrections_count; i++) {
1627                         data = &corrections[i];
1628                         if (nextage < data->correction &&
1629                             data->correction < maxage)
1630                                 nextage = data->correction;
1631                 }
1632         } while (nextage);
1633
1634         /* The ages assigned above are off by one. */
1635         for (i = 0; i != trees_count; i++) {
1636                 j = 0;
1637                 while (ages[j] < trees[i].maxage)
1638                         j++;
1639                 trees[i].maxage = ages[j-1];
1640         }
1641
1642         /* Set up the forwarding between trees. */
1643         trees[trees_count-2].next = &trees[trees_count-1];
1644         trees[trees_count-1].leaf_mark = nfkdi_mark;
1645         trees[trees_count-2].leaf_mark = nfkdicf_mark;
1646         for (i = 0; i != trees_count-2; i += 2) {
1647                 trees[i].next = &trees[trees_count-2];
1648                 trees[i].leaf_mark = correction_mark;
1649                 trees[i+1].next = &trees[trees_count-1];
1650                 trees[i+1].leaf_mark = correction_mark;
1651         }
1652
1653         /* Assign the callouts. */
1654         for (i = 0; i != trees_count; i += 2) {
1655                 trees[i].type = "nfkdicf";
1656                 trees[i].leaf_equal = nfkdicf_equal;
1657                 trees[i].leaf_print = nfkdicf_print;
1658                 trees[i].leaf_size = nfkdicf_size;
1659                 trees[i].leaf_index = nfkdicf_index;
1660                 trees[i].leaf_emit = nfkdicf_emit;
1661
1662                 trees[i+1].type = "nfkdi";
1663                 trees[i+1].leaf_equal = nfkdi_equal;
1664                 trees[i+1].leaf_print = nfkdi_print;
1665                 trees[i+1].leaf_size = nfkdi_size;
1666                 trees[i+1].leaf_index = nfkdi_index;
1667                 trees[i+1].leaf_emit = nfkdi_emit;
1668         }
1669
1670         /* Finish init. */
1671         for (i = 0; i != trees_count; i++)
1672                 trees[i].childnode = NODE;
1673 }
1674
1675 static void trees_populate(void)
1676 {
1677         struct unicode_data *data;
1678         unsigned int unichar;
1679         char keyval[4];
1680         int keylen;
1681         int i;
1682
1683         for (i = 0; i != trees_count; i++) {
1684                 if (verbose > 0) {
1685                         printf("Populating %s_%x\n",
1686                                 trees[i].type, trees[i].maxage);
1687                 }
1688                 for (unichar = 0; unichar != 0x110000; unichar++) {
1689                         if (unicode_data[unichar].gen < 0)
1690                                 continue;
1691                         keylen = utf8encode(keyval, unichar);
1692                         data = corrections_lookup(&unicode_data[unichar]);
1693                         if (data->correction <= trees[i].maxage)
1694                                 data = &unicode_data[unichar];
1695                         insert(&trees[i], keyval, keylen, data);
1696                 }
1697         }
1698 }
1699
1700 static void trees_reduce(void)
1701 {
1702         int i;
1703         int size;
1704         int changed;
1705
1706         for (i = 0; i != trees_count; i++)
1707                 prune(&trees[i]);
1708         for (i = 0; i != trees_count; i++)
1709                 mark_nodes(&trees[i]);
1710         do {
1711                 size = 0;
1712                 for (i = 0; i != trees_count; i++)
1713                         size = index_nodes(&trees[i], size);
1714                 changed = 0;
1715                 for (i = 0; i != trees_count; i++)
1716                         changed += size_nodes(&trees[i]);
1717         } while (changed);
1718
1719         utf8data = calloc(size, 1);
1720         utf8data_size = size;
1721         for (i = 0; i != trees_count; i++)
1722                 emit(&trees[i], utf8data);
1723
1724         if (verbose > 0) {
1725                 for (i = 0; i != trees_count; i++) {
1726                         printf("%s_%x idx %d\n",
1727                                 trees[i].type, trees[i].maxage, trees[i].index);
1728                 }
1729         }
1730
1731         nfkdi = utf8data + trees[trees_count-1].index;
1732         nfkdicf = utf8data + trees[trees_count-2].index;
1733
1734         nfkdi_tree = &trees[trees_count-1];
1735         nfkdicf_tree = &trees[trees_count-2];
1736 }
1737
1738 static void verify(struct tree *tree)
1739 {
1740         struct unicode_data *data;
1741         utf8leaf_t      *leaf;
1742         unsigned int    unichar;
1743         char            key[4];
1744         unsigned char   hangul[UTF8HANGULLEAF];
1745         int             report;
1746         int             nocf;
1747
1748         if (verbose > 0)
1749                 printf("Verifying %s_%x\n", tree->type, tree->maxage);
1750         nocf = strcmp(tree->type, "nfkdicf");
1751
1752         for (unichar = 0; unichar != 0x110000; unichar++) {
1753                 report = 0;
1754                 data = corrections_lookup(&unicode_data[unichar]);
1755                 if (data->correction <= tree->maxage)
1756                         data = &unicode_data[unichar];
1757                 utf8encode(key,unichar);
1758                 leaf = utf8lookup(tree, hangul, key);
1759
1760                 if (!leaf) {
1761                         if (data->gen != -1)
1762                                 report++;
1763                         if (unichar < 0xd800 || unichar > 0xdfff)
1764                                 report++;
1765                 } else {
1766                         if (unichar >= 0xd800 && unichar <= 0xdfff)
1767                                 report++;
1768                         if (data->gen == -1)
1769                                 report++;
1770                         if (data->gen != LEAF_GEN(leaf))
1771                                 report++;
1772                         if (LEAF_CCC(leaf) == DECOMPOSE) {
1773                                 if (HANGUL_SYLLABLE(data->code)) {
1774                                         if (data->utf8nfkdi[0] != HANGUL)
1775                                                 report++;
1776                                 } else if (nocf) {
1777                                         if (!data->utf8nfkdi) {
1778                                                 report++;
1779                                         } else if (strcmp(data->utf8nfkdi,
1780                                                           LEAF_STR(leaf))) {
1781                                                 report++;
1782                                         }
1783                                 } else {
1784                                         if (!data->utf8nfkdicf &&
1785                                             !data->utf8nfkdi) {
1786                                                 report++;
1787                                         } else if (data->utf8nfkdicf) {
1788                                                 if (strcmp(data->utf8nfkdicf,
1789                                                            LEAF_STR(leaf)))
1790                                                         report++;
1791                                         } else if (strcmp(data->utf8nfkdi,
1792                                                           LEAF_STR(leaf))) {
1793                                                 report++;
1794                                         }
1795                                 }
1796                         } else if (data->ccc != LEAF_CCC(leaf)) {
1797                                 report++;
1798                         }
1799                 }
1800                 if (report) {
1801                         printf("%X code %X gen %d ccc %d"
1802                                 " nfkdi -> \"%s\"",
1803                                 unichar, data->code, data->gen,
1804                                 data->ccc,
1805                                 data->utf8nfkdi);
1806                         if (leaf) {
1807                                 printf(" gen %d ccc %d"
1808                                         " nfkdi -> \"%s\"",
1809                                         LEAF_GEN(leaf),
1810                                         LEAF_CCC(leaf),
1811                                         LEAF_CCC(leaf) == DECOMPOSE ?
1812                                                 LEAF_STR(leaf) : "");
1813                         }
1814                         printf("\n");
1815                 }
1816         }
1817 }
1818
1819 static void trees_verify(void)
1820 {
1821         int i;
1822
1823         for (i = 0; i != trees_count; i++)
1824                 verify(&trees[i]);
1825 }
1826
1827 /* ------------------------------------------------------------------ */
1828
1829 static void help(void)
1830 {
1831         printf("Usage: %s [options]\n", argv0);
1832         printf("\n");
1833         printf("This program creates an a data trie used for parsing and\n");
1834         printf("normalization of UTF-8 strings. The trie is derived from\n");
1835         printf("a set of input files from the Unicode character database\n");
1836         printf("found at: http://www.unicode.org/Public/UCD/latest/ucd/\n");
1837         printf("\n");
1838         printf("The generated tree supports two normalization forms:\n");
1839         printf("\n");
1840         printf("\tnfkdi:\n");
1841         printf("\t- Apply unicode normalization form NFKD.\n");
1842         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1843         printf("\n");
1844         printf("\tnfkdicf:\n");
1845         printf("\t- Apply unicode normalization form NFKD.\n");
1846         printf("\t- Remove any Default_Ignorable_Code_Point.\n");
1847         printf("\t- Apply a full casefold (C + F).\n");
1848         printf("\n");
1849         printf("These forms were chosen as being most useful when dealing\n");
1850         printf("with file names: NFKD catches most cases where characters\n");
1851         printf("should be considered equivalent. The ignorables are mostly\n");
1852         printf("invisible, making names hard to type.\n");
1853         printf("\n");
1854         printf("The options to specify the files to be used are listed\n");
1855         printf("below with their default values, which are the names used\n");
1856         printf("by version 11.0.0 of the Unicode Character Database.\n");
1857         printf("\n");
1858         printf("The input files:\n");
1859         printf("\t-a %s\n", AGE_NAME);
1860         printf("\t-c %s\n", CCC_NAME);
1861         printf("\t-p %s\n", PROP_NAME);
1862         printf("\t-d %s\n", DATA_NAME);
1863         printf("\t-f %s\n", FOLD_NAME);
1864         printf("\t-n %s\n", NORM_NAME);
1865         printf("\n");
1866         printf("Additionally, the generated tables are tested using:\n");
1867         printf("\t-t %s\n", TEST_NAME);
1868         printf("\n");
1869         printf("Finally, the output file:\n");
1870         printf("\t-o %s\n", UTF8_NAME);
1871         printf("\n");
1872 }
1873
1874 static void usage(void)
1875 {
1876         help();
1877         exit(1);
1878 }
1879
1880 static void open_fail(const char *name, int error)
1881 {
1882         printf("Error %d opening %s: %s\n", error, name, strerror(error));
1883         exit(1);
1884 }
1885
1886 static void file_fail(const char *filename)
1887 {
1888         printf("Error parsing %s\n", filename);
1889         exit(1);
1890 }
1891
1892 static void line_fail(const char *filename, const char *line)
1893 {
1894         printf("Error parsing %s:%s\n", filename, line);
1895         exit(1);
1896 }
1897
1898 /* ------------------------------------------------------------------ */
1899
1900 static void print_utf32(unsigned int *utf32str)
1901 {
1902         int     i;
1903
1904         for (i = 0; utf32str[i]; i++)
1905                 printf(" %X", utf32str[i]);
1906 }
1907
1908 static void print_utf32nfkdi(unsigned int unichar)
1909 {
1910         printf(" %X ->", unichar);
1911         print_utf32(unicode_data[unichar].utf32nfkdi);
1912         printf("\n");
1913 }
1914
1915 static void print_utf32nfkdicf(unsigned int unichar)
1916 {
1917         printf(" %X ->", unichar);
1918         print_utf32(unicode_data[unichar].utf32nfkdicf);
1919         printf("\n");
1920 }
1921
1922 /* ------------------------------------------------------------------ */
1923
1924 static void age_init(void)
1925 {
1926         FILE *file;
1927         unsigned int first;
1928         unsigned int last;
1929         unsigned int unichar;
1930         unsigned int major;
1931         unsigned int minor;
1932         unsigned int revision;
1933         int gen;
1934         int count;
1935         int ret;
1936
1937         if (verbose > 0)
1938                 printf("Parsing %s\n", age_name);
1939
1940         file = fopen(age_name, "r");
1941         if (!file)
1942                 open_fail(age_name, errno);
1943         count = 0;
1944
1945         gen = 0;
1946         while (fgets(line, LINESIZE, file)) {
1947                 ret = sscanf(line, "# Age=V%d_%d_%d",
1948                                 &major, &minor, &revision);
1949                 if (ret == 3) {
1950                         ages_count++;
1951                         if (verbose > 1)
1952                                 printf(" Age V%d_%d_%d\n",
1953                                         major, minor, revision);
1954                         if (!age_valid(major, minor, revision))
1955                                 line_fail(age_name, line);
1956                         continue;
1957                 }
1958                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1959                 if (ret == 2) {
1960                         ages_count++;
1961                         if (verbose > 1)
1962                                 printf(" Age V%d_%d\n", major, minor);
1963                         if (!age_valid(major, minor, 0))
1964                                 line_fail(age_name, line);
1965                         continue;
1966                 }
1967         }
1968
1969         /* We must have found something above. */
1970         if (verbose > 1)
1971                 printf("%d age entries\n", ages_count);
1972         if (ages_count == 0 || ages_count > MAXGEN)
1973                 file_fail(age_name);
1974
1975         /* There is a 0 entry. */
1976         ages_count++;
1977         ages = calloc(ages_count + 1, sizeof(*ages));
1978         /* And a guard entry. */
1979         ages[ages_count] = (unsigned int)-1;
1980
1981         rewind(file);
1982         count = 0;
1983         gen = 0;
1984         while (fgets(line, LINESIZE, file)) {
1985                 ret = sscanf(line, "# Age=V%d_%d_%d",
1986                                 &major, &minor, &revision);
1987                 if (ret == 3) {
1988                         ages[++gen] =
1989                                 UNICODE_AGE(major, minor, revision);
1990                         if (verbose > 1)
1991                                 printf(" Age V%d_%d_%d = gen %d\n",
1992                                         major, minor, revision, gen);
1993                         if (!age_valid(major, minor, revision))
1994                                 line_fail(age_name, line);
1995                         continue;
1996                 }
1997                 ret = sscanf(line, "# Age=V%d_%d", &major, &minor);
1998                 if (ret == 2) {
1999                         ages[++gen] = UNICODE_AGE(major, minor, 0);
2000                         if (verbose > 1)
2001                                 printf(" Age V%d_%d = %d\n",
2002                                         major, minor, gen);
2003                         if (!age_valid(major, minor, 0))
2004                                 line_fail(age_name, line);
2005                         continue;
2006                 }
2007                 ret = sscanf(line, "%X..%X ; %d.%d #",
2008                              &first, &last, &major, &minor);
2009                 if (ret == 4) {
2010                         for (unichar = first; unichar <= last; unichar++)
2011                                 unicode_data[unichar].gen = gen;
2012                         count += 1 + last - first;
2013                         if (verbose > 1)
2014                                 printf("  %X..%X gen %d\n", first, last, gen);
2015                         if (!utf32valid(first) || !utf32valid(last))
2016                                 line_fail(age_name, line);
2017                         continue;
2018                 }
2019                 ret = sscanf(line, "%X ; %d.%d #", &unichar, &major, &minor);
2020                 if (ret == 3) {
2021                         unicode_data[unichar].gen = gen;
2022                         count++;
2023                         if (verbose > 1)
2024                                 printf("  %X gen %d\n", unichar, gen);
2025                         if (!utf32valid(unichar))
2026                                 line_fail(age_name, line);
2027                         continue;
2028                 }
2029         }
2030         unicode_maxage = ages[gen];
2031         fclose(file);
2032
2033         /* Nix surrogate block */
2034         if (verbose > 1)
2035                 printf(" Removing surrogate block D800..DFFF\n");
2036         for (unichar = 0xd800; unichar <= 0xdfff; unichar++)
2037                 unicode_data[unichar].gen = -1;
2038
2039         if (verbose > 0)
2040                 printf("Found %d entries\n", count);
2041         if (count == 0)
2042                 file_fail(age_name);
2043 }
2044
2045 static void ccc_init(void)
2046 {
2047         FILE *file;
2048         unsigned int first;
2049         unsigned int last;
2050         unsigned int unichar;
2051         unsigned int value;
2052         int count;
2053         int ret;
2054
2055         if (verbose > 0)
2056                 printf("Parsing %s\n", ccc_name);
2057
2058         file = fopen(ccc_name, "r");
2059         if (!file)
2060                 open_fail(ccc_name, errno);
2061
2062         count = 0;
2063         while (fgets(line, LINESIZE, file)) {
2064                 ret = sscanf(line, "%X..%X ; %d #", &first, &last, &value);
2065                 if (ret == 3) {
2066                         for (unichar = first; unichar <= last; unichar++) {
2067                                 unicode_data[unichar].ccc = value;
2068                                 count++;
2069                         }
2070                         if (verbose > 1)
2071                                 printf(" %X..%X ccc %d\n", first, last, value);
2072                         if (!utf32valid(first) || !utf32valid(last))
2073                                 line_fail(ccc_name, line);
2074                         continue;
2075                 }
2076                 ret = sscanf(line, "%X ; %d #", &unichar, &value);
2077                 if (ret == 2) {
2078                         unicode_data[unichar].ccc = value;
2079                         count++;
2080                         if (verbose > 1)
2081                                 printf(" %X ccc %d\n", unichar, value);
2082                         if (!utf32valid(unichar))
2083                                 line_fail(ccc_name, line);
2084                         continue;
2085                 }
2086         }
2087         fclose(file);
2088
2089         if (verbose > 0)
2090                 printf("Found %d entries\n", count);
2091         if (count == 0)
2092                 file_fail(ccc_name);
2093 }
2094
2095 static void nfkdi_init(void)
2096 {
2097         FILE *file;
2098         unsigned int unichar;
2099         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2100         char *s;
2101         unsigned int *um;
2102         int count;
2103         int i;
2104         int ret;
2105
2106         if (verbose > 0)
2107                 printf("Parsing %s\n", data_name);
2108         file = fopen(data_name, "r");
2109         if (!file)
2110                 open_fail(data_name, errno);
2111
2112         count = 0;
2113         while (fgets(line, LINESIZE, file)) {
2114                 ret = sscanf(line, "%X;%*[^;];%*[^;];%*[^;];%*[^;];%[^;];",
2115                              &unichar, buf0);
2116                 if (ret != 2)
2117                         continue;
2118                 if (!utf32valid(unichar))
2119                         line_fail(data_name, line);
2120
2121                 s = buf0;
2122                 /* skip over <tag> */
2123                 if (*s == '<')
2124                         while (*s++ != ' ')
2125                                 ;
2126                 /* decode the decomposition into UTF-32 */
2127                 i = 0;
2128                 while (*s) {
2129                         mapping[i] = strtoul(s, &s, 16);
2130                         if (!utf32valid(mapping[i]))
2131                                 line_fail(data_name, line);
2132                         i++;
2133                 }
2134                 mapping[i++] = 0;
2135
2136                 um = malloc(i * sizeof(unsigned int));
2137                 memcpy(um, mapping, i * sizeof(unsigned int));
2138                 unicode_data[unichar].utf32nfkdi = um;
2139
2140                 if (verbose > 1)
2141                         print_utf32nfkdi(unichar);
2142                 count++;
2143         }
2144         fclose(file);
2145         if (verbose > 0)
2146                 printf("Found %d entries\n", count);
2147         if (count == 0)
2148                 file_fail(data_name);
2149 }
2150
2151 static void nfkdicf_init(void)
2152 {
2153         FILE *file;
2154         unsigned int unichar;
2155         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2156         char status;
2157         char *s;
2158         unsigned int *um;
2159         int i;
2160         int count;
2161         int ret;
2162
2163         if (verbose > 0)
2164                 printf("Parsing %s\n", fold_name);
2165         file = fopen(fold_name, "r");
2166         if (!file)
2167                 open_fail(fold_name, errno);
2168
2169         count = 0;
2170         while (fgets(line, LINESIZE, file)) {
2171                 ret = sscanf(line, "%X; %c; %[^;];", &unichar, &status, buf0);
2172                 if (ret != 3)
2173                         continue;
2174                 if (!utf32valid(unichar))
2175                         line_fail(fold_name, line);
2176                 /* Use the C+F casefold. */
2177                 if (status != 'C' && status != 'F')
2178                         continue;
2179                 s = buf0;
2180                 if (*s == '<')
2181                         while (*s++ != ' ')
2182                                 ;
2183                 i = 0;
2184                 while (*s) {
2185                         mapping[i] = strtoul(s, &s, 16);
2186                         if (!utf32valid(mapping[i]))
2187                                 line_fail(fold_name, line);
2188                         i++;
2189                 }
2190                 mapping[i++] = 0;
2191
2192                 um = malloc(i * sizeof(unsigned int));
2193                 memcpy(um, mapping, i * sizeof(unsigned int));
2194                 unicode_data[unichar].utf32nfkdicf = um;
2195
2196                 if (verbose > 1)
2197                         print_utf32nfkdicf(unichar);
2198                 count++;
2199         }
2200         fclose(file);
2201         if (verbose > 0)
2202                 printf("Found %d entries\n", count);
2203         if (count == 0)
2204                 file_fail(fold_name);
2205 }
2206
2207 static void ignore_init(void)
2208 {
2209         FILE *file;
2210         unsigned int unichar;
2211         unsigned int first;
2212         unsigned int last;
2213         unsigned int *um;
2214         int count;
2215         int ret;
2216
2217         if (verbose > 0)
2218                 printf("Parsing %s\n", prop_name);
2219         file = fopen(prop_name, "r");
2220         if (!file)
2221                 open_fail(prop_name, errno);
2222         assert(file);
2223         count = 0;
2224         while (fgets(line, LINESIZE, file)) {
2225                 ret = sscanf(line, "%X..%X ; %s # ", &first, &last, buf0);
2226                 if (ret == 3) {
2227                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2228                                 continue;
2229                         if (!utf32valid(first) || !utf32valid(last))
2230                                 line_fail(prop_name, line);
2231                         for (unichar = first; unichar <= last; unichar++) {
2232                                 free(unicode_data[unichar].utf32nfkdi);
2233                                 um = malloc(sizeof(unsigned int));
2234                                 *um = 0;
2235                                 unicode_data[unichar].utf32nfkdi = um;
2236                                 free(unicode_data[unichar].utf32nfkdicf);
2237                                 um = malloc(sizeof(unsigned int));
2238                                 *um = 0;
2239                                 unicode_data[unichar].utf32nfkdicf = um;
2240                                 count++;
2241                         }
2242                         if (verbose > 1)
2243                                 printf(" %X..%X Default_Ignorable_Code_Point\n",
2244                                         first, last);
2245                         continue;
2246                 }
2247                 ret = sscanf(line, "%X ; %s # ", &unichar, buf0);
2248                 if (ret == 2) {
2249                         if (strcmp(buf0, "Default_Ignorable_Code_Point"))
2250                                 continue;
2251                         if (!utf32valid(unichar))
2252                                 line_fail(prop_name, line);
2253                         free(unicode_data[unichar].utf32nfkdi);
2254                         um = malloc(sizeof(unsigned int));
2255                         *um = 0;
2256                         unicode_data[unichar].utf32nfkdi = um;
2257                         free(unicode_data[unichar].utf32nfkdicf);
2258                         um = malloc(sizeof(unsigned int));
2259                         *um = 0;
2260                         unicode_data[unichar].utf32nfkdicf = um;
2261                         if (verbose > 1)
2262                                 printf(" %X Default_Ignorable_Code_Point\n",
2263                                         unichar);
2264                         count++;
2265                         continue;
2266                 }
2267         }
2268         fclose(file);
2269
2270         if (verbose > 0)
2271                 printf("Found %d entries\n", count);
2272         if (count == 0)
2273                 file_fail(prop_name);
2274 }
2275
2276 static void corrections_init(void)
2277 {
2278         FILE *file;
2279         unsigned int unichar;
2280         unsigned int major;
2281         unsigned int minor;
2282         unsigned int revision;
2283         unsigned int age;
2284         unsigned int *um;
2285         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2286         char *s;
2287         int i;
2288         int count;
2289         int ret;
2290
2291         if (verbose > 0)
2292                 printf("Parsing %s\n", norm_name);
2293         file = fopen(norm_name, "r");
2294         if (!file)
2295                 open_fail(norm_name, errno);
2296
2297         count = 0;
2298         while (fgets(line, LINESIZE, file)) {
2299                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2300                                 &unichar, buf0, buf1,
2301                                 &major, &minor, &revision);
2302                 if (ret != 6)
2303                         continue;
2304                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2305                         line_fail(norm_name, line);
2306                 count++;
2307         }
2308         corrections = calloc(count, sizeof(struct unicode_data));
2309         corrections_count = count;
2310         rewind(file);
2311
2312         count = 0;
2313         while (fgets(line, LINESIZE, file)) {
2314                 ret = sscanf(line, "%X;%[^;];%[^;];%d.%d.%d #",
2315                                 &unichar, buf0, buf1,
2316                                 &major, &minor, &revision);
2317                 if (ret != 6)
2318                         continue;
2319                 if (!utf32valid(unichar) || !age_valid(major, minor, revision))
2320                         line_fail(norm_name, line);
2321                 corrections[count] = unicode_data[unichar];
2322                 assert(corrections[count].code == unichar);
2323                 age = UNICODE_AGE(major, minor, revision);
2324                 corrections[count].correction = age;
2325
2326                 i = 0;
2327                 s = buf0;
2328                 while (*s) {
2329                         mapping[i] = strtoul(s, &s, 16);
2330                         if (!utf32valid(mapping[i]))
2331                                 line_fail(norm_name, line);
2332                         i++;
2333                 }
2334                 mapping[i++] = 0;
2335
2336                 um = malloc(i * sizeof(unsigned int));
2337                 memcpy(um, mapping, i * sizeof(unsigned int));
2338                 corrections[count].utf32nfkdi = um;
2339
2340                 if (verbose > 1)
2341                         printf(" %X -> %s -> %s V%d_%d_%d\n",
2342                                 unichar, buf0, buf1, major, minor, revision);
2343                 count++;
2344         }
2345         fclose(file);
2346
2347         if (verbose > 0)
2348                 printf("Found %d entries\n", count);
2349         if (count == 0)
2350                 file_fail(norm_name);
2351 }
2352
2353 /* ------------------------------------------------------------------ */
2354
2355 /*
2356  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2357  *
2358  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2359  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2360  *
2361  * SBase = 0xAC00
2362  * LBase = 0x1100
2363  * VBase = 0x1161
2364  * TBase = 0x11A7
2365  * LCount = 19
2366  * VCount = 21
2367  * TCount = 28
2368  * NCount = 588 (VCount * TCount)
2369  * SCount = 11172 (LCount * NCount)
2370  *
2371  * Decomposition:
2372  *   SIndex = s - SBase
2373  *
2374  * LV (Canonical/Full)
2375  *   LIndex = SIndex / NCount
2376  *   VIndex = (Sindex % NCount) / TCount
2377  *   LPart = LBase + LIndex
2378  *   VPart = VBase + VIndex
2379  *
2380  * LVT (Canonical)
2381  *   LVIndex = (SIndex / TCount) * TCount
2382  *   TIndex = (Sindex % TCount)
2383  *   LVPart = SBase + LVIndex
2384  *   TPart = TBase + TIndex
2385  *
2386  * LVT (Full)
2387  *   LIndex = SIndex / NCount
2388  *   VIndex = (Sindex % NCount) / TCount
2389  *   TIndex = (Sindex % TCount)
2390  *   LPart = LBase + LIndex
2391  *   VPart = VBase + VIndex
2392  *   if (TIndex == 0) {
2393  *          d = <LPart, VPart>
2394  *   } else {
2395  *          TPart = TBase + TIndex
2396  *          d = <LPart, VPart, TPart>
2397  *   }
2398  *
2399  */
2400
2401 static void hangul_decompose(void)
2402 {
2403         unsigned int sb = 0xAC00;
2404         unsigned int lb = 0x1100;
2405         unsigned int vb = 0x1161;
2406         unsigned int tb = 0x11a7;
2407         /* unsigned int lc = 19; */
2408         unsigned int vc = 21;
2409         unsigned int tc = 28;
2410         unsigned int nc = (vc * tc);
2411         /* unsigned int sc = (lc * nc); */
2412         unsigned int unichar;
2413         unsigned int mapping[4];
2414         unsigned int *um;
2415         int count;
2416         int i;
2417
2418         if (verbose > 0)
2419                 printf("Decomposing hangul\n");
2420         /* Hangul */
2421         count = 0;
2422         for (unichar = 0xAC00; unichar <= 0xD7A3; unichar++) {
2423                 unsigned int si = unichar - sb;
2424                 unsigned int li = si / nc;
2425                 unsigned int vi = (si % nc) / tc;
2426                 unsigned int ti = si % tc;
2427
2428                 i = 0;
2429                 mapping[i++] = lb + li;
2430                 mapping[i++] = vb + vi;
2431                 if (ti)
2432                         mapping[i++] = tb + ti;
2433                 mapping[i++] = 0;
2434
2435                 assert(!unicode_data[unichar].utf32nfkdi);
2436                 um = malloc(i * sizeof(unsigned int));
2437                 memcpy(um, mapping, i * sizeof(unsigned int));
2438                 unicode_data[unichar].utf32nfkdi = um;
2439
2440                 assert(!unicode_data[unichar].utf32nfkdicf);
2441                 um = malloc(i * sizeof(unsigned int));
2442                 memcpy(um, mapping, i * sizeof(unsigned int));
2443                 unicode_data[unichar].utf32nfkdicf = um;
2444
2445                 /*
2446                  * Add a cookie as a reminder that the hangul syllable
2447                  * decompositions must not be stored in the generated
2448                  * trie.
2449                  */
2450                 unicode_data[unichar].utf8nfkdi = malloc(2);
2451                 unicode_data[unichar].utf8nfkdi[0] = HANGUL;
2452                 unicode_data[unichar].utf8nfkdi[1] = '\0';
2453
2454                 if (verbose > 1)
2455                         print_utf32nfkdi(unichar);
2456
2457                 count++;
2458         }
2459         if (verbose > 0)
2460                 printf("Created %d entries\n", count);
2461 }
2462
2463 static void nfkdi_decompose(void)
2464 {
2465         unsigned int unichar;
2466         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2467         unsigned int *um;
2468         unsigned int *dc;
2469         int count;
2470         int i;
2471         int j;
2472         int ret;
2473
2474         if (verbose > 0)
2475                 printf("Decomposing nfkdi\n");
2476
2477         count = 0;
2478         for (unichar = 0; unichar != 0x110000; unichar++) {
2479                 if (!unicode_data[unichar].utf32nfkdi)
2480                         continue;
2481                 for (;;) {
2482                         ret = 1;
2483                         i = 0;
2484                         um = unicode_data[unichar].utf32nfkdi;
2485                         while (*um) {
2486                                 dc = unicode_data[*um].utf32nfkdi;
2487                                 if (dc) {
2488                                         for (j = 0; dc[j]; j++)
2489                                                 mapping[i++] = dc[j];
2490                                         ret = 0;
2491                                 } else {
2492                                         mapping[i++] = *um;
2493                                 }
2494                                 um++;
2495                         }
2496                         mapping[i++] = 0;
2497                         if (ret)
2498                                 break;
2499                         free(unicode_data[unichar].utf32nfkdi);
2500                         um = malloc(i * sizeof(unsigned int));
2501                         memcpy(um, mapping, i * sizeof(unsigned int));
2502                         unicode_data[unichar].utf32nfkdi = um;
2503                 }
2504                 /* Add this decomposition to nfkdicf if there is no entry. */
2505                 if (!unicode_data[unichar].utf32nfkdicf) {
2506                         um = malloc(i * sizeof(unsigned int));
2507                         memcpy(um, mapping, i * sizeof(unsigned int));
2508                         unicode_data[unichar].utf32nfkdicf = um;
2509                 }
2510                 if (verbose > 1)
2511                         print_utf32nfkdi(unichar);
2512                 count++;
2513         }
2514         if (verbose > 0)
2515                 printf("Processed %d entries\n", count);
2516 }
2517
2518 static void nfkdicf_decompose(void)
2519 {
2520         unsigned int unichar;
2521         unsigned int mapping[19]; /* Magic - guaranteed not to be exceeded. */
2522         unsigned int *um;
2523         unsigned int *dc;
2524         int count;
2525         int i;
2526         int j;
2527         int ret;
2528
2529         if (verbose > 0)
2530                 printf("Decomposing nfkdicf\n");
2531         count = 0;
2532         for (unichar = 0; unichar != 0x110000; unichar++) {
2533                 if (!unicode_data[unichar].utf32nfkdicf)
2534                         continue;
2535                 for (;;) {
2536                         ret = 1;
2537                         i = 0;
2538                         um = unicode_data[unichar].utf32nfkdicf;
2539                         while (*um) {
2540                                 dc = unicode_data[*um].utf32nfkdicf;
2541                                 if (dc) {
2542                                         for (j = 0; dc[j]; j++)
2543                                                 mapping[i++] = dc[j];
2544                                         ret = 0;
2545                                 } else {
2546                                         mapping[i++] = *um;
2547                                 }
2548                                 um++;
2549                         }
2550                         mapping[i++] = 0;
2551                         if (ret)
2552                                 break;
2553                         free(unicode_data[unichar].utf32nfkdicf);
2554                         um = malloc(i * sizeof(unsigned int));
2555                         memcpy(um, mapping, i * sizeof(unsigned int));
2556                         unicode_data[unichar].utf32nfkdicf = um;
2557                 }
2558                 if (verbose > 1)
2559                         print_utf32nfkdicf(unichar);
2560                 count++;
2561         }
2562         if (verbose > 0)
2563                 printf("Processed %d entries\n", count);
2564 }
2565
2566 /* ------------------------------------------------------------------ */
2567
2568 int utf8agemax(struct tree *, const char *);
2569 int utf8nagemax(struct tree *, const char *, size_t);
2570 int utf8agemin(struct tree *, const char *);
2571 int utf8nagemin(struct tree *, const char *, size_t);
2572 ssize_t utf8len(struct tree *, const char *);
2573 ssize_t utf8nlen(struct tree *, const char *, size_t);
2574 struct utf8cursor;
2575 int utf8cursor(struct utf8cursor *, struct tree *, const char *);
2576 int utf8ncursor(struct utf8cursor *, struct tree *, const char *, size_t);
2577 int utf8byte(struct utf8cursor *);
2578
2579 /*
2580  * Hangul decomposition (algorithm from Section 3.12 of Unicode 6.3.0)
2581  *
2582  * AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
2583  * D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
2584  *
2585  * SBase = 0xAC00
2586  * LBase = 0x1100
2587  * VBase = 0x1161
2588  * TBase = 0x11A7
2589  * LCount = 19
2590  * VCount = 21
2591  * TCount = 28
2592  * NCount = 588 (VCount * TCount)
2593  * SCount = 11172 (LCount * NCount)
2594  *
2595  * Decomposition:
2596  *   SIndex = s - SBase
2597  *
2598  * LV (Canonical/Full)
2599  *   LIndex = SIndex / NCount
2600  *   VIndex = (Sindex % NCount) / TCount
2601  *   LPart = LBase + LIndex
2602  *   VPart = VBase + VIndex
2603  *
2604  * LVT (Canonical)
2605  *   LVIndex = (SIndex / TCount) * TCount
2606  *   TIndex = (Sindex % TCount)
2607  *   LVPart = SBase + LVIndex
2608  *   TPart = TBase + TIndex
2609  *
2610  * LVT (Full)
2611  *   LIndex = SIndex / NCount
2612  *   VIndex = (Sindex % NCount) / TCount
2613  *   TIndex = (Sindex % TCount)
2614  *   LPart = LBase + LIndex
2615  *   VPart = VBase + VIndex
2616  *   if (TIndex == 0) {
2617  *          d = <LPart, VPart>
2618  *   } else {
2619  *          TPart = TBase + TIndex
2620  *          d = <LPart, VPart, TPart>
2621  *   }
2622  */
2623
2624 /* Constants */
2625 #define SB      (0xAC00)
2626 #define LB      (0x1100)
2627 #define VB      (0x1161)
2628 #define TB      (0x11A7)
2629 #define LC      (19)
2630 #define VC      (21)
2631 #define TC      (28)
2632 #define NC      (VC * TC)
2633 #define SC      (LC * NC)
2634
2635 /* Algorithmic decomposition of hangul syllable. */
2636 static utf8leaf_t *utf8hangul(const char *str, unsigned char *hangul)
2637 {
2638         unsigned int    si;
2639         unsigned int    li;
2640         unsigned int    vi;
2641         unsigned int    ti;
2642         unsigned char   *h;
2643
2644         /* Calculate the SI, LI, VI, and TI values. */
2645         si = utf8decode(str) - SB;
2646         li = si / NC;
2647         vi = (si % NC) / TC;
2648         ti = si % TC;
2649
2650         /* Fill in base of leaf. */
2651         h = hangul;
2652         LEAF_GEN(h) = 2;
2653         LEAF_CCC(h) = DECOMPOSE;
2654         h += 2;
2655
2656         /* Add LPart, a 3-byte UTF-8 sequence. */
2657         h += utf8encode((char *)h, li + LB);
2658
2659         /* Add VPart, a 3-byte UTF-8 sequence. */
2660         h += utf8encode((char *)h, vi + VB);
2661
2662         /* Add TPart if required, also a 3-byte UTF-8 sequence. */
2663         if (ti)
2664                 h += utf8encode((char *)h, ti + TB);
2665
2666         /* Terminate string. */
2667         h[0] = '\0';
2668
2669         return hangul;
2670 }
2671
2672 /*
2673  * Use trie to scan s, touching at most len bytes.
2674  * Returns the leaf if one exists, NULL otherwise.
2675  *
2676  * A non-NULL return guarantees that the UTF-8 sequence starting at s
2677  * is well-formed and corresponds to a known unicode code point.  The
2678  * shorthand for this will be "is valid UTF-8 unicode".
2679  */
2680 static utf8leaf_t *utf8nlookup(struct tree *tree, unsigned char *hangul,
2681                                const char *s, size_t len)
2682 {
2683         utf8trie_t      *trie = utf8data + tree->index;
2684         int             offlen;
2685         int             offset;
2686         int             mask;
2687         int             node;
2688
2689         if (!tree)
2690                 return NULL;
2691         if (len == 0)
2692                 return NULL;
2693         node = 1;
2694         while (node) {
2695                 offlen = (*trie & OFFLEN) >> OFFLEN_SHIFT;
2696                 if (*trie & NEXTBYTE) {
2697                         if (--len == 0)
2698                                 return NULL;
2699                         s++;
2700                 }
2701                 mask = 1 << (*trie & BITNUM);
2702                 if (*s & mask) {
2703                         /* Right leg */
2704                         if (offlen) {
2705                                 /* Right node at offset of trie */
2706                                 node = (*trie & RIGHTNODE);
2707                                 offset = trie[offlen];
2708                                 while (--offlen) {
2709                                         offset <<= 8;
2710                                         offset |= trie[offlen];
2711                                 }
2712                                 trie += offset;
2713                         } else if (*trie & RIGHTPATH) {
2714                                 /* Right node after this node */
2715                                 node = (*trie & TRIENODE);
2716                                 trie++;
2717                         } else {
2718                                 /* No right node. */
2719                                 return NULL;
2720                         }
2721                 } else {
2722                         /* Left leg */
2723                         if (offlen) {
2724                                 /* Left node after this node. */
2725                                 node = (*trie & LEFTNODE);
2726                                 trie += offlen + 1;
2727                         } else if (*trie & RIGHTPATH) {
2728                                 /* No left node. */
2729                                 return NULL;
2730                         } else {
2731                                 /* Left node after this node */
2732                                 node = (*trie & TRIENODE);
2733                                 trie++;
2734                         }
2735                 }
2736         }
2737         /*
2738          * Hangul decomposition is done algorithmically. These are the
2739          * codepoints >= 0xAC00 and <= 0xD7A3. Their UTF-8 encoding is
2740          * always 3 bytes long, so s has been advanced twice, and the
2741          * start of the sequence is at s-2.
2742          */
2743         if (LEAF_CCC(trie) == DECOMPOSE && LEAF_STR(trie)[0] == HANGUL)
2744                 trie = utf8hangul(s - 2, hangul);
2745         return trie;
2746 }
2747
2748 /*
2749  * Use trie to scan s.
2750  * Returns the leaf if one exists, NULL otherwise.
2751  *
2752  * Forwards to trie_nlookup().
2753  */
2754 static utf8leaf_t *utf8lookup(struct tree *tree, unsigned char *hangul,
2755                               const char *s)
2756 {
2757         return utf8nlookup(tree, hangul, s, (size_t)-1);
2758 }
2759
2760 /*
2761  * Return the number of bytes used by the current UTF-8 sequence.
2762  * Assumes the input points to the first byte of a valid UTF-8
2763  * sequence.
2764  */
2765 static inline int utf8clen(const char *s)
2766 {
2767         unsigned char c = *s;
2768         return 1 + (c >= 0xC0) + (c >= 0xE0) + (c >= 0xF0);
2769 }
2770
2771 /*
2772  * Maximum age of any character in s.
2773  * Return -1 if s is not valid UTF-8 unicode.
2774  * Return 0 if only non-assigned code points are used.
2775  */
2776 int utf8agemax(struct tree *tree, const char *s)
2777 {
2778         utf8leaf_t      *leaf;
2779         int             age = 0;
2780         int             leaf_age;
2781         unsigned char   hangul[UTF8HANGULLEAF];
2782
2783         if (!tree)
2784                 return -1;
2785
2786         while (*s) {
2787                 leaf = utf8lookup(tree, hangul, s);
2788                 if (!leaf)
2789                         return -1;
2790                 leaf_age = ages[LEAF_GEN(leaf)];
2791                 if (leaf_age <= tree->maxage && leaf_age > age)
2792                         age = leaf_age;
2793                 s += utf8clen(s);
2794         }
2795         return age;
2796 }
2797
2798 /*
2799  * Minimum age of any character in s.
2800  * Return -1 if s is not valid UTF-8 unicode.
2801  * Return 0 if non-assigned code points are used.
2802  */
2803 int utf8agemin(struct tree *tree, const char *s)
2804 {
2805         utf8leaf_t      *leaf;
2806         int             age;
2807         int             leaf_age;
2808         unsigned char   hangul[UTF8HANGULLEAF];
2809
2810         if (!tree)
2811                 return -1;
2812         age = tree->maxage;
2813         while (*s) {
2814                 leaf = utf8lookup(tree, hangul, s);
2815                 if (!leaf)
2816                         return -1;
2817                 leaf_age = ages[LEAF_GEN(leaf)];
2818                 if (leaf_age <= tree->maxage && leaf_age < age)
2819                         age = leaf_age;
2820                 s += utf8clen(s);
2821         }
2822         return age;
2823 }
2824
2825 /*
2826  * Maximum age of any character in s, touch at most len bytes.
2827  * Return -1 if s is not valid UTF-8 unicode.
2828  */
2829 int utf8nagemax(struct tree *tree, const char *s, size_t len)
2830 {
2831         utf8leaf_t      *leaf;
2832         int             age = 0;
2833         int             leaf_age;
2834         unsigned char   hangul[UTF8HANGULLEAF];
2835
2836         if (!tree)
2837                 return -1;
2838
2839         while (len && *s) {
2840                 leaf = utf8nlookup(tree, hangul, s, len);
2841                 if (!leaf)
2842                         return -1;
2843                 leaf_age = ages[LEAF_GEN(leaf)];
2844                 if (leaf_age <= tree->maxage && leaf_age > age)
2845                         age = leaf_age;
2846                 len -= utf8clen(s);
2847                 s += utf8clen(s);
2848         }
2849         return age;
2850 }
2851
2852 /*
2853  * Maximum age of any character in s, touch at most len bytes.
2854  * Return -1 if s is not valid UTF-8 unicode.
2855  */
2856 int utf8nagemin(struct tree *tree, const char *s, size_t len)
2857 {
2858         utf8leaf_t      *leaf;
2859         int             leaf_age;
2860         int             age;
2861         unsigned char   hangul[UTF8HANGULLEAF];
2862
2863         if (!tree)
2864                 return -1;
2865         age = tree->maxage;
2866         while (len && *s) {
2867                 leaf = utf8nlookup(tree, hangul, s, len);
2868                 if (!leaf)
2869                         return -1;
2870                 leaf_age = ages[LEAF_GEN(leaf)];
2871                 if (leaf_age <= tree->maxage && leaf_age < age)
2872                         age = leaf_age;
2873                 len -= utf8clen(s);
2874                 s += utf8clen(s);
2875         }
2876         return age;
2877 }
2878
2879 /*
2880  * Length of the normalization of s.
2881  * Return -1 if s is not valid UTF-8 unicode.
2882  *
2883  * A string of Default_Ignorable_Code_Point has length 0.
2884  */
2885 ssize_t utf8len(struct tree *tree, const char *s)
2886 {
2887         utf8leaf_t      *leaf;
2888         size_t          ret = 0;
2889         unsigned char   hangul[UTF8HANGULLEAF];
2890
2891         if (!tree)
2892                 return -1;
2893         while (*s) {
2894                 leaf = utf8lookup(tree, hangul, s);
2895                 if (!leaf)
2896                         return -1;
2897                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2898                         ret += utf8clen(s);
2899                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2900                         ret += strlen(LEAF_STR(leaf));
2901                 else
2902                         ret += utf8clen(s);
2903                 s += utf8clen(s);
2904         }
2905         return ret;
2906 }
2907
2908 /*
2909  * Length of the normalization of s, touch at most len bytes.
2910  * Return -1 if s is not valid UTF-8 unicode.
2911  */
2912 ssize_t utf8nlen(struct tree *tree, const char *s, size_t len)
2913 {
2914         utf8leaf_t      *leaf;
2915         size_t          ret = 0;
2916         unsigned char   hangul[UTF8HANGULLEAF];
2917
2918         if (!tree)
2919                 return -1;
2920         while (len && *s) {
2921                 leaf = utf8nlookup(tree, hangul, s, len);
2922                 if (!leaf)
2923                         return -1;
2924                 if (ages[LEAF_GEN(leaf)] > tree->maxage)
2925                         ret += utf8clen(s);
2926                 else if (LEAF_CCC(leaf) == DECOMPOSE)
2927                         ret += strlen(LEAF_STR(leaf));
2928                 else
2929                         ret += utf8clen(s);
2930                 len -= utf8clen(s);
2931                 s += utf8clen(s);
2932         }
2933         return ret;
2934 }
2935
2936 /*
2937  * Cursor structure used by the normalizer.
2938  */
2939 struct utf8cursor {
2940         struct tree     *tree;
2941         const char      *s;
2942         const char      *p;
2943         const char      *ss;
2944         const char      *sp;
2945         unsigned int    len;
2946         unsigned int    slen;
2947         short int       ccc;
2948         short int       nccc;
2949         unsigned int    unichar;
2950         unsigned char   hangul[UTF8HANGULLEAF];
2951 };
2952
2953 /*
2954  * Set up an utf8cursor for use by utf8byte().
2955  *
2956  *   s      : string.
2957  *   len    : length of s.
2958  *   u8c    : pointer to cursor.
2959  *   trie   : utf8trie_t to use for normalization.
2960  *
2961  * Returns -1 on error, 0 on success.
2962  */
2963 int utf8ncursor(struct utf8cursor *u8c, struct tree *tree, const char *s,
2964                 size_t len)
2965 {
2966         if (!tree)
2967                 return -1;
2968         if (!s)
2969                 return -1;
2970         u8c->tree = tree;
2971         u8c->s = s;
2972         u8c->p = NULL;
2973         u8c->ss = NULL;
2974         u8c->sp = NULL;
2975         u8c->len = len;
2976         u8c->slen = 0;
2977         u8c->ccc = STOPPER;
2978         u8c->nccc = STOPPER;
2979         u8c->unichar = 0;
2980         /* Check we didn't clobber the maximum length. */
2981         if (u8c->len != len)
2982                 return -1;
2983         /* The first byte of s may not be an utf8 continuation. */
2984         if (len > 0 && (*s & 0xC0) == 0x80)
2985                 return -1;
2986         return 0;
2987 }
2988
2989 /*
2990  * Set up an utf8cursor for use by utf8byte().
2991  *
2992  *   s      : NUL-terminated string.
2993  *   u8c    : pointer to cursor.
2994  *   trie   : utf8trie_t to use for normalization.
2995  *
2996  * Returns -1 on error, 0 on success.
2997  */
2998 int utf8cursor(struct utf8cursor *u8c, struct tree *tree, const char *s)
2999 {
3000         return utf8ncursor(u8c, tree, s, (unsigned int)-1);
3001 }
3002
3003 /*
3004  * Get one byte from the normalized form of the string described by u8c.
3005  *
3006  * Returns the byte cast to an unsigned char on succes, and -1 on failure.
3007  *
3008  * The cursor keeps track of the location in the string in u8c->s.
3009  * When a character is decomposed, the current location is stored in
3010  * u8c->p, and u8c->s is set to the start of the decomposition. Note
3011  * that bytes from a decomposition do not count against u8c->len.
3012  *
3013  * Characters are emitted if they match the current CCC in u8c->ccc.
3014  * Hitting end-of-string while u8c->ccc == STOPPER means we're done,
3015  * and the function returns 0 in that case.
3016  *
3017  * Sorting by CCC is done by repeatedly scanning the string.  The
3018  * values of u8c->s and u8c->p are stored in u8c->ss and u8c->sp at
3019  * the start of the scan.  The first pass finds the lowest CCC to be
3020  * emitted and stores it in u8c->nccc, the second pass emits the
3021  * characters with this CCC and finds the next lowest CCC. This limits
3022  * the number of passes to 1 + the number of different CCCs in the
3023  * sequence being scanned.
3024  *
3025  * Therefore:
3026  *  u8c->p  != NULL -> a decomposition is being scanned.
3027  *  u8c->ss != NULL -> this is a repeating scan.
3028  *  u8c->ccc == -1  -> this is the first scan of a repeating scan.
3029  */
3030 int utf8byte(struct utf8cursor *u8c)
3031 {
3032         utf8leaf_t *leaf;
3033         int ccc;
3034
3035         for (;;) {
3036                 /* Check for the end of a decomposed character. */
3037                 if (u8c->p && *u8c->s == '\0') {
3038                         u8c->s = u8c->p;
3039                         u8c->p = NULL;
3040                 }
3041
3042                 /* Check for end-of-string. */
3043                 if (!u8c->p && (u8c->len == 0 || *u8c->s == '\0')) {
3044                         /* There is no next byte. */
3045                         if (u8c->ccc == STOPPER)
3046                                 return 0;
3047                         /* End-of-string during a scan counts as a stopper. */
3048                         ccc = STOPPER;
3049                         goto ccc_mismatch;
3050                 } else if ((*u8c->s & 0xC0) == 0x80) {
3051                         /* This is a continuation of the current character. */
3052                         if (!u8c->p)
3053                                 u8c->len--;
3054                         return (unsigned char)*u8c->s++;
3055                 }
3056
3057                 /* Look up the data for the current character. */
3058                 if (u8c->p) {
3059                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3060                 } else {
3061                         leaf = utf8nlookup(u8c->tree, u8c->hangul,
3062                                            u8c->s, u8c->len);
3063                 }
3064
3065                 /* No leaf found implies that the input is a binary blob. */
3066                 if (!leaf)
3067                         return -1;
3068
3069                 /* Characters that are too new have CCC 0. */
3070                 if (ages[LEAF_GEN(leaf)] > u8c->tree->maxage) {
3071                         ccc = STOPPER;
3072                 } else if ((ccc = LEAF_CCC(leaf)) == DECOMPOSE) {
3073                         u8c->len -= utf8clen(u8c->s);
3074                         u8c->p = u8c->s + utf8clen(u8c->s);
3075                         u8c->s = LEAF_STR(leaf);
3076                         /* Empty decomposition implies CCC 0. */
3077                         if (*u8c->s == '\0') {
3078                                 if (u8c->ccc == STOPPER)
3079                                         continue;
3080                                 ccc = STOPPER;
3081                                 goto ccc_mismatch;
3082                         }
3083                         leaf = utf8lookup(u8c->tree, u8c->hangul, u8c->s);
3084                         ccc = LEAF_CCC(leaf);
3085                 }
3086                 u8c->unichar = utf8decode(u8c->s);
3087
3088                 /*
3089                  * If this is not a stopper, then see if it updates
3090                  * the next canonical class to be emitted.
3091                  */
3092                 if (ccc != STOPPER && u8c->ccc < ccc && ccc < u8c->nccc)
3093                         u8c->nccc = ccc;
3094
3095                 /*
3096                  * Return the current byte if this is the current
3097                  * combining class.
3098                  */
3099                 if (ccc == u8c->ccc) {
3100                         if (!u8c->p)
3101                                 u8c->len--;
3102                         return (unsigned char)*u8c->s++;
3103                 }
3104
3105                 /* Current combining class mismatch. */
3106         ccc_mismatch:
3107                 if (u8c->nccc == STOPPER) {
3108                         /*
3109                          * Scan forward for the first canonical class
3110                          * to be emitted.  Save the position from
3111                          * which to restart.
3112                          */
3113                         assert(u8c->ccc == STOPPER);
3114                         u8c->ccc = MINCCC - 1;
3115                         u8c->nccc = ccc;
3116                         u8c->sp = u8c->p;
3117                         u8c->ss = u8c->s;
3118                         u8c->slen = u8c->len;
3119                         if (!u8c->p)
3120                                 u8c->len -= utf8clen(u8c->s);
3121                         u8c->s += utf8clen(u8c->s);
3122                 } else if (ccc != STOPPER) {
3123                         /* Not a stopper, and not the ccc we're emitting. */
3124                         if (!u8c->p)
3125                                 u8c->len -= utf8clen(u8c->s);
3126                         u8c->s += utf8clen(u8c->s);
3127                 } else if (u8c->nccc != MAXCCC + 1) {
3128                         /* At a stopper, restart for next ccc. */
3129                         u8c->ccc = u8c->nccc;
3130                         u8c->nccc = MAXCCC + 1;
3131                         u8c->s = u8c->ss;
3132                         u8c->p = u8c->sp;
3133                         u8c->len = u8c->slen;
3134                 } else {
3135                         /* All done, proceed from here. */
3136                         u8c->ccc = STOPPER;
3137                         u8c->nccc = STOPPER;
3138                         u8c->sp = NULL;
3139                         u8c->ss = NULL;
3140                         u8c->slen = 0;
3141                 }
3142         }
3143 }
3144
3145 /* ------------------------------------------------------------------ */
3146
3147 static int normalize_line(struct tree *tree)
3148 {
3149         char *s;
3150         char *t;
3151         int c;
3152         struct utf8cursor u8c;
3153
3154         /* First test: null-terminated string. */
3155         s = buf2;
3156         t = buf3;
3157         if (utf8cursor(&u8c, tree, s))
3158                 return -1;
3159         while ((c = utf8byte(&u8c)) > 0)
3160                 if (c != (unsigned char)*t++)
3161                         return -1;
3162         if (c < 0)
3163                 return -1;
3164         if (*t != 0)
3165                 return -1;
3166
3167         /* Second test: length-limited string. */
3168         s = buf2;
3169         /* Replace NUL with a value that will cause an error if seen. */
3170         s[strlen(s) + 1] = -1;
3171         t = buf3;
3172         if (utf8cursor(&u8c, tree, s))
3173                 return -1;
3174         while ((c = utf8byte(&u8c)) > 0)
3175                 if (c != (unsigned char)*t++)
3176                         return -1;
3177         if (c < 0)
3178                 return -1;
3179         if (*t != 0)
3180                 return -1;
3181
3182         return 0;
3183 }
3184
3185 static void normalization_test(void)
3186 {
3187         FILE *file;
3188         unsigned int unichar;
3189         struct unicode_data *data;
3190         char *s;
3191         char *t;
3192         int ret;
3193         int ignorables;
3194         int tests = 0;
3195         int failures = 0;
3196
3197         if (verbose > 0)
3198                 printf("Parsing %s\n", test_name);
3199         /* Step one, read data from file. */
3200         file = fopen(test_name, "r");
3201         if (!file)
3202                 open_fail(test_name, errno);
3203
3204         while (fgets(line, LINESIZE, file)) {
3205                 ret = sscanf(line, "%[^;];%*[^;];%*[^;];%*[^;];%[^;];",
3206                              buf0, buf1);
3207                 if (ret != 2 || *line == '#')
3208                         continue;
3209                 s = buf0;
3210                 t = buf2;
3211                 while (*s) {
3212                         unichar = strtoul(s, &s, 16);
3213                         t += utf8encode(t, unichar);
3214                 }
3215                 *t = '\0';
3216
3217                 ignorables = 0;
3218                 s = buf1;
3219                 t = buf3;
3220                 while (*s) {
3221                         unichar = strtoul(s, &s, 16);
3222                         data = &unicode_data[unichar];
3223                         if (data->utf8nfkdi && !*data->utf8nfkdi)
3224                                 ignorables = 1;
3225                         else
3226                                 t += utf8encode(t, unichar);
3227                 }
3228                 *t = '\0';
3229
3230                 tests++;
3231                 if (normalize_line(nfkdi_tree) < 0) {
3232                         printf("Line %s -> %s", buf0, buf1);
3233                         if (ignorables)
3234                                 printf(" (ignorables removed)");
3235                         printf(" failure\n");
3236                         failures++;
3237                 }
3238         }
3239         fclose(file);
3240         if (verbose > 0)
3241                 printf("Ran %d tests with %d failures\n", tests, failures);
3242         if (failures)
3243                 file_fail(test_name);
3244 }
3245
3246 /* ------------------------------------------------------------------ */
3247
3248 static void write_file(void)
3249 {
3250         FILE *file;
3251         int i;
3252         int j;
3253         int t;
3254         int gen;
3255
3256         if (verbose > 0)
3257                 printf("Writing %s\n", utf8_name);
3258         file = fopen(utf8_name, "w");
3259         if (!file)
3260                 open_fail(utf8_name, errno);
3261
3262         fprintf(file, "/* This file is generated code, do not edit. */\n");
3263         fprintf(file, "#ifndef __INCLUDED_FROM_UTF8NORM_C__\n");
3264         fprintf(file, "#error Only nls_utf8-norm.c should include this file.\n");
3265         fprintf(file, "#endif\n");
3266         fprintf(file, "\n");
3267         fprintf(file, "static const unsigned int utf8vers = %#x;\n",
3268                 unicode_maxage);
3269         fprintf(file, "\n");
3270         fprintf(file, "static const unsigned int utf8agetab[] = {\n");
3271         for (i = 0; i != ages_count; i++)
3272                 fprintf(file, "\t%#x%s\n", ages[i],
3273                         ages[i] == unicode_maxage ? "" : ",");
3274         fprintf(file, "};\n");
3275         fprintf(file, "\n");
3276         fprintf(file, "static const struct utf8data utf8nfkdicfdata[] = {\n");
3277         t = 0;
3278         for (gen = 0; gen < ages_count; gen++) {
3279                 fprintf(file, "\t{ %#x, %d }%s\n",
3280                         ages[gen], trees[t].index,
3281                         ages[gen] == unicode_maxage ? "" : ",");
3282                 if (trees[t].maxage == ages[gen])
3283                         t += 2;
3284         }
3285         fprintf(file, "};\n");
3286         fprintf(file, "\n");
3287         fprintf(file, "static const struct utf8data utf8nfkdidata[] = {\n");
3288         t = 1;
3289         for (gen = 0; gen < ages_count; gen++) {
3290                 fprintf(file, "\t{ %#x, %d }%s\n",
3291                         ages[gen], trees[t].index,
3292                         ages[gen] == unicode_maxage ? "" : ",");
3293                 if (trees[t].maxage == ages[gen])
3294                         t += 2;
3295         }
3296         fprintf(file, "};\n");
3297         fprintf(file, "\n");
3298         fprintf(file, "static const unsigned char utf8data[%zd] = {\n",
3299                 utf8data_size);
3300         t = 0;
3301         for (i = 0; i != utf8data_size; i += 16) {
3302                 if (i == trees[t].index) {
3303                         fprintf(file, "\t/* %s_%x */\n",
3304                                 trees[t].type, trees[t].maxage);
3305                         if (t < trees_count-1)
3306                                 t++;
3307                 }
3308                 fprintf(file, "\t");
3309                 for (j = i; j != i + 16; j++)
3310                         fprintf(file, "0x%.2x%s", utf8data[j],
3311                                 (j < utf8data_size -1 ? "," : ""));
3312                 fprintf(file, "\n");
3313         }
3314         fprintf(file, "};\n");
3315         fclose(file);
3316 }
3317
3318 /* ------------------------------------------------------------------ */
3319
3320 int main(int argc, char *argv[])
3321 {
3322         unsigned int unichar;
3323         int opt;
3324
3325         argv0 = argv[0];
3326
3327         while ((opt = getopt(argc, argv, "a:c:d:f:hn:o:p:t:v")) != -1) {
3328                 switch (opt) {
3329                 case 'a':
3330                         age_name = optarg;
3331                         break;
3332                 case 'c':
3333                         ccc_name = optarg;
3334                         break;
3335                 case 'd':
3336                         data_name = optarg;
3337                         break;
3338                 case 'f':
3339                         fold_name = optarg;
3340                         break;
3341                 case 'n':
3342                         norm_name = optarg;
3343                         break;
3344                 case 'o':
3345                         utf8_name = optarg;
3346                         break;
3347                 case 'p':
3348                         prop_name = optarg;
3349                         break;
3350                 case 't':
3351                         test_name = optarg;
3352                         break;
3353                 case 'v':
3354                         verbose++;
3355                         break;
3356                 case 'h':
3357                         help();
3358                         exit(0);
3359                 default:
3360                         usage();
3361                 }
3362         }
3363
3364         if (verbose > 1)
3365                 help();
3366         for (unichar = 0; unichar != 0x110000; unichar++)
3367                 unicode_data[unichar].code = unichar;
3368         age_init();
3369         ccc_init();
3370         nfkdi_init();
3371         nfkdicf_init();
3372         ignore_init();
3373         corrections_init();
3374         hangul_decompose();
3375         nfkdi_decompose();
3376         nfkdicf_decompose();
3377         utf8_init();
3378         trees_init();
3379         trees_populate();
3380         trees_reduce();
3381         trees_verify();
3382         /* Prevent "unused function" warning. */
3383         (void)lookup(nfkdi_tree, " ");
3384         if (verbose > 2)
3385                 tree_walk(nfkdi_tree);
3386         if (verbose > 2)
3387                 tree_walk(nfkdicf_tree);
3388         normalization_test();
3389         write_file();
3390
3391         return 0;
3392 }