libcpp/makeuname2c.cc

   1 /* Make uname2c.h from various sources.
   2    Copyright (C) 2005-2023 Free Software Foundation, Inc.
   3    Contributed by Jakub Jelinek <jakub@redhat.com>
   4
   5 This program is free software; you can redistribute it and/or modify it
   6 under the terms of the GNU General Public License as published by the
   7 Free Software Foundation; either version 3, or (at your option) any
   8 later version.
   9
  10 This program is distributed in the hope that it will be useful,
  11 but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 GNU General Public License for more details.
  14
  15 You should have received a copy of the GNU General Public License
  16 along with this program; see the file COPYING3.  If not see
  17 <http://www.gnu.org/licenses/>.  */
  18
  19 /* Run this program as
  20    ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h
  21
  22    This program generates 2 big arrays and 2 small ones.
  23    The large ones are uname2c_dict, initialized by string literal
  24    representing dictionary, and uname2c_tree, which is a space optimized
  25    radix tree.
  26    The format of the radix tree is:
  27    byte 0       either 0x80 + (key[0] - ' ')    (if key_len == 1)
  28                 or key_len                      (otherwise)
  29                 either of them ored with 0x40 if it has a codepoint
  30    byte 1       LSB of offset into uname2c_dict for key (only if key_len > 1)
  31    byte 2       MSB of offset into uname2c_dict for key (only if key_len > 1)
  32                 if key_len == 1, the above 2 bytes are omitted
  33    byte 3       LSB of codepoint (only if it has a codepoint)
  34    byte 4       middle byte of codepoint (ditto)
  35    byte 5       MSB of codepoint (ditto), ored with 0x80 if node has children
  36                                    ored with 0x40 if it doesn't have siblings
  37                 if it doesn't have a codepoint, the above 3 bytes are omitted
  38                 and we assume that the node has children
  39    byte 6, 7, 8 uleb128 encoded offset to first child relative to the end
  40                 of the uleb128 (only if node has children)
  41    byte 9       0xff (only if node doesn't have a codepoint and doesn't
  42                       have siblings)
  43
  44    For prefixes of Unicode NR1 or NR2 rule generated names, on a node
  45    representing end of the prefix codepoint is 0xd800 + index into
  46    uname2c_generated array with indexes into uname2c_pairs array of
  47    code points (low, high) of the ranges terminated by single 0.
  48    0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules.
  49 */
  50
  51 #include <assert.h>
  52 #include <stdio.h>
  53 #include <string.h>
  54 #include <stdint.h>
  55 #include <ctype.h>
  56 #include <limits.h>
  57 #include <stdarg.h>
  58 #include <stdbool.h>
  59 #include <stdlib.h>
  60
  61 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
  62
  63 #define NUM_CODE_POINTS 0x110000
  64 #define MAX_CODE_POINT 0x10ffff
  65 #define NO_VALUE 0xdc00
  66 #define GENERATED 0xd800
  67
  68 struct entry { const char *name; unsigned long codepoint; };
  69 static struct entry *entries;
  70 static unsigned long num_allocated, num_entries;
  71
  72 /* Unicode 15 Table 4-8.  */
  73 struct generated {
  74   const char *prefix;
  75   /* max_high is a workaround for UnicodeData.txt inconsistencies
  76      on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>"
  77      entry is a few code points above the end of the range.  */
  78   unsigned long low, high, max_high;
  79   int idx, ok;
  80 };
  81 static struct generated generated_ranges[] =
  82 { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */
  83   { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */
  84   { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 },
  85   { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 },
  86   { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 },
  87   { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 },
  88   { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 },
  89   { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 },
  90   { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 },
  91   { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 },
  92   { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 },
  93   { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 },
  94   { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 },
  95   { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 },
  96   { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 },
  97   { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 },
  98   { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 }
  99 };
 100
 101 struct node {
 102   struct node *sibling, *child;
 103   const char *key;
 104   size_t key_len, key_idx, node_size, size_sum, child_off;
 105   unsigned long codepoint;
 106   bool in_dict;
 107 };
 108 static struct node *root, **nodes;
 109 static unsigned long num_nodes;
 110 static size_t dict_size, tree_size, max_entry_len;
 111 static char *dict;
 112 static unsigned char *tree;
 113
 114 /* Die!  */
 115
 116 static void
 117 fail (const char *s, ...)
 118 {
 119   va_list ap;
 120
 121   va_start (ap, s);
 122   vfprintf (stderr, s, ap);
 123   va_end (ap);
 124   fputc ('\n', stderr);
 125   exit (1);
 126 }
 127
 128 static void *
 129 xmalloc (size_t size)
 130 {
 131   void *ret = malloc (size);
 132
 133   if (ret == NULL)
 134     fail ("failed to allocate %ld bytes", (long) size);
 135   return ret;
 136 }
 137
 138 static void *
 139 xrealloc (void *p, size_t size)
 140 {
 141   void *ret = p ? realloc (p, size) : malloc (size);
 142
 143   if (ret == NULL)
 144     fail ("failed to allocate %ld bytes", (long) size);
 145   return ret;
 146 }
 147
 148 static int
 149 entrycmp (const void *p1, const void *p2)
 150 {
 151   const struct entry *e1 = (const struct entry *) p1;
 152   const struct entry *e2 = (const struct entry *) p2;
 153   int ret = strcmp (e1->name, e2->name);
 154
 155   if (ret != 0)
 156     return ret;
 157   if (e1->codepoint < e2->codepoint)
 158     return -1;
 159   if (e1->codepoint > e2->codepoint)
 160     return 1;
 161   return 0;
 162 }
 163
 164 static int
 165 nodecmp (const void *p1, const void *p2)
 166 {
 167   const struct node *n1 = *(const struct node *const *) p1;
 168   const struct node *n2 = *(const struct node *const *) p2;
 169   if (n1->key_len > n2->key_len)
 170     return -1;
 171   if (n1->key_len < n2->key_len)
 172     return 1;
 173   return memcmp (n1->key, n2->key, n1->key_len);
 174 }
 175
 176 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
 177    decompositions of characters for which both the character
 178    decomposed and all the code points in the decomposition are valid
 179    for some supported language version, and the 'all_decomp' table to
 180    be the decompositions of all characters without those
 181    constraints.  */
 182
 183 static void
 184 read_table (char *fname, bool aliases_p)
 185 {
 186   FILE *f = fopen (fname, "r");
 187   const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt";
 188
 189   if (!f)
 190     fail ("opening %s", sname);
 191   for (;;)
 192     {
 193       char line[256];
 194       unsigned long codepoint;
 195       const char *name, *aname;
 196       char *l;
 197       size_t i;
 198
 199       if (!fgets (line, sizeof (line), f))
 200         break;
 201       codepoint = strtoul (line, &l, 16);
 202       if (l == line && aliases_p)
 203         {
 204           /* NameAliased.txt can contain comments and empty lines.  */
 205           if (*line == '#' || *line == '\n')
 206             continue;
 207         }
 208       if (l == line || *l != ';')
 209         fail ("parsing %s, reading code point", sname);
 210       if (codepoint > MAX_CODE_POINT)
 211         fail ("parsing %s, code point too large", sname);
 212
 213       name = l + 1;
 214       do {
 215         ++l;
 216       } while (*l != ';');
 217
 218       aname = NULL;
 219       if (aliases_p)
 220         {
 221           /* Ignore figment and abbreviation aliases.  */
 222           if (strcmp (l + 1, "correction\n") != 0
 223               && strcmp (l + 1, "control\n") != 0
 224               && strcmp (l + 1, "alternate\n") != 0)
 225             continue;
 226           i = ARRAY_SIZE (generated_ranges);
 227         }
 228       else
 229         {
 230           for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 231             if (codepoint >= generated_ranges[i].low
 232                 && codepoint <= generated_ranges[i].max_high)
 233               break;
 234           if (i != ARRAY_SIZE (generated_ranges))
 235             {
 236               if (*name == '<' && l[-1] == '>')
 237                 {
 238                   if (codepoint == generated_ranges[i].low
 239                       && l - name >= 9
 240                       && memcmp (l - 8, ", First>", 8) == 0
 241                       && generated_ranges[i].ok == 0)
 242                     {
 243                       generated_ranges[i].ok = INT_MAX - 1;
 244                       aname = generated_ranges[i].prefix;
 245                       codepoint = GENERATED + generated_ranges[i].idx;
 246                     }
 247                   /* Unfortunately, UnicodeData.txt isn't consistent
 248                      with the Table 4-8 range endpoints in 3 cases,
 249                      the ranges are longer there by a few codepoints.
 250                      So use the max_high hack to avoid verification
 251                      failures.  */
 252                   else if (codepoint == generated_ranges[i].max_high
 253                            && l - name >= 8
 254                            && memcmp (l - 7, ", Last>", 7) == 0
 255                            && generated_ranges[i].ok == INT_MAX - 1)
 256                     {
 257                       generated_ranges[i].ok = INT_MAX;
 258                       continue;
 259                     }
 260                   else
 261                     fail ("unexpected generated entry %lx %.*s",
 262                           codepoint, (int) (l - name), name);
 263                 }
 264               else if (codepoint
 265                        == generated_ranges[i].low + generated_ranges[i].ok
 266                        && l - name == (strlen (generated_ranges[i].prefix)
 267                                        + (name - 1 - line))
 268                        && memcmp (name, generated_ranges[i].prefix,
 269                                   strlen (generated_ranges[i].prefix)) == 0
 270                        && memcmp (name + strlen (generated_ranges[i].prefix),
 271                                   line, name - 1 - line) == 0)
 272                 {
 273                   ++generated_ranges[i].ok;
 274                   if (codepoint != generated_ranges[i].low)
 275                     continue;
 276                   aname = generated_ranges[i].prefix;
 277                   codepoint = GENERATED + generated_ranges[i].idx;
 278                 }
 279               else
 280                 fail ("unexpected generated entry %lx %.*s",
 281                       codepoint, (int) (l - name), name);
 282               if (aname == generated_ranges[i].prefix)
 283                 {
 284                   size_t j;
 285
 286                   /* Don't add an entry for a generated range where the
 287                      same prefix has been added already.  */
 288                   for (j = 0; j < i; ++j)
 289                     if (generated_ranges[j].idx == generated_ranges[i].idx
 290                         && generated_ranges[j].ok != 0)
 291                       break;
 292                   if (j < i)
 293                     continue;
 294                 }
 295             }
 296           else if (*name == '<' && l[-1] == '>')
 297             continue;
 298         }
 299
 300       if (num_entries == num_allocated)
 301         {
 302           num_allocated = num_allocated ? 2 * num_allocated : 65536;
 303           entries = (struct entry *) xrealloc (entries, num_allocated
 304                                                         * sizeof (entries[0]));
 305         }
 306
 307       if (aname == NULL)
 308         {
 309           char *a = (char *) xmalloc (l + 1 - name);
 310           if (l - name > max_entry_len)
 311             max_entry_len = l - name;
 312           memcpy (a, name, l - name);
 313           a[l - name] = '\0';
 314           aname = a;
 315         }
 316       entries[num_entries].name = aname;
 317       entries[num_entries++].codepoint = codepoint;
 318     }
 319   if (ferror (f))
 320     fail ("reading %s", sname);
 321   fclose (f);
 322 }
 323
 324 /* Assumes nodes are added from sorted array, so we never
 325    add any node before existing one, only after it.  */
 326
 327 static void
 328 node_add (struct node **p, const char *key, size_t key_len,
 329           unsigned long codepoint)
 330 {
 331   struct node *n;
 332   size_t i;
 333
 334   do
 335     {
 336       if (*p == NULL)
 337         {
 338           *p = n = (struct node *) xmalloc (sizeof (struct node));
 339           ++num_nodes;
 340           assert (key_len);
 341           n->sibling = NULL;
 342           n->child = NULL;
 343           n->key = key;
 344           n->key_len = key_len;
 345           n->codepoint = codepoint;
 346           return;
 347         }
 348       n = *p;
 349       for (i = 0; i < n->key_len && i < key_len; ++i)
 350         if (n->key[i] != key[i])
 351           break;
 352       if (i == 0)
 353         {
 354           p = &n->sibling;
 355           continue;
 356         }
 357       if (i == n->key_len)
 358         {
 359           assert (key_len > n->key_len);
 360           p = &n->child;
 361           key += n->key_len;
 362           key_len -= n->key_len;
 363           continue;
 364         }
 365       /* Need to split the node.  */
 366       assert (i < key_len);
 367       n = (struct node *) xmalloc (sizeof (struct node));
 368       ++num_nodes;
 369       n->sibling = NULL;
 370       n->child = (*p)->child;
 371       n->key = (*p)->key + i;
 372       n->key_len = (*p)->key_len - i;
 373       n->codepoint = (*p)->codepoint;
 374       (*p)->child = n;
 375       (*p)->key_len = i;
 376       (*p)->codepoint = NO_VALUE;
 377       key += i;
 378       key_len -= i;
 379       p = &n->sibling;
 380     }
 381   while (1);
 382 }
 383
 384 static void
 385 append_nodes (struct node *n)
 386 {
 387   for (; n; n = n->sibling)
 388     {
 389       nodes[num_nodes++] = n;
 390       append_nodes (n->child);
 391     }
 392 }
 393
 394 static size_t
 395 sizeof_uleb128 (size_t val)
 396 {
 397   size_t sz = 0;
 398   do
 399     {
 400       val >>= 7;
 401       sz += 1;
 402     }
 403   while (val != 0);
 404   return sz;
 405 }
 406
 407 static void
 408 size_nodes (struct node *n)
 409 {
 410   if (n->child)
 411     size_nodes (n->child);
 412   if (n->sibling)
 413     size_nodes (n->sibling);
 414   n->node_size = 1 + (n->key_len > 1) * 2;
 415   if (n->codepoint != NO_VALUE)
 416     n->node_size += 3;
 417   else if (n->sibling == NULL)
 418     ++n->node_size;
 419   n->size_sum = 0;
 420   n->child_off = 0;
 421   if (n->sibling)
 422     n->size_sum += n->sibling->size_sum;
 423   if (n->child)
 424     {
 425       n->child_off = n->size_sum + (n->codepoint == NO_VALUE
 426                                     && n->sibling == NULL);
 427       n->node_size += sizeof_uleb128 (n->child_off);
 428     }
 429   n->size_sum += n->node_size;
 430   if (n->child)
 431     n->size_sum += n->child->size_sum;
 432   tree_size += n->node_size;
 433 }
 434
 435 static void
 436 write_uleb128 (unsigned char *p, size_t val)
 437 {
 438   unsigned char c;
 439   do
 440     {
 441       c = val & 0x7f;
 442       val >>= 7;
 443       if (val)
 444         c |= 0x80;
 445       *p++ = c;
 446     }
 447   while (val);
 448 }
 449
 450 static void
 451 write_nodes (struct node *n, size_t off)
 452 {
 453   for (; n; n = n->sibling)
 454     {
 455       assert (off < tree_size && tree[off] == 0);
 456       if (n->key_len > 1)
 457         {
 458           assert (n->key_len < 64);
 459           tree[off] = n->key_len;
 460         }
 461       else
 462         tree[off] = (n->key[0] - ' ') | 0x80;
 463       assert ((tree[off] & 0x40) == 0);
 464       if (n->codepoint != NO_VALUE)
 465         tree[off] |= 0x40;
 466       off++;
 467       if (n->key_len > 1)
 468         {
 469           tree[off++] = n->key_idx & 0xff;
 470           tree[off++] = (n->key_idx >> 8) & 0xff;
 471         }
 472       if (n->codepoint != NO_VALUE)
 473         {
 474           assert (n->codepoint < (1L << 21));
 475           tree[off++] = n->codepoint & 0xff;
 476           tree[off++] = (n->codepoint >> 8) & 0xff;
 477           tree[off] = (n->codepoint >> 16) & 0xff;
 478           if (n->child)
 479             tree[off] |= 0x80;
 480           if (!n->sibling)
 481             tree[off] |= 0x40;
 482           off++;
 483         }
 484       if (n->child)
 485         {
 486           write_uleb128 (&tree[off], n->child_off);
 487           off += sizeof_uleb128 (n->child_off);
 488           write_nodes (n->child, off + n->child_off);
 489         }
 490       if (n->codepoint == NO_VALUE
 491           && n->sibling == NULL)
 492         tree[off++] = 0xff;
 493     }
 494   assert (off <= tree_size);
 495 }
 496
 497 static void
 498 build_radix_tree (void)
 499 {
 500   size_t i, j, k, key_idx;
 501
 502   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 503     if (generated_ranges[i].ok == INT_MAX)
 504       {
 505         if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL)
 506           break;
 507       }
 508     else if (generated_ranges[i].ok == (generated_ranges[i].high
 509                                         - generated_ranges[i].low + 1))
 510       {
 511         if (generated_ranges[i].max_high != generated_ranges[i].high)
 512           break;
 513       }
 514     else
 515       break;
 516   if (i < ARRAY_SIZE (generated_ranges))
 517     fail ("uncovered generated range %s %lx %lx",
 518           generated_ranges[i].prefix, generated_ranges[i].low,
 519           generated_ranges[i].high);
 520   /* Sort entries alphabetically, node_add relies on that.  */
 521   qsort (entries, num_entries, sizeof (struct entry), entrycmp);
 522   for (i = 1; i < num_entries; ++i)
 523     if (i && strcmp (entries[i].name, entries[i - 1].name) == 0)
 524       fail ("multiple entries for name %s", entries[i].name);
 525
 526   for (i = 0; i < num_entries; ++i)
 527     node_add (&root, entries[i].name, strlen (entries[i].name),
 528               entries[i].codepoint);
 529
 530   nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *));
 531   i = num_nodes;
 532   num_nodes = 0;
 533   append_nodes (root);
 534   assert (num_nodes == i);
 535   /* Sort node pointers by decreasing string length to handle substrings
 536      right.  */
 537   qsort (nodes, num_nodes, sizeof (struct node *), nodecmp);
 538   if (nodes[0]->key_len >= 64)
 539     /* We could actually encode even 64 and 65, as key_len 0 and 1 will
 540        never appear in the multiple letter key encodings, so could subtract
 541        2.  */
 542     fail ("can't encode key length %d >= 64, so need to split some radix "
 543           "tree nodes to ensure length fits", nodes[0]->key_len);
 544
 545   /* Verify a property charset.cc UAX44-LM2 matching relies on:
 546      if - is at the end of key of some node, then all its siblings
 547      start with alphanumeric characters.
 548      Only 2 character names and 1 alias have - followed by space:
 549      U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
 550      U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
 551      U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
 552      so the KA- in there will always be followed at least by SHOG
 553      in the same node.
 554      If this changes, charset.cc needs to change.  */
 555   for (i = 0; i < num_nodes; ++i)
 556     if (nodes[i]->key[nodes[i]->key_len - 1] == '-'
 557         && nodes[i]->child)
 558       {
 559         struct node *n;
 560
 561         for (n = nodes[i]->child; n; n = n->sibling)
 562           if (n->key[0] == ' ')
 563             fail ("node with key %.*s followed by node with key %.*s",
 564                   (int) nodes[i]->key_len, nodes[i]->key,
 565                   (int) n->key_len, n->key);
 566       }
 567
 568   /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but
 569      fortunately num_nodes is < 64K and key_len < 64.  */
 570   key_idx = 0;
 571   for (i = 0; i < num_nodes; ++i)
 572     {
 573       nodes[i]->key_idx = SIZE_MAX;
 574       nodes[i]->in_dict = false;
 575       if (nodes[i]->key_len > 1)
 576         {
 577           for (j = 0; j < i; ++j)
 578             /* Can't rely on memmem unfortunately.  */
 579             if (nodes[j]->in_dict)
 580               {
 581                 for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k)
 582                   if (nodes[j]->key[k] == nodes[i]->key[0]
 583                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 584                                  nodes[i]->key_len - 1) == 0)
 585                     {
 586                       nodes[i]->key_idx = nodes[j]->key_idx + k;
 587                       j = i;
 588                       break;
 589                     }
 590                 if (j == i)
 591                   break;
 592                 for (; k < nodes[j]->key_len; ++k)
 593                   if (nodes[j]->key[k] == nodes[i]->key[0]
 594                       && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
 595                                  nodes[j]->key_len - 1 - k) == 0)
 596                     {
 597                       size_t l;
 598
 599                       for (l = j + 1; l < i; ++l)
 600                         if (nodes[l]->in_dict)
 601                           break;
 602                       if (l < i
 603                           && memcmp (nodes[l]->key,
 604                                      nodes[i]->key + (nodes[j]->key_len - k),
 605                                      nodes[i]->key_len
 606                                      - (nodes[j]->key_len - k)) == 0)
 607                         {
 608                           nodes[i]->key_idx = nodes[j]->key_idx + k;
 609                           j = i;
 610                         }
 611                       else
 612                         j = l - 1;
 613                       break;
 614                     }
 615               }
 616           if (nodes[i]->key_idx == SIZE_MAX)
 617             {
 618               nodes[i]->key_idx = key_idx;
 619               nodes[i]->in_dict = true;
 620               key_idx += nodes[i]->key_len;
 621             }
 622         }
 623     }
 624   if (key_idx >= 65536)
 625     /* We only use 2 bytes for offsets into the dictionary.
 626        If it grows more, there is e.g. a possibility to replace
 627        most often seen words or substrings in the dictionary
 628        with characters other than [A-Z0-9 -] (say LETTER occurs
 629        in the dictionary almost 197 times and so by using a
 630        instead of LETTER we could save (6 - 1) * 197 bytes,
 631        with some on the side table mapping 'a' to "LETTER".  */
 632     fail ("too large dictionary %ld", (long) key_idx);
 633   dict_size = key_idx;
 634
 635   size_nodes (root);
 636
 637   dict = (char *) xmalloc (dict_size + 1);
 638   for (i = 0; i < num_nodes; ++i)
 639     if (nodes[i]->in_dict)
 640       memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len);
 641   dict[dict_size] = '\0';
 642
 643   tree = (unsigned char *) xmalloc (tree_size);
 644   memset (tree, 0, tree_size);
 645   write_nodes (root, 0);
 646 }
 647
 648 /* Print out the huge copyright notice.  */
 649
 650 static void
 651 write_copyright (void)
 652 {
 653   static const char copyright[] = "\
 654 /* Unicode name to codepoint.\n\
 655    Copyright (C) 2005-2023 Free Software Foundation, Inc.\n\
 656 \n\
 657    This program is free software; you can redistribute it and/or modify it\n\
 658    under the terms of the GNU General Public License as published by the\n\
 659    Free Software Foundation; either version 3, or (at your option) any\n\
 660    later version.\n\
 661 \n\
 662    This program is distributed in the hope that it will be useful,\n\
 663    but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
 664    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n\
 665    GNU General Public License for more details.\n\
 666 \n\
 667    You should have received a copy of the GNU General Public License\n\
 668    along with this program; see the file COPYING3.  If not see\n\
 669    <http://www.gnu.org/licenses/>.\n\
 670 \n\
 671 \n\
 672    Copyright (C) 1991-2022 Unicode, Inc.  All rights reserved.\n\
 673    Distributed under the Terms of Use in\n\
 674    http://www.unicode.org/copyright.html.\n\
 675 \n\
 676    Permission is hereby granted, free of charge, to any person\n\
 677    obtaining a copy of the Unicode data files and any associated\n\
 678    documentation (the \"Data Files\") or Unicode software and any\n\
 679    associated documentation (the \"Software\") to deal in the Data Files\n\
 680    or Software without restriction, including without limitation the\n\
 681    rights to use, copy, modify, merge, publish, distribute, and/or\n\
 682    sell copies of the Data Files or Software, and to permit persons to\n\
 683    whom the Data Files or Software are furnished to do so, provided\n\
 684    that (a) the above copyright notice(s) and this permission notice\n\
 685    appear with all copies of the Data Files or Software, (b) both the\n\
 686    above copyright notice(s) and this permission notice appear in\n\
 687    associated documentation, and (c) there is clear notice in each\n\
 688    modified Data File or in the Software as well as in the\n\
 689    documentation associated with the Data File(s) or Software that the\n\
 690    data or software has been modified.\n\
 691 \n\
 692    THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
 693    OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
 694    WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
 695    NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
 696    COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
 697    ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
 698    DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
 699    WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
 700    ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
 701    OF THE DATA FILES OR SOFTWARE.\n\
 702 \n\
 703    Except as contained in this notice, the name of a copyright holder\n\
 704    shall not be used in advertising or otherwise to promote the sale,\n\
 705    use or other dealings in these Data Files or Software without prior\n\
 706    written authorization of the copyright holder.  */\n";
 707
 708    puts (copyright);
 709 }
 710
 711 static void
 712 write_dict (void)
 713 {
 714   size_t i;
 715
 716   printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1));
 717   for (i = 0; i < dict_size; i += 77)
 718     printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : "");
 719   puts ("");
 720 }
 721
 722 static void
 723 write_tree (void)
 724 {
 725   size_t i, j;
 726
 727   printf ("static const unsigned char uname2c_tree[%ld] = {\n",
 728           (long) tree_size);
 729   for (i = 0, j = 0; i < tree_size; ++i)
 730     {
 731       printf ("%s0x%02x%s", j == 0 ? "  " : "", tree[i],
 732               i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", ");
 733       if (j == 11)
 734         j = 0;
 735       else
 736         ++j;
 737     }
 738 }
 739
 740 static void
 741 write_generated (void)
 742 {
 743   size_t i, j;
 744
 745   puts ("static const cppchar_t uname2c_pairs[] = {");
 746   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 747     {
 748       if (i == 0)
 749         ;
 750       else if (generated_ranges[i - 1].idx != generated_ranges[i].idx)
 751         puts (", 0,");
 752       else
 753         puts (",");
 754       printf ("  0x%lx, 0x%lx /* %s */",
 755               generated_ranges[i].low,
 756               generated_ranges[i].high,
 757               generated_ranges[i].prefix);
 758     }
 759   puts (", 0 };\n");
 760
 761   puts ("static const unsigned char uname2c_generated[] = {");
 762   for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i)
 763     {
 764       if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx)
 765         printf ("%s  %d /* %s */", i ? ",\n" : "",
 766                 ++j, generated_ranges[i].prefix);
 767       j += 2;
 768     }
 769   puts (" };\n");
 770 }
 771
 772 /* Main program.  */
 773
 774 int
 775 main (int argc, char **argv)
 776 {
 777   size_t i;
 778
 779   if (argc != 3)
 780     fail ("too few arguments to makeradixtree");
 781   for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
 782     if (!generated_ranges[i].max_high)
 783       generated_ranges[i].max_high = generated_ranges[i].high;
 784   read_table (argv[1], false);
 785   read_table (argv[2], true);
 786   build_radix_tree ();
 787
 788   write_copyright ();
 789   write_dict ();
 790   write_tree ();
 791   write_generated ();
 792   printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len);
 793   return 0;
 794 }