]> git.ipfire.org Git - thirdparty/gcc.git/blob - libcpp/makeuname2c.cc
updat_bb_profile_for_threading TLC
[thirdparty/gcc.git] / libcpp / makeuname2c.cc
1 /* Make uname2c.h from various sources.
2 Copyright (C) 2005-2023 Free Software Foundation, Inc.
3 Contributed by Jakub Jelinek <jakub@redhat.com>
4
5 This program is free software; you can redistribute it and/or modify it
6 under the terms of the GNU General Public License as published by the
7 Free Software Foundation; either version 3, or (at your option) any
8 later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; see the file COPYING3. If not see
17 <http://www.gnu.org/licenses/>. */
18
19 /* Run this program as
20 ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h
21
22 This program generates 2 big arrays and 2 small ones.
23 The large ones are uname2c_dict, initialized by string literal
24 representing dictionary, and uname2c_tree, which is a space optimized
25 radix tree.
26 The format of the radix tree is:
27 byte 0 either 0x80 + (key[0] - ' ') (if key_len == 1)
28 or key_len (otherwise)
29 either of them ored with 0x40 if it has a codepoint
30 byte 1 LSB of offset into uname2c_dict for key (only if key_len > 1)
31 byte 2 MSB of offset into uname2c_dict for key (only if key_len > 1)
32 if key_len == 1, the above 2 bytes are omitted
33 byte 3 LSB of codepoint (only if it has a codepoint)
34 byte 4 middle byte of codepoint (ditto)
35 byte 5 MSB of codepoint (ditto), ored with 0x80 if node has children
36 ored with 0x40 if it doesn't have siblings
37 if it doesn't have a codepoint, the above 3 bytes are omitted
38 and we assume that the node has children
39 byte 6, 7, 8 uleb128 encoded offset to first child relative to the end
40 of the uleb128 (only if node has children)
41 byte 9 0xff (only if node doesn't have a codepoint and doesn't
42 have siblings)
43
44 For prefixes of Unicode NR1 or NR2 rule generated names, on a node
45 representing end of the prefix codepoint is 0xd800 + index into
46 uname2c_generated array with indexes into uname2c_pairs array of
47 code points (low, high) of the ranges terminated by single 0.
48 0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules.
49 */
50
51 #include <assert.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <stdint.h>
55 #include <ctype.h>
56 #include <limits.h>
57 #include <stdarg.h>
58 #include <stdbool.h>
59 #include <stdlib.h>
60
61 #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
62
63 #define NUM_CODE_POINTS 0x110000
64 #define MAX_CODE_POINT 0x10ffff
65 #define NO_VALUE 0xdc00
66 #define GENERATED 0xd800
67
68 struct entry { const char *name; unsigned long codepoint; };
69 static struct entry *entries;
70 static unsigned long num_allocated, num_entries;
71
72 /* Unicode 15 Table 4-8. */
73 struct generated {
74 const char *prefix;
75 /* max_high is a workaround for UnicodeData.txt inconsistencies
76 on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>"
77 entry is a few code points above the end of the range. */
78 unsigned long low, high, max_high;
79 int idx, ok;
80 };
81 static struct generated generated_ranges[] =
82 { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */
83 { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */
84 { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 },
85 { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 },
86 { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 },
87 { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 },
88 { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 },
89 { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 },
90 { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 },
91 { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 },
92 { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 },
93 { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 },
94 { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 },
95 { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 },
96 { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 },
97 { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 },
98 { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 }
99 };
100
101 struct node {
102 struct node *sibling, *child;
103 const char *key;
104 size_t key_len, key_idx, node_size, size_sum, child_off;
105 unsigned long codepoint;
106 bool in_dict;
107 };
108 static struct node *root, **nodes;
109 static unsigned long num_nodes;
110 static size_t dict_size, tree_size, max_entry_len;
111 static char *dict;
112 static unsigned char *tree;
113
114 /* Die! */
115
116 static void
117 fail (const char *s, ...)
118 {
119 va_list ap;
120
121 va_start (ap, s);
122 vfprintf (stderr, s, ap);
123 va_end (ap);
124 fputc ('\n', stderr);
125 exit (1);
126 }
127
128 static void *
129 xmalloc (size_t size)
130 {
131 void *ret = malloc (size);
132
133 if (ret == NULL)
134 fail ("failed to allocate %ld bytes", (long) size);
135 return ret;
136 }
137
138 static void *
139 xrealloc (void *p, size_t size)
140 {
141 void *ret = p ? realloc (p, size) : malloc (size);
142
143 if (ret == NULL)
144 fail ("failed to allocate %ld bytes", (long) size);
145 return ret;
146 }
147
148 static int
149 entrycmp (const void *p1, const void *p2)
150 {
151 const struct entry *e1 = (const struct entry *) p1;
152 const struct entry *e2 = (const struct entry *) p2;
153 int ret = strcmp (e1->name, e2->name);
154
155 if (ret != 0)
156 return ret;
157 if (e1->codepoint < e2->codepoint)
158 return -1;
159 if (e1->codepoint > e2->codepoint)
160 return 1;
161 return 0;
162 }
163
164 static int
165 nodecmp (const void *p1, const void *p2)
166 {
167 const struct node *n1 = *(const struct node *const *) p1;
168 const struct node *n2 = *(const struct node *const *) p2;
169 if (n1->key_len > n2->key_len)
170 return -1;
171 if (n1->key_len < n2->key_len)
172 return 1;
173 return memcmp (n1->key, n2->key, n1->key_len);
174 }
175
176 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
177 decompositions of characters for which both the character
178 decomposed and all the code points in the decomposition are valid
179 for some supported language version, and the 'all_decomp' table to
180 be the decompositions of all characters without those
181 constraints. */
182
183 static void
184 read_table (char *fname, bool aliases_p)
185 {
186 FILE *f = fopen (fname, "r");
187 const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt";
188
189 if (!f)
190 fail ("opening %s", sname);
191 for (;;)
192 {
193 char line[256];
194 unsigned long codepoint;
195 const char *name, *aname;
196 char *l;
197 size_t i;
198
199 if (!fgets (line, sizeof (line), f))
200 break;
201 codepoint = strtoul (line, &l, 16);
202 if (l == line && aliases_p)
203 {
204 /* NameAliased.txt can contain comments and empty lines. */
205 if (*line == '#' || *line == '\n')
206 continue;
207 }
208 if (l == line || *l != ';')
209 fail ("parsing %s, reading code point", sname);
210 if (codepoint > MAX_CODE_POINT)
211 fail ("parsing %s, code point too large", sname);
212
213 name = l + 1;
214 do {
215 ++l;
216 } while (*l != ';');
217
218 aname = NULL;
219 if (aliases_p)
220 {
221 /* Ignore figment and abbreviation aliases. */
222 if (strcmp (l + 1, "correction\n") != 0
223 && strcmp (l + 1, "control\n") != 0
224 && strcmp (l + 1, "alternate\n") != 0)
225 continue;
226 i = ARRAY_SIZE (generated_ranges);
227 }
228 else
229 {
230 for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
231 if (codepoint >= generated_ranges[i].low
232 && codepoint <= generated_ranges[i].max_high)
233 break;
234 if (i != ARRAY_SIZE (generated_ranges))
235 {
236 if (*name == '<' && l[-1] == '>')
237 {
238 if (codepoint == generated_ranges[i].low
239 && l - name >= 9
240 && memcmp (l - 8, ", First>", 8) == 0
241 && generated_ranges[i].ok == 0)
242 {
243 generated_ranges[i].ok = INT_MAX - 1;
244 aname = generated_ranges[i].prefix;
245 codepoint = GENERATED + generated_ranges[i].idx;
246 }
247 /* Unfortunately, UnicodeData.txt isn't consistent
248 with the Table 4-8 range endpoints in 3 cases,
249 the ranges are longer there by a few codepoints.
250 So use the max_high hack to avoid verification
251 failures. */
252 else if (codepoint == generated_ranges[i].max_high
253 && l - name >= 8
254 && memcmp (l - 7, ", Last>", 7) == 0
255 && generated_ranges[i].ok == INT_MAX - 1)
256 {
257 generated_ranges[i].ok = INT_MAX;
258 continue;
259 }
260 else
261 fail ("unexpected generated entry %lx %.*s",
262 codepoint, (int) (l - name), name);
263 }
264 else if (codepoint
265 == generated_ranges[i].low + generated_ranges[i].ok
266 && l - name == (strlen (generated_ranges[i].prefix)
267 + (name - 1 - line))
268 && memcmp (name, generated_ranges[i].prefix,
269 strlen (generated_ranges[i].prefix)) == 0
270 && memcmp (name + strlen (generated_ranges[i].prefix),
271 line, name - 1 - line) == 0)
272 {
273 ++generated_ranges[i].ok;
274 if (codepoint != generated_ranges[i].low)
275 continue;
276 aname = generated_ranges[i].prefix;
277 codepoint = GENERATED + generated_ranges[i].idx;
278 }
279 else
280 fail ("unexpected generated entry %lx %.*s",
281 codepoint, (int) (l - name), name);
282 if (aname == generated_ranges[i].prefix)
283 {
284 size_t j;
285
286 /* Don't add an entry for a generated range where the
287 same prefix has been added already. */
288 for (j = 0; j < i; ++j)
289 if (generated_ranges[j].idx == generated_ranges[i].idx
290 && generated_ranges[j].ok != 0)
291 break;
292 if (j < i)
293 continue;
294 }
295 }
296 else if (*name == '<' && l[-1] == '>')
297 continue;
298 }
299
300 if (num_entries == num_allocated)
301 {
302 num_allocated = num_allocated ? 2 * num_allocated : 65536;
303 entries = (struct entry *) xrealloc (entries, num_allocated
304 * sizeof (entries[0]));
305 }
306
307 if (aname == NULL)
308 {
309 char *a = (char *) xmalloc (l + 1 - name);
310 if (l - name > max_entry_len)
311 max_entry_len = l - name;
312 memcpy (a, name, l - name);
313 a[l - name] = '\0';
314 aname = a;
315 }
316 entries[num_entries].name = aname;
317 entries[num_entries++].codepoint = codepoint;
318 }
319 if (ferror (f))
320 fail ("reading %s", sname);
321 fclose (f);
322 }
323
324 /* Assumes nodes are added from sorted array, so we never
325 add any node before existing one, only after it. */
326
327 static void
328 node_add (struct node **p, const char *key, size_t key_len,
329 unsigned long codepoint)
330 {
331 struct node *n;
332 size_t i;
333
334 do
335 {
336 if (*p == NULL)
337 {
338 *p = n = (struct node *) xmalloc (sizeof (struct node));
339 ++num_nodes;
340 assert (key_len);
341 n->sibling = NULL;
342 n->child = NULL;
343 n->key = key;
344 n->key_len = key_len;
345 n->codepoint = codepoint;
346 return;
347 }
348 n = *p;
349 for (i = 0; i < n->key_len && i < key_len; ++i)
350 if (n->key[i] != key[i])
351 break;
352 if (i == 0)
353 {
354 p = &n->sibling;
355 continue;
356 }
357 if (i == n->key_len)
358 {
359 assert (key_len > n->key_len);
360 p = &n->child;
361 key += n->key_len;
362 key_len -= n->key_len;
363 continue;
364 }
365 /* Need to split the node. */
366 assert (i < key_len);
367 n = (struct node *) xmalloc (sizeof (struct node));
368 ++num_nodes;
369 n->sibling = NULL;
370 n->child = (*p)->child;
371 n->key = (*p)->key + i;
372 n->key_len = (*p)->key_len - i;
373 n->codepoint = (*p)->codepoint;
374 (*p)->child = n;
375 (*p)->key_len = i;
376 (*p)->codepoint = NO_VALUE;
377 key += i;
378 key_len -= i;
379 p = &n->sibling;
380 }
381 while (1);
382 }
383
384 static void
385 append_nodes (struct node *n)
386 {
387 for (; n; n = n->sibling)
388 {
389 nodes[num_nodes++] = n;
390 append_nodes (n->child);
391 }
392 }
393
394 static size_t
395 sizeof_uleb128 (size_t val)
396 {
397 size_t sz = 0;
398 do
399 {
400 val >>= 7;
401 sz += 1;
402 }
403 while (val != 0);
404 return sz;
405 }
406
407 static void
408 size_nodes (struct node *n)
409 {
410 if (n->child)
411 size_nodes (n->child);
412 if (n->sibling)
413 size_nodes (n->sibling);
414 n->node_size = 1 + (n->key_len > 1) * 2;
415 if (n->codepoint != NO_VALUE)
416 n->node_size += 3;
417 else if (n->sibling == NULL)
418 ++n->node_size;
419 n->size_sum = 0;
420 n->child_off = 0;
421 if (n->sibling)
422 n->size_sum += n->sibling->size_sum;
423 if (n->child)
424 {
425 n->child_off = n->size_sum + (n->codepoint == NO_VALUE
426 && n->sibling == NULL);
427 n->node_size += sizeof_uleb128 (n->child_off);
428 }
429 n->size_sum += n->node_size;
430 if (n->child)
431 n->size_sum += n->child->size_sum;
432 tree_size += n->node_size;
433 }
434
435 static void
436 write_uleb128 (unsigned char *p, size_t val)
437 {
438 unsigned char c;
439 do
440 {
441 c = val & 0x7f;
442 val >>= 7;
443 if (val)
444 c |= 0x80;
445 *p++ = c;
446 }
447 while (val);
448 }
449
450 static void
451 write_nodes (struct node *n, size_t off)
452 {
453 for (; n; n = n->sibling)
454 {
455 assert (off < tree_size && tree[off] == 0);
456 if (n->key_len > 1)
457 {
458 assert (n->key_len < 64);
459 tree[off] = n->key_len;
460 }
461 else
462 tree[off] = (n->key[0] - ' ') | 0x80;
463 assert ((tree[off] & 0x40) == 0);
464 if (n->codepoint != NO_VALUE)
465 tree[off] |= 0x40;
466 off++;
467 if (n->key_len > 1)
468 {
469 tree[off++] = n->key_idx & 0xff;
470 tree[off++] = (n->key_idx >> 8) & 0xff;
471 }
472 if (n->codepoint != NO_VALUE)
473 {
474 assert (n->codepoint < (1L << 21));
475 tree[off++] = n->codepoint & 0xff;
476 tree[off++] = (n->codepoint >> 8) & 0xff;
477 tree[off] = (n->codepoint >> 16) & 0xff;
478 if (n->child)
479 tree[off] |= 0x80;
480 if (!n->sibling)
481 tree[off] |= 0x40;
482 off++;
483 }
484 if (n->child)
485 {
486 write_uleb128 (&tree[off], n->child_off);
487 off += sizeof_uleb128 (n->child_off);
488 write_nodes (n->child, off + n->child_off);
489 }
490 if (n->codepoint == NO_VALUE
491 && n->sibling == NULL)
492 tree[off++] = 0xff;
493 }
494 assert (off <= tree_size);
495 }
496
497 static void
498 build_radix_tree (void)
499 {
500 size_t i, j, k, key_idx;
501
502 for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
503 if (generated_ranges[i].ok == INT_MAX)
504 {
505 if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL)
506 break;
507 }
508 else if (generated_ranges[i].ok == (generated_ranges[i].high
509 - generated_ranges[i].low + 1))
510 {
511 if (generated_ranges[i].max_high != generated_ranges[i].high)
512 break;
513 }
514 else
515 break;
516 if (i < ARRAY_SIZE (generated_ranges))
517 fail ("uncovered generated range %s %lx %lx",
518 generated_ranges[i].prefix, generated_ranges[i].low,
519 generated_ranges[i].high);
520 /* Sort entries alphabetically, node_add relies on that. */
521 qsort (entries, num_entries, sizeof (struct entry), entrycmp);
522 for (i = 1; i < num_entries; ++i)
523 if (i && strcmp (entries[i].name, entries[i - 1].name) == 0)
524 fail ("multiple entries for name %s", entries[i].name);
525
526 for (i = 0; i < num_entries; ++i)
527 node_add (&root, entries[i].name, strlen (entries[i].name),
528 entries[i].codepoint);
529
530 nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *));
531 i = num_nodes;
532 num_nodes = 0;
533 append_nodes (root);
534 assert (num_nodes == i);
535 /* Sort node pointers by decreasing string length to handle substrings
536 right. */
537 qsort (nodes, num_nodes, sizeof (struct node *), nodecmp);
538 if (nodes[0]->key_len >= 64)
539 /* We could actually encode even 64 and 65, as key_len 0 and 1 will
540 never appear in the multiple letter key encodings, so could subtract
541 2. */
542 fail ("can't encode key length %d >= 64, so need to split some radix "
543 "tree nodes to ensure length fits", nodes[0]->key_len);
544
545 /* Verify a property charset.cc UAX44-LM2 matching relies on:
546 if - is at the end of key of some node, then all its siblings
547 start with alphanumeric characters.
548 Only 2 character names and 1 alias have - followed by space:
549 U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
550 U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
551 U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
552 so the KA- in there will always be followed at least by SHOG
553 in the same node.
554 If this changes, charset.cc needs to change. */
555 for (i = 0; i < num_nodes; ++i)
556 if (nodes[i]->key[nodes[i]->key_len - 1] == '-'
557 && nodes[i]->child)
558 {
559 struct node *n;
560
561 for (n = nodes[i]->child; n; n = n->sibling)
562 if (n->key[0] == ' ')
563 fail ("node with key %.*s followed by node with key %.*s",
564 (int) nodes[i]->key_len, nodes[i]->key,
565 (int) n->key_len, n->key);
566 }
567
568 /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but
569 fortunately num_nodes is < 64K and key_len < 64. */
570 key_idx = 0;
571 for (i = 0; i < num_nodes; ++i)
572 {
573 nodes[i]->key_idx = SIZE_MAX;
574 nodes[i]->in_dict = false;
575 if (nodes[i]->key_len > 1)
576 {
577 for (j = 0; j < i; ++j)
578 /* Can't rely on memmem unfortunately. */
579 if (nodes[j]->in_dict)
580 {
581 for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k)
582 if (nodes[j]->key[k] == nodes[i]->key[0]
583 && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
584 nodes[i]->key_len - 1) == 0)
585 {
586 nodes[i]->key_idx = nodes[j]->key_idx + k;
587 j = i;
588 break;
589 }
590 if (j == i)
591 break;
592 for (; k < nodes[j]->key_len; ++k)
593 if (nodes[j]->key[k] == nodes[i]->key[0]
594 && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1,
595 nodes[j]->key_len - 1 - k) == 0)
596 {
597 size_t l;
598
599 for (l = j + 1; l < i; ++l)
600 if (nodes[l]->in_dict)
601 break;
602 if (l < i
603 && memcmp (nodes[l]->key,
604 nodes[i]->key + (nodes[j]->key_len - k),
605 nodes[i]->key_len
606 - (nodes[j]->key_len - k)) == 0)
607 {
608 nodes[i]->key_idx = nodes[j]->key_idx + k;
609 j = i;
610 }
611 else
612 j = l - 1;
613 break;
614 }
615 }
616 if (nodes[i]->key_idx == SIZE_MAX)
617 {
618 nodes[i]->key_idx = key_idx;
619 nodes[i]->in_dict = true;
620 key_idx += nodes[i]->key_len;
621 }
622 }
623 }
624 if (key_idx >= 65536)
625 /* We only use 2 bytes for offsets into the dictionary.
626 If it grows more, there is e.g. a possibility to replace
627 most often seen words or substrings in the dictionary
628 with characters other than [A-Z0-9 -] (say LETTER occurs
629 in the dictionary almost 197 times and so by using a
630 instead of LETTER we could save (6 - 1) * 197 bytes,
631 with some on the side table mapping 'a' to "LETTER". */
632 fail ("too large dictionary %ld", (long) key_idx);
633 dict_size = key_idx;
634
635 size_nodes (root);
636
637 dict = (char *) xmalloc (dict_size + 1);
638 for (i = 0; i < num_nodes; ++i)
639 if (nodes[i]->in_dict)
640 memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len);
641 dict[dict_size] = '\0';
642
643 tree = (unsigned char *) xmalloc (tree_size);
644 memset (tree, 0, tree_size);
645 write_nodes (root, 0);
646 }
647
648 /* Print out the huge copyright notice. */
649
650 static void
651 write_copyright (void)
652 {
653 static const char copyright[] = "\
654 /* Unicode name to codepoint.\n\
655 Copyright (C) 2005-2023 Free Software Foundation, Inc.\n\
656 \n\
657 This program is free software; you can redistribute it and/or modify it\n\
658 under the terms of the GNU General Public License as published by the\n\
659 Free Software Foundation; either version 3, or (at your option) any\n\
660 later version.\n\
661 \n\
662 This program is distributed in the hope that it will be useful,\n\
663 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
664 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
665 GNU General Public License for more details.\n\
666 \n\
667 You should have received a copy of the GNU General Public License\n\
668 along with this program; see the file COPYING3. If not see\n\
669 <http://www.gnu.org/licenses/>.\n\
670 \n\
671 \n\
672 Copyright (C) 1991-2022 Unicode, Inc. All rights reserved.\n\
673 Distributed under the Terms of Use in\n\
674 http://www.unicode.org/copyright.html.\n\
675 \n\
676 Permission is hereby granted, free of charge, to any person\n\
677 obtaining a copy of the Unicode data files and any associated\n\
678 documentation (the \"Data Files\") or Unicode software and any\n\
679 associated documentation (the \"Software\") to deal in the Data Files\n\
680 or Software without restriction, including without limitation the\n\
681 rights to use, copy, modify, merge, publish, distribute, and/or\n\
682 sell copies of the Data Files or Software, and to permit persons to\n\
683 whom the Data Files or Software are furnished to do so, provided\n\
684 that (a) the above copyright notice(s) and this permission notice\n\
685 appear with all copies of the Data Files or Software, (b) both the\n\
686 above copyright notice(s) and this permission notice appear in\n\
687 associated documentation, and (c) there is clear notice in each\n\
688 modified Data File or in the Software as well as in the\n\
689 documentation associated with the Data File(s) or Software that the\n\
690 data or software has been modified.\n\
691 \n\
692 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
693 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
694 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
695 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
696 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
697 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
698 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
699 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
700 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
701 OF THE DATA FILES OR SOFTWARE.\n\
702 \n\
703 Except as contained in this notice, the name of a copyright holder\n\
704 shall not be used in advertising or otherwise to promote the sale,\n\
705 use or other dealings in these Data Files or Software without prior\n\
706 written authorization of the copyright holder. */\n";
707
708 puts (copyright);
709 }
710
711 static void
712 write_dict (void)
713 {
714 size_t i;
715
716 printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1));
717 for (i = 0; i < dict_size; i += 77)
718 printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : "");
719 puts ("");
720 }
721
722 static void
723 write_tree (void)
724 {
725 size_t i, j;
726
727 printf ("static const unsigned char uname2c_tree[%ld] = {\n",
728 (long) tree_size);
729 for (i = 0, j = 0; i < tree_size; ++i)
730 {
731 printf ("%s0x%02x%s", j == 0 ? " " : "", tree[i],
732 i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", ");
733 if (j == 11)
734 j = 0;
735 else
736 ++j;
737 }
738 }
739
740 static void
741 write_generated (void)
742 {
743 size_t i, j;
744
745 puts ("static const cppchar_t uname2c_pairs[] = {");
746 for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
747 {
748 if (i == 0)
749 ;
750 else if (generated_ranges[i - 1].idx != generated_ranges[i].idx)
751 puts (", 0,");
752 else
753 puts (",");
754 printf (" 0x%lx, 0x%lx /* %s */",
755 generated_ranges[i].low,
756 generated_ranges[i].high,
757 generated_ranges[i].prefix);
758 }
759 puts (", 0 };\n");
760
761 puts ("static const unsigned char uname2c_generated[] = {");
762 for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i)
763 {
764 if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx)
765 printf ("%s %d /* %s */", i ? ",\n" : "",
766 ++j, generated_ranges[i].prefix);
767 j += 2;
768 }
769 puts (" };\n");
770 }
771
772 /* Main program. */
773
774 int
775 main (int argc, char **argv)
776 {
777 size_t i;
778
779 if (argc != 3)
780 fail ("too few arguments to makeradixtree");
781 for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i)
782 if (!generated_ranges[i].max_high)
783 generated_ranges[i].max_high = generated_ranges[i].high;
784 read_table (argv[1], false);
785 read_table (argv[2], true);
786 build_radix_tree ();
787
788 write_copyright ();
789 write_dict ();
790 write_tree ();
791 write_generated ();
792 printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len);
793 return 0;
794 }