]>
Commit | Line | Data |
---|---|---|
eb4879ab | 1 | /* Make uname2c.h from various sources. |
a945c346 | 2 | Copyright (C) 2005-2024 Free Software Foundation, Inc. |
eb4879ab JJ |
3 | Contributed by Jakub Jelinek <jakub@redhat.com> |
4 | ||
5 | This program is free software; you can redistribute it and/or modify it | |
6 | under the terms of the GNU General Public License as published by the | |
7 | Free Software Foundation; either version 3, or (at your option) any | |
8 | later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU General Public License | |
16 | along with this program; see the file COPYING3. If not see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* Run this program as | |
20 | ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h | |
21 | ||
22 | This program generates 2 big arrays and 2 small ones. | |
23 | The large ones are uname2c_dict, initialized by string literal | |
24 | representing dictionary, and uname2c_tree, which is a space optimized | |
25 | radix tree. | |
26 | The format of the radix tree is: | |
27 | byte 0 either 0x80 + (key[0] - ' ') (if key_len == 1) | |
28 | or key_len (otherwise) | |
29 | either of them ored with 0x40 if it has a codepoint | |
30 | byte 1 LSB of offset into uname2c_dict for key (only if key_len > 1) | |
31 | byte 2 MSB of offset into uname2c_dict for key (only if key_len > 1) | |
32 | if key_len == 1, the above 2 bytes are omitted | |
33 | byte 3 LSB of codepoint (only if it has a codepoint) | |
34 | byte 4 middle byte of codepoint (ditto) | |
35 | byte 5 MSB of codepoint (ditto), ored with 0x80 if node has children | |
36 | ored with 0x40 if it doesn't have siblings | |
37 | if it doesn't have a codepoint, the above 3 bytes are omitted | |
38 | and we assume that the node has children | |
39 | byte 6, 7, 8 uleb128 encoded offset to first child relative to the end | |
40 | of the uleb128 (only if node has children) | |
41 | byte 9 0xff (only if node doesn't have a codepoint and doesn't | |
42 | have siblings) | |
43 | ||
44 | For prefixes of Unicode NR1 or NR2 rule generated names, on a node | |
45 | representing end of the prefix codepoint is 0xd800 + index into | |
46 | uname2c_generated array with indexes into uname2c_pairs array of | |
47 | code points (low, high) of the ranges terminated by single 0. | |
48 | 0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules. | |
49 | */ | |
50 | ||
51 | #include <assert.h> | |
52 | #include <stdio.h> | |
53 | #include <string.h> | |
54 | #include <stdint.h> | |
55 | #include <ctype.h> | |
56 | #include <limits.h> | |
57 | #include <stdarg.h> | |
58 | #include <stdbool.h> | |
59 | #include <stdlib.h> | |
60 | ||
61 | #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) | |
62 | ||
63 | #define NUM_CODE_POINTS 0x110000 | |
64 | #define MAX_CODE_POINT 0x10ffff | |
65 | #define NO_VALUE 0xdc00 | |
66 | #define GENERATED 0xd800 | |
67 | ||
68 | struct entry { const char *name; unsigned long codepoint; }; | |
69 | static struct entry *entries; | |
70 | static unsigned long num_allocated, num_entries; | |
71 | ||
d64b7c82 | 72 | /* Unicode 15.1 Table 4-8. */ |
eb4879ab JJ |
73 | struct generated { |
74 | const char *prefix; | |
75 | /* max_high is a workaround for UnicodeData.txt inconsistencies | |
76 | on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>" | |
77 | entry is a few code points above the end of the range. */ | |
78 | unsigned long low, high, max_high; | |
79 | int idx, ok; | |
80 | }; | |
81 | static struct generated generated_ranges[] = | |
82 | { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */ | |
83 | { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */ | |
2662d537 JJ |
84 | { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 }, |
85 | { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 }, | |
86 | { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 }, | |
eb4879ab JJ |
87 | { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 }, |
88 | { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 }, | |
89 | { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 }, | |
d64b7c82 | 90 | { "CJK UNIFIED IDEOGRAPH-", 0x2ebf0, 0x2ee5d, 0, 1, 0 }, |
eb4879ab | 91 | { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 }, |
2662d537 | 92 | { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 }, |
eb4879ab JJ |
93 | { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 }, |
94 | { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 }, | |
95 | { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 }, | |
96 | { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 }, | |
97 | { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 }, | |
98 | { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 }, | |
99 | { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 } | |
100 | }; | |
101 | ||
102 | struct node { | |
103 | struct node *sibling, *child; | |
104 | const char *key; | |
105 | size_t key_len, key_idx, node_size, size_sum, child_off; | |
106 | unsigned long codepoint; | |
107 | bool in_dict; | |
108 | }; | |
109 | static struct node *root, **nodes; | |
110 | static unsigned long num_nodes; | |
111 | static size_t dict_size, tree_size, max_entry_len; | |
112 | static char *dict; | |
113 | static unsigned char *tree; | |
114 | ||
115 | /* Die! */ | |
116 | ||
117 | static void | |
118 | fail (const char *s, ...) | |
119 | { | |
120 | va_list ap; | |
121 | ||
122 | va_start (ap, s); | |
123 | vfprintf (stderr, s, ap); | |
124 | va_end (ap); | |
125 | fputc ('\n', stderr); | |
126 | exit (1); | |
127 | } | |
128 | ||
129 | static void * | |
130 | xmalloc (size_t size) | |
131 | { | |
132 | void *ret = malloc (size); | |
133 | ||
134 | if (ret == NULL) | |
135 | fail ("failed to allocate %ld bytes", (long) size); | |
136 | return ret; | |
137 | } | |
138 | ||
139 | static void * | |
140 | xrealloc (void *p, size_t size) | |
141 | { | |
142 | void *ret = p ? realloc (p, size) : malloc (size); | |
143 | ||
144 | if (ret == NULL) | |
145 | fail ("failed to allocate %ld bytes", (long) size); | |
146 | return ret; | |
147 | } | |
148 | ||
149 | static int | |
150 | entrycmp (const void *p1, const void *p2) | |
151 | { | |
152 | const struct entry *e1 = (const struct entry *) p1; | |
153 | const struct entry *e2 = (const struct entry *) p2; | |
154 | int ret = strcmp (e1->name, e2->name); | |
155 | ||
156 | if (ret != 0) | |
157 | return ret; | |
158 | if (e1->codepoint < e2->codepoint) | |
159 | return -1; | |
160 | if (e1->codepoint > e2->codepoint) | |
161 | return 1; | |
162 | return 0; | |
163 | } | |
164 | ||
165 | static int | |
166 | nodecmp (const void *p1, const void *p2) | |
167 | { | |
168 | const struct node *n1 = *(const struct node *const *) p1; | |
169 | const struct node *n2 = *(const struct node *const *) p2; | |
170 | if (n1->key_len > n2->key_len) | |
171 | return -1; | |
172 | if (n1->key_len < n2->key_len) | |
173 | return 1; | |
174 | return memcmp (n1->key, n2->key, n1->key_len); | |
175 | } | |
176 | ||
177 | /* Read UnicodeData.txt and fill in the 'decomp' table to be the | |
178 | decompositions of characters for which both the character | |
179 | decomposed and all the code points in the decomposition are valid | |
180 | for some supported language version, and the 'all_decomp' table to | |
181 | be the decompositions of all characters without those | |
182 | constraints. */ | |
183 | ||
184 | static void | |
185 | read_table (char *fname, bool aliases_p) | |
186 | { | |
187 | FILE *f = fopen (fname, "r"); | |
188 | const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt"; | |
189 | ||
190 | if (!f) | |
191 | fail ("opening %s", sname); | |
192 | for (;;) | |
193 | { | |
194 | char line[256]; | |
195 | unsigned long codepoint; | |
196 | const char *name, *aname; | |
197 | char *l; | |
198 | size_t i; | |
199 | ||
200 | if (!fgets (line, sizeof (line), f)) | |
201 | break; | |
202 | codepoint = strtoul (line, &l, 16); | |
203 | if (l == line && aliases_p) | |
204 | { | |
205 | /* NameAliased.txt can contain comments and empty lines. */ | |
206 | if (*line == '#' || *line == '\n') | |
207 | continue; | |
208 | } | |
209 | if (l == line || *l != ';') | |
210 | fail ("parsing %s, reading code point", sname); | |
211 | if (codepoint > MAX_CODE_POINT) | |
212 | fail ("parsing %s, code point too large", sname); | |
213 | ||
214 | name = l + 1; | |
215 | do { | |
216 | ++l; | |
217 | } while (*l != ';'); | |
218 | ||
219 | aname = NULL; | |
220 | if (aliases_p) | |
221 | { | |
222 | /* Ignore figment and abbreviation aliases. */ | |
223 | if (strcmp (l + 1, "correction\n") != 0 | |
224 | && strcmp (l + 1, "control\n") != 0 | |
225 | && strcmp (l + 1, "alternate\n") != 0) | |
226 | continue; | |
227 | i = ARRAY_SIZE (generated_ranges); | |
228 | } | |
229 | else | |
230 | { | |
231 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
232 | if (codepoint >= generated_ranges[i].low | |
233 | && codepoint <= generated_ranges[i].max_high) | |
234 | break; | |
235 | if (i != ARRAY_SIZE (generated_ranges)) | |
236 | { | |
237 | if (*name == '<' && l[-1] == '>') | |
238 | { | |
239 | if (codepoint == generated_ranges[i].low | |
240 | && l - name >= 9 | |
241 | && memcmp (l - 8, ", First>", 8) == 0 | |
242 | && generated_ranges[i].ok == 0) | |
243 | { | |
244 | generated_ranges[i].ok = INT_MAX - 1; | |
245 | aname = generated_ranges[i].prefix; | |
246 | codepoint = GENERATED + generated_ranges[i].idx; | |
247 | } | |
248 | /* Unfortunately, UnicodeData.txt isn't consistent | |
249 | with the Table 4-8 range endpoints in 3 cases, | |
250 | the ranges are longer there by a few codepoints. | |
251 | So use the max_high hack to avoid verification | |
252 | failures. */ | |
253 | else if (codepoint == generated_ranges[i].max_high | |
254 | && l - name >= 8 | |
255 | && memcmp (l - 7, ", Last>", 7) == 0 | |
256 | && generated_ranges[i].ok == INT_MAX - 1) | |
257 | { | |
258 | generated_ranges[i].ok = INT_MAX; | |
259 | continue; | |
260 | } | |
261 | else | |
262 | fail ("unexpected generated entry %lx %.*s", | |
263 | codepoint, (int) (l - name), name); | |
264 | } | |
265 | else if (codepoint | |
266 | == generated_ranges[i].low + generated_ranges[i].ok | |
267 | && l - name == (strlen (generated_ranges[i].prefix) | |
268 | + (name - 1 - line)) | |
269 | && memcmp (name, generated_ranges[i].prefix, | |
270 | strlen (generated_ranges[i].prefix)) == 0 | |
271 | && memcmp (name + strlen (generated_ranges[i].prefix), | |
272 | line, name - 1 - line) == 0) | |
273 | { | |
274 | ++generated_ranges[i].ok; | |
275 | if (codepoint != generated_ranges[i].low) | |
276 | continue; | |
277 | aname = generated_ranges[i].prefix; | |
278 | codepoint = GENERATED + generated_ranges[i].idx; | |
279 | } | |
280 | else | |
281 | fail ("unexpected generated entry %lx %.*s", | |
282 | codepoint, (int) (l - name), name); | |
283 | if (aname == generated_ranges[i].prefix) | |
284 | { | |
285 | size_t j; | |
286 | ||
287 | /* Don't add an entry for a generated range where the | |
288 | same prefix has been added already. */ | |
289 | for (j = 0; j < i; ++j) | |
290 | if (generated_ranges[j].idx == generated_ranges[i].idx | |
291 | && generated_ranges[j].ok != 0) | |
292 | break; | |
293 | if (j < i) | |
294 | continue; | |
295 | } | |
296 | } | |
297 | else if (*name == '<' && l[-1] == '>') | |
298 | continue; | |
299 | } | |
300 | ||
301 | if (num_entries == num_allocated) | |
302 | { | |
303 | num_allocated = num_allocated ? 2 * num_allocated : 65536; | |
304 | entries = (struct entry *) xrealloc (entries, num_allocated | |
305 | * sizeof (entries[0])); | |
306 | } | |
307 | ||
308 | if (aname == NULL) | |
309 | { | |
310 | char *a = (char *) xmalloc (l + 1 - name); | |
311 | if (l - name > max_entry_len) | |
312 | max_entry_len = l - name; | |
313 | memcpy (a, name, l - name); | |
314 | a[l - name] = '\0'; | |
315 | aname = a; | |
316 | } | |
317 | entries[num_entries].name = aname; | |
318 | entries[num_entries++].codepoint = codepoint; | |
319 | } | |
320 | if (ferror (f)) | |
321 | fail ("reading %s", sname); | |
322 | fclose (f); | |
323 | } | |
324 | ||
325 | /* Assumes nodes are added from sorted array, so we never | |
326 | add any node before existing one, only after it. */ | |
327 | ||
328 | static void | |
329 | node_add (struct node **p, const char *key, size_t key_len, | |
330 | unsigned long codepoint) | |
331 | { | |
332 | struct node *n; | |
333 | size_t i; | |
334 | ||
335 | do | |
336 | { | |
337 | if (*p == NULL) | |
338 | { | |
339 | *p = n = (struct node *) xmalloc (sizeof (struct node)); | |
340 | ++num_nodes; | |
341 | assert (key_len); | |
342 | n->sibling = NULL; | |
343 | n->child = NULL; | |
344 | n->key = key; | |
345 | n->key_len = key_len; | |
346 | n->codepoint = codepoint; | |
347 | return; | |
348 | } | |
349 | n = *p; | |
350 | for (i = 0; i < n->key_len && i < key_len; ++i) | |
351 | if (n->key[i] != key[i]) | |
352 | break; | |
353 | if (i == 0) | |
354 | { | |
355 | p = &n->sibling; | |
356 | continue; | |
357 | } | |
358 | if (i == n->key_len) | |
359 | { | |
360 | assert (key_len > n->key_len); | |
361 | p = &n->child; | |
362 | key += n->key_len; | |
363 | key_len -= n->key_len; | |
364 | continue; | |
365 | } | |
366 | /* Need to split the node. */ | |
367 | assert (i < key_len); | |
368 | n = (struct node *) xmalloc (sizeof (struct node)); | |
369 | ++num_nodes; | |
370 | n->sibling = NULL; | |
371 | n->child = (*p)->child; | |
372 | n->key = (*p)->key + i; | |
373 | n->key_len = (*p)->key_len - i; | |
374 | n->codepoint = (*p)->codepoint; | |
375 | (*p)->child = n; | |
376 | (*p)->key_len = i; | |
377 | (*p)->codepoint = NO_VALUE; | |
378 | key += i; | |
379 | key_len -= i; | |
380 | p = &n->sibling; | |
381 | } | |
382 | while (1); | |
383 | } | |
384 | ||
385 | static void | |
386 | append_nodes (struct node *n) | |
387 | { | |
388 | for (; n; n = n->sibling) | |
389 | { | |
390 | nodes[num_nodes++] = n; | |
391 | append_nodes (n->child); | |
392 | } | |
393 | } | |
394 | ||
395 | static size_t | |
396 | sizeof_uleb128 (size_t val) | |
397 | { | |
398 | size_t sz = 0; | |
399 | do | |
400 | { | |
401 | val >>= 7; | |
402 | sz += 1; | |
403 | } | |
404 | while (val != 0); | |
405 | return sz; | |
406 | } | |
407 | ||
408 | static void | |
409 | size_nodes (struct node *n) | |
410 | { | |
411 | if (n->child) | |
412 | size_nodes (n->child); | |
413 | if (n->sibling) | |
414 | size_nodes (n->sibling); | |
415 | n->node_size = 1 + (n->key_len > 1) * 2; | |
416 | if (n->codepoint != NO_VALUE) | |
417 | n->node_size += 3; | |
418 | else if (n->sibling == NULL) | |
419 | ++n->node_size; | |
420 | n->size_sum = 0; | |
421 | n->child_off = 0; | |
422 | if (n->sibling) | |
423 | n->size_sum += n->sibling->size_sum; | |
424 | if (n->child) | |
425 | { | |
426 | n->child_off = n->size_sum + (n->codepoint == NO_VALUE | |
427 | && n->sibling == NULL); | |
428 | n->node_size += sizeof_uleb128 (n->child_off); | |
429 | } | |
430 | n->size_sum += n->node_size; | |
431 | if (n->child) | |
432 | n->size_sum += n->child->size_sum; | |
433 | tree_size += n->node_size; | |
434 | } | |
435 | ||
436 | static void | |
437 | write_uleb128 (unsigned char *p, size_t val) | |
438 | { | |
439 | unsigned char c; | |
440 | do | |
441 | { | |
442 | c = val & 0x7f; | |
443 | val >>= 7; | |
444 | if (val) | |
445 | c |= 0x80; | |
446 | *p++ = c; | |
447 | } | |
448 | while (val); | |
449 | } | |
450 | ||
451 | static void | |
452 | write_nodes (struct node *n, size_t off) | |
453 | { | |
454 | for (; n; n = n->sibling) | |
455 | { | |
b3048b6f | 456 | assert (off < tree_size && tree[off] == 0); |
eb4879ab JJ |
457 | if (n->key_len > 1) |
458 | { | |
459 | assert (n->key_len < 64); | |
460 | tree[off] = n->key_len; | |
461 | } | |
462 | else | |
463 | tree[off] = (n->key[0] - ' ') | 0x80; | |
464 | assert ((tree[off] & 0x40) == 0); | |
465 | if (n->codepoint != NO_VALUE) | |
466 | tree[off] |= 0x40; | |
467 | off++; | |
468 | if (n->key_len > 1) | |
469 | { | |
470 | tree[off++] = n->key_idx & 0xff; | |
471 | tree[off++] = (n->key_idx >> 8) & 0xff; | |
472 | } | |
473 | if (n->codepoint != NO_VALUE) | |
474 | { | |
475 | assert (n->codepoint < (1L << 21)); | |
476 | tree[off++] = n->codepoint & 0xff; | |
477 | tree[off++] = (n->codepoint >> 8) & 0xff; | |
478 | tree[off] = (n->codepoint >> 16) & 0xff; | |
479 | if (n->child) | |
480 | tree[off] |= 0x80; | |
481 | if (!n->sibling) | |
482 | tree[off] |= 0x40; | |
483 | off++; | |
484 | } | |
485 | if (n->child) | |
486 | { | |
487 | write_uleb128 (&tree[off], n->child_off); | |
488 | off += sizeof_uleb128 (n->child_off); | |
489 | write_nodes (n->child, off + n->child_off); | |
490 | } | |
491 | if (n->codepoint == NO_VALUE | |
492 | && n->sibling == NULL) | |
493 | tree[off++] = 0xff; | |
494 | } | |
495 | assert (off <= tree_size); | |
496 | } | |
497 | ||
498 | static void | |
499 | build_radix_tree (void) | |
500 | { | |
501 | size_t i, j, k, key_idx; | |
502 | ||
503 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
504 | if (generated_ranges[i].ok == INT_MAX) | |
505 | { | |
506 | if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL) | |
507 | break; | |
508 | } | |
509 | else if (generated_ranges[i].ok == (generated_ranges[i].high | |
510 | - generated_ranges[i].low + 1)) | |
511 | { | |
512 | if (generated_ranges[i].max_high != generated_ranges[i].high) | |
513 | break; | |
514 | } | |
515 | else | |
516 | break; | |
517 | if (i < ARRAY_SIZE (generated_ranges)) | |
518 | fail ("uncovered generated range %s %lx %lx", | |
519 | generated_ranges[i].prefix, generated_ranges[i].low, | |
520 | generated_ranges[i].high); | |
521 | /* Sort entries alphabetically, node_add relies on that. */ | |
522 | qsort (entries, num_entries, sizeof (struct entry), entrycmp); | |
523 | for (i = 1; i < num_entries; ++i) | |
524 | if (i && strcmp (entries[i].name, entries[i - 1].name) == 0) | |
525 | fail ("multiple entries for name %s", entries[i].name); | |
526 | ||
527 | for (i = 0; i < num_entries; ++i) | |
528 | node_add (&root, entries[i].name, strlen (entries[i].name), | |
529 | entries[i].codepoint); | |
530 | ||
531 | nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *)); | |
532 | i = num_nodes; | |
533 | num_nodes = 0; | |
534 | append_nodes (root); | |
535 | assert (num_nodes == i); | |
536 | /* Sort node pointers by decreasing string length to handle substrings | |
537 | right. */ | |
538 | qsort (nodes, num_nodes, sizeof (struct node *), nodecmp); | |
539 | if (nodes[0]->key_len >= 64) | |
540 | /* We could actually encode even 64 and 65, as key_len 0 and 1 will | |
541 | never appear in the multiple letter key encodings, so could subtract | |
542 | 2. */ | |
543 | fail ("can't encode key length %d >= 64, so need to split some radix " | |
544 | "tree nodes to ensure length fits", nodes[0]->key_len); | |
545 | ||
546 | /* Verify a property charset.cc UAX44-LM2 matching relies on: | |
547 | if - is at the end of key of some node, then all its siblings | |
548 | start with alphanumeric characters. | |
549 | Only 2 character names and 1 alias have - followed by space: | |
550 | U+0F0A TIBETAN MARK BKA- SHOG YIG MGO | |
551 | U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN | |
552 | U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN | |
553 | so the KA- in there will always be followed at least by SHOG | |
554 | in the same node. | |
555 | If this changes, charset.cc needs to change. */ | |
556 | for (i = 0; i < num_nodes; ++i) | |
557 | if (nodes[i]->key[nodes[i]->key_len - 1] == '-' | |
558 | && nodes[i]->child) | |
559 | { | |
560 | struct node *n; | |
561 | ||
562 | for (n = nodes[i]->child; n; n = n->sibling) | |
563 | if (n->key[0] == ' ') | |
564 | fail ("node with key %.*s followed by node with key %.*s", | |
565 | (int) nodes[i]->key_len, nodes[i]->key, | |
566 | (int) n->key_len, n->key); | |
567 | } | |
568 | ||
569 | /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but | |
570 | fortunately num_nodes is < 64K and key_len < 64. */ | |
571 | key_idx = 0; | |
572 | for (i = 0; i < num_nodes; ++i) | |
573 | { | |
574 | nodes[i]->key_idx = SIZE_MAX; | |
575 | nodes[i]->in_dict = false; | |
576 | if (nodes[i]->key_len > 1) | |
577 | { | |
578 | for (j = 0; j < i; ++j) | |
579 | /* Can't rely on memmem unfortunately. */ | |
580 | if (nodes[j]->in_dict) | |
581 | { | |
582 | for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k) | |
583 | if (nodes[j]->key[k] == nodes[i]->key[0] | |
584 | && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, | |
585 | nodes[i]->key_len - 1) == 0) | |
586 | { | |
587 | nodes[i]->key_idx = nodes[j]->key_idx + k; | |
588 | j = i; | |
589 | break; | |
590 | } | |
591 | if (j == i) | |
592 | break; | |
593 | for (; k < nodes[j]->key_len; ++k) | |
594 | if (nodes[j]->key[k] == nodes[i]->key[0] | |
595 | && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, | |
596 | nodes[j]->key_len - 1 - k) == 0) | |
597 | { | |
598 | size_t l; | |
599 | ||
600 | for (l = j + 1; l < i; ++l) | |
601 | if (nodes[l]->in_dict) | |
602 | break; | |
603 | if (l < i | |
604 | && memcmp (nodes[l]->key, | |
605 | nodes[i]->key + (nodes[j]->key_len - k), | |
606 | nodes[i]->key_len | |
607 | - (nodes[j]->key_len - k)) == 0) | |
608 | { | |
609 | nodes[i]->key_idx = nodes[j]->key_idx + k; | |
610 | j = i; | |
611 | } | |
612 | else | |
613 | j = l - 1; | |
614 | break; | |
615 | } | |
616 | } | |
617 | if (nodes[i]->key_idx == SIZE_MAX) | |
618 | { | |
619 | nodes[i]->key_idx = key_idx; | |
620 | nodes[i]->in_dict = true; | |
621 | key_idx += nodes[i]->key_len; | |
622 | } | |
623 | } | |
624 | } | |
625 | if (key_idx >= 65536) | |
626 | /* We only use 2 bytes for offsets into the dictionary. | |
627 | If it grows more, there is e.g. a possibility to replace | |
628 | most often seen words or substrings in the dictionary | |
629 | with characters other than [A-Z0-9 -] (say LETTER occurs | |
630 | in the dictionary almost 197 times and so by using a | |
631 | instead of LETTER we could save (6 - 1) * 197 bytes, | |
632 | with some on the side table mapping 'a' to "LETTER". */ | |
633 | fail ("too large dictionary %ld", (long) key_idx); | |
634 | dict_size = key_idx; | |
635 | ||
636 | size_nodes (root); | |
637 | ||
638 | dict = (char *) xmalloc (dict_size + 1); | |
639 | for (i = 0; i < num_nodes; ++i) | |
640 | if (nodes[i]->in_dict) | |
641 | memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len); | |
642 | dict[dict_size] = '\0'; | |
643 | ||
644 | tree = (unsigned char *) xmalloc (tree_size); | |
645 | memset (tree, 0, tree_size); | |
646 | write_nodes (root, 0); | |
647 | } | |
648 | ||
649 | /* Print out the huge copyright notice. */ | |
650 | ||
651 | static void | |
652 | write_copyright (void) | |
653 | { | |
654 | static const char copyright[] = "\ | |
655 | /* Unicode name to codepoint.\n\ | |
a945c346 | 656 | Copyright (C) 2005-2024 Free Software Foundation, Inc.\n\ |
eb4879ab JJ |
657 | \n\ |
658 | This program is free software; you can redistribute it and/or modify it\n\ | |
659 | under the terms of the GNU General Public License as published by the\n\ | |
660 | Free Software Foundation; either version 3, or (at your option) any\n\ | |
661 | later version.\n\ | |
662 | \n\ | |
663 | This program is distributed in the hope that it will be useful,\n\ | |
664 | but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ | |
665 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ | |
666 | GNU General Public License for more details.\n\ | |
667 | \n\ | |
668 | You should have received a copy of the GNU General Public License\n\ | |
669 | along with this program; see the file COPYING3. If not see\n\ | |
670 | <http://www.gnu.org/licenses/>.\n\ | |
671 | \n\ | |
672 | \n\ | |
d64b7c82 | 673 | Copyright (C) 1991-2023 Unicode, Inc. All rights reserved.\n\ |
eb4879ab JJ |
674 | Distributed under the Terms of Use in\n\ |
675 | http://www.unicode.org/copyright.html.\n\ | |
676 | \n\ | |
677 | Permission is hereby granted, free of charge, to any person\n\ | |
678 | obtaining a copy of the Unicode data files and any associated\n\ | |
679 | documentation (the \"Data Files\") or Unicode software and any\n\ | |
680 | associated documentation (the \"Software\") to deal in the Data Files\n\ | |
681 | or Software without restriction, including without limitation the\n\ | |
682 | rights to use, copy, modify, merge, publish, distribute, and/or\n\ | |
683 | sell copies of the Data Files or Software, and to permit persons to\n\ | |
684 | whom the Data Files or Software are furnished to do so, provided\n\ | |
685 | that (a) the above copyright notice(s) and this permission notice\n\ | |
686 | appear with all copies of the Data Files or Software, (b) both the\n\ | |
687 | above copyright notice(s) and this permission notice appear in\n\ | |
688 | associated documentation, and (c) there is clear notice in each\n\ | |
689 | modified Data File or in the Software as well as in the\n\ | |
690 | documentation associated with the Data File(s) or Software that the\n\ | |
691 | data or software has been modified.\n\ | |
692 | \n\ | |
693 | THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ | |
694 | OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ | |
695 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ | |
696 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ | |
697 | COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ | |
698 | ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ | |
699 | DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ | |
700 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ | |
701 | ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ | |
702 | OF THE DATA FILES OR SOFTWARE.\n\ | |
703 | \n\ | |
704 | Except as contained in this notice, the name of a copyright holder\n\ | |
705 | shall not be used in advertising or otherwise to promote the sale,\n\ | |
706 | use or other dealings in these Data Files or Software without prior\n\ | |
707 | written authorization of the copyright holder. */\n"; | |
708 | ||
709 | puts (copyright); | |
710 | } | |
711 | ||
712 | static void | |
713 | write_dict (void) | |
714 | { | |
715 | size_t i; | |
716 | ||
717 | printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1)); | |
718 | for (i = 0; i < dict_size; i += 77) | |
719 | printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : ""); | |
720 | puts (""); | |
721 | } | |
722 | ||
723 | static void | |
724 | write_tree (void) | |
725 | { | |
726 | size_t i, j; | |
727 | ||
728 | printf ("static const unsigned char uname2c_tree[%ld] = {\n", | |
729 | (long) tree_size); | |
730 | for (i = 0, j = 0; i < tree_size; ++i) | |
731 | { | |
732 | printf ("%s0x%02x%s", j == 0 ? " " : "", tree[i], | |
733 | i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", "); | |
734 | if (j == 11) | |
735 | j = 0; | |
736 | else | |
737 | ++j; | |
738 | } | |
739 | } | |
740 | ||
741 | static void | |
742 | write_generated (void) | |
743 | { | |
744 | size_t i, j; | |
745 | ||
746 | puts ("static const cppchar_t uname2c_pairs[] = {"); | |
747 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
748 | { | |
749 | if (i == 0) | |
750 | ; | |
751 | else if (generated_ranges[i - 1].idx != generated_ranges[i].idx) | |
752 | puts (", 0,"); | |
753 | else | |
754 | puts (","); | |
755 | printf (" 0x%lx, 0x%lx /* %s */", | |
756 | generated_ranges[i].low, | |
757 | generated_ranges[i].high, | |
758 | generated_ranges[i].prefix); | |
759 | } | |
760 | puts (", 0 };\n"); | |
761 | ||
762 | puts ("static const unsigned char uname2c_generated[] = {"); | |
763 | for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i) | |
764 | { | |
765 | if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx) | |
766 | printf ("%s %d /* %s */", i ? ",\n" : "", | |
767 | ++j, generated_ranges[i].prefix); | |
768 | j += 2; | |
769 | } | |
770 | puts (" };\n"); | |
771 | } | |
772 | ||
773 | /* Main program. */ | |
774 | ||
775 | int | |
776 | main (int argc, char **argv) | |
777 | { | |
778 | size_t i; | |
779 | ||
780 | if (argc != 3) | |
781 | fail ("too few arguments to makeradixtree"); | |
782 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
783 | if (!generated_ranges[i].max_high) | |
784 | generated_ranges[i].max_high = generated_ranges[i].high; | |
785 | read_table (argv[1], false); | |
786 | read_table (argv[2], true); | |
787 | build_radix_tree (); | |
788 | ||
789 | write_copyright (); | |
790 | write_dict (); | |
791 | write_tree (); | |
792 | write_generated (); | |
793 | printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len); | |
794 | return 0; | |
795 | } |