]>
Commit | Line | Data |
---|---|---|
eb4879ab | 1 | /* Make uname2c.h from various sources. |
83ffe9cd | 2 | Copyright (C) 2005-2023 Free Software Foundation, Inc. |
eb4879ab JJ |
3 | Contributed by Jakub Jelinek <jakub@redhat.com> |
4 | ||
5 | This program is free software; you can redistribute it and/or modify it | |
6 | under the terms of the GNU General Public License as published by the | |
7 | Free Software Foundation; either version 3, or (at your option) any | |
8 | later version. | |
9 | ||
10 | This program is distributed in the hope that it will be useful, | |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 | GNU General Public License for more details. | |
14 | ||
15 | You should have received a copy of the GNU General Public License | |
16 | along with this program; see the file COPYING3. If not see | |
17 | <http://www.gnu.org/licenses/>. */ | |
18 | ||
19 | /* Run this program as | |
20 | ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h | |
21 | ||
22 | This program generates 2 big arrays and 2 small ones. | |
23 | The large ones are uname2c_dict, initialized by string literal | |
24 | representing dictionary, and uname2c_tree, which is a space optimized | |
25 | radix tree. | |
26 | The format of the radix tree is: | |
27 | byte 0 either 0x80 + (key[0] - ' ') (if key_len == 1) | |
28 | or key_len (otherwise) | |
29 | either of them ored with 0x40 if it has a codepoint | |
30 | byte 1 LSB of offset into uname2c_dict for key (only if key_len > 1) | |
31 | byte 2 MSB of offset into uname2c_dict for key (only if key_len > 1) | |
32 | if key_len == 1, the above 2 bytes are omitted | |
33 | byte 3 LSB of codepoint (only if it has a codepoint) | |
34 | byte 4 middle byte of codepoint (ditto) | |
35 | byte 5 MSB of codepoint (ditto), ored with 0x80 if node has children | |
36 | ored with 0x40 if it doesn't have siblings | |
37 | if it doesn't have a codepoint, the above 3 bytes are omitted | |
38 | and we assume that the node has children | |
39 | byte 6, 7, 8 uleb128 encoded offset to first child relative to the end | |
40 | of the uleb128 (only if node has children) | |
41 | byte 9 0xff (only if node doesn't have a codepoint and doesn't | |
42 | have siblings) | |
43 | ||
44 | For prefixes of Unicode NR1 or NR2 rule generated names, on a node | |
45 | representing end of the prefix codepoint is 0xd800 + index into | |
46 | uname2c_generated array with indexes into uname2c_pairs array of | |
47 | code points (low, high) of the ranges terminated by single 0. | |
48 | 0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules. | |
49 | */ | |
50 | ||
51 | #include <assert.h> | |
52 | #include <stdio.h> | |
53 | #include <string.h> | |
54 | #include <stdint.h> | |
55 | #include <ctype.h> | |
56 | #include <limits.h> | |
57 | #include <stdarg.h> | |
58 | #include <stdbool.h> | |
59 | #include <stdlib.h> | |
60 | ||
61 | #define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) | |
62 | ||
63 | #define NUM_CODE_POINTS 0x110000 | |
64 | #define MAX_CODE_POINT 0x10ffff | |
65 | #define NO_VALUE 0xdc00 | |
66 | #define GENERATED 0xd800 | |
67 | ||
68 | struct entry { const char *name; unsigned long codepoint; }; | |
69 | static struct entry *entries; | |
70 | static unsigned long num_allocated, num_entries; | |
71 | ||
2662d537 | 72 | /* Unicode 15 Table 4-8. */ |
eb4879ab JJ |
73 | struct generated { |
74 | const char *prefix; | |
75 | /* max_high is a workaround for UnicodeData.txt inconsistencies | |
76 | on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>" | |
77 | entry is a few code points above the end of the range. */ | |
78 | unsigned long low, high, max_high; | |
79 | int idx, ok; | |
80 | }; | |
81 | static struct generated generated_ranges[] = | |
82 | { { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */ | |
83 | { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */ | |
2662d537 JJ |
84 | { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9fff, 0, 1, 0 }, |
85 | { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6df, 0, 1, 0 }, | |
86 | { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b739, 0, 1, 0 }, | |
eb4879ab JJ |
87 | { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 }, |
88 | { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 }, | |
89 | { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 }, | |
90 | { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 }, | |
2662d537 | 91 | { "CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323af, 0, 1, 0 }, |
eb4879ab JJ |
92 | { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 }, |
93 | { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 }, | |
94 | { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 }, | |
95 | { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 }, | |
96 | { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 }, | |
97 | { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 }, | |
98 | { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 } | |
99 | }; | |
100 | ||
101 | struct node { | |
102 | struct node *sibling, *child; | |
103 | const char *key; | |
104 | size_t key_len, key_idx, node_size, size_sum, child_off; | |
105 | unsigned long codepoint; | |
106 | bool in_dict; | |
107 | }; | |
108 | static struct node *root, **nodes; | |
109 | static unsigned long num_nodes; | |
110 | static size_t dict_size, tree_size, max_entry_len; | |
111 | static char *dict; | |
112 | static unsigned char *tree; | |
113 | ||
114 | /* Die! */ | |
115 | ||
116 | static void | |
117 | fail (const char *s, ...) | |
118 | { | |
119 | va_list ap; | |
120 | ||
121 | va_start (ap, s); | |
122 | vfprintf (stderr, s, ap); | |
123 | va_end (ap); | |
124 | fputc ('\n', stderr); | |
125 | exit (1); | |
126 | } | |
127 | ||
128 | static void * | |
129 | xmalloc (size_t size) | |
130 | { | |
131 | void *ret = malloc (size); | |
132 | ||
133 | if (ret == NULL) | |
134 | fail ("failed to allocate %ld bytes", (long) size); | |
135 | return ret; | |
136 | } | |
137 | ||
138 | static void * | |
139 | xrealloc (void *p, size_t size) | |
140 | { | |
141 | void *ret = p ? realloc (p, size) : malloc (size); | |
142 | ||
143 | if (ret == NULL) | |
144 | fail ("failed to allocate %ld bytes", (long) size); | |
145 | return ret; | |
146 | } | |
147 | ||
148 | static int | |
149 | entrycmp (const void *p1, const void *p2) | |
150 | { | |
151 | const struct entry *e1 = (const struct entry *) p1; | |
152 | const struct entry *e2 = (const struct entry *) p2; | |
153 | int ret = strcmp (e1->name, e2->name); | |
154 | ||
155 | if (ret != 0) | |
156 | return ret; | |
157 | if (e1->codepoint < e2->codepoint) | |
158 | return -1; | |
159 | if (e1->codepoint > e2->codepoint) | |
160 | return 1; | |
161 | return 0; | |
162 | } | |
163 | ||
164 | static int | |
165 | nodecmp (const void *p1, const void *p2) | |
166 | { | |
167 | const struct node *n1 = *(const struct node *const *) p1; | |
168 | const struct node *n2 = *(const struct node *const *) p2; | |
169 | if (n1->key_len > n2->key_len) | |
170 | return -1; | |
171 | if (n1->key_len < n2->key_len) | |
172 | return 1; | |
173 | return memcmp (n1->key, n2->key, n1->key_len); | |
174 | } | |
175 | ||
176 | /* Read UnicodeData.txt and fill in the 'decomp' table to be the | |
177 | decompositions of characters for which both the character | |
178 | decomposed and all the code points in the decomposition are valid | |
179 | for some supported language version, and the 'all_decomp' table to | |
180 | be the decompositions of all characters without those | |
181 | constraints. */ | |
182 | ||
183 | static void | |
184 | read_table (char *fname, bool aliases_p) | |
185 | { | |
186 | FILE *f = fopen (fname, "r"); | |
187 | const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt"; | |
188 | ||
189 | if (!f) | |
190 | fail ("opening %s", sname); | |
191 | for (;;) | |
192 | { | |
193 | char line[256]; | |
194 | unsigned long codepoint; | |
195 | const char *name, *aname; | |
196 | char *l; | |
197 | size_t i; | |
198 | ||
199 | if (!fgets (line, sizeof (line), f)) | |
200 | break; | |
201 | codepoint = strtoul (line, &l, 16); | |
202 | if (l == line && aliases_p) | |
203 | { | |
204 | /* NameAliased.txt can contain comments and empty lines. */ | |
205 | if (*line == '#' || *line == '\n') | |
206 | continue; | |
207 | } | |
208 | if (l == line || *l != ';') | |
209 | fail ("parsing %s, reading code point", sname); | |
210 | if (codepoint > MAX_CODE_POINT) | |
211 | fail ("parsing %s, code point too large", sname); | |
212 | ||
213 | name = l + 1; | |
214 | do { | |
215 | ++l; | |
216 | } while (*l != ';'); | |
217 | ||
218 | aname = NULL; | |
219 | if (aliases_p) | |
220 | { | |
221 | /* Ignore figment and abbreviation aliases. */ | |
222 | if (strcmp (l + 1, "correction\n") != 0 | |
223 | && strcmp (l + 1, "control\n") != 0 | |
224 | && strcmp (l + 1, "alternate\n") != 0) | |
225 | continue; | |
226 | i = ARRAY_SIZE (generated_ranges); | |
227 | } | |
228 | else | |
229 | { | |
230 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
231 | if (codepoint >= generated_ranges[i].low | |
232 | && codepoint <= generated_ranges[i].max_high) | |
233 | break; | |
234 | if (i != ARRAY_SIZE (generated_ranges)) | |
235 | { | |
236 | if (*name == '<' && l[-1] == '>') | |
237 | { | |
238 | if (codepoint == generated_ranges[i].low | |
239 | && l - name >= 9 | |
240 | && memcmp (l - 8, ", First>", 8) == 0 | |
241 | && generated_ranges[i].ok == 0) | |
242 | { | |
243 | generated_ranges[i].ok = INT_MAX - 1; | |
244 | aname = generated_ranges[i].prefix; | |
245 | codepoint = GENERATED + generated_ranges[i].idx; | |
246 | } | |
247 | /* Unfortunately, UnicodeData.txt isn't consistent | |
248 | with the Table 4-8 range endpoints in 3 cases, | |
249 | the ranges are longer there by a few codepoints. | |
250 | So use the max_high hack to avoid verification | |
251 | failures. */ | |
252 | else if (codepoint == generated_ranges[i].max_high | |
253 | && l - name >= 8 | |
254 | && memcmp (l - 7, ", Last>", 7) == 0 | |
255 | && generated_ranges[i].ok == INT_MAX - 1) | |
256 | { | |
257 | generated_ranges[i].ok = INT_MAX; | |
258 | continue; | |
259 | } | |
260 | else | |
261 | fail ("unexpected generated entry %lx %.*s", | |
262 | codepoint, (int) (l - name), name); | |
263 | } | |
264 | else if (codepoint | |
265 | == generated_ranges[i].low + generated_ranges[i].ok | |
266 | && l - name == (strlen (generated_ranges[i].prefix) | |
267 | + (name - 1 - line)) | |
268 | && memcmp (name, generated_ranges[i].prefix, | |
269 | strlen (generated_ranges[i].prefix)) == 0 | |
270 | && memcmp (name + strlen (generated_ranges[i].prefix), | |
271 | line, name - 1 - line) == 0) | |
272 | { | |
273 | ++generated_ranges[i].ok; | |
274 | if (codepoint != generated_ranges[i].low) | |
275 | continue; | |
276 | aname = generated_ranges[i].prefix; | |
277 | codepoint = GENERATED + generated_ranges[i].idx; | |
278 | } | |
279 | else | |
280 | fail ("unexpected generated entry %lx %.*s", | |
281 | codepoint, (int) (l - name), name); | |
282 | if (aname == generated_ranges[i].prefix) | |
283 | { | |
284 | size_t j; | |
285 | ||
286 | /* Don't add an entry for a generated range where the | |
287 | same prefix has been added already. */ | |
288 | for (j = 0; j < i; ++j) | |
289 | if (generated_ranges[j].idx == generated_ranges[i].idx | |
290 | && generated_ranges[j].ok != 0) | |
291 | break; | |
292 | if (j < i) | |
293 | continue; | |
294 | } | |
295 | } | |
296 | else if (*name == '<' && l[-1] == '>') | |
297 | continue; | |
298 | } | |
299 | ||
300 | if (num_entries == num_allocated) | |
301 | { | |
302 | num_allocated = num_allocated ? 2 * num_allocated : 65536; | |
303 | entries = (struct entry *) xrealloc (entries, num_allocated | |
304 | * sizeof (entries[0])); | |
305 | } | |
306 | ||
307 | if (aname == NULL) | |
308 | { | |
309 | char *a = (char *) xmalloc (l + 1 - name); | |
310 | if (l - name > max_entry_len) | |
311 | max_entry_len = l - name; | |
312 | memcpy (a, name, l - name); | |
313 | a[l - name] = '\0'; | |
314 | aname = a; | |
315 | } | |
316 | entries[num_entries].name = aname; | |
317 | entries[num_entries++].codepoint = codepoint; | |
318 | } | |
319 | if (ferror (f)) | |
320 | fail ("reading %s", sname); | |
321 | fclose (f); | |
322 | } | |
323 | ||
324 | /* Assumes nodes are added from sorted array, so we never | |
325 | add any node before existing one, only after it. */ | |
326 | ||
327 | static void | |
328 | node_add (struct node **p, const char *key, size_t key_len, | |
329 | unsigned long codepoint) | |
330 | { | |
331 | struct node *n; | |
332 | size_t i; | |
333 | ||
334 | do | |
335 | { | |
336 | if (*p == NULL) | |
337 | { | |
338 | *p = n = (struct node *) xmalloc (sizeof (struct node)); | |
339 | ++num_nodes; | |
340 | assert (key_len); | |
341 | n->sibling = NULL; | |
342 | n->child = NULL; | |
343 | n->key = key; | |
344 | n->key_len = key_len; | |
345 | n->codepoint = codepoint; | |
346 | return; | |
347 | } | |
348 | n = *p; | |
349 | for (i = 0; i < n->key_len && i < key_len; ++i) | |
350 | if (n->key[i] != key[i]) | |
351 | break; | |
352 | if (i == 0) | |
353 | { | |
354 | p = &n->sibling; | |
355 | continue; | |
356 | } | |
357 | if (i == n->key_len) | |
358 | { | |
359 | assert (key_len > n->key_len); | |
360 | p = &n->child; | |
361 | key += n->key_len; | |
362 | key_len -= n->key_len; | |
363 | continue; | |
364 | } | |
365 | /* Need to split the node. */ | |
366 | assert (i < key_len); | |
367 | n = (struct node *) xmalloc (sizeof (struct node)); | |
368 | ++num_nodes; | |
369 | n->sibling = NULL; | |
370 | n->child = (*p)->child; | |
371 | n->key = (*p)->key + i; | |
372 | n->key_len = (*p)->key_len - i; | |
373 | n->codepoint = (*p)->codepoint; | |
374 | (*p)->child = n; | |
375 | (*p)->key_len = i; | |
376 | (*p)->codepoint = NO_VALUE; | |
377 | key += i; | |
378 | key_len -= i; | |
379 | p = &n->sibling; | |
380 | } | |
381 | while (1); | |
382 | } | |
383 | ||
384 | static void | |
385 | append_nodes (struct node *n) | |
386 | { | |
387 | for (; n; n = n->sibling) | |
388 | { | |
389 | nodes[num_nodes++] = n; | |
390 | append_nodes (n->child); | |
391 | } | |
392 | } | |
393 | ||
394 | static size_t | |
395 | sizeof_uleb128 (size_t val) | |
396 | { | |
397 | size_t sz = 0; | |
398 | do | |
399 | { | |
400 | val >>= 7; | |
401 | sz += 1; | |
402 | } | |
403 | while (val != 0); | |
404 | return sz; | |
405 | } | |
406 | ||
407 | static void | |
408 | size_nodes (struct node *n) | |
409 | { | |
410 | if (n->child) | |
411 | size_nodes (n->child); | |
412 | if (n->sibling) | |
413 | size_nodes (n->sibling); | |
414 | n->node_size = 1 + (n->key_len > 1) * 2; | |
415 | if (n->codepoint != NO_VALUE) | |
416 | n->node_size += 3; | |
417 | else if (n->sibling == NULL) | |
418 | ++n->node_size; | |
419 | n->size_sum = 0; | |
420 | n->child_off = 0; | |
421 | if (n->sibling) | |
422 | n->size_sum += n->sibling->size_sum; | |
423 | if (n->child) | |
424 | { | |
425 | n->child_off = n->size_sum + (n->codepoint == NO_VALUE | |
426 | && n->sibling == NULL); | |
427 | n->node_size += sizeof_uleb128 (n->child_off); | |
428 | } | |
429 | n->size_sum += n->node_size; | |
430 | if (n->child) | |
431 | n->size_sum += n->child->size_sum; | |
432 | tree_size += n->node_size; | |
433 | } | |
434 | ||
435 | static void | |
436 | write_uleb128 (unsigned char *p, size_t val) | |
437 | { | |
438 | unsigned char c; | |
439 | do | |
440 | { | |
441 | c = val & 0x7f; | |
442 | val >>= 7; | |
443 | if (val) | |
444 | c |= 0x80; | |
445 | *p++ = c; | |
446 | } | |
447 | while (val); | |
448 | } | |
449 | ||
450 | static void | |
451 | write_nodes (struct node *n, size_t off) | |
452 | { | |
453 | for (; n; n = n->sibling) | |
454 | { | |
b3048b6f | 455 | assert (off < tree_size && tree[off] == 0); |
eb4879ab JJ |
456 | if (n->key_len > 1) |
457 | { | |
458 | assert (n->key_len < 64); | |
459 | tree[off] = n->key_len; | |
460 | } | |
461 | else | |
462 | tree[off] = (n->key[0] - ' ') | 0x80; | |
463 | assert ((tree[off] & 0x40) == 0); | |
464 | if (n->codepoint != NO_VALUE) | |
465 | tree[off] |= 0x40; | |
466 | off++; | |
467 | if (n->key_len > 1) | |
468 | { | |
469 | tree[off++] = n->key_idx & 0xff; | |
470 | tree[off++] = (n->key_idx >> 8) & 0xff; | |
471 | } | |
472 | if (n->codepoint != NO_VALUE) | |
473 | { | |
474 | assert (n->codepoint < (1L << 21)); | |
475 | tree[off++] = n->codepoint & 0xff; | |
476 | tree[off++] = (n->codepoint >> 8) & 0xff; | |
477 | tree[off] = (n->codepoint >> 16) & 0xff; | |
478 | if (n->child) | |
479 | tree[off] |= 0x80; | |
480 | if (!n->sibling) | |
481 | tree[off] |= 0x40; | |
482 | off++; | |
483 | } | |
484 | if (n->child) | |
485 | { | |
486 | write_uleb128 (&tree[off], n->child_off); | |
487 | off += sizeof_uleb128 (n->child_off); | |
488 | write_nodes (n->child, off + n->child_off); | |
489 | } | |
490 | if (n->codepoint == NO_VALUE | |
491 | && n->sibling == NULL) | |
492 | tree[off++] = 0xff; | |
493 | } | |
494 | assert (off <= tree_size); | |
495 | } | |
496 | ||
497 | static void | |
498 | build_radix_tree (void) | |
499 | { | |
500 | size_t i, j, k, key_idx; | |
501 | ||
502 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
503 | if (generated_ranges[i].ok == INT_MAX) | |
504 | { | |
505 | if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL) | |
506 | break; | |
507 | } | |
508 | else if (generated_ranges[i].ok == (generated_ranges[i].high | |
509 | - generated_ranges[i].low + 1)) | |
510 | { | |
511 | if (generated_ranges[i].max_high != generated_ranges[i].high) | |
512 | break; | |
513 | } | |
514 | else | |
515 | break; | |
516 | if (i < ARRAY_SIZE (generated_ranges)) | |
517 | fail ("uncovered generated range %s %lx %lx", | |
518 | generated_ranges[i].prefix, generated_ranges[i].low, | |
519 | generated_ranges[i].high); | |
520 | /* Sort entries alphabetically, node_add relies on that. */ | |
521 | qsort (entries, num_entries, sizeof (struct entry), entrycmp); | |
522 | for (i = 1; i < num_entries; ++i) | |
523 | if (i && strcmp (entries[i].name, entries[i - 1].name) == 0) | |
524 | fail ("multiple entries for name %s", entries[i].name); | |
525 | ||
526 | for (i = 0; i < num_entries; ++i) | |
527 | node_add (&root, entries[i].name, strlen (entries[i].name), | |
528 | entries[i].codepoint); | |
529 | ||
530 | nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *)); | |
531 | i = num_nodes; | |
532 | num_nodes = 0; | |
533 | append_nodes (root); | |
534 | assert (num_nodes == i); | |
535 | /* Sort node pointers by decreasing string length to handle substrings | |
536 | right. */ | |
537 | qsort (nodes, num_nodes, sizeof (struct node *), nodecmp); | |
538 | if (nodes[0]->key_len >= 64) | |
539 | /* We could actually encode even 64 and 65, as key_len 0 and 1 will | |
540 | never appear in the multiple letter key encodings, so could subtract | |
541 | 2. */ | |
542 | fail ("can't encode key length %d >= 64, so need to split some radix " | |
543 | "tree nodes to ensure length fits", nodes[0]->key_len); | |
544 | ||
545 | /* Verify a property charset.cc UAX44-LM2 matching relies on: | |
546 | if - is at the end of key of some node, then all its siblings | |
547 | start with alphanumeric characters. | |
548 | Only 2 character names and 1 alias have - followed by space: | |
549 | U+0F0A TIBETAN MARK BKA- SHOG YIG MGO | |
550 | U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN | |
551 | U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN | |
552 | so the KA- in there will always be followed at least by SHOG | |
553 | in the same node. | |
554 | If this changes, charset.cc needs to change. */ | |
555 | for (i = 0; i < num_nodes; ++i) | |
556 | if (nodes[i]->key[nodes[i]->key_len - 1] == '-' | |
557 | && nodes[i]->child) | |
558 | { | |
559 | struct node *n; | |
560 | ||
561 | for (n = nodes[i]->child; n; n = n->sibling) | |
562 | if (n->key[0] == ' ') | |
563 | fail ("node with key %.*s followed by node with key %.*s", | |
564 | (int) nodes[i]->key_len, nodes[i]->key, | |
565 | (int) n->key_len, n->key); | |
566 | } | |
567 | ||
568 | /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but | |
569 | fortunately num_nodes is < 64K and key_len < 64. */ | |
570 | key_idx = 0; | |
571 | for (i = 0; i < num_nodes; ++i) | |
572 | { | |
573 | nodes[i]->key_idx = SIZE_MAX; | |
574 | nodes[i]->in_dict = false; | |
575 | if (nodes[i]->key_len > 1) | |
576 | { | |
577 | for (j = 0; j < i; ++j) | |
578 | /* Can't rely on memmem unfortunately. */ | |
579 | if (nodes[j]->in_dict) | |
580 | { | |
581 | for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k) | |
582 | if (nodes[j]->key[k] == nodes[i]->key[0] | |
583 | && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, | |
584 | nodes[i]->key_len - 1) == 0) | |
585 | { | |
586 | nodes[i]->key_idx = nodes[j]->key_idx + k; | |
587 | j = i; | |
588 | break; | |
589 | } | |
590 | if (j == i) | |
591 | break; | |
592 | for (; k < nodes[j]->key_len; ++k) | |
593 | if (nodes[j]->key[k] == nodes[i]->key[0] | |
594 | && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, | |
595 | nodes[j]->key_len - 1 - k) == 0) | |
596 | { | |
597 | size_t l; | |
598 | ||
599 | for (l = j + 1; l < i; ++l) | |
600 | if (nodes[l]->in_dict) | |
601 | break; | |
602 | if (l < i | |
603 | && memcmp (nodes[l]->key, | |
604 | nodes[i]->key + (nodes[j]->key_len - k), | |
605 | nodes[i]->key_len | |
606 | - (nodes[j]->key_len - k)) == 0) | |
607 | { | |
608 | nodes[i]->key_idx = nodes[j]->key_idx + k; | |
609 | j = i; | |
610 | } | |
611 | else | |
612 | j = l - 1; | |
613 | break; | |
614 | } | |
615 | } | |
616 | if (nodes[i]->key_idx == SIZE_MAX) | |
617 | { | |
618 | nodes[i]->key_idx = key_idx; | |
619 | nodes[i]->in_dict = true; | |
620 | key_idx += nodes[i]->key_len; | |
621 | } | |
622 | } | |
623 | } | |
624 | if (key_idx >= 65536) | |
625 | /* We only use 2 bytes for offsets into the dictionary. | |
626 | If it grows more, there is e.g. a possibility to replace | |
627 | most often seen words or substrings in the dictionary | |
628 | with characters other than [A-Z0-9 -] (say LETTER occurs | |
629 | in the dictionary almost 197 times and so by using a | |
630 | instead of LETTER we could save (6 - 1) * 197 bytes, | |
631 | with some on the side table mapping 'a' to "LETTER". */ | |
632 | fail ("too large dictionary %ld", (long) key_idx); | |
633 | dict_size = key_idx; | |
634 | ||
635 | size_nodes (root); | |
636 | ||
637 | dict = (char *) xmalloc (dict_size + 1); | |
638 | for (i = 0; i < num_nodes; ++i) | |
639 | if (nodes[i]->in_dict) | |
640 | memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len); | |
641 | dict[dict_size] = '\0'; | |
642 | ||
643 | tree = (unsigned char *) xmalloc (tree_size); | |
644 | memset (tree, 0, tree_size); | |
645 | write_nodes (root, 0); | |
646 | } | |
647 | ||
648 | /* Print out the huge copyright notice. */ | |
649 | ||
650 | static void | |
651 | write_copyright (void) | |
652 | { | |
653 | static const char copyright[] = "\ | |
654 | /* Unicode name to codepoint.\n\ | |
83ffe9cd | 655 | Copyright (C) 2005-2023 Free Software Foundation, Inc.\n\ |
eb4879ab JJ |
656 | \n\ |
657 | This program is free software; you can redistribute it and/or modify it\n\ | |
658 | under the terms of the GNU General Public License as published by the\n\ | |
659 | Free Software Foundation; either version 3, or (at your option) any\n\ | |
660 | later version.\n\ | |
661 | \n\ | |
662 | This program is distributed in the hope that it will be useful,\n\ | |
663 | but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ | |
664 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ | |
665 | GNU General Public License for more details.\n\ | |
666 | \n\ | |
667 | You should have received a copy of the GNU General Public License\n\ | |
668 | along with this program; see the file COPYING3. If not see\n\ | |
669 | <http://www.gnu.org/licenses/>.\n\ | |
670 | \n\ | |
671 | \n\ | |
672 | Copyright (C) 1991-2021 Unicode, Inc. All rights reserved.\n\ | |
673 | Distributed under the Terms of Use in\n\ | |
674 | http://www.unicode.org/copyright.html.\n\ | |
675 | \n\ | |
676 | Permission is hereby granted, free of charge, to any person\n\ | |
677 | obtaining a copy of the Unicode data files and any associated\n\ | |
678 | documentation (the \"Data Files\") or Unicode software and any\n\ | |
679 | associated documentation (the \"Software\") to deal in the Data Files\n\ | |
680 | or Software without restriction, including without limitation the\n\ | |
681 | rights to use, copy, modify, merge, publish, distribute, and/or\n\ | |
682 | sell copies of the Data Files or Software, and to permit persons to\n\ | |
683 | whom the Data Files or Software are furnished to do so, provided\n\ | |
684 | that (a) the above copyright notice(s) and this permission notice\n\ | |
685 | appear with all copies of the Data Files or Software, (b) both the\n\ | |
686 | above copyright notice(s) and this permission notice appear in\n\ | |
687 | associated documentation, and (c) there is clear notice in each\n\ | |
688 | modified Data File or in the Software as well as in the\n\ | |
689 | documentation associated with the Data File(s) or Software that the\n\ | |
690 | data or software has been modified.\n\ | |
691 | \n\ | |
692 | THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ | |
693 | OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ | |
694 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ | |
695 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ | |
696 | COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ | |
697 | ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ | |
698 | DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ | |
699 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ | |
700 | ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ | |
701 | OF THE DATA FILES OR SOFTWARE.\n\ | |
702 | \n\ | |
703 | Except as contained in this notice, the name of a copyright holder\n\ | |
704 | shall not be used in advertising or otherwise to promote the sale,\n\ | |
705 | use or other dealings in these Data Files or Software without prior\n\ | |
706 | written authorization of the copyright holder. */\n"; | |
707 | ||
708 | puts (copyright); | |
709 | } | |
710 | ||
711 | static void | |
712 | write_dict (void) | |
713 | { | |
714 | size_t i; | |
715 | ||
716 | printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1)); | |
717 | for (i = 0; i < dict_size; i += 77) | |
718 | printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : ""); | |
719 | puts (""); | |
720 | } | |
721 | ||
722 | static void | |
723 | write_tree (void) | |
724 | { | |
725 | size_t i, j; | |
726 | ||
727 | printf ("static const unsigned char uname2c_tree[%ld] = {\n", | |
728 | (long) tree_size); | |
729 | for (i = 0, j = 0; i < tree_size; ++i) | |
730 | { | |
731 | printf ("%s0x%02x%s", j == 0 ? " " : "", tree[i], | |
732 | i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", "); | |
733 | if (j == 11) | |
734 | j = 0; | |
735 | else | |
736 | ++j; | |
737 | } | |
738 | } | |
739 | ||
740 | static void | |
741 | write_generated (void) | |
742 | { | |
743 | size_t i, j; | |
744 | ||
745 | puts ("static const cppchar_t uname2c_pairs[] = {"); | |
746 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
747 | { | |
748 | if (i == 0) | |
749 | ; | |
750 | else if (generated_ranges[i - 1].idx != generated_ranges[i].idx) | |
751 | puts (", 0,"); | |
752 | else | |
753 | puts (","); | |
754 | printf (" 0x%lx, 0x%lx /* %s */", | |
755 | generated_ranges[i].low, | |
756 | generated_ranges[i].high, | |
757 | generated_ranges[i].prefix); | |
758 | } | |
759 | puts (", 0 };\n"); | |
760 | ||
761 | puts ("static const unsigned char uname2c_generated[] = {"); | |
762 | for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i) | |
763 | { | |
764 | if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx) | |
765 | printf ("%s %d /* %s */", i ? ",\n" : "", | |
766 | ++j, generated_ranges[i].prefix); | |
767 | j += 2; | |
768 | } | |
769 | puts (" };\n"); | |
770 | } | |
771 | ||
772 | /* Main program. */ | |
773 | ||
774 | int | |
775 | main (int argc, char **argv) | |
776 | { | |
777 | size_t i; | |
778 | ||
779 | if (argc != 3) | |
780 | fail ("too few arguments to makeradixtree"); | |
781 | for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) | |
782 | if (!generated_ranges[i].max_high) | |
783 | generated_ranges[i].max_high = generated_ranges[i].high; | |
784 | read_table (argv[1], false); | |
785 | read_table (argv[2], true); | |
786 | build_radix_tree (); | |
787 | ||
788 | write_copyright (); | |
789 | write_dict (); | |
790 | write_tree (); | |
791 | write_generated (); | |
792 | printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len); | |
793 | return 0; | |
794 | } |