]>
Commit | Line | Data |
---|---|---|
bce47149 | 1 | /* Make ucnid.h from various sources. |
fbd26352 | 2 | Copyright (C) 2005-2019 Free Software Foundation, Inc. |
bce47149 | 3 | |
4 | This program is free software; you can redistribute it and/or modify it | |
5 | under the terms of the GNU General Public License as published by the | |
6bc9506f | 6 | Free Software Foundation; either version 3, or (at your option) any |
bce47149 | 7 | later version. |
8 | ||
9 | This program is distributed in the hope that it will be useful, | |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | GNU General Public License for more details. | |
13 | ||
14 | You should have received a copy of the GNU General Public License | |
6bc9506f | 15 | along with this program; see the file COPYING3. If not see |
16 | <http://www.gnu.org/licenses/>. */ | |
bce47149 | 17 | |
18 | /* Run this program as | |
19 | ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ | |
20 | > ucnid.h | |
21 | */ | |
22 | ||
23 | #include <stdio.h> | |
24 | #include <string.h> | |
25 | #include <ctype.h> | |
26 | #include <stdbool.h> | |
27 | #include <stdlib.h> | |
28 | ||
29 | enum { | |
30 | C99 = 1, | |
31 | CXX = 2, | |
460f52aa | 32 | N99 = 4, |
33 | C11 = 8, | |
34 | N11 = 16, | |
35 | all_languages = C99 | CXX | C11, | |
36 | not_NFC = 32, | |
37 | not_NFKC = 64, | |
38 | maybe_not_NFC = 128 | |
bce47149 | 39 | }; |
40 | ||
460f52aa | 41 | #define NUM_CODE_POINTS 0x110000 |
42 | #define MAX_CODE_POINT 0x10ffff | |
43 | ||
44 | static unsigned flags[NUM_CODE_POINTS]; | |
45 | static unsigned int all_decomp[NUM_CODE_POINTS][2]; | |
46 | static unsigned int decomp[NUM_CODE_POINTS][2]; | |
47 | static unsigned char combining_value[NUM_CODE_POINTS]; | |
bce47149 | 48 | |
49 | /* Die! */ | |
50 | ||
51 | static void | |
52 | fail (const char *s) | |
53 | { | |
54 | fprintf (stderr, "%s\n", s); | |
55 | exit (1); | |
56 | } | |
57 | ||
460f52aa | 58 | /* Read ucnid.tab and set the flags for language versions in header[]. */ |
bce47149 | 59 | |
60 | static void | |
61 | read_ucnid (const char *fname) | |
62 | { | |
63 | FILE *f = fopen (fname, "r"); | |
64 | unsigned fl = 0; | |
65 | ||
66 | if (!f) | |
67 | fail ("opening ucnid.tab"); | |
68 | for (;;) | |
69 | { | |
70 | char line[256]; | |
71 | ||
72 | if (!fgets (line, sizeof (line), f)) | |
73 | break; | |
74 | if (strcmp (line, "[C99]\n") == 0) | |
75 | fl = C99; | |
460f52aa | 76 | else if (strcmp (line, "[C99DIG]\n") == 0) |
77 | fl = C99|N99; | |
bce47149 | 78 | else if (strcmp (line, "[CXX]\n") == 0) |
79 | fl = CXX; | |
460f52aa | 80 | else if (strcmp (line, "[C11]\n") == 0) |
81 | fl = C11; | |
82 | else if (strcmp (line, "[C11NOSTART]\n") == 0) | |
83 | fl = C11|N11; | |
bce47149 | 84 | else if (isxdigit (line[0])) |
85 | { | |
86 | char *l = line; | |
87 | while (*l) | |
88 | { | |
89 | unsigned long start, end; | |
90 | char *endptr; | |
91 | start = strtoul (l, &endptr, 16); | |
92 | if (endptr == l || (*endptr != '-' && ! isspace (*endptr))) | |
93 | fail ("parsing ucnid.tab [1]"); | |
94 | l = endptr; | |
95 | if (*l != '-') | |
96 | end = start; | |
97 | else | |
98 | { | |
99 | end = strtoul (l + 1, &endptr, 16); | |
100 | if (end < start) | |
101 | fail ("parsing ucnid.tab, end before start"); | |
102 | l = endptr; | |
103 | if (! isspace (*l)) | |
104 | fail ("parsing ucnid.tab, junk after range"); | |
105 | } | |
106 | while (isspace (*l)) | |
107 | l++; | |
460f52aa | 108 | if (end > MAX_CODE_POINT) |
bce47149 | 109 | fail ("parsing ucnid.tab, end too large"); |
110 | while (start <= end) | |
111 | flags[start++] |= fl; | |
112 | } | |
113 | } | |
114 | } | |
115 | if (ferror (f)) | |
116 | fail ("reading ucnid.tab"); | |
117 | fclose (f); | |
118 | } | |
119 | ||
ceaeebbe | 120 | /* Read UnicodeData.txt and fill in the 'decomp' table to be the |
121 | decompositions of characters for which both the character | |
460f52aa | 122 | decomposed and all the code points in the decomposition are valid |
123 | for some supported language version, and the 'all_decomp' table to | |
124 | be the decompositions of all characters without those | |
125 | constraints. */ | |
bce47149 | 126 | |
127 | static void | |
128 | read_table (char *fname) | |
129 | { | |
130 | FILE * f = fopen (fname, "r"); | |
131 | ||
132 | if (!f) | |
133 | fail ("opening UnicodeData.txt"); | |
134 | for (;;) | |
135 | { | |
136 | char line[256]; | |
137 | unsigned long codepoint, this_decomp[4]; | |
138 | char *l; | |
460f52aa | 139 | int i, j; |
bce47149 | 140 | int decomp_useful; |
141 | ||
142 | if (!fgets (line, sizeof (line), f)) | |
143 | break; | |
144 | codepoint = strtoul (line, &l, 16); | |
145 | if (l == line || *l != ';') | |
146 | fail ("parsing UnicodeData.txt, reading code point"); | |
460f52aa | 147 | if (codepoint > MAX_CODE_POINT) |
148 | fail ("parsing UnicodeData.txt, code point too large"); | |
bce47149 | 149 | |
150 | do { | |
151 | l++; | |
152 | } while (*l != ';'); | |
ceaeebbe | 153 | /* Category value. */ |
bce47149 | 154 | do { |
155 | l++; | |
156 | } while (*l != ';'); | |
157 | /* Canonical combining class; in NFC/NFKC, they must be increasing | |
158 | (or zero). */ | |
159 | if (! isdigit (*++l)) | |
160 | fail ("parsing UnicodeData.txt, combining class not number"); | |
161 | combining_value[codepoint] = strtoul (l, &l, 10); | |
162 | if (*l++ != ';') | |
163 | fail ("parsing UnicodeData.txt, junk after combining class"); | |
164 | ||
165 | /* Skip over bidi value. */ | |
166 | do { | |
167 | l++; | |
168 | } while (*l != ';'); | |
169 | ||
170 | /* Decomposition mapping. */ | |
171 | decomp_useful = flags[codepoint]; | |
172 | if (*++l == '<') /* Compatibility mapping. */ | |
173 | continue; | |
174 | for (i = 0; i < 4; i++) | |
175 | { | |
176 | if (*l == ';') | |
177 | break; | |
178 | if (!isxdigit (*l)) | |
179 | fail ("parsing UnicodeData.txt, decomposition format"); | |
180 | this_decomp[i] = strtoul (l, &l, 16); | |
181 | decomp_useful &= flags[this_decomp[i]]; | |
182 | while (isspace (*l)) | |
183 | l++; | |
184 | } | |
185 | if (i > 2) /* Decomposition too long. */ | |
186 | fail ("parsing UnicodeData.txt, decomposition too long"); | |
460f52aa | 187 | for (j = 0; j < i; j++) |
188 | all_decomp[codepoint][j] = this_decomp[j]; | |
189 | if ((flags[codepoint] & all_languages) && decomp_useful) | |
bce47149 | 190 | while (--i >= 0) |
191 | decomp[codepoint][i] = this_decomp[i]; | |
192 | } | |
193 | if (ferror (f)) | |
194 | fail ("reading UnicodeData.txt"); | |
195 | fclose (f); | |
196 | } | |
197 | ||
198 | /* Read DerivedNormalizationProps.txt and set the flags that say whether | |
199 | a character is in NFC, NFKC, or is context-dependent. */ | |
200 | ||
201 | static void | |
202 | read_derived (const char *fname) | |
203 | { | |
204 | FILE * f = fopen (fname, "r"); | |
205 | ||
206 | if (!f) | |
207 | fail ("opening DerivedNormalizationProps.txt"); | |
208 | for (;;) | |
209 | { | |
210 | char line[256]; | |
211 | unsigned long start, end; | |
212 | char *l; | |
213 | bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p; | |
214 | ||
215 | if (!fgets (line, sizeof (line), f)) | |
216 | break; | |
217 | not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL); | |
218 | not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL); | |
219 | maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL); | |
220 | if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p) | |
221 | continue; | |
222 | ||
223 | start = strtoul (line, &l, 16); | |
224 | if (l == line) | |
225 | fail ("parsing DerivedNormalizationProps.txt, reading start"); | |
460f52aa | 226 | if (start > MAX_CODE_POINT) |
227 | fail ("parsing DerivedNormalizationProps.txt, code point too large"); | |
bce47149 | 228 | if (*l == '.' && l[1] == '.') |
229 | end = strtoul (l + 2, &l, 16); | |
230 | else | |
231 | end = start; | |
232 | ||
233 | while (start <= end) | |
234 | flags[start++] |= ((not_NFC_p ? not_NFC : 0) | |
235 | | (not_NFKC_p ? not_NFKC : 0) | |
236 | | (maybe_not_NFC_p ? maybe_not_NFC : 0) | |
237 | ); | |
238 | } | |
239 | if (ferror (f)) | |
240 | fail ("reading DerivedNormalizationProps.txt"); | |
241 | fclose (f); | |
242 | } | |
243 | ||
244 | /* Write out the table. | |
245 | The table consists of two words per entry. The first word is the flags | |
246 | for the unicode code points up to and including the second word. */ | |
247 | ||
248 | static void | |
249 | write_table (void) | |
250 | { | |
251 | unsigned i; | |
252 | unsigned last_flag = flags[0]; | |
253 | bool really_safe = decomp[0][0] == 0; | |
254 | unsigned char last_combine = combining_value[0]; | |
460f52aa | 255 | |
256 | printf ("static const struct ucnrange ucnranges[] = {\n"); | |
bce47149 | 257 | |
460f52aa | 258 | for (i = 1; i <= NUM_CODE_POINTS; i++) |
259 | if (i == NUM_CODE_POINTS | |
260 | || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages)) | |
bce47149 | 261 | || really_safe != (decomp[i][0] == 0) |
262 | || combining_value[i] != last_combine) | |
263 | { | |
460f52aa | 264 | printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", |
bce47149 | 265 | last_flag & C99 ? "C99" : " 0", |
460f52aa | 266 | last_flag & N99 ? "N99" : " 0", |
bce47149 | 267 | last_flag & CXX ? "CXX" : " 0", |
460f52aa | 268 | last_flag & C11 ? "C11" : " 0", |
269 | last_flag & N11 ? "N11" : " 0", | |
bce47149 | 270 | really_safe ? "CID" : " 0", |
271 | last_flag & not_NFC ? " 0" : "NFC", | |
272 | last_flag & not_NFKC ? " 0" : "NKC", | |
273 | last_flag & maybe_not_NFC ? "CTX" : " 0", | |
274 | combining_value[i - 1], | |
275 | i - 1); | |
276 | last_flag = flags[i]; | |
277 | last_combine = combining_value[0]; | |
278 | really_safe = decomp[i][0] == 0; | |
279 | } | |
460f52aa | 280 | |
281 | printf ("};\n"); | |
282 | } | |
283 | ||
284 | /* Return whether a given character is valid in an identifier for some | |
285 | supported language, either as itself or as a UCN. */ | |
286 | ||
287 | static bool | |
288 | char_id_valid (unsigned int c) | |
289 | { | |
290 | return ((flags[c] & all_languages) | |
291 | || (c == 0x24) | |
292 | || (c >= 0x30 && c <= 0x39) | |
293 | || (c >= 0x41 && c <= 0x5a) | |
294 | || (c >= 0x61 && c <= 0x7a)); | |
295 | } | |
296 | ||
297 | /* Write out the switch statement over characters for which it is | |
298 | context-dependent whether they are in NFC. */ | |
299 | ||
300 | static void | |
301 | write_context_switch (void) | |
302 | { | |
303 | unsigned i; | |
304 | printf ("static bool\n" | |
305 | "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n" | |
306 | "{\n" | |
307 | " switch (c)\n" | |
308 | " {\n"); | |
309 | for (i = 0; i < NUM_CODE_POINTS; i++) | |
310 | { | |
311 | bool found_case = false; | |
312 | unsigned j; | |
313 | if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC)) | |
314 | continue; | |
315 | if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2)) | |
316 | continue; /* Hangul handled algorithmically. */ | |
317 | printf (" case %#06x:\n" | |
318 | " switch (p)\n" | |
319 | "\t{\n", i); | |
320 | /* If an NFC starter character decomposes with this character I | |
321 | as the second character and an NFC starter character S as the | |
322 | first character, that latter character as a previous | |
323 | character means this character is not NFC. Furthermore, any | |
324 | NFC starter character K made by a series of compositions of S | |
325 | with combining characters whose combining class is greater | |
326 | than that of I also means this character is not NFC. */ | |
327 | for (j = 0; j < NUM_CODE_POINTS; j++) | |
328 | { | |
329 | unsigned s, k; | |
330 | if (all_decomp[j][1] != i) | |
331 | continue; | |
332 | s = all_decomp[j][0]; | |
333 | if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0) | |
334 | continue; | |
335 | if (char_id_valid (s)) | |
336 | { | |
337 | found_case = true; | |
338 | printf ("\tcase %#06x:\n", s); | |
339 | } | |
340 | for (k = 0; k < NUM_CODE_POINTS; k++) | |
341 | { | |
342 | unsigned t = k; | |
343 | if (k == s || !char_id_valid (k)) | |
344 | continue; | |
345 | while (all_decomp[t][1] != 0 | |
346 | && combining_value[all_decomp[t][1]] > combining_value[i]) | |
347 | { | |
348 | if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0) | |
349 | break; | |
350 | t = all_decomp[t][0]; | |
351 | } | |
352 | if (t == s) | |
353 | { | |
354 | found_case = true; | |
355 | printf ("\tcase %#06x:\n", k); | |
356 | } | |
357 | } | |
358 | } | |
359 | if (found_case) | |
360 | printf ("\t return false;\n"); | |
361 | else | |
362 | printf ("\t/* Non-NFC cases not applicable to C/C++. */\n"); | |
363 | printf ("\tdefault:\n" | |
364 | "\t return true;\n" | |
365 | "\t}\n\n"); | |
366 | } | |
367 | printf (" default:\n" | |
368 | " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n" | |
369 | " return true;\n" | |
370 | " }\n" | |
371 | "}\n"); | |
bce47149 | 372 | } |
373 | ||
374 | /* Print out the huge copyright notice. */ | |
375 | ||
376 | static void | |
377 | write_copyright (void) | |
378 | { | |
379 | static const char copyright[] = "\ | |
380 | /* Unicode characters and various properties.\n\ | |
fbd26352 | 381 | Copyright (C) 2003-2019 Free Software Foundation, Inc.\n\ |
bce47149 | 382 | \n\ |
383 | This program is free software; you can redistribute it and/or modify it\n\ | |
384 | under the terms of the GNU General Public License as published by the\n\ | |
6bc9506f | 385 | Free Software Foundation; either version 3, or (at your option) any\n\ |
bce47149 | 386 | later version.\n\ |
387 | \n\ | |
388 | This program is distributed in the hope that it will be useful,\n\ | |
389 | but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ | |
390 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ | |
391 | GNU General Public License for more details.\n\ | |
392 | \n\ | |
393 | You should have received a copy of the GNU General Public License\n\ | |
6bc9506f | 394 | along with this program; see the file COPYING3. If not see\n\ |
395 | <http://www.gnu.org/licenses/>.\n\ | |
bce47149 | 396 | \n\ |
397 | \n\ | |
398 | Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\ | |
399 | Distributed under the Terms of Use in\n\ | |
400 | http://www.unicode.org/copyright.html.\n\ | |
401 | \n\ | |
402 | Permission is hereby granted, free of charge, to any person\n\ | |
403 | obtaining a copy of the Unicode data files and any associated\n\ | |
404 | documentation (the \"Data Files\") or Unicode software and any\n\ | |
405 | associated documentation (the \"Software\") to deal in the Data Files\n\ | |
406 | or Software without restriction, including without limitation the\n\ | |
407 | rights to use, copy, modify, merge, publish, distribute, and/or\n\ | |
408 | sell copies of the Data Files or Software, and to permit persons to\n\ | |
409 | whom the Data Files or Software are furnished to do so, provided\n\ | |
410 | that (a) the above copyright notice(s) and this permission notice\n\ | |
411 | appear with all copies of the Data Files or Software, (b) both the\n\ | |
412 | above copyright notice(s) and this permission notice appear in\n\ | |
413 | associated documentation, and (c) there is clear notice in each\n\ | |
414 | modified Data File or in the Software as well as in the\n\ | |
415 | documentation associated with the Data File(s) or Software that the\n\ | |
416 | data or software has been modified.\n\ | |
417 | \n\ | |
418 | THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ | |
419 | OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ | |
420 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ | |
421 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ | |
422 | COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ | |
423 | ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ | |
424 | DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ | |
425 | WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ | |
426 | ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ | |
427 | OF THE DATA FILES OR SOFTWARE.\n\ | |
428 | \n\ | |
429 | Except as contained in this notice, the name of a copyright holder\n\ | |
430 | shall not be used in advertising or otherwise to promote the sale,\n\ | |
431 | use or other dealings in these Data Files or Software without prior\n\ | |
432 | written authorization of the copyright holder. */\n"; | |
433 | ||
434 | puts (copyright); | |
435 | } | |
436 | ||
437 | /* Main program. */ | |
438 | ||
439 | int | |
440 | main(int argc, char ** argv) | |
441 | { | |
442 | if (argc != 4) | |
443 | fail ("too few arguments to makeucn"); | |
444 | read_ucnid (argv[1]); | |
445 | read_table (argv[2]); | |
446 | read_derived (argv[3]); | |
447 | ||
448 | write_copyright (); | |
449 | write_table (); | |
460f52aa | 450 | write_context_switch (); |
bce47149 | 451 | return 0; |
452 | } |