]> git.ipfire.org Git - thirdparty/gcc.git/blame - libcpp/makeucnid.c
Update copyright years.
[thirdparty/gcc.git] / libcpp / makeucnid.c
CommitLineData
50668cf6 1/* Make ucnid.h from various sources.
85ec4feb 2 Copyright (C) 2005-2018 Free Software Foundation, Inc.
50668cf6
GK
3
4This program is free software; you can redistribute it and/or modify it
5under the terms of the GNU General Public License as published by the
748086b7 6Free Software Foundation; either version 3, or (at your option) any
50668cf6
GK
7later version.
8
9This program is distributed in the hope that it will be useful,
10but WITHOUT ANY WARRANTY; without even the implied warranty of
11MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License
748086b7
JJ
15along with this program; see the file COPYING3. If not see
16<http://www.gnu.org/licenses/>. */
50668cf6
GK
17
18/* Run this program as
19 ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
20 > ucnid.h
21*/
22
23#include <stdio.h>
24#include <string.h>
25#include <ctype.h>
26#include <stdbool.h>
27#include <stdlib.h>
28
29enum {
30 C99 = 1,
31 CXX = 2,
d3f4ff8b
JM
32 N99 = 4,
33 C11 = 8,
34 N11 = 16,
35 all_languages = C99 | CXX | C11,
36 not_NFC = 32,
37 not_NFKC = 64,
38 maybe_not_NFC = 128
50668cf6
GK
39};
40
d3f4ff8b
JM
41#define NUM_CODE_POINTS 0x110000
42#define MAX_CODE_POINT 0x10ffff
43
44static unsigned flags[NUM_CODE_POINTS];
45static unsigned int all_decomp[NUM_CODE_POINTS][2];
46static unsigned int decomp[NUM_CODE_POINTS][2];
47static unsigned char combining_value[NUM_CODE_POINTS];
50668cf6
GK
48
49/* Die! */
50
51static void
52fail (const char *s)
53{
54 fprintf (stderr, "%s\n", s);
55 exit (1);
56}
57
d3f4ff8b 58/* Read ucnid.tab and set the flags for language versions in header[]. */
50668cf6
GK
59
60static void
61read_ucnid (const char *fname)
62{
63 FILE *f = fopen (fname, "r");
64 unsigned fl = 0;
65
66 if (!f)
67 fail ("opening ucnid.tab");
68 for (;;)
69 {
70 char line[256];
71
72 if (!fgets (line, sizeof (line), f))
73 break;
74 if (strcmp (line, "[C99]\n") == 0)
75 fl = C99;
d3f4ff8b
JM
76 else if (strcmp (line, "[C99DIG]\n") == 0)
77 fl = C99|N99;
50668cf6
GK
78 else if (strcmp (line, "[CXX]\n") == 0)
79 fl = CXX;
d3f4ff8b
JM
80 else if (strcmp (line, "[C11]\n") == 0)
81 fl = C11;
82 else if (strcmp (line, "[C11NOSTART]\n") == 0)
83 fl = C11|N11;
50668cf6
GK
84 else if (isxdigit (line[0]))
85 {
86 char *l = line;
87 while (*l)
88 {
89 unsigned long start, end;
90 char *endptr;
91 start = strtoul (l, &endptr, 16);
92 if (endptr == l || (*endptr != '-' && ! isspace (*endptr)))
93 fail ("parsing ucnid.tab [1]");
94 l = endptr;
95 if (*l != '-')
96 end = start;
97 else
98 {
99 end = strtoul (l + 1, &endptr, 16);
100 if (end < start)
101 fail ("parsing ucnid.tab, end before start");
102 l = endptr;
103 if (! isspace (*l))
104 fail ("parsing ucnid.tab, junk after range");
105 }
106 while (isspace (*l))
107 l++;
d3f4ff8b 108 if (end > MAX_CODE_POINT)
50668cf6
GK
109 fail ("parsing ucnid.tab, end too large");
110 while (start <= end)
111 flags[start++] |= fl;
112 }
113 }
114 }
115 if (ferror (f))
116 fail ("reading ucnid.tab");
117 fclose (f);
118}
119
54848ff8
JM
120/* Read UnicodeData.txt and fill in the 'decomp' table to be the
121 decompositions of characters for which both the character
d3f4ff8b
JM
122 decomposed and all the code points in the decomposition are valid
123 for some supported language version, and the 'all_decomp' table to
124 be the decompositions of all characters without those
125 constraints. */
50668cf6
GK
126
127static void
128read_table (char *fname)
129{
130 FILE * f = fopen (fname, "r");
131
132 if (!f)
133 fail ("opening UnicodeData.txt");
134 for (;;)
135 {
136 char line[256];
137 unsigned long codepoint, this_decomp[4];
138 char *l;
d3f4ff8b 139 int i, j;
50668cf6
GK
140 int decomp_useful;
141
142 if (!fgets (line, sizeof (line), f))
143 break;
144 codepoint = strtoul (line, &l, 16);
145 if (l == line || *l != ';')
146 fail ("parsing UnicodeData.txt, reading code point");
d3f4ff8b
JM
147 if (codepoint > MAX_CODE_POINT)
148 fail ("parsing UnicodeData.txt, code point too large");
50668cf6
GK
149
150 do {
151 l++;
152 } while (*l != ';');
54848ff8 153 /* Category value. */
50668cf6
GK
154 do {
155 l++;
156 } while (*l != ';');
157 /* Canonical combining class; in NFC/NFKC, they must be increasing
158 (or zero). */
159 if (! isdigit (*++l))
160 fail ("parsing UnicodeData.txt, combining class not number");
161 combining_value[codepoint] = strtoul (l, &l, 10);
162 if (*l++ != ';')
163 fail ("parsing UnicodeData.txt, junk after combining class");
164
165 /* Skip over bidi value. */
166 do {
167 l++;
168 } while (*l != ';');
169
170 /* Decomposition mapping. */
171 decomp_useful = flags[codepoint];
172 if (*++l == '<') /* Compatibility mapping. */
173 continue;
174 for (i = 0; i < 4; i++)
175 {
176 if (*l == ';')
177 break;
178 if (!isxdigit (*l))
179 fail ("parsing UnicodeData.txt, decomposition format");
180 this_decomp[i] = strtoul (l, &l, 16);
181 decomp_useful &= flags[this_decomp[i]];
182 while (isspace (*l))
183 l++;
184 }
185 if (i > 2) /* Decomposition too long. */
186 fail ("parsing UnicodeData.txt, decomposition too long");
d3f4ff8b
JM
187 for (j = 0; j < i; j++)
188 all_decomp[codepoint][j] = this_decomp[j];
189 if ((flags[codepoint] & all_languages) && decomp_useful)
50668cf6
GK
190 while (--i >= 0)
191 decomp[codepoint][i] = this_decomp[i];
192 }
193 if (ferror (f))
194 fail ("reading UnicodeData.txt");
195 fclose (f);
196}
197
198/* Read DerivedNormalizationProps.txt and set the flags that say whether
199 a character is in NFC, NFKC, or is context-dependent. */
200
201static void
202read_derived (const char *fname)
203{
204 FILE * f = fopen (fname, "r");
205
206 if (!f)
207 fail ("opening DerivedNormalizationProps.txt");
208 for (;;)
209 {
210 char line[256];
211 unsigned long start, end;
212 char *l;
213 bool not_NFC_p, not_NFKC_p, maybe_not_NFC_p;
214
215 if (!fgets (line, sizeof (line), f))
216 break;
217 not_NFC_p = (strstr (line, "; NFC_QC; N") != NULL);
218 not_NFKC_p = (strstr (line, "; NFKC_QC; N") != NULL);
219 maybe_not_NFC_p = (strstr (line, "; NFC_QC; M") != NULL);
220 if (! not_NFC_p && ! not_NFKC_p && ! maybe_not_NFC_p)
221 continue;
222
223 start = strtoul (line, &l, 16);
224 if (l == line)
225 fail ("parsing DerivedNormalizationProps.txt, reading start");
d3f4ff8b
JM
226 if (start > MAX_CODE_POINT)
227 fail ("parsing DerivedNormalizationProps.txt, code point too large");
50668cf6
GK
228 if (*l == '.' && l[1] == '.')
229 end = strtoul (l + 2, &l, 16);
230 else
231 end = start;
232
233 while (start <= end)
234 flags[start++] |= ((not_NFC_p ? not_NFC : 0)
235 | (not_NFKC_p ? not_NFKC : 0)
236 | (maybe_not_NFC_p ? maybe_not_NFC : 0)
237 );
238 }
239 if (ferror (f))
240 fail ("reading DerivedNormalizationProps.txt");
241 fclose (f);
242}
243
244/* Write out the table.
245 The table consists of two words per entry. The first word is the flags
246 for the unicode code points up to and including the second word. */
247
248static void
249write_table (void)
250{
251 unsigned i;
252 unsigned last_flag = flags[0];
253 bool really_safe = decomp[0][0] == 0;
254 unsigned char last_combine = combining_value[0];
d3f4ff8b
JM
255
256 printf ("static const struct ucnrange ucnranges[] = {\n");
50668cf6 257
d3f4ff8b
JM
258 for (i = 1; i <= NUM_CODE_POINTS; i++)
259 if (i == NUM_CODE_POINTS
260 || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
50668cf6
GK
261 || really_safe != (decomp[i][0] == 0)
262 || combining_value[i] != last_combine)
263 {
d3f4ff8b 264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
50668cf6 265 last_flag & C99 ? "C99" : " 0",
d3f4ff8b 266 last_flag & N99 ? "N99" : " 0",
50668cf6 267 last_flag & CXX ? "CXX" : " 0",
d3f4ff8b
JM
268 last_flag & C11 ? "C11" : " 0",
269 last_flag & N11 ? "N11" : " 0",
50668cf6
GK
270 really_safe ? "CID" : " 0",
271 last_flag & not_NFC ? " 0" : "NFC",
272 last_flag & not_NFKC ? " 0" : "NKC",
273 last_flag & maybe_not_NFC ? "CTX" : " 0",
274 combining_value[i - 1],
275 i - 1);
276 last_flag = flags[i];
277 last_combine = combining_value[0];
278 really_safe = decomp[i][0] == 0;
279 }
d3f4ff8b
JM
280
281 printf ("};\n");
282}
283
284/* Return whether a given character is valid in an identifier for some
285 supported language, either as itself or as a UCN. */
286
287static bool
288char_id_valid (unsigned int c)
289{
290 return ((flags[c] & all_languages)
291 || (c == 0x24)
292 || (c >= 0x30 && c <= 0x39)
293 || (c >= 0x41 && c <= 0x5a)
294 || (c >= 0x61 && c <= 0x7a));
295}
296
297/* Write out the switch statement over characters for which it is
298 context-dependent whether they are in NFC. */
299
300static void
301write_context_switch (void)
302{
303 unsigned i;
304 printf ("static bool\n"
305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
306 "{\n"
307 " switch (c)\n"
308 " {\n");
309 for (i = 0; i < NUM_CODE_POINTS; i++)
310 {
311 bool found_case = false;
312 unsigned j;
313 if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
314 continue;
315 if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
316 continue; /* Hangul handled algorithmically. */
317 printf (" case %#06x:\n"
318 " switch (p)\n"
319 "\t{\n", i);
320 /* If an NFC starter character decomposes with this character I
321 as the second character and an NFC starter character S as the
322 first character, that latter character as a previous
323 character means this character is not NFC. Furthermore, any
324 NFC starter character K made by a series of compositions of S
325 with combining characters whose combining class is greater
326 than that of I also means this character is not NFC. */
327 for (j = 0; j < NUM_CODE_POINTS; j++)
328 {
329 unsigned s, k;
330 if (all_decomp[j][1] != i)
331 continue;
332 s = all_decomp[j][0];
333 if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
334 continue;
335 if (char_id_valid (s))
336 {
337 found_case = true;
338 printf ("\tcase %#06x:\n", s);
339 }
340 for (k = 0; k < NUM_CODE_POINTS; k++)
341 {
342 unsigned t = k;
343 if (k == s || !char_id_valid (k))
344 continue;
345 while (all_decomp[t][1] != 0
346 && combining_value[all_decomp[t][1]] > combining_value[i])
347 {
348 if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
349 break;
350 t = all_decomp[t][0];
351 }
352 if (t == s)
353 {
354 found_case = true;
355 printf ("\tcase %#06x:\n", k);
356 }
357 }
358 }
359 if (found_case)
360 printf ("\t return false;\n");
361 else
362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
363 printf ("\tdefault:\n"
364 "\t return true;\n"
365 "\t}\n\n");
366 }
367 printf (" default:\n"
368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
369 " return true;\n"
370 " }\n"
371 "}\n");
50668cf6
GK
372}
373
374/* Print out the huge copyright notice. */
375
376static void
377write_copyright (void)
378{
379 static const char copyright[] = "\
380/* Unicode characters and various properties.\n\
85ec4feb 381 Copyright (C) 2003-2018 Free Software Foundation, Inc.\n\
50668cf6
GK
382\n\
383 This program is free software; you can redistribute it and/or modify it\n\
384 under the terms of the GNU General Public License as published by the\n\
748086b7 385 Free Software Foundation; either version 3, or (at your option) any\n\
50668cf6
GK
386 later version.\n\
387\n\
388 This program is distributed in the hope that it will be useful,\n\
389 but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
390 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
391 GNU General Public License for more details.\n\
392\n\
393 You should have received a copy of the GNU General Public License\n\
748086b7
JJ
394 along with this program; see the file COPYING3. If not see\n\
395 <http://www.gnu.org/licenses/>.\n\
50668cf6
GK
396\n\
397\n\
398 Copyright (C) 1991-2005 Unicode, Inc. All rights reserved.\n\
399 Distributed under the Terms of Use in\n\
400 http://www.unicode.org/copyright.html.\n\
401\n\
402 Permission is hereby granted, free of charge, to any person\n\
403 obtaining a copy of the Unicode data files and any associated\n\
404 documentation (the \"Data Files\") or Unicode software and any\n\
405 associated documentation (the \"Software\") to deal in the Data Files\n\
406 or Software without restriction, including without limitation the\n\
407 rights to use, copy, modify, merge, publish, distribute, and/or\n\
408 sell copies of the Data Files or Software, and to permit persons to\n\
409 whom the Data Files or Software are furnished to do so, provided\n\
410 that (a) the above copyright notice(s) and this permission notice\n\
411 appear with all copies of the Data Files or Software, (b) both the\n\
412 above copyright notice(s) and this permission notice appear in\n\
413 associated documentation, and (c) there is clear notice in each\n\
414 modified Data File or in the Software as well as in the\n\
415 documentation associated with the Data File(s) or Software that the\n\
416 data or software has been modified.\n\
417\n\
418 THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\
419 OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\
420 WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\
421 NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\
422 COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\
423 ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\
424 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\
425 WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\
426 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\
427 OF THE DATA FILES OR SOFTWARE.\n\
428\n\
429 Except as contained in this notice, the name of a copyright holder\n\
430 shall not be used in advertising or otherwise to promote the sale,\n\
431 use or other dealings in these Data Files or Software without prior\n\
432 written authorization of the copyright holder. */\n";
433
434 puts (copyright);
435}
436
437/* Main program. */
438
439int
440main(int argc, char ** argv)
441{
442 if (argc != 4)
443 fail ("too few arguments to makeucn");
444 read_ucnid (argv[1]);
445 read_table (argv[2]);
446 read_derived (argv[3]);
447
448 write_copyright ();
449 write_table ();
d3f4ff8b 450 write_context_switch ();
50668cf6
GK
451 return 0;
452}