]> git.ipfire.org Git - thirdparty/glibc.git/blame - localedata/gen-unicode-ctype.c
(CFLAGS-tst-align.c): Add -mpreferred-stack-boundary=4.
[thirdparty/glibc.git] / localedata / gen-unicode-ctype.c
CommitLineData
0b95971d 1/* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
601d2942 2 Copyright (C) 2000-2001 Free Software Foundation, Inc.
0b95971d
UD
3 This file is part of the GNU C Library.
4 Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
5
6 The GNU C Library is free software; you can redistribute it and/or
41bdb6e2
AJ
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
0b95971d
UD
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
41bdb6e2 14 Lesser General Public License for more details.
0b95971d 15
41bdb6e2
AJ
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
0b95971d
UD
20
21/* Usage example:
601d2942 22 $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
0b95971d
UD
23 */
24
25#include <stdio.h>
26#include <stdlib.h>
27#include <stdbool.h>
28#include <string.h>
29#include <time.h>
30
31/* This structure represents one line in the UnicodeData.txt file. */
32struct unicode_attribute
33{
34 const char *name; /* Character name */
35 const char *category; /* General category */
36 const char *combining; /* Canonical combining classes */
37 const char *bidi; /* Bidirectional category */
38 const char *decomposition; /* Character decomposition mapping */
39 const char *decdigit; /* Decimal digit value */
40 const char *digit; /* Digit value */
41 const char *numeric; /* Numeric value */
42 int mirrored; /* mirrored */
43 const char *oldname; /* Old Unicode 1.0 name */
44 const char *comment; /* Comment */
45 unsigned int upper; /* Uppercase mapping */
46 unsigned int lower; /* Lowercase mapping */
47 unsigned int title; /* Titlecase mapping */
48};
49
50/* Missing fields are represented with "" for strings, and NONE for
51 characters. */
52#define NONE (~(unsigned int)0)
53
54/* The entire contents of the UnicodeData.txt file. */
601d2942 55struct unicode_attribute unicode_attributes [0x110000];
0b95971d
UD
56
57/* Stores in unicode_attributes[i] the values from the given fields. */
58static void
59fill_attribute (unsigned int i,
60 const char *field1, const char *field2,
61 const char *field3, const char *field4,
62 const char *field5, const char *field6,
63 const char *field7, const char *field8,
64 const char *field9, const char *field10,
65 const char *field11, const char *field12,
66 const char *field13, const char *field14)
67{
68 struct unicode_attribute * uni;
69
601d2942 70 if (i >= 0x110000)
0b95971d
UD
71 {
72 fprintf (stderr, "index too large\n");
73 exit (1);
74 }
601d2942
UD
75 if (strcmp (field2, "Cs") == 0)
76 /* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
77 return;
0b95971d
UD
78 uni = &unicode_attributes[i];
79 /* Copy the strings. */
80 uni->name = strdup (field1);
81 uni->category = (field2[0] == '\0' ? "" : strdup (field2));
82 uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
83 uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
84 uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
85 uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
86 uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
87 uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
88 uni->mirrored = (field9[0] == 'Y');
89 uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
90 uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
91 uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
92 uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
93 uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
94}
95
96/* Maximum length of a field in the UnicodeData.txt file. */
97#define FIELDLEN 120
98
99/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
100 Reads up to (but excluding) DELIM.
101 Returns 1 when a field was successfully read, otherwise 0. */
102static int
103getfield (FILE *stream, char *buffer, int delim)
104{
105 int count = 0;
106 int c;
107
108 for (; (c = getc (stream)), (c != EOF && c != delim); )
109 {
110 /* The original unicode.org UnicodeData.txt file happens to have
111 CR/LF line terminators. Silently convert to LF. */
112 if (c == '\r')
113 continue;
114
115 /* Put c into the buffer. */
116 if (++count >= FIELDLEN - 1)
117 {
118 fprintf (stderr, "field too long\n");
119 exit (1);
120 }
121 *buffer++ = c;
122 }
123
124 if (c == EOF)
125 return 0;
126
127 *buffer = '\0';
128 return 1;
129}
130
131/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
132 file. */
133static void
134fill_attributes (const char *unicodedata_filename)
135{
136 unsigned int i, j;
137 FILE *stream;
138 char field0[FIELDLEN];
139 char field1[FIELDLEN];
140 char field2[FIELDLEN];
141 char field3[FIELDLEN];
142 char field4[FIELDLEN];
143 char field5[FIELDLEN];
144 char field6[FIELDLEN];
145 char field7[FIELDLEN];
146 char field8[FIELDLEN];
147 char field9[FIELDLEN];
148 char field10[FIELDLEN];
149 char field11[FIELDLEN];
150 char field12[FIELDLEN];
151 char field13[FIELDLEN];
152 char field14[FIELDLEN];
153 int lineno = 0;
154
601d2942 155 for (i = 0; i < 0x110000; i++)
0b95971d
UD
156 unicode_attributes[i].name = NULL;
157
158 stream = fopen (unicodedata_filename, "r");
159 if (stream == NULL)
160 {
161 fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
162 exit (1);
163 }
164
165 for (;;)
166 {
167 int n;
168
169 lineno++;
f00f95d1
UD
170 n = getfield (stream, field0, ';');
171 n += getfield (stream, field1, ';');
172 n += getfield (stream, field2, ';');
173 n += getfield (stream, field3, ';');
174 n += getfield (stream, field4, ';');
175 n += getfield (stream, field5, ';');
176 n += getfield (stream, field6, ';');
177 n += getfield (stream, field7, ';');
178 n += getfield (stream, field8, ';');
179 n += getfield (stream, field9, ';');
180 n += getfield (stream, field10, ';');
181 n += getfield (stream, field11, ';');
182 n += getfield (stream, field12, ';');
183 n += getfield (stream, field13, ';');
184 n += getfield (stream, field14, '\n');
0b95971d
UD
185 if (n == 0)
186 break;
187 if (n != 15)
188 {
189 fprintf (stderr, "short line in'%s':%d\n",
190 unicodedata_filename, lineno);
191 exit (1);
192 }
193 i = strtoul (field0, NULL, 16);
194 if (field1[0] == '<'
195 && strlen (field1) >= 9
196 && !strcmp (field1 + strlen(field1) - 8, ", First>"))
197 {
198 /* Deal with a range. */
199 lineno++;
f00f95d1
UD
200 n = getfield (stream, field0, ';');
201 n += getfield (stream, field1, ';');
202 n += getfield (stream, field2, ';');
203 n += getfield (stream, field3, ';');
204 n += getfield (stream, field4, ';');
205 n += getfield (stream, field5, ';');
206 n += getfield (stream, field6, ';');
207 n += getfield (stream, field7, ';');
208 n += getfield (stream, field8, ';');
209 n += getfield (stream, field9, ';');
210 n += getfield (stream, field10, ';');
211 n += getfield (stream, field11, ';');
212 n += getfield (stream, field12, ';');
213 n += getfield (stream, field13, ';');
214 n += getfield (stream, field14, '\n');
0b95971d
UD
215 if (n != 15)
216 {
217 fprintf (stderr, "missing end range in '%s':%d\n",
218 unicodedata_filename, lineno);
219 exit (1);
220 }
221 if (!(field1[0] == '<'
222 && strlen (field1) >= 8
223 && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
224 {
225 fprintf (stderr, "missing end range in '%s':%d\n",
226 unicodedata_filename, lineno);
227 exit (1);
228 }
229 field1[strlen (field1) - 7] = '\0';
230 j = strtoul (field0, NULL, 16);
231 for (; i <= j; i++)
232 fill_attribute (i, field1+1, field2, field3, field4, field5,
233 field6, field7, field8, field9, field10,
234 field11, field12, field13, field14);
235 }
236 else
237 {
238 /* Single character line */
239 fill_attribute (i, field1, field2, field3, field4, field5,
240 field6, field7, field8, field9, field10,
241 field11, field12, field13, field14);
242 }
243 }
244 if (ferror (stream) || fclose (stream))
245 {
246 fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
247 exit (1);
248 }
249}
250
0b95971d
UD
251/* Character mappings. */
252
253static unsigned int
254to_upper (unsigned int ch)
255{
256 if (unicode_attributes[ch].name != NULL
257 && unicode_attributes[ch].upper != NONE)
258 return unicode_attributes[ch].upper;
259 else
260 return ch;
261}
262
263static unsigned int
264to_lower (unsigned int ch)
265{
266 if (unicode_attributes[ch].name != NULL
267 && unicode_attributes[ch].lower != NONE)
268 return unicode_attributes[ch].lower;
269 else
270 return ch;
271}
272
273static unsigned int
274to_title (unsigned int ch)
275{
276 if (unicode_attributes[ch].name != NULL
277 && unicode_attributes[ch].title != NONE)
278 return unicode_attributes[ch].title;
279 else
280 return ch;
281}
282
283/* Character class properties. */
284
285static bool
286is_upper (unsigned int ch)
287{
288 return (to_lower (ch) != ch);
289}
290
291static bool
292is_lower (unsigned int ch)
293{
294 return (to_upper (ch) != ch)
295 /* <U00DF> is lowercase, but without simple to_upper mapping. */
296 || (ch == 0x00DF);
297}
298
299static bool
300is_alpha (unsigned int ch)
301{
302 return (unicode_attributes[ch].name != NULL
601d2942
UD
303 && ((unicode_attributes[ch].category[0] == 'L'
304 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
305 <U0E2F>, <U0E46> should belong to is_punct. */
306 && (ch != 0x0E2F) && (ch != 0x0E46))
307 /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
308 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
309 || (ch == 0x0E31)
310 || (ch >= 0x0E34 && ch <= 0x0E3A)
311 || (ch >= 0x0E47 && ch <= 0x0E4E)
0b95971d
UD
312 /* Avoid warning for <U0345>. */
313 || (ch == 0x0345)
314 /* Avoid warnings for <U2160>..<U217F>. */
315 || (unicode_attributes[ch].category[0] == 'N'
316 && unicode_attributes[ch].category[1] == 'l')
317 /* Avoid warnings for <U24B6>..<U24E9>. */
318 || (unicode_attributes[ch].category[0] == 'S'
319 && unicode_attributes[ch].category[1] == 'o'
320 && strstr (unicode_attributes[ch].name, " LETTER ")
f00f95d1
UD
321 != NULL)
322 /* Consider all the non-ASCII digits as alphabetic.
323 ISO C 99 forbids us to have them in category "digit",
324 but we want iswalnum to return true on them. */
325 || (unicode_attributes[ch].category[0] == 'N'
326 && unicode_attributes[ch].category[1] == 'd'
327 && !(ch >= 0x0030 && ch <= 0x0039))));
0b95971d
UD
328}
329
330static bool
331is_digit (unsigned int ch)
332{
f00f95d1 333#if 0
0b95971d
UD
334 return (unicode_attributes[ch].name != NULL
335 && unicode_attributes[ch].category[0] == 'N'
336 && unicode_attributes[ch].category[1] == 'd');
337 /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
338 a zero. Must add <0> in front of them by hand. */
f00f95d1
UD
339#else
340 /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
341 takes it away:
342 7.25.2.1.5:
343 The iswdigit function tests for any wide character that corresponds
344 to a decimal-digit character (as defined in 5.2.1).
345 5.2.1:
346 the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
347 */
348 return (ch >= 0x0030 && ch <= 0x0039);
349#endif
0b95971d
UD
350}
351
352static bool
353is_outdigit (unsigned int ch)
354{
355 return (ch >= 0x0030 && ch <= 0x0039);
356}
357
358static bool
359is_blank (unsigned int ch)
360{
361 return (ch == 0x0009 /* '\t' */
362 /* Category Zs without mention of "<noBreak>" */
363 || (unicode_attributes[ch].name != NULL
364 && unicode_attributes[ch].category[0] == 'Z'
365 && unicode_attributes[ch].category[1] == 's'
366 && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
367}
368
369static bool
370is_space (unsigned int ch)
371{
372 /* Don't make U+00A0 a space. Non-breaking space means that all programs
373 should treat it like a punctuation character, not like a space. */
374 return (ch == 0x0020 /* ' ' */
375 || ch == 0x000C /* '\f' */
376 || ch == 0x000A /* '\n' */
377 || ch == 0x000D /* '\r' */
378 || ch == 0x0009 /* '\t' */
379 || ch == 0x000B /* '\v' */
380 /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
381 || (unicode_attributes[ch].name != NULL
382 && unicode_attributes[ch].category[0] == 'Z'
383 && (unicode_attributes[ch].category[1] == 'l'
384 || unicode_attributes[ch].category[1] == 'p'
385 || (unicode_attributes[ch].category[1] == 's'
386 && !strstr (unicode_attributes[ch].decomposition,
387 "<noBreak>")))));
388}
389
390static bool
391is_cntrl (unsigned int ch)
392{
393 return (unicode_attributes[ch].name != NULL
394 && (!strcmp (unicode_attributes[ch].name, "<control>")
395 /* Categories Zl and Zp */
396 || (unicode_attributes[ch].category[0] == 'Z'
397 && (unicode_attributes[ch].category[1] == 'l'
398 || unicode_attributes[ch].category[1] == 'p'))));
399}
400
401static bool
402is_xdigit (unsigned int ch)
403{
f00f95d1 404#if 0
0b95971d
UD
405 return is_digit (ch)
406 || (ch >= 0x0041 && ch <= 0x0046)
407 || (ch >= 0x0061 && ch <= 0x0066);
f00f95d1
UD
408#else
409 /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
410 takes it away:
411 7.25.2.1.12:
412 The iswxdigit function tests for any wide character that corresponds
413 to a hexadecimal-digit character (as defined in 6.4.4.1).
414 6.4.4.1:
415 hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
416 */
417 return (ch >= 0x0030 && ch <= 0x0039)
418 || (ch >= 0x0041 && ch <= 0x0046)
419 || (ch >= 0x0061 && ch <= 0x0066);
420#endif
0b95971d
UD
421}
422
423static bool
424is_graph (unsigned int ch)
425{
426 return (unicode_attributes[ch].name != NULL
427 && strcmp (unicode_attributes[ch].name, "<control>")
428 && !is_space (ch));
429}
430
431static bool
432is_print (unsigned int ch)
433{
434 return (unicode_attributes[ch].name != NULL
435 && strcmp (unicode_attributes[ch].name, "<control>")
436 /* Categories Zl and Zp */
437 && !(unicode_attributes[ch].name != NULL
438 && unicode_attributes[ch].category[0] == 'Z'
439 && (unicode_attributes[ch].category[1] == 'l'
440 || unicode_attributes[ch].category[1] == 'p')));
441}
442
443static bool
444is_punct (unsigned int ch)
445{
446#if 0
447 return (unicode_attributes[ch].name != NULL
448 && unicode_attributes[ch].category[0] == 'P');
449#else
450 /* The traditional POSIX definition of punctuation is every graphic,
451 non-alphanumeric character. */
452 return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
453#endif
454}
455
456static bool
457is_combining (unsigned int ch)
458{
601d2942
UD
459 /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
460 file. In 3.0.1 it was identical to the union of the general categories
461 "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
462 PropList.txt file, so we take the latter definition. */
0b95971d 463 return (unicode_attributes[ch].name != NULL
601d2942
UD
464 && unicode_attributes[ch].category[0] == 'M'
465 && (unicode_attributes[ch].category[1] == 'n'
466 || unicode_attributes[ch].category[1] == 'c'
467 || unicode_attributes[ch].category[1] == 'e'));
0b95971d
UD
468}
469
470static bool
471is_combining_level3 (unsigned int ch)
472{
473 return is_combining (ch)
474 && !(unicode_attributes[ch].combining[0] != '\0'
475 && unicode_attributes[ch].combining[0] != '0'
476 && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
477}
478
601d2942
UD
479/* Return the UCS symbol string for a Unicode character. */
480static const char *
481ucs_symbol (unsigned int i)
482{
483 static char buf[11+1];
484
485 sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
486 return buf;
487}
488
489/* Return the UCS symbol range string for a Unicode characters interval. */
490static const char *
491ucs_symbol_range (unsigned int low, unsigned int high)
492{
493 static char buf[24+1];
494
495 strcpy (buf, ucs_symbol (low));
496 strcat (buf, "..");
497 strcat (buf, ucs_symbol (high));
498 return buf;
499}
500
0b95971d
UD
501/* Output a character class (= property) table. */
502
503static void
504output_charclass (FILE *stream, const char *classname,
505 bool (*func) (unsigned int))
506{
601d2942 507 char table[0x110000];
0b95971d
UD
508 unsigned int i;
509 bool need_semicolon;
510 const int max_column = 75;
511 int column;
512
601d2942 513 for (i = 0; i < 0x110000; i++)
0b95971d
UD
514 table[i] = (int) func (i);
515
516 fprintf (stream, "%s ", classname);
517 need_semicolon = false;
518 column = 1000;
601d2942 519 for (i = 0; i < 0x110000; )
0b95971d
UD
520 {
521 if (!table[i])
522 i++;
523 else
524 {
525 unsigned int low, high;
601d2942 526 char buf[25];
0b95971d
UD
527
528 low = i;
529 do
530 i++;
601d2942 531 while (i < 0x110000 && table[i]);
0b95971d
UD
532 high = i - 1;
533
534 if (low == high)
601d2942 535 strcpy (buf, ucs_symbol (low));
0b95971d 536 else
601d2942 537 strcpy (buf, ucs_symbol_range (low, high));
0b95971d
UD
538
539 if (need_semicolon)
540 {
541 fprintf (stream, ";");
542 column++;
543 }
544
545 if (column + strlen (buf) > max_column)
546 {
547 fprintf (stream, "/\n ");
548 column = 3;
549 }
550
551 fprintf (stream, "%s", buf);
552 column += strlen (buf);
553 need_semicolon = true;
554 }
555 }
556 fprintf (stream, "\n");
557}
558
559/* Output a character mapping table. */
560
561static void
562output_charmap (FILE *stream, const char *mapname,
563 unsigned int (*func) (unsigned int))
564{
601d2942 565 char table[0x110000];
0b95971d
UD
566 unsigned int i;
567 bool need_semicolon;
568 const int max_column = 75;
569 int column;
570
601d2942 571 for (i = 0; i < 0x110000; i++)
0b95971d
UD
572 table[i] = (func (i) != i);
573
574 fprintf (stream, "%s ", mapname);
575 need_semicolon = false;
576 column = 1000;
601d2942 577 for (i = 0; i < 0x110000; i++)
0b95971d
UD
578 if (table[i])
579 {
601d2942 580 char buf[25+1];
0b95971d 581
601d2942
UD
582 strcpy (buf, "(");
583 strcat (buf, ucs_symbol (i));
584 strcat (buf, ",");
585 strcat (buf, ucs_symbol (func (i)));
586 strcat (buf, ")");
0b95971d
UD
587
588 if (need_semicolon)
589 {
590 fprintf (stream, ";");
591 column++;
592 }
593
594 if (column + strlen (buf) > max_column)
595 {
596 fprintf (stream, "/\n ");
597 column = 3;
598 }
599
600 fprintf (stream, "%s", buf);
601 column += strlen (buf);
602 need_semicolon = true;
603 }
604 fprintf (stream, "\n");
605}
606
607/* Output the width table. */
608
609static void
610output_widthmap (FILE *stream)
611{
612}
613
614/* Output the tables to the given file. */
615
616static void
617output_tables (const char *filename, const char *version)
618{
619 FILE *stream;
620 unsigned int ch;
621
622 stream = fopen (filename, "w");
623 if (stream == NULL)
624 {
625 fprintf (stderr, "cannot open '%s' for writing\n", filename);
626 exit (1);
627 }
628
629 fprintf (stream, "escape_char /\n");
630 fprintf (stream, "comment_char %%\n");
631 fprintf (stream, "\n");
601d2942 632 fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
0b95971d
UD
633 version);
634 fprintf (stream, "\n");
635
636 fprintf (stream, "LC_IDENTIFICATION\n");
637 fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
638 fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
639 fprintf (stream, "address \"\"\n");
640 fprintf (stream, "contact \"\"\n");
a334319f 641 fprintf (stream, "email \"bug-glibc@gnu.org\"\n");
0b95971d
UD
642 fprintf (stream, "tel \"\"\n");
643 fprintf (stream, "fax \"\"\n");
644 fprintf (stream, "language \"\"\n");
645 fprintf (stream, "territory \"Earth\"\n");
646 fprintf (stream, "revision \"%s\"\n", version);
647 {
648 time_t now;
649 char date[11];
650 now = time (NULL);
651 strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
652 fprintf (stream, "date \"%s\"\n", date);
653 }
601d2942 654 fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
0b95971d
UD
655 fprintf (stream, "END LC_IDENTIFICATION\n");
656 fprintf (stream, "\n");
657
658 /* Verifications. */
601d2942 659 for (ch = 0; ch < 0x110000; ch++)
0b95971d
UD
660 {
661 /* toupper restriction: "Only characters specified for the keywords
662 lower and upper shall be specified. */
663 if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
664 fprintf (stderr,
601d2942
UD
665 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
666 ucs_symbol (ch), ch, to_upper (ch));
0b95971d
UD
667
668 /* tolower restriction: "Only characters specified for the keywords
669 lower and upper shall be specified. */
670 if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
671 fprintf (stderr,
601d2942
UD
672 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
673 ucs_symbol (ch), ch, to_lower (ch));
0b95971d
UD
674
675 /* alpha restriction: "Characters classified as either upper or lower
676 shall automatically belong to this class. */
677 if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
601d2942 678 fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
0b95971d
UD
679
680 /* alpha restriction: "No character specified for the keywords cntrl,
681 digit, punct or space shall be specified." */
682 if (is_alpha (ch) && is_cntrl (ch))
601d2942 683 fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
0b95971d 684 if (is_alpha (ch) && is_digit (ch))
601d2942 685 fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
0b95971d 686 if (is_alpha (ch) && is_punct (ch))
601d2942 687 fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
0b95971d 688 if (is_alpha (ch) && is_space (ch))
601d2942 689 fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
0b95971d
UD
690
691 /* space restriction: "No character specified for the keywords upper,
692 lower, alpha, digit, graph or xdigit shall be specified."
693 upper, lower, alpha already checked above. */
694 if (is_space (ch) && is_digit (ch))
601d2942 695 fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
0b95971d 696 if (is_space (ch) && is_graph (ch))
601d2942 697 fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
0b95971d 698 if (is_space (ch) && is_xdigit (ch))
601d2942 699 fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
0b95971d
UD
700
701 /* cntrl restriction: "No character specified for the keywords upper,
702 lower, alpha, digit, punct, graph, print or xdigit shall be
703 specified." upper, lower, alpha already checked above. */
704 if (is_cntrl (ch) && is_digit (ch))
601d2942 705 fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
0b95971d 706 if (is_cntrl (ch) && is_punct (ch))
601d2942 707 fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
0b95971d 708 if (is_cntrl (ch) && is_graph (ch))
601d2942 709 fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
0b95971d 710 if (is_cntrl (ch) && is_print (ch))
601d2942 711 fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
0b95971d 712 if (is_cntrl (ch) && is_xdigit (ch))
601d2942 713 fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
0b95971d
UD
714
715 /* punct restriction: "No character specified for the keywords upper,
716 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
717 be specified." upper, lower, alpha, cntrl already checked above. */
718 if (is_punct (ch) && is_digit (ch))
601d2942 719 fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
0b95971d 720 if (is_punct (ch) && is_xdigit (ch))
601d2942 721 fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
0b95971d 722 if (is_punct (ch) && (ch == 0x0020))
601d2942 723 fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
0b95971d
UD
724
725 /* graph restriction: "No character specified for the keyword cntrl
726 shall be specified." Already checked above. */
727
728 /* print restriction: "No character specified for the keyword cntrl
729 shall be specified." Already checked above. */
730
731 /* graph - print relation: differ only in the <space> character.
732 How is this possible if there are more than one space character?!
733 I think susv2/xbd/locale.html should speak of "space characters",
734 not "space character". */
735 if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
601d2942
UD
736 fprintf (stderr,
737 "%s is print but not graph|<space>\n", ucs_symbol (ch));
0b95971d 738 if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
601d2942
UD
739 fprintf (stderr,
740 "%s is graph|<space> but not print\n", ucs_symbol (ch));
0b95971d
UD
741 }
742
743 fprintf (stream, "LC_CTYPE\n");
744 output_charclass (stream, "upper", is_upper);
745 output_charclass (stream, "lower", is_lower);
746 output_charclass (stream, "alpha", is_alpha);
747 output_charclass (stream, "digit", is_digit);
748 output_charclass (stream, "outdigit", is_outdigit);
749 output_charclass (stream, "blank", is_blank);
750 output_charclass (stream, "space", is_space);
751 output_charclass (stream, "cntrl", is_cntrl);
752 output_charclass (stream, "punct", is_punct);
753 output_charclass (stream, "xdigit", is_xdigit);
754 output_charclass (stream, "graph", is_graph);
755 output_charclass (stream, "print", is_print);
756 output_charclass (stream, "class \"combining\";", is_combining);
757 output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
758 output_charmap (stream, "toupper", to_upper);
759 output_charmap (stream, "tolower", to_lower);
760 output_charmap (stream, "map \"totitle\";", to_title);
761 output_widthmap (stream);
762 fprintf (stream, "END LC_CTYPE\n");
763
764 if (ferror (stream) || fclose (stream))
765 {
766 fprintf (stderr, "error writing to '%s'\n", filename);
767 exit (1);
768 }
769}
770
771int
772main (int argc, char * argv[])
773{
601d2942 774 if (argc != 3)
0b95971d 775 {
601d2942 776 fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
0b95971d
UD
777 exit (1);
778 }
779
780 fill_attributes (argv[1]);
0b95971d 781
601d2942 782 output_tables ("unicode", argv[2]);
0b95971d
UD
783
784 return 0;
785}