]>
Commit | Line | Data |
---|---|---|
dd8e8e54 CD |
1 | # Utilities to generate Unicode data for glibc from upstream Unicode data. |
2 | # | |
bfff8b1b | 3 | # Copyright (C) 2014-2017 Free Software Foundation, Inc. |
dd8e8e54 CD |
4 | # This file is part of the GNU C Library. |
5 | # | |
6 | # The GNU C Library is free software; you can redistribute it and/or | |
7 | # modify it under the terms of the GNU Lesser General Public | |
8 | # License as published by the Free Software Foundation; either | |
9 | # version 2.1 of the License, or (at your option) any later version. | |
10 | # | |
11 | # The GNU C Library is distributed in the hope that it will be useful, | |
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | # Lesser General Public License for more details. | |
15 | # | |
16 | # You should have received a copy of the GNU Lesser General Public | |
17 | # License along with the GNU C Library; if not, see | |
18 | # <http://www.gnu.org/licenses/>. | |
19 | ||
20 | ''' | |
21 | This module contains utilities used by the scripts to generate | |
22 | Unicode data for glibc from upstream Unicode data files. | |
23 | ''' | |
24 | ||
25 | import sys | |
26 | import re | |
27 | ||
277da2ab MF |
28 | |
29 | # Common locale header. | |
30 | COMMENT_HEADER = """ | |
31 | % This file is part of the GNU C Library and contains locale data. | |
32 | % The Free Software Foundation does not claim any copyright interest | |
33 | % in the locale data contained in this file. The foregoing does not | |
34 | % affect the license of the GNU C Library as a whole. It does not | |
35 | % exempt you from the conditions of the license if your use would | |
36 | % otherwise be governed by that license. | |
37 | """ | |
38 | ||
dd8e8e54 CD |
39 | # Dictionary holding the entire contents of the UnicodeData.txt file |
40 | # | |
41 | # Contents of this dictionary look like this: | |
42 | # | |
43 | # {0: {'category': 'Cc', | |
44 | # 'title': None, | |
45 | # 'digit': '', | |
46 | # 'name': '<control>', | |
47 | # 'bidi': 'BN', | |
48 | # 'combining': '0', | |
49 | # 'comment': '', | |
50 | # 'oldname': 'NULL', | |
51 | # 'decomposition': '', | |
52 | # 'upper': None, | |
53 | # 'mirrored': 'N', | |
54 | # 'lower': None, | |
55 | # 'decdigit': '', | |
56 | # 'numeric': ''}, | |
57 | # … | |
58 | # } | |
59 | UNICODE_ATTRIBUTES = {} | |
60 | ||
61 | # Dictionary holding the entire contents of the DerivedCoreProperties.txt file | |
62 | # | |
63 | # Contents of this dictionary look like this: | |
64 | # | |
65 | # {917504: ['Default_Ignorable_Code_Point'], | |
66 | # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], | |
67 | # … | |
68 | # } | |
69 | DERIVED_CORE_PROPERTIES = {} | |
70 | ||
71 | # Dictionary holding the entire contents of the EastAsianWidths.txt file | |
72 | # | |
73 | # Contents of this dictionary look like this: | |
74 | # | |
75 | # {0: 'N', … , 45430: 'W', …} | |
76 | EAST_ASIAN_WIDTHS = {} | |
77 | ||
78 | def fill_attribute(code_point, fields): | |
79 | '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. | |
80 | ||
81 | One entry in the UNICODE_ATTRIBUTES dictionary represents one line | |
82 | in the UnicodeData.txt file. | |
83 | ||
84 | ''' | |
85 | UNICODE_ATTRIBUTES[code_point] = { | |
86 | 'name': fields[1], # Character name | |
87 | 'category': fields[2], # General category | |
88 | 'combining': fields[3], # Canonical combining classes | |
89 | 'bidi': fields[4], # Bidirectional category | |
90 | 'decomposition': fields[5], # Character decomposition mapping | |
91 | 'decdigit': fields[6], # Decimal digit value | |
92 | 'digit': fields[7], # Digit value | |
93 | 'numeric': fields[8], # Numeric value | |
94 | 'mirrored': fields[9], # mirrored | |
95 | 'oldname': fields[10], # Old Unicode 1.0 name | |
96 | 'comment': fields[11], # comment | |
97 | # Uppercase mapping | |
98 | 'upper': int(fields[12], 16) if fields[12] else None, | |
99 | # Lowercase mapping | |
100 | 'lower': int(fields[13], 16) if fields[13] else None, | |
101 | # Titlecase mapping | |
102 | 'title': int(fields[14], 16) if fields[14] else None, | |
103 | } | |
104 | ||
105 | def fill_attributes(filename): | |
106 | '''Stores the entire contents of the UnicodeData.txt file | |
107 | in the UNICODE_ATTRIBUTES dictionary. | |
108 | ||
109 | A typical line for a single code point in UnicodeData.txt looks | |
110 | like this: | |
111 | ||
112 | 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; | |
113 | ||
114 | Code point ranges are indicated by pairs of lines like this: | |
115 | ||
116 | 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; | |
117 | 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; | |
118 | ''' | |
119 | with open(filename, mode='r') as unicode_data_file: | |
120 | fields_start = [] | |
121 | for line in unicode_data_file: | |
122 | fields = line.strip().split(';') | |
123 | if len(fields) != 15: | |
124 | sys.stderr.write( | |
125 | 'short line in file "%(f)s": %(l)s\n' %{ | |
126 | 'f': filename, 'l': line}) | |
127 | exit(1) | |
128 | if fields[2] == 'Cs': | |
129 | # Surrogates are UTF-16 artefacts, | |
130 | # not real characters. Ignore them. | |
131 | fields_start = [] | |
132 | continue | |
133 | if fields[1].endswith(', First>'): | |
134 | fields_start = fields | |
135 | fields_start[1] = fields_start[1].split(',')[0][1:] | |
136 | continue | |
137 | if fields[1].endswith(', Last>'): | |
138 | fields[1] = fields[1].split(',')[0][1:] | |
139 | if fields[1:] != fields_start[1:]: | |
140 | sys.stderr.write( | |
141 | 'broken code point range in file "%(f)s": %(l)s\n' %{ | |
142 | 'f': filename, 'l': line}) | |
143 | exit(1) | |
144 | for code_point in range( | |
145 | int(fields_start[0], 16), | |
146 | int(fields[0], 16)+1): | |
147 | fill_attribute(code_point, fields) | |
148 | fields_start = [] | |
149 | continue | |
150 | fill_attribute(int(fields[0], 16), fields) | |
151 | fields_start = [] | |
152 | ||
153 | def fill_derived_core_properties(filename): | |
154 | '''Stores the entire contents of the DerivedCoreProperties.txt file | |
155 | in the DERIVED_CORE_PROPERTIES dictionary. | |
156 | ||
157 | Lines in DerivedCoreProperties.txt are either a code point range like | |
158 | this: | |
159 | ||
160 | 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z | |
161 | ||
162 | or a single code point like this: | |
163 | ||
164 | 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR | |
165 | ||
166 | ''' | |
167 | with open(filename, mode='r') as derived_core_properties_file: | |
168 | for line in derived_core_properties_file: | |
169 | match = re.match( | |
170 | r'^(?P<codepoint1>[0-9A-F]{4,6})' | |
171 | + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' | |
172 | + r'\s*;\s*(?P<property>[a-zA-Z_]+)', | |
173 | line) | |
174 | if not match: | |
175 | continue | |
176 | start = match.group('codepoint1') | |
177 | end = match.group('codepoint2') | |
178 | if not end: | |
179 | end = start | |
180 | for code_point in range(int(start, 16), int(end, 16)+1): | |
181 | prop = match.group('property') | |
182 | if code_point in DERIVED_CORE_PROPERTIES: | |
183 | DERIVED_CORE_PROPERTIES[code_point].append(prop) | |
184 | else: | |
185 | DERIVED_CORE_PROPERTIES[code_point] = [prop] | |
186 | ||
187 | def fill_east_asian_widths(filename): | |
188 | '''Stores the entire contents of the EastAsianWidths.txt file | |
189 | in the EAST_ASIAN_WIDTHS dictionary. | |
190 | ||
191 | Lines in EastAsianWidths.txt are either a code point range like | |
192 | this: | |
193 | ||
194 | 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> | |
195 | ||
196 | or a single code point like this: | |
197 | ||
198 | A015;W # Lm YI SYLLABLE WU | |
199 | ''' | |
200 | with open(filename, mode='r') as east_asian_widths_file: | |
201 | for line in east_asian_widths_file: | |
202 | match = re.match( | |
203 | r'^(?P<codepoint1>[0-9A-F]{4,6})' | |
204 | +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' | |
205 | +r'\s*;\s*(?P<property>[a-zA-Z]+)', | |
206 | line) | |
207 | if not match: | |
208 | continue | |
209 | start = match.group('codepoint1') | |
210 | end = match.group('codepoint2') | |
211 | if not end: | |
212 | end = start | |
213 | for code_point in range(int(start, 16), int(end, 16)+1): | |
214 | EAST_ASIAN_WIDTHS[code_point] = match.group('property') | |
215 | ||
216 | def to_upper(code_point): | |
217 | '''Returns the code point of the uppercase version | |
218 | of the given code point''' | |
219 | if (UNICODE_ATTRIBUTES[code_point]['name'] | |
220 | and UNICODE_ATTRIBUTES[code_point]['upper']): | |
221 | return UNICODE_ATTRIBUTES[code_point]['upper'] | |
222 | else: | |
223 | return code_point | |
224 | ||
225 | def to_lower(code_point): | |
226 | '''Returns the code point of the lowercase version | |
227 | of the given code point''' | |
228 | if (UNICODE_ATTRIBUTES[code_point]['name'] | |
229 | and UNICODE_ATTRIBUTES[code_point]['lower']): | |
230 | return UNICODE_ATTRIBUTES[code_point]['lower'] | |
231 | else: | |
232 | return code_point | |
233 | ||
85bafe6f JM |
234 | def to_upper_turkish(code_point): |
235 | '''Returns the code point of the Turkish uppercase version | |
236 | of the given code point''' | |
237 | if code_point == 0x0069: | |
238 | return 0x0130 | |
239 | return to_upper(code_point) | |
240 | ||
241 | def to_lower_turkish(code_point): | |
242 | '''Returns the code point of the Turkish lowercase version | |
243 | of the given code point''' | |
244 | if code_point == 0x0049: | |
245 | return 0x0131 | |
246 | return to_lower(code_point) | |
247 | ||
dd8e8e54 CD |
248 | def to_title(code_point): |
249 | '''Returns the code point of the titlecase version | |
250 | of the given code point''' | |
251 | if (UNICODE_ATTRIBUTES[code_point]['name'] | |
252 | and UNICODE_ATTRIBUTES[code_point]['title']): | |
253 | return UNICODE_ATTRIBUTES[code_point]['title'] | |
254 | else: | |
255 | return code_point | |
256 | ||
257 | def is_upper(code_point): | |
258 | '''Checks whether the character with this code point is uppercase''' | |
259 | return (to_lower(code_point) != code_point | |
260 | or (code_point in DERIVED_CORE_PROPERTIES | |
261 | and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) | |
262 | ||
263 | def is_lower(code_point): | |
264 | '''Checks whether the character with this code point is lowercase''' | |
265 | # Some characters are defined as “Lowercase” in | |
266 | # DerivedCoreProperties.txt but do not have a mapping to upper | |
267 | # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is | |
268 | # one of these. | |
269 | return (to_upper(code_point) != code_point | |
270 | # <U00DF> is lowercase, but without simple to_upper mapping. | |
271 | or code_point == 0x00DF | |
272 | or (code_point in DERIVED_CORE_PROPERTIES | |
273 | and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) | |
274 | ||
275 | def is_alpha(code_point): | |
276 | '''Checks whether the character with this code point is alphabetic''' | |
277 | return ((code_point in DERIVED_CORE_PROPERTIES | |
278 | and | |
279 | 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) | |
280 | or | |
281 | # Consider all the non-ASCII digits as alphabetic. | |
282 | # ISO C 99 forbids us to have them in category “digit”, | |
283 | # but we want iswalnum to return true on them. | |
284 | (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' | |
285 | and not (code_point >= 0x0030 and code_point <= 0x0039))) | |
286 | ||
287 | def is_digit(code_point): | |
288 | '''Checks whether the character with this code point is a digit''' | |
289 | if False: | |
290 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
291 | and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') | |
292 | # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without | |
293 | # a zero. Must add <0> in front of them by hand. | |
294 | else: | |
295 | # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 | |
296 | # takes it away: | |
297 | # 7.25.2.1.5: | |
298 | # The iswdigit function tests for any wide character that | |
299 | # corresponds to a decimal-digit character (as defined in 5.2.1). | |
300 | # 5.2.1: | |
301 | # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 | |
302 | return (code_point >= 0x0030 and code_point <= 0x0039) | |
303 | ||
304 | def is_outdigit(code_point): | |
305 | '''Checks whether the character with this code point is outdigit''' | |
306 | return (code_point >= 0x0030 and code_point <= 0x0039) | |
307 | ||
308 | def is_blank(code_point): | |
309 | '''Checks whether the character with this code point is blank''' | |
310 | return (code_point == 0x0009 # '\t' | |
311 | # Category Zs without mention of '<noBreak>' | |
312 | or (UNICODE_ATTRIBUTES[code_point]['name'] | |
313 | and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' | |
314 | and '<noBreak>' not in | |
315 | UNICODE_ATTRIBUTES[code_point]['decomposition'])) | |
316 | ||
317 | def is_space(code_point): | |
318 | '''Checks whether the character with this code point is a space''' | |
319 | # Don’t make U+00A0 a space. Non-breaking space means that all programs | |
320 | # should treat it like a punctuation character, not like a space. | |
321 | return (code_point == 0x0020 # ' ' | |
322 | or code_point == 0x000C # '\f' | |
323 | or code_point == 0x000A # '\n' | |
324 | or code_point == 0x000D # '\r' | |
325 | or code_point == 0x0009 # '\t' | |
326 | or code_point == 0x000B # '\v' | |
327 | # Categories Zl, Zp, and Zs without mention of "<noBreak>" | |
328 | or (UNICODE_ATTRIBUTES[code_point]['name'] | |
329 | and | |
330 | (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] | |
331 | or | |
332 | (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] | |
333 | and | |
334 | '<noBreak>' not in | |
335 | UNICODE_ATTRIBUTES[code_point]['decomposition'])))) | |
336 | ||
337 | def is_cntrl(code_point): | |
338 | '''Checks whether the character with this code point is | |
339 | a control character''' | |
340 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
341 | and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' | |
342 | or | |
343 | UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) | |
344 | ||
345 | def is_xdigit(code_point): | |
346 | '''Checks whether the character with this code point is | |
347 | a hexadecimal digit''' | |
348 | if False: | |
349 | return (is_digit(code_point) | |
350 | or (code_point >= 0x0041 and code_point <= 0x0046) | |
351 | or (code_point >= 0x0061 and code_point <= 0x0066)) | |
352 | else: | |
353 | # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 | |
354 | # takes it away: | |
355 | # 7.25.2.1.12: | |
356 | # The iswxdigit function tests for any wide character that | |
357 | # corresponds to a hexadecimal-digit character (as defined | |
358 | # in 6.4.4.1). | |
359 | # 6.4.4.1: | |
360 | # hexadecimal-digit: one of | |
361 | # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F | |
362 | return ((code_point >= 0x0030 and code_point <= 0x0039) | |
363 | or (code_point >= 0x0041 and code_point <= 0x0046) | |
364 | or (code_point >= 0x0061 and code_point <= 0x0066)) | |
365 | ||
366 | def is_graph(code_point): | |
367 | '''Checks whether the character with this code point is | |
368 | a graphical character''' | |
369 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
370 | and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' | |
371 | and not is_space(code_point)) | |
372 | ||
373 | def is_print(code_point): | |
374 | '''Checks whether the character with this code point is printable''' | |
375 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
376 | and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' | |
377 | and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) | |
378 | ||
379 | def is_punct(code_point): | |
380 | '''Checks whether the character with this code point is punctuation''' | |
381 | if False: | |
382 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
383 | and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) | |
384 | else: | |
385 | # The traditional POSIX definition of punctuation is every graphic, | |
386 | # non-alphanumeric character. | |
387 | return (is_graph(code_point) | |
388 | and not is_alpha(code_point) | |
389 | and not is_digit(code_point)) | |
390 | ||
391 | def is_combining(code_point): | |
392 | '''Checks whether the character with this code point is | |
393 | a combining character''' | |
394 | # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt | |
395 | # file. In 3.0.1 it was identical to the union of the general categories | |
396 | # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the | |
397 | # PropList.txt file, so we take the latter definition. | |
398 | return (UNICODE_ATTRIBUTES[code_point]['name'] | |
399 | and | |
400 | UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) | |
401 | ||
402 | def is_combining_level3(code_point): | |
403 | '''Checks whether the character with this code point is | |
404 | a combining level3 character''' | |
405 | return (is_combining(code_point) | |
406 | and | |
407 | int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) | |
408 | ||
409 | def ucs_symbol(code_point): | |
410 | '''Return the UCS symbol string for a Unicode character.''' | |
411 | if code_point < 0x10000: | |
412 | return '<U{:04X}>'.format(code_point) | |
413 | else: | |
414 | return '<U{:08X}>'.format(code_point) | |
415 | ||
416 | def ucs_symbol_range(code_point_low, code_point_high): | |
417 | '''Returns a string UCS symbol string for a code point range. | |
418 | ||
419 | Example: | |
420 | ||
421 | <U0041>..<U005A> | |
422 | ''' | |
423 | return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) | |
424 | ||
425 | def verifications(): | |
426 | '''Tests whether the is_* functions observe the known restrictions''' | |
427 | for code_point in sorted(UNICODE_ATTRIBUTES): | |
428 | # toupper restriction: "Only characters specified for the keywords | |
429 | # lower and upper shall be specified. | |
430 | if (to_upper(code_point) != code_point | |
431 | and not (is_lower(code_point) or is_upper(code_point))): | |
432 | sys.stderr.write( | |
433 | ('%(sym)s is not upper|lower ' | |
434 | + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ | |
435 | 'sym': ucs_symbol(code_point), | |
436 | 'c': code_point, | |
437 | 'uc': to_upper(code_point)}) | |
438 | # tolower restriction: "Only characters specified for the keywords | |
439 | # lower and upper shall be specified. | |
440 | if (to_lower(code_point) != code_point | |
441 | and not (is_lower(code_point) or is_upper(code_point))): | |
442 | sys.stderr.write( | |
443 | ('%(sym)s is not upper|lower ' | |
444 | + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ | |
445 | 'sym': ucs_symbol(code_point), | |
446 | 'c': code_point, | |
447 | 'uc': to_lower(code_point)}) | |
448 | # alpha restriction: "Characters classified as either upper or lower | |
449 | # shall automatically belong to this class. | |
450 | if ((is_lower(code_point) or is_upper(code_point)) | |
451 | and not is_alpha(code_point)): | |
452 | sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ | |
453 | 'sym': ucs_symbol(code_point)}) | |
454 | # alpha restriction: “No character specified for the keywords cntrl, | |
455 | # digit, punct or space shall be specified.” | |
456 | if (is_alpha(code_point) and is_cntrl(code_point)): | |
457 | sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ | |
458 | 'sym': ucs_symbol(code_point)}) | |
459 | if (is_alpha(code_point) and is_digit(code_point)): | |
460 | sys.stderr.write('%(sym)s is alpha and digit\n' %{ | |
461 | 'sym': ucs_symbol(code_point)}) | |
462 | if (is_alpha(code_point) and is_punct(code_point)): | |
463 | sys.stderr.write('%(sym)s is alpha and punct\n' %{ | |
464 | 'sym': ucs_symbol(code_point)}) | |
465 | if (is_alpha(code_point) and is_space(code_point)): | |
466 | sys.stderr.write('%(sym)s is alpha and space\n' %{ | |
467 | 'sym': ucs_symbol(code_point)}) | |
468 | # space restriction: “No character specified for the keywords upper, | |
469 | # lower, alpha, digit, graph or xdigit shall be specified.” | |
470 | # upper, lower, alpha already checked above. | |
471 | if (is_space(code_point) and is_digit(code_point)): | |
472 | sys.stderr.write('%(sym)s is space and digit\n' %{ | |
473 | 'sym': ucs_symbol(code_point)}) | |
474 | if (is_space(code_point) and is_graph(code_point)): | |
475 | sys.stderr.write('%(sym)s is space and graph\n' %{ | |
476 | 'sym': ucs_symbol(code_point)}) | |
477 | if (is_space(code_point) and is_xdigit(code_point)): | |
478 | sys.stderr.write('%(sym)s is space and xdigit\n' %{ | |
479 | 'sym': ucs_symbol(code_point)}) | |
480 | # cntrl restriction: “No character specified for the keywords upper, | |
481 | # lower, alpha, digit, punct, graph, print or xdigit shall be | |
482 | # specified.” upper, lower, alpha already checked above. | |
483 | if (is_cntrl(code_point) and is_digit(code_point)): | |
484 | sys.stderr.write('%(sym)s is cntrl and digit\n' %{ | |
485 | 'sym': ucs_symbol(code_point)}) | |
486 | if (is_cntrl(code_point) and is_punct(code_point)): | |
487 | sys.stderr.write('%(sym)s is cntrl and punct\n' %{ | |
488 | 'sym': ucs_symbol(code_point)}) | |
489 | if (is_cntrl(code_point) and is_graph(code_point)): | |
490 | sys.stderr.write('%(sym)s is cntrl and graph\n' %{ | |
491 | 'sym': ucs_symbol(code_point)}) | |
492 | if (is_cntrl(code_point) and is_print(code_point)): | |
493 | sys.stderr.write('%(sym)s is cntrl and print\n' %{ | |
494 | 'sym': ucs_symbol(code_point)}) | |
495 | if (is_cntrl(code_point) and is_xdigit(code_point)): | |
496 | sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ | |
497 | 'sym': ucs_symbol(code_point)}) | |
498 | # punct restriction: “No character specified for the keywords upper, | |
499 | # lower, alpha, digit, cntrl, xdigit or as the <space> character shall | |
500 | # be specified.” upper, lower, alpha, cntrl already checked above. | |
501 | if (is_punct(code_point) and is_digit(code_point)): | |
502 | sys.stderr.write('%(sym)s is punct and digit\n' %{ | |
503 | 'sym': ucs_symbol(code_point)}) | |
504 | if (is_punct(code_point) and is_xdigit(code_point)): | |
505 | sys.stderr.write('%(sym)s is punct and xdigit\n' %{ | |
506 | 'sym': ucs_symbol(code_point)}) | |
507 | if (is_punct(code_point) and code_point == 0x0020): | |
508 | sys.stderr.write('%(sym)s is punct\n' %{ | |
509 | 'sym': ucs_symbol(code_point)}) | |
510 | # graph restriction: “No character specified for the keyword cntrl | |
511 | # shall be specified.” Already checked above. | |
512 | ||
513 | # print restriction: “No character specified for the keyword cntrl | |
514 | # shall be specified.” Already checked above. | |
515 | ||
516 | # graph - print relation: differ only in the <space> character. | |
517 | # How is this possible if there are more than one space character?! | |
518 | # I think susv2/xbd/locale.html should speak of “space characters”, | |
519 | # not “space character”. | |
520 | if (is_print(code_point) | |
521 | and not (is_graph(code_point) or is_space(code_point))): | |
522 | sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ | |
523 | 'sym': unicode_utils.ucs_symbol(code_point)}) | |
524 | if (not is_print(code_point) | |
525 | and (is_graph(code_point) or code_point == 0x0020)): | |
526 | sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ | |
527 | 'sym': unicode_utils.ucs_symbol(code_point)}) |