]> git.ipfire.org Git - thirdparty/glibc.git/blob - localedata/unicode-gen/unicode_utils.py
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / localedata / unicode-gen / unicode_utils.py
1 # Utilities to generate Unicode data for glibc from upstream Unicode data.
2 #
3 # Copyright (C) 2014-2019 Free Software Foundation, Inc.
4 # This file is part of the GNU C Library.
5 #
6 # The GNU C Library is free software; you can redistribute it and/or
7 # modify it under the terms of the GNU Lesser General Public
8 # License as published by the Free Software Foundation; either
9 # version 2.1 of the License, or (at your option) any later version.
10 #
11 # The GNU C Library is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # Lesser General Public License for more details.
15 #
16 # You should have received a copy of the GNU Lesser General Public
17 # License along with the GNU C Library; if not, see
18 # <https://www.gnu.org/licenses/>.
19
20 '''
21 This module contains utilities used by the scripts to generate
22 Unicode data for glibc from upstream Unicode data files.
23 '''
24
25 import sys
26 import re
27
28
29 # Common locale header.
30 COMMENT_HEADER = """
31 % This file is part of the GNU C Library and contains locale data.
32 % The Free Software Foundation does not claim any copyright interest
33 % in the locale data contained in this file. The foregoing does not
34 % affect the license of the GNU C Library as a whole. It does not
35 % exempt you from the conditions of the license if your use would
36 % otherwise be governed by that license.
37 """
38
39 # Dictionary holding the entire contents of the UnicodeData.txt file
40 #
41 # Contents of this dictionary look like this:
42 #
43 # {0: {'category': 'Cc',
44 # 'title': None,
45 # 'digit': '',
46 # 'name': '<control>',
47 # 'bidi': 'BN',
48 # 'combining': '0',
49 # 'comment': '',
50 # 'oldname': 'NULL',
51 # 'decomposition': '',
52 # 'upper': None,
53 # 'mirrored': 'N',
54 # 'lower': None,
55 # 'decdigit': '',
56 # 'numeric': ''},
57 # …
58 # }
59 UNICODE_ATTRIBUTES = {}
60
61 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
62 #
63 # Contents of this dictionary look like this:
64 #
65 # {917504: ['Default_Ignorable_Code_Point'],
66 # 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
67 # …
68 # }
69 DERIVED_CORE_PROPERTIES = {}
70
71 # Dictionary holding the entire contents of the EastAsianWidths.txt file
72 #
73 # Contents of this dictionary look like this:
74 #
75 # {0: 'N', … , 45430: 'W', …}
76 EAST_ASIAN_WIDTHS = {}
77
78 def fill_attribute(code_point, fields):
79 '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
80
81 One entry in the UNICODE_ATTRIBUTES dictionary represents one line
82 in the UnicodeData.txt file.
83
84 '''
85 UNICODE_ATTRIBUTES[code_point] = {
86 'name': fields[1], # Character name
87 'category': fields[2], # General category
88 'combining': fields[3], # Canonical combining classes
89 'bidi': fields[4], # Bidirectional category
90 'decomposition': fields[5], # Character decomposition mapping
91 'decdigit': fields[6], # Decimal digit value
92 'digit': fields[7], # Digit value
93 'numeric': fields[8], # Numeric value
94 'mirrored': fields[9], # mirrored
95 'oldname': fields[10], # Old Unicode 1.0 name
96 'comment': fields[11], # comment
97 # Uppercase mapping
98 'upper': int(fields[12], 16) if fields[12] else None,
99 # Lowercase mapping
100 'lower': int(fields[13], 16) if fields[13] else None,
101 # Titlecase mapping
102 'title': int(fields[14], 16) if fields[14] else None,
103 }
104
105 def fill_attributes(filename):
106 '''Stores the entire contents of the UnicodeData.txt file
107 in the UNICODE_ATTRIBUTES dictionary.
108
109 A typical line for a single code point in UnicodeData.txt looks
110 like this:
111
112 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
113
114 Code point ranges are indicated by pairs of lines like this:
115
116 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
117 9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
118 '''
119 with open(filename, mode='r') as unicode_data_file:
120 fields_start = []
121 for line in unicode_data_file:
122 fields = line.strip().split(';')
123 if len(fields) != 15:
124 sys.stderr.write(
125 'short line in file "%(f)s": %(l)s\n' %{
126 'f': filename, 'l': line})
127 exit(1)
128 if fields[2] == 'Cs':
129 # Surrogates are UTF-16 artefacts,
130 # not real characters. Ignore them.
131 fields_start = []
132 continue
133 if fields[1].endswith(', First>'):
134 fields_start = fields
135 fields_start[1] = fields_start[1].split(',')[0][1:]
136 continue
137 if fields[1].endswith(', Last>'):
138 fields[1] = fields[1].split(',')[0][1:]
139 if fields[1:] != fields_start[1:]:
140 sys.stderr.write(
141 'broken code point range in file "%(f)s": %(l)s\n' %{
142 'f': filename, 'l': line})
143 exit(1)
144 for code_point in range(
145 int(fields_start[0], 16),
146 int(fields[0], 16)+1):
147 fill_attribute(code_point, fields)
148 fields_start = []
149 continue
150 fill_attribute(int(fields[0], 16), fields)
151 fields_start = []
152
153 def fill_derived_core_properties(filename):
154 '''Stores the entire contents of the DerivedCoreProperties.txt file
155 in the DERIVED_CORE_PROPERTIES dictionary.
156
157 Lines in DerivedCoreProperties.txt are either a code point range like
158 this:
159
160 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
161
162 or a single code point like this:
163
164 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
165
166 '''
167 with open(filename, mode='r') as derived_core_properties_file:
168 for line in derived_core_properties_file:
169 match = re.match(
170 r'^(?P<codepoint1>[0-9A-F]{4,6})'
171 + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
172 + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
173 line)
174 if not match:
175 continue
176 start = match.group('codepoint1')
177 end = match.group('codepoint2')
178 if not end:
179 end = start
180 for code_point in range(int(start, 16), int(end, 16)+1):
181 prop = match.group('property')
182 if code_point in DERIVED_CORE_PROPERTIES:
183 DERIVED_CORE_PROPERTIES[code_point].append(prop)
184 else:
185 DERIVED_CORE_PROPERTIES[code_point] = [prop]
186
187 def fill_east_asian_widths(filename):
188 '''Stores the entire contents of the EastAsianWidths.txt file
189 in the EAST_ASIAN_WIDTHS dictionary.
190
191 Lines in EastAsianWidths.txt are either a code point range like
192 this:
193
194 9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
195
196 or a single code point like this:
197
198 A015;W # Lm YI SYLLABLE WU
199 '''
200 with open(filename, mode='r') as east_asian_widths_file:
201 for line in east_asian_widths_file:
202 match = re.match(
203 r'^(?P<codepoint1>[0-9A-F]{4,6})'
204 +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
205 +r'\s*;\s*(?P<property>[a-zA-Z]+)',
206 line)
207 if not match:
208 continue
209 start = match.group('codepoint1')
210 end = match.group('codepoint2')
211 if not end:
212 end = start
213 for code_point in range(int(start, 16), int(end, 16)+1):
214 EAST_ASIAN_WIDTHS[code_point] = match.group('property')
215
216 def to_upper(code_point):
217 '''Returns the code point of the uppercase version
218 of the given code point'''
219 if (UNICODE_ATTRIBUTES[code_point]['name']
220 and UNICODE_ATTRIBUTES[code_point]['upper']):
221 return UNICODE_ATTRIBUTES[code_point]['upper']
222 else:
223 return code_point
224
225 def to_lower(code_point):
226 '''Returns the code point of the lowercase version
227 of the given code point'''
228 if (UNICODE_ATTRIBUTES[code_point]['name']
229 and UNICODE_ATTRIBUTES[code_point]['lower']):
230 return UNICODE_ATTRIBUTES[code_point]['lower']
231 else:
232 return code_point
233
234 def to_upper_turkish(code_point):
235 '''Returns the code point of the Turkish uppercase version
236 of the given code point'''
237 if code_point == 0x0069:
238 return 0x0130
239 return to_upper(code_point)
240
241 def to_lower_turkish(code_point):
242 '''Returns the code point of the Turkish lowercase version
243 of the given code point'''
244 if code_point == 0x0049:
245 return 0x0131
246 return to_lower(code_point)
247
248 def to_title(code_point):
249 '''Returns the code point of the titlecase version
250 of the given code point'''
251 if (UNICODE_ATTRIBUTES[code_point]['name']
252 and UNICODE_ATTRIBUTES[code_point]['title']):
253 return UNICODE_ATTRIBUTES[code_point]['title']
254 else:
255 return code_point
256
257 def is_upper(code_point):
258 '''Checks whether the character with this code point is uppercase'''
259 return (to_lower(code_point) != code_point
260 or (code_point in DERIVED_CORE_PROPERTIES
261 and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
262
263 def is_lower(code_point):
264 '''Checks whether the character with this code point is lowercase'''
265 # Some characters are defined as “Lowercase” in
266 # DerivedCoreProperties.txt but do not have a mapping to upper
267 # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
268 # one of these.
269 return (to_upper(code_point) != code_point
270 # <U00DF> is lowercase, but without simple to_upper mapping.
271 or code_point == 0x00DF
272 or (code_point in DERIVED_CORE_PROPERTIES
273 and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
274
275 def is_alpha(code_point):
276 '''Checks whether the character with this code point is alphabetic'''
277 return ((code_point in DERIVED_CORE_PROPERTIES
278 and
279 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
280 or
281 # Consider all the non-ASCII digits as alphabetic.
282 # ISO C 99 forbids us to have them in category “digit”,
283 # but we want iswalnum to return true on them.
284 (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
285 and not (code_point >= 0x0030 and code_point <= 0x0039)))
286
287 def is_digit(code_point):
288 '''Checks whether the character with this code point is a digit'''
289 if False:
290 return (UNICODE_ATTRIBUTES[code_point]['name']
291 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
292 # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
293 # a zero. Must add <0> in front of them by hand.
294 else:
295 # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
296 # takes it away:
297 # 7.25.2.1.5:
298 # The iswdigit function tests for any wide character that
299 # corresponds to a decimal-digit character (as defined in 5.2.1).
300 # 5.2.1:
301 # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
302 return (code_point >= 0x0030 and code_point <= 0x0039)
303
304 def is_outdigit(code_point):
305 '''Checks whether the character with this code point is outdigit'''
306 return (code_point >= 0x0030 and code_point <= 0x0039)
307
308 def is_blank(code_point):
309 '''Checks whether the character with this code point is blank'''
310 return (code_point == 0x0009 # '\t'
311 # Category Zs without mention of '<noBreak>'
312 or (UNICODE_ATTRIBUTES[code_point]['name']
313 and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
314 and '<noBreak>' not in
315 UNICODE_ATTRIBUTES[code_point]['decomposition']))
316
317 def is_space(code_point):
318 '''Checks whether the character with this code point is a space'''
319 # Don’t make U+00A0 a space. Non-breaking space means that all programs
320 # should treat it like a punctuation character, not like a space.
321 return (code_point == 0x0020 # ' '
322 or code_point == 0x000C # '\f'
323 or code_point == 0x000A # '\n'
324 or code_point == 0x000D # '\r'
325 or code_point == 0x0009 # '\t'
326 or code_point == 0x000B # '\v'
327 # Categories Zl, Zp, and Zs without mention of "<noBreak>"
328 or (UNICODE_ATTRIBUTES[code_point]['name']
329 and
330 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
331 or
332 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
333 and
334 '<noBreak>' not in
335 UNICODE_ATTRIBUTES[code_point]['decomposition']))))
336
337 def is_cntrl(code_point):
338 '''Checks whether the character with this code point is
339 a control character'''
340 return (UNICODE_ATTRIBUTES[code_point]['name']
341 and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
342 or
343 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
344
345 def is_xdigit(code_point):
346 '''Checks whether the character with this code point is
347 a hexadecimal digit'''
348 if False:
349 return (is_digit(code_point)
350 or (code_point >= 0x0041 and code_point <= 0x0046)
351 or (code_point >= 0x0061 and code_point <= 0x0066))
352 else:
353 # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
354 # takes it away:
355 # 7.25.2.1.12:
356 # The iswxdigit function tests for any wide character that
357 # corresponds to a hexadecimal-digit character (as defined
358 # in 6.4.4.1).
359 # 6.4.4.1:
360 # hexadecimal-digit: one of
361 # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
362 return ((code_point >= 0x0030 and code_point <= 0x0039)
363 or (code_point >= 0x0041 and code_point <= 0x0046)
364 or (code_point >= 0x0061 and code_point <= 0x0066))
365
366 def is_graph(code_point):
367 '''Checks whether the character with this code point is
368 a graphical character'''
369 return (UNICODE_ATTRIBUTES[code_point]['name']
370 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
371 and not is_space(code_point))
372
373 def is_print(code_point):
374 '''Checks whether the character with this code point is printable'''
375 return (UNICODE_ATTRIBUTES[code_point]['name']
376 and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
377 and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
378
379 def is_punct(code_point):
380 '''Checks whether the character with this code point is punctuation'''
381 if False:
382 return (UNICODE_ATTRIBUTES[code_point]['name']
383 and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
384 else:
385 # The traditional POSIX definition of punctuation is every graphic,
386 # non-alphanumeric character.
387 return (is_graph(code_point)
388 and not is_alpha(code_point)
389 and not is_digit(code_point))
390
391 def is_combining(code_point):
392 '''Checks whether the character with this code point is
393 a combining character'''
394 # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
395 # file. In 3.0.1 it was identical to the union of the general categories
396 # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
397 # PropList.txt file, so we take the latter definition.
398 return (UNICODE_ATTRIBUTES[code_point]['name']
399 and
400 UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
401
402 def is_combining_level3(code_point):
403 '''Checks whether the character with this code point is
404 a combining level3 character'''
405 return (is_combining(code_point)
406 and
407 int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
408
409 def ucs_symbol(code_point):
410 '''Return the UCS symbol string for a Unicode character.'''
411 if code_point < 0x10000:
412 return '<U{:04X}>'.format(code_point)
413 else:
414 return '<U{:08X}>'.format(code_point)
415
416 def ucs_symbol_range(code_point_low, code_point_high):
417 '''Returns a string UCS symbol string for a code point range.
418
419 Example:
420
421 <U0041>..<U005A>
422 '''
423 return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
424
425 def verifications():
426 '''Tests whether the is_* functions observe the known restrictions'''
427 for code_point in sorted(UNICODE_ATTRIBUTES):
428 # toupper restriction: "Only characters specified for the keywords
429 # lower and upper shall be specified.
430 if (to_upper(code_point) != code_point
431 and not (is_lower(code_point) or is_upper(code_point))):
432 sys.stderr.write(
433 ('%(sym)s is not upper|lower '
434 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
435 'sym': ucs_symbol(code_point),
436 'c': code_point,
437 'uc': to_upper(code_point)})
438 # tolower restriction: "Only characters specified for the keywords
439 # lower and upper shall be specified.
440 if (to_lower(code_point) != code_point
441 and not (is_lower(code_point) or is_upper(code_point))):
442 sys.stderr.write(
443 ('%(sym)s is not upper|lower '
444 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
445 'sym': ucs_symbol(code_point),
446 'c': code_point,
447 'uc': to_lower(code_point)})
448 # alpha restriction: "Characters classified as either upper or lower
449 # shall automatically belong to this class.
450 if ((is_lower(code_point) or is_upper(code_point))
451 and not is_alpha(code_point)):
452 sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
453 'sym': ucs_symbol(code_point)})
454 # alpha restriction: “No character specified for the keywords cntrl,
455 # digit, punct or space shall be specified.”
456 if (is_alpha(code_point) and is_cntrl(code_point)):
457 sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
458 'sym': ucs_symbol(code_point)})
459 if (is_alpha(code_point) and is_digit(code_point)):
460 sys.stderr.write('%(sym)s is alpha and digit\n' %{
461 'sym': ucs_symbol(code_point)})
462 if (is_alpha(code_point) and is_punct(code_point)):
463 sys.stderr.write('%(sym)s is alpha and punct\n' %{
464 'sym': ucs_symbol(code_point)})
465 if (is_alpha(code_point) and is_space(code_point)):
466 sys.stderr.write('%(sym)s is alpha and space\n' %{
467 'sym': ucs_symbol(code_point)})
468 # space restriction: “No character specified for the keywords upper,
469 # lower, alpha, digit, graph or xdigit shall be specified.”
470 # upper, lower, alpha already checked above.
471 if (is_space(code_point) and is_digit(code_point)):
472 sys.stderr.write('%(sym)s is space and digit\n' %{
473 'sym': ucs_symbol(code_point)})
474 if (is_space(code_point) and is_graph(code_point)):
475 sys.stderr.write('%(sym)s is space and graph\n' %{
476 'sym': ucs_symbol(code_point)})
477 if (is_space(code_point) and is_xdigit(code_point)):
478 sys.stderr.write('%(sym)s is space and xdigit\n' %{
479 'sym': ucs_symbol(code_point)})
480 # cntrl restriction: “No character specified for the keywords upper,
481 # lower, alpha, digit, punct, graph, print or xdigit shall be
482 # specified.” upper, lower, alpha already checked above.
483 if (is_cntrl(code_point) and is_digit(code_point)):
484 sys.stderr.write('%(sym)s is cntrl and digit\n' %{
485 'sym': ucs_symbol(code_point)})
486 if (is_cntrl(code_point) and is_punct(code_point)):
487 sys.stderr.write('%(sym)s is cntrl and punct\n' %{
488 'sym': ucs_symbol(code_point)})
489 if (is_cntrl(code_point) and is_graph(code_point)):
490 sys.stderr.write('%(sym)s is cntrl and graph\n' %{
491 'sym': ucs_symbol(code_point)})
492 if (is_cntrl(code_point) and is_print(code_point)):
493 sys.stderr.write('%(sym)s is cntrl and print\n' %{
494 'sym': ucs_symbol(code_point)})
495 if (is_cntrl(code_point) and is_xdigit(code_point)):
496 sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
497 'sym': ucs_symbol(code_point)})
498 # punct restriction: “No character specified for the keywords upper,
499 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
500 # be specified.” upper, lower, alpha, cntrl already checked above.
501 if (is_punct(code_point) and is_digit(code_point)):
502 sys.stderr.write('%(sym)s is punct and digit\n' %{
503 'sym': ucs_symbol(code_point)})
504 if (is_punct(code_point) and is_xdigit(code_point)):
505 sys.stderr.write('%(sym)s is punct and xdigit\n' %{
506 'sym': ucs_symbol(code_point)})
507 if (is_punct(code_point) and code_point == 0x0020):
508 sys.stderr.write('%(sym)s is punct\n' %{
509 'sym': ucs_symbol(code_point)})
510 # graph restriction: “No character specified for the keyword cntrl
511 # shall be specified.” Already checked above.
512
513 # print restriction: “No character specified for the keyword cntrl
514 # shall be specified.” Already checked above.
515
516 # graph - print relation: differ only in the <space> character.
517 # How is this possible if there are more than one space character?!
518 # I think susv2/xbd/locale.html should speak of “space characters”,
519 # not “space character”.
520 if (is_print(code_point)
521 and not (is_graph(code_point) or is_space(code_point))):
522 sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
523 'sym': unicode_utils.ucs_symbol(code_point)})
524 if (not is_print(code_point)
525 and (is_graph(code_point) or code_point == 0x0020)):
526 sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
527 'sym': unicode_utils.ucs_symbol(code_point)})