]> git.ipfire.org Git - thirdparty/glibc.git/blame - localedata/unicode-gen/ctype_compatibility.py
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / localedata / unicode-gen / ctype_compatibility.py
CommitLineData
4a4839c9
AO
1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
2b778ceb 3# Copyright (C) 2014-2021 Free Software Foundation, Inc.
4a4839c9
AO
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
5a82c748 18# <https://www.gnu.org/licenses/>.
4a4839c9
AO
19
20'''
21This script is useful for checking the differences between
22an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
23new one generated by gen_unicode_ctype.py
24
25To see how it is used, call it with the “-h” option:
26
27 $ ./ctype_compatibility.py -h
28 … prints usage message …
29'''
30
31import sys
32import re
33import unicodedata
34import argparse
35
36from ctype_compatibility_test_cases import TEST_CASES
37
38def get_lines_from_file(filename):
39 '''Get all non-comment lines from a i18n file
40
41 Also merge all lines which are continued on the next line because
42 they end in “/” into a single line.
43 '''
44 with open(filename) as i18n_file:
45 current_line = ''
46 for line in i18n_file:
47 line = line.strip('\n')
48 if '%' in line:
49 if line.endswith('/'):
50 line = line[0:line.find('%')] + '/'
51 else:
52 line = line[0:line.find('%')]
53 line = line.strip()
54 if line.endswith('/'):
55 current_line += line[:-1]
56 else:
57 yield current_line + line
58 current_line = ''
59 if current_line: # file ends with a continuation line
60 yield current_line
61
62def extract_character_classes(filename):
63 '''Get all Unicode code points for each character class from a file
64
65 Store these code points in a dictionary using the character classes
66 as keys and the list of code points in this character class as values.
67
68 In case of the character classes “toupper”, “tolower”, and “totitle”,
69 these area actually pairs of code points
70 '''
71 ctype_dict = {}
72 for line in get_lines_from_file(filename):
73 for char_class in [
74 'upper',
75 'lower',
76 'alpha',
77 'digit',
78 'outdigit',
79 'space',
80 'cntrl',
81 'punct',
82 'graph',
83 'print',
84 'xdigit',
85 'blank',
86 'combining',
87 'combining_level3',
88 'toupper',
89 'tolower',
90 'totitle']:
91 match = re.match(r'^('
92 +'(?:(?:class|map)\s+")'
93 +re.escape(char_class)+
94 '(?:";)\s+'
95 +'|'
96 +re.escape(char_class)+'\s+'
97 +')', line)
98 if match:
99 if char_class not in ctype_dict:
100 ctype_dict[char_class] = []
101 process_chars(
102 ctype_dict[char_class],
103 line[match.end():])
104 return ctype_dict
105
106def process_chars(char_class_list, code_point_line):
107 '''
108 Extract Unicode values from code_point_line
109 and add to the list of code points in a character class
110 '''
111 for code_points in code_point_line.split(';'):
112 code_points = code_points.strip()
113 match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
114 if match: # <Uxxxx>
115 char_class_list.append(
116 int(match.group('codepoint'), 16))
117 continue
118 match = re.match(
119 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
120 +'\.\.'+
121 '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
122 code_points)
123 if match: # <Uxxxx>..<Uxxxx>
124 for codepoint in range(
125 int(match.group('codepoint1'), 16),
126 int(match.group('codepoint2'), 16) + 1):
127 char_class_list.append(codepoint)
128 continue
129 match = re.match(
130 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
131 +'\.\.\(2\)\.\.'+
132 '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
133 code_points)
134 if match: # <Uxxxx>..(2)..<Uxxxx>
135 for codepoint in range(
136 int(match.group('codepoint1'), 16),
137 int(match.group('codepoint2'), 16) + 1,
138 2):
139 char_class_list.append(codepoint)
140 continue
141 match = re.match(
142 r'^\('
143 +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
144 +','+
145 '<U(?P<codepoint2>[0-9A-F]{4,8})>'
146 +'\)$',
147 code_points)
148 if match: # (<Uxxxx>,<Uxxxx>)
149 char_class_list.append((
150 int(match.group('codepoint1'), 16),
151 int(match.group('codepoint2'), 16)))
152 continue
153 sys.stderr.write(
154 ('None of the regexps matched '
155 + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
156 'cp': code_points,
157 'cpl': code_point_line
158 })
159 exit(1)
160
161def compare_lists(old_ctype_dict, new_ctype_dict):
162 '''Compare character classes in the old and the new LC_CTYPE'''
163 print('****************************************************')
164 print('Character classes which are only in the new '
165 + 'or only in the old file:')
166 for char_class in sorted(old_ctype_dict):
167 if char_class not in new_ctype_dict:
168 print('Character class %s is in old ctype but not in new ctype'
169 %char_class)
170 for char_class in sorted(new_ctype_dict):
171 if char_class not in old_ctype_dict:
172 print('Character class %s is in new ctype but not in old ctype'
173 %char_class)
174 for char_class in sorted(old_ctype_dict):
175 print("****************************************************")
176 print("%s: %d chars in old ctype and %d chars in new ctype" %(
177 char_class,
178 len(old_ctype_dict[char_class]),
179 len(new_ctype_dict[char_class])))
180 print("----------------------------------------------------")
181 report(char_class,
182 old_ctype_dict[char_class],
183 new_ctype_dict[char_class])
184
185def report_code_points(char_class, code_point_list, text=''):
186 '''Report all code points which have been added to or removed from a
187 character class.
188 '''
189 for code_point in sorted(code_point_list):
190 if type(code_point) == type(int()):
191 print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
192 %{'text': text,
193 'char': chr(code_point),
194 'char_class': char_class,
195 'code_point': hex(code_point),
196 'name': unicodedata.name(chr(code_point), 'name unknown')})
197 else:
198 print(('%(char_class)s: %(text)s: '
199 + '%(char0)s → %(char1)s '
200 + '%(code_point0)s → %(code_point1)s '
201 + '%(name0)s → %(name1)s') %{
202 'text': text,
203 'char_class': char_class,
204 'char0': chr(code_point[0]),
205 'code_point0': hex(code_point[0]),
206 'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
207 'char1': chr(code_point[1]),
208 'code_point1': hex(code_point[1]),
209 'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
210 })
211
212def report(char_class, old_list, new_list):
213 '''Report the differences for a certain LC_CTYPE character class
214 between the old and the newly generated state
215 '''
216 missing_chars = list(set(old_list)-set(new_list))
217 print(('%(char_class)s: Missing %(number)d characters '
218 + 'of old ctype in new ctype ')
219 %{'char_class': char_class, 'number': len(missing_chars)})
220 if ARGS.show_missing_characters:
221 report_code_points(char_class, missing_chars, 'Missing')
222 added_chars = list(set(new_list)-set(old_list))
223 print(('%(char_class)s: Added %(number)d characters '
224 + 'in new ctype which were not in old ctype')
225 %{'char_class': char_class, 'number': len(added_chars)})
226 if ARGS.show_added_characters:
227 report_code_points(char_class, added_chars, 'Added')
228
229
230def cperror(error_message, errorcounter=0):
231 '''Increase number of errors by one and print an error message'''
232 print(error_message)
233 return errorcounter + 1
234
235def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
236 errorcounter=0):
237 '''The parameter “code_point_list_with_ranges” is a list of
238 integers or pairs of integers, for example:
239
240 [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
241
242 where the pairs of integers stand for all the code points in the range
243 of the two integers given, including the two integers of the pair.
244
245 '''
246 for code_point_range in code_point_list_with_ranges:
247 for code_point in ([code_point_range]
248 if type(code_point_range) == type(int())
249 else range(code_point_range[0],
250 code_point_range[1]+1)):
251 for char_class_tuple in char_classes:
252 char_class = char_class_tuple[0]
253 in_char_class = char_class_tuple[1]
254 if (code_point in ctype_dict[char_class]) != in_char_class:
255 errorcounter = cperror(
256 ('error: %(code_point)s %(char)s '
257 + '%(char_class)s %(in)s: %(reason)s') %{
258 'code_point': hex(code_point),
259 'char': chr(code_point),
260 'char_class': char_class,
261 'in': not in_char_class,
262 'reason': reason},
263 errorcounter)
264 return errorcounter
265
266def tests(ctype_dict, errorcounter = 0):
267 '''Test a LC_CTYPE character class dictionary for known errors'''
268 # copy the information from ctype_dict (which contains lists) in
269 # a new dictionary ctype_dict2 (which contains dictionaries).
270 # The checks below are easier with that type of data structure.
271
272 ctype_dict2 = {}
273 for key in ctype_dict:
274 ctype_dict2[key] = {}
275 if ctype_dict[key]:
276 if type(ctype_dict[key][0]) == type(int()):
277 for value in ctype_dict[key]:
278 ctype_dict2[key][value] = 1
279 else: # key is 'toupper', 'tolower', or 'totitle'
280 for value in ctype_dict[key]:
281 ctype_dict2[key][value[0]] = value[1]
282
283 for test_case in TEST_CASES:
284 errorcounter = cpcheck(ctype_dict2,
285 test_case[0],
286 test_case[1],
287 test_case[2],
288 errorcounter = errorcounter)
289
290 for code_point in range(0, 0x110000):
291 # toupper restriction: "Only characters specified for the keywords
292 # lower and upper shall be specified.
293 if (code_point in ctype_dict2['toupper']
294 and code_point != ctype_dict2['toupper'][code_point]
295 and not (code_point in ctype_dict2['lower']
296 or code_point in ctype_dict2['upper'])):
297 errorcounter = cperror(
298 ('error: %(char1)s is not upper|lower '
299 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
300 'char1': chr(code_point),
301 'cp1': hex(code_point),
302 'cp2': hex(ctype_dict2['toupper'][code_point]),
303 'char2': chr(ctype_dict2['toupper'][code_point])
304 },
305 errorcounter)
306 # tolower restriction: "Only characters specified for the keywords
307 # lower and upper shall be specified.
308 if (code_point in ctype_dict2['tolower']
309 and code_point != ctype_dict2['tolower'][code_point]
310 and not (code_point in ctype_dict2['lower']
311 or code_point in ctype_dict2['upper'])):
312 errorcounter = cperror(
313 ('error: %(char1)s is not upper|lower '
314 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
315 'char1': chr(code_point),
316 'cp1': hex(code_point),
317 'cp2': hex(ctype_dict2['tolower'][code_point]),
318 'char2': chr(ctype_dict2['tolower'][code_point])
319 },
320 errorcounter)
321 # alpha restriction: "Characters classified as either upper or lower
322 # shall automatically belong to this class.
323 if ((code_point in ctype_dict2['lower']
324 or code_point in ctype_dict2['upper'])
325 and code_point not in ctype_dict2['alpha']):
326 errorcounter = cperror(
327 'error: %(char)s %(cp)s is upper|lower but not alpha' %{
328 'char': chr(code_point),
329 'cp': hex(code_point)
330 },
331 errorcounter)
332 # alpha restriction: "No character specified for the keywords cntrl,
333 # digit, punct or space shall be specified."
334 if (code_point in ctype_dict2['alpha']
335 and code_point in ctype_dict2['cntrl']):
336 errorcounter = cperror(
337 'error: %(char)s %(cp)s is alpha and cntrl' %{
338 'char': chr(code_point),
339 'cp': hex(code_point)
340 },
341 errorcounter)
342 if (code_point in ctype_dict2['alpha']
343 and code_point in ctype_dict2['digit']):
344 errorcounter = cperror(
345 'error: %(char)s %(cp)s is alpha and digit' %{
346 'char': chr(code_point),
347 'cp': hex(code_point)
348 },
349 errorcounter)
350 if (code_point in ctype_dict2['alpha']
351 and code_point in ctype_dict2['punct']):
352 errorcounter = cperror(
353 'error: %(char)s %(cp)s is alpha and punct' %{
354 'char': chr(code_point),
355 'cp': hex(code_point)
356 },
357 errorcounter)
358 if (code_point in ctype_dict2['alpha']
359 and code_point in ctype_dict2['space']):
360 errorcounter = cperror(
361 'error: %(char)s %(cp)s is alpha and space' %{
362 'char': chr(code_point),
363 'cp': hex(code_point)
364 },
365 errorcounter)
366 # space restriction: "No character specified for the keywords upper,
367 # lower, alpha, digit, graph or xdigit shall be specified."
368 # upper, lower, alpha already checked above.
369 if (code_point in ctype_dict2['space']
370 and code_point in ctype_dict2['digit']):
371 errorcounter = cperror(
372 'error: %(char)s %(cp)s is space and digit' %{
373 'char': chr(code_point),
374 'cp': hex(code_point)
375 },
376 errorcounter)
377 if (code_point in ctype_dict2['space']
378 and code_point in ctype_dict2['graph']):
379 errorcounter = cperror(
380 'error: %(char)s %(cp)s is space and graph' %{
381 'char': chr(code_point),
382 'cp': hex(code_point)
383 },
384 errorcounter)
385 if (code_point in ctype_dict2['space']
386 and code_point in ctype_dict2['xdigit']):
387 errorcounter = cperror(
388 'error: %(char)s %(cp)s is space and xdigit' %{
389 'char': chr(code_point),
390 'cp': hex(code_point)
391 },
392 errorcounter)
393 # cntrl restriction: "No character specified for the keywords upper,
394 # lower, alpha, digit, punct, graph, print or xdigit shall be
395 # specified." upper, lower, alpha already checked above.
396 if (code_point in ctype_dict2['cntrl']
397 and code_point in ctype_dict2['digit']):
398 errorcounter = cperror(
399 'error: %(char)s %(cp)s is cntrl and digit' %{
400 'char': chr(code_point),
401 'cp': hex(code_point)
402 },
403 errorcounter)
404 if (code_point in ctype_dict2['cntrl']
405 and code_point in ctype_dict2['punct']):
406 errorcounter = cperror(
407 'error: %(char)s %(cp)s is cntrl and punct' %{
408 'char': chr(code_point),
409 'cp': hex(code_point)
410 },
411 errorcounter)
412 if (code_point in ctype_dict2['cntrl']
413 and code_point in ctype_dict2['graph']):
414 errorcounter = cperror(
415 'error: %(char)s %(cp)s is cntrl and graph' %{
416 'char': chr(code_point),
417 'cp': hex(code_point)
418 },
419 errorcounter)
420 if (code_point in ctype_dict2['cntrl']
421 and code_point in ctype_dict2['print']):
422 errorcounter = cperror(
423 'error: %(char)s %(cp)s is cntrl and print' %{
424 'char': chr(code_point),
425 'cp': hex(code_point)
426 },
427 errorcounter)
428 if (code_point in ctype_dict2['cntrl']
429 and code_point in ctype_dict2['xdigit']):
430 errorcounter = cperror(
431 'error: %(char)s %(cp)s is cntrl and xdigit' %{
432 'char': chr(code_point),
433 'cp': hex(code_point)
434 },
435 errorcounter)
436 # punct restriction: "No character specified for the keywords upper,
437 # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
438 # be specified." upper, lower, alpha, cntrl already checked above.
439 if (code_point in ctype_dict2['punct']
440 and code_point in ctype_dict2['digit']):
441 errorcounter = cperror(
442 'error: %(char)s %(cp)s is punct and digit' %{
443 'char': chr(code_point),
444 'cp': hex(code_point)
445 },
446 errorcounter)
447 if (code_point in ctype_dict2['punct']
448 and code_point in ctype_dict2['xdigit']):
449 errorcounter = cperror(
450 'error: %(char)s %(cp)s is punct and xdigit' %{
451 'char': chr(code_point),
452 'cp': hex(code_point)
453 },
454 errorcounter)
455 if (code_point in ctype_dict2['punct']
456 and code_point == 0x0020):
457 errorcounter = cperror(
458 'error: %(char)s %(cp)s is punct.' %{
459 'char': chr(code_point),
460 'cp': hex(code_point)
461 },
462 errorcounter)
463 # graph restriction: "No character specified for the keyword cntrl
464 # shall be specified." Already checked above.
465
466 # print restriction: "No character specified for the keyword cntrl
467 # shall be specified." Already checked above.
468
469 # graph - print relation: differ only in the <space> character.
470 # How is this possible if there are more than one space character?!
471 # I think susv2/xbd/locale.html should speak of "space characters",
472 # not "space character".
473 if (code_point in ctype_dict2['print']
474 and not (code_point in ctype_dict2['graph']
475 or code_point in ctype_dict2['space'])):
476 errorcounter = cperror(
477 'error: %(char)s %(cp)s is print but not graph|space' %{
478 'char': chr(code_point),
479 'cp': hex(code_point)
480 },
481 errorcounter)
482 if (code_point not in ctype_dict2['print']
483 and (code_point in ctype_dict2['graph']
484 or code_point == 0x0020)):
485 errorcounter = cperror(
486 'error: %(char)s %(cp)s graph|space but not print' %{
487 'char': chr(code_point),
488 'cp': hex(code_point)
489 },
490 errorcounter)
491 return errorcounter
492
493if __name__ == "__main__":
494 PARSER = argparse.ArgumentParser(
495 description='''
496 Compare the contents of LC_CTYPE in two files and check for errors.
497 ''')
498 PARSER.add_argument(
499 '-o', '--old_ctype_file',
500 nargs='?',
501 type=str,
502 default='i18n',
503 help='The old ctype file, default: %(default)s')
504 PARSER.add_argument(
505 '-n', '--new_ctype_file',
506 nargs='?',
507 type=str,
508 default='unicode-ctype',
509 help='The new ctype file, default: %(default)s')
510 PARSER.add_argument(
511 '-a', '--show_added_characters',
512 action='store_true',
513 help=('Show characters which were added to each '
514 + 'character class in detail.'))
515 PARSER.add_argument(
516 '-m', '--show_missing_characters',
517 action='store_true',
518 help=('Show characters which were removed from each '
519 + 'character class in detail.'))
520 ARGS = PARSER.parse_args()
521
522 OLD_CTYPE_DICT = extract_character_classes(
523 ARGS.old_ctype_file)
524 NEW_CTYPE_DICT = extract_character_classes(
525 ARGS.new_ctype_file)
526 compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
527 print('============================================================')
528 print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
529 print('------------------------------------------------------------')
530 NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
531 print('------------------------------------------------------------')
532 print('Old file = %s' %ARGS.old_ctype_file)
533 print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
534 print('------------------------------------------------------------')
535 print('============================================================')
536 print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
537 print('------------------------------------------------------------')
538 NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
539 print('------------------------------------------------------------')
540 print('New file = %s' %ARGS.new_ctype_file)
541 print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
542 print('------------------------------------------------------------')
543 if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
544 exit(1)
545 else:
546 exit(0)