[thirdparty/glibc.git] / localedata / unicode-gen / ctype_compatibility.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014-2021 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

'''
This script is useful for checking the differences between
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
new one generated by gen_unicode_ctype.py

To see how it is used, call it with the “-h” option:

    $ ./ctype_compatibility.py -h
    … prints usage message …
'''

import sys
import re
import unicodedata
import argparse

from ctype_compatibility_test_cases import TEST_CASES

def get_lines_from_file(filename):
    '''Get all non-comment lines from a i18n file

    Also merge all lines which are continued on the next line because
    they end in “/” into a single line.
    '''
    with open(filename) as i18n_file:
        current_line = ''
        for line in i18n_file:
            line = line.strip('\n')
            if '%' in line:
                if line.endswith('/'):
                    line = line[0:line.find('%')] + '/'
                else:
                    line = line[0:line.find('%')]
            line = line.strip()
            if line.endswith('/'):
                current_line += line[:-1]
            else:
                yield current_line + line
                current_line = ''
    if current_line: # file ends with a continuation line
        yield current_line

def extract_character_classes(filename):
    '''Get all Unicode code points for each character class from a file

    Store these code points in a dictionary using the character classes
    as keys and the list of code points in this character class as values.

    In case  of the character classes “toupper”, “tolower”, and “totitle”,
    these area actually pairs of code points
    '''
    ctype_dict = {}
    for line in get_lines_from_file(filename):
        for char_class in [
                'upper',
                'lower',
                'alpha',
                'digit',
                'outdigit',
                'space',
                'cntrl',
                'punct',
                'graph',
                'print',
                'xdigit',
                'blank',
                'combining',
                'combining_level3',
                'toupper',
                'tolower',
                'totitle']:
            match = re.match(r'^('
                             +'(?:(?:class|map)\s+")'
                             +re.escape(char_class)+
                             '(?:";)\s+'
                             +'|'
                             +re.escape(char_class)+'\s+'
                             +')', line)
            if match:
                if char_class not in ctype_dict:
                    ctype_dict[char_class] = []
                process_chars(
                    ctype_dict[char_class],
                    line[match.end():])
    return ctype_dict

def process_chars(char_class_list, code_point_line):
    '''
    Extract Unicode values from code_point_line
    and add to the list of code points in a character class
    '''
    for code_points in code_point_line.split(';'):
        code_points = code_points.strip()
        match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
        if match: # <Uxxxx>
            char_class_list.append(
                int(match.group('codepoint'), 16))
            continue
        match = re.match(
            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +'\.\.'+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
            code_points)
        if match: # <Uxxxx>..<Uxxxx>
            for codepoint in range(
                    int(match.group('codepoint1'), 16),
                    int(match.group('codepoint2'), 16) + 1):
                char_class_list.append(codepoint)
            continue
        match = re.match(
            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +'\.\.\(2\)\.\.'+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
            code_points)
        if match: # <Uxxxx>..(2)..<Uxxxx>
            for codepoint in range(
                    int(match.group('codepoint1'), 16),
                    int(match.group('codepoint2'), 16) + 1,
                    2):
                char_class_list.append(codepoint)
            continue
        match = re.match(
            r'^\('
            +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +','+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>'
            +'\)$',
            code_points)
        if match: # (<Uxxxx>,<Uxxxx>)
            char_class_list.append((
                int(match.group('codepoint1'), 16),
                int(match.group('codepoint2'), 16)))
            continue
        sys.stderr.write(
            ('None of the regexps matched '
             + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
            'cp': code_points,
            'cpl': code_point_line
        })
        exit(1)

def compare_lists(old_ctype_dict, new_ctype_dict):
    '''Compare character classes in the old and the new LC_CTYPE'''
    print('****************************************************')
    print('Character classes which are only in the new '
          + 'or only in the old file:')
    for char_class in sorted(old_ctype_dict):
        if char_class not in new_ctype_dict:
            print('Character class %s is in old ctype but not in new ctype'
                  %char_class)
    for char_class in sorted(new_ctype_dict):
        if char_class not in old_ctype_dict:
            print('Character class %s is in new ctype but not in old ctype'
                  %char_class)
    for char_class in sorted(old_ctype_dict):
        print("****************************************************")
        print("%s: %d chars in old ctype and %d chars in new ctype" %(
            char_class,
            len(old_ctype_dict[char_class]),
            len(new_ctype_dict[char_class])))
        print("----------------------------------------------------")
        report(char_class,
               old_ctype_dict[char_class],
               new_ctype_dict[char_class])

def report_code_points(char_class, code_point_list, text=''):
    '''Report all code points which have been added to or removed from a
    character class.
    '''
    for code_point in sorted(code_point_list):
        if type(code_point) == type(int()):
            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
                  %{'text': text,
                    'char': chr(code_point),
                    'char_class': char_class,
                    'code_point': hex(code_point),
                    'name': unicodedata.name(chr(code_point), 'name unknown')})
        else:
            print(('%(char_class)s: %(text)s: '
                   + '%(char0)s → %(char1)s '
                   + '%(code_point0)s → %(code_point1)s '
                   + '%(name0)s → %(name1)s') %{
                'text': text,
                'char_class': char_class,
                'char0': chr(code_point[0]),
                'code_point0': hex(code_point[0]),
                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
                'char1': chr(code_point[1]),
                'code_point1': hex(code_point[1]),
                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
            })

def report(char_class, old_list, new_list):
    '''Report the differences for a certain LC_CTYPE character class
    between the old and the newly generated state
    '''
    missing_chars = list(set(old_list)-set(new_list))
    print(('%(char_class)s: Missing %(number)d characters '
           + 'of old ctype in new ctype ')
          %{'char_class': char_class, 'number': len(missing_chars)})
    if ARGS.show_missing_characters:
        report_code_points(char_class, missing_chars, 'Missing')
    added_chars = list(set(new_list)-set(old_list))
    print(('%(char_class)s: Added %(number)d characters '
           + 'in new ctype which were not in old ctype')
          %{'char_class': char_class, 'number': len(added_chars)})
    if ARGS.show_added_characters:
        report_code_points(char_class, added_chars, 'Added')


def cperror(error_message, errorcounter=0):
    '''Increase number of errors by one and print an error message'''
    print(error_message)
    return errorcounter + 1

def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
            errorcounter=0):
    '''The parameter “code_point_list_with_ranges” is a list of
    integers or pairs of integers, for example:

    [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]

    where the pairs of integers stand for all the code points in the range
    of the two integers given, including the two integers of the pair.

    '''
    for code_point_range in code_point_list_with_ranges:
        for code_point in ([code_point_range]
                           if type(code_point_range) == type(int())
                           else range(code_point_range[0],
                                      code_point_range[1]+1)):
            for char_class_tuple in char_classes:
                char_class = char_class_tuple[0]
                in_char_class = char_class_tuple[1]
                if (code_point in ctype_dict[char_class]) != in_char_class:
                    errorcounter = cperror(
                        ('error: %(code_point)s %(char)s '
                         + '%(char_class)s %(in)s: %(reason)s') %{
                             'code_point': hex(code_point),
                             'char': chr(code_point),
                             'char_class': char_class,
                             'in': not in_char_class,
                             'reason': reason},
                        errorcounter)
    return errorcounter

def tests(ctype_dict, errorcounter = 0):
    '''Test a LC_CTYPE character class dictionary for known errors'''
    # copy the information from ctype_dict (which contains lists) in
    # a new dictionary ctype_dict2 (which contains dictionaries).
    # The checks below are easier with that type of data structure.

    ctype_dict2 = {}
    for key in ctype_dict:
        ctype_dict2[key] = {}
        if ctype_dict[key]:
            if type(ctype_dict[key][0]) == type(int()):
                for value in ctype_dict[key]:
                    ctype_dict2[key][value] = 1
            else: # key is 'toupper', 'tolower', or 'totitle'
                for value in ctype_dict[key]:
                    ctype_dict2[key][value[0]] = value[1]

    for test_case in TEST_CASES:
        errorcounter = cpcheck(ctype_dict2,
                               test_case[0],
                               test_case[1],
                               test_case[2],
                               errorcounter = errorcounter)

    for code_point in range(0, 0x110000):
        # toupper restriction: "Only characters specified for the keywords
	# lower and upper shall be specified.
        if (code_point in ctype_dict2['toupper']
            and code_point != ctype_dict2['toupper'][code_point]
            and not (code_point in ctype_dict2['lower']
                     or code_point in ctype_dict2['upper'])):
            errorcounter = cperror(
                ('error: %(char1)s is not upper|lower '
                 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
                     'char1': chr(code_point),
                     'cp1': hex(code_point),
                     'cp2': hex(ctype_dict2['toupper'][code_point]),
                     'char2': chr(ctype_dict2['toupper'][code_point])
                 },
                errorcounter)
        # tolower restriction: "Only characters specified for the keywords
	# lower and upper shall be specified.
        if (code_point in ctype_dict2['tolower']
            and code_point != ctype_dict2['tolower'][code_point]
            and not (code_point in ctype_dict2['lower']
                     or code_point in ctype_dict2['upper'])):
            errorcounter = cperror(
                ('error: %(char1)s is not upper|lower '
                 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
                     'char1': chr(code_point),
                     'cp1': hex(code_point),
                     'cp2': hex(ctype_dict2['tolower'][code_point]),
                     'char2': chr(ctype_dict2['tolower'][code_point])
                 },
                errorcounter)
        # alpha restriction: "Characters classified as either upper or lower
	# shall automatically belong to this class.
        if ((code_point in ctype_dict2['lower']
             or code_point in ctype_dict2['upper'])
            and code_point not in ctype_dict2['alpha']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is upper|lower but not alpha' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # alpha restriction: "No character specified for the keywords cntrl,
	# digit, punct or space shall be specified."
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['cntrl']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and cntrl' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['punct']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and punct' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['space']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and space' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # space restriction: "No character specified for the keywords upper,
	# lower, alpha, digit, graph or xdigit shall be specified."
	# upper, lower, alpha already checked above.
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['graph']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and graph' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # cntrl restriction: "No character specified for the keywords upper,
	# lower, alpha, digit, punct, graph, print or xdigit shall be
	# specified."  upper, lower, alpha already checked above.
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['punct']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and punct' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['graph']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and graph' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['print']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and print' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # punct restriction: "No character specified for the keywords upper,
	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
	# be specified."  upper, lower, alpha, cntrl already checked above.
        if (code_point in ctype_dict2['punct']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['punct']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['punct']
            and code_point == 0x0020):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct.' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # graph restriction: "No character specified for the keyword cntrl
	# shall be specified."  Already checked above.

        # print restriction: "No character specified for the keyword cntrl
	# shall be specified."  Already checked above.

        # graph - print relation: differ only in the <space> character.
	# How is this possible if there are more than one space character?!
	# I think susv2/xbd/locale.html should speak of "space characters",
	# not "space character".
        if (code_point in ctype_dict2['print']
            and not (code_point in ctype_dict2['graph']
                     or code_point in ctype_dict2['space'])):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is print but not graph|space' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point not in ctype_dict2['print']
            and (code_point in ctype_dict2['graph']
                 or code_point ==  0x0020)):
            errorcounter = cperror(
                'error: %(char)s %(cp)s graph|space but not print' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
    return errorcounter

if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Compare the contents of LC_CTYPE in two files and check for errors.
        ''')
    PARSER.add_argument(
        '-o', '--old_ctype_file',
        nargs='?',
        type=str,
        default='i18n',
        help='The old ctype file, default: %(default)s')
    PARSER.add_argument(
        '-n', '--new_ctype_file',
        nargs='?',
        type=str,
        default='unicode-ctype',
        help='The new ctype file, default: %(default)s')
    PARSER.add_argument(
        '-a', '--show_added_characters',
        action='store_true',
        help=('Show characters which were added to each '
              + 'character class in detail.'))
    PARSER.add_argument(
        '-m', '--show_missing_characters',
        action='store_true',
        help=('Show characters which were removed from each '
              + 'character class in detail.'))
    ARGS = PARSER.parse_args()

    OLD_CTYPE_DICT = extract_character_classes(
        ARGS.old_ctype_file)
    NEW_CTYPE_DICT = extract_character_classes(
        ARGS.new_ctype_file)
    compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
    print('============================================================')
    print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
    print('------------------------------------------------------------')
    NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
    print('------------------------------------------------------------')
    print('Old file = %s' %ARGS.old_ctype_file)
    print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
    print('------------------------------------------------------------')
    print('============================================================')
    print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
    print('------------------------------------------------------------')
    NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
    print('------------------------------------------------------------')
    print('New file = %s' %ARGS.new_ctype_file)
    print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
    print('------------------------------------------------------------')
    if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
        exit(1)
    else:
        exit(0)
Commit	Line	Data
4a4839c9 AO	1	#!/usr/bin/python3
4a4839c9 AO	2	# -- coding: utf-8 --
2b778ceb	3	# Copyright (C) 2014-2021 Free Software Foundation, Inc.
4a4839c9 AO	4	# This file is part of the GNU C Library.
	5	#
	6	# The GNU C Library is free software; you can redistribute it and/or
	7	# modify it under the terms of the GNU Lesser General Public
	8	# License as published by the Free Software Foundation; either
	9	# version 2.1 of the License, or (at your option) any later version.
	10	#
	11	# The GNU C Library is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	# Lesser General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU Lesser General Public
	17	# License along with the GNU C Library; if not, see
5a82c748	18	# <https://www.gnu.org/licenses/>.
4a4839c9 AO	19
	20	'''
	21	This script is useful for checking the differences between
	22	an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
	23	new one generated by gen_unicode_ctype.py
	24
	25	To see how it is used, call it with the “-h” option:
	26
	27	$ ./ctype_compatibility.py -h
	28	… prints usage message …
	29	'''
	30
	31	import sys
	32	import re
	33	import unicodedata
	34	import argparse
	35
	36	from ctype_compatibility_test_cases import TEST_CASES
	37
	38	def get_lines_from_file(filename):
	39	'''Get all non-comment lines from a i18n file
	40
	41	Also merge all lines which are continued on the next line because
	42	they end in “/” into a single line.
	43	'''
	44	with open(filename) as i18n_file:
	45	current_line = ''
	46	for line in i18n_file:
	47	line = line.strip('\n')
	48	if '%' in line:
	49	if line.endswith('/'):
	50	line = line[0:line.find('%')] + '/'
	51	else:
	52	line = line[0:line.find('%')]
	53	line = line.strip()
	54	if line.endswith('/'):
	55	current_line += line[:-1]
	56	else:
	57	yield current_line + line
	58	current_line = ''
	59	if current_line: # file ends with a continuation line
	60	yield current_line
	61
	62	def extract_character_classes(filename):
	63	'''Get all Unicode code points for each character class from a file
	64
	65	Store these code points in a dictionary using the character classes
	66	as keys and the list of code points in this character class as values.
	67
	68	In case of the character classes “toupper”, “tolower”, and “totitle”,
	69	these area actually pairs of code points
	70	'''
	71	ctype_dict = {}
	72	for line in get_lines_from_file(filename):
	73	for char_class in [
	74	'upper',
	75	'lower',
	76	'alpha',
	77	'digit',
	78	'outdigit',
	79	'space',
	80	'cntrl',
	81	'punct',
	82	'graph',
83	'print',
84	'xdigit',
85	'blank',
86	'combining',
87	'combining_level3',
88	'toupper',
89	'tolower',
90	'totitle']:
91	match = re.match(r'^('
92	+'(?:(?:class\|map)\s+")'
93	+re.escape(char_class)+
94	'(?:";)\s+'
95	+'\|'
96	+re.escape(char_class)+'\s+'
97	+')', line)
98	if match:
99	if char_class not in ctype_dict:
100	ctype_dict[char_class] = []
101	process_chars(
102	ctype_dict[char_class],
103	line[match.end():])
104	return ctype_dict
105
106	def process_chars(char_class_list, code_point_line):
107	'''
108	Extract Unicode values from code_point_line
109	and add to the list of code points in a character class
110	'''
111	for code_points in code_point_line.split(';'):
112	code_points = code_points.strip()
113	match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
114	if match: # <Uxxxx>
115	char_class_list.append(
116	int(match.group('codepoint'), 16))
117	continue
118	match = re.match(
119	r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
120	+'\.\.'+
121	'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
122	code_points)
123	if match: # <Uxxxx>..<Uxxxx>
124	for codepoint in range(
125	int(match.group('codepoint1'), 16),
126	int(match.group('codepoint2'), 16) + 1):
127	char_class_list.append(codepoint)
128	continue
129	match = re.match(
130	r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
131	+'\.\.\(2\)\.\.'+
132	'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
133	code_points)
134	if match: # <Uxxxx>..(2)..<Uxxxx>
135	for codepoint in range(
136	int(match.group('codepoint1'), 16),
137	int(match.group('codepoint2'), 16) + 1,
138	2):
139	char_class_list.append(codepoint)
140	continue
141	match = re.match(
142	r'^\('
143	+'<U(?P<codepoint1>[0-9A-F]{4,8})>'
144	+','+
145	'<U(?P<codepoint2>[0-9A-F]{4,8})>'
146	+'\)$',
147	code_points)
148	if match: # (<Uxxxx>,<Uxxxx>)
149	char_class_list.append((
150	int(match.group('codepoint1'), 16),
151	int(match.group('codepoint2'), 16)))
152	continue
153	sys.stderr.write(
154	('None of the regexps matched '
155	+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
156	'cp': code_points,
157	'cpl': code_point_line
158	})
159	exit(1)
160
161	def compare_lists(old_ctype_dict, new_ctype_dict):
162	'''Compare character classes in the old and the new LC_CTYPE'''
163	print('****************************************************')
164	print('Character classes which are only in the new '
165	+ 'or only in the old file:')
166	for char_class in sorted(old_ctype_dict):
167	if char_class not in new_ctype_dict:
168	print('Character class %s is in old ctype but not in new ctype'
169	%char_class)
170	for char_class in sorted(new_ctype_dict):
171	if char_class not in old_ctype_dict:
172	print('Character class %s is in new ctype but not in old ctype'
173	%char_class)
174	for char_class in sorted(old_ctype_dict):
175	print("****************************************************")
176	print("%s: %d chars in old ctype and %d chars in new ctype" %(
177	char_class,
178	len(old_ctype_dict[char_class]),
179	len(new_ctype_dict[char_class])))
180	print("----------------------------------------------------")
181	report(char_class,
182	old_ctype_dict[char_class],
183	new_ctype_dict[char_class])
184
185	def report_code_points(char_class, code_point_list, text=''):
186	'''Report all code points which have been added to or removed from a
187	character class.
188	'''
189	for code_point in sorted(code_point_list):
190	if type(code_point) == type(int()):
191	print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
192	%{'text': text,
193	'char': chr(code_point),
194	'char_class': char_class,
195	'code_point': hex(code_point),
196	'name': unicodedata.name(chr(code_point), 'name unknown')})
197	else:
198	print(('%(char_class)s: %(text)s: '
199	+ '%(char0)s → %(char1)s '
200	+ '%(code_point0)s → %(code_point1)s '
201	+ '%(name0)s → %(name1)s') %{
202	'text': text,
203	'char_class': char_class,
204	'char0': chr(code_point[0]),
205	'code_point0': hex(code_point[0]),
206	'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
207	'char1': chr(code_point[1]),
208	'code_point1': hex(code_point[1]),
209	'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
210	})
211
212	def report(char_class, old_list, new_list):
213	'''Report the differences for a certain LC_CTYPE character class
214	between the old and the newly generated state
215	'''
216	missing_chars = list(set(old_list)-set(new_list))
217	print(('%(char_class)s: Missing %(number)d characters '
218	+ 'of old ctype in new ctype ')
219	%{'char_class': char_class, 'number': len(missing_chars)})
220	if ARGS.show_missing_characters:
221	report_code_points(char_class, missing_chars, 'Missing')
222	added_chars = list(set(new_list)-set(old_list))
223	print(('%(char_class)s: Added %(number)d characters '
224	+ 'in new ctype which were not in old ctype')
225	%{'char_class': char_class, 'number': len(added_chars)})
226	if ARGS.show_added_characters:
227	report_code_points(char_class, added_chars, 'Added')
228
229
230	def cperror(error_message, errorcounter=0):
231	'''Increase number of errors by one and print an error message'''
232	print(error_message)
233	return errorcounter + 1
234
235	def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
236	errorcounter=0):
237	'''The parameter “code_point_list_with_ranges” is a list of
238	integers or pairs of integers, for example:
239
240	[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
241
242	where the pairs of integers stand for all the code points in the range
243	of the two integers given, including the two integers of the pair.
244
245	'''
246	for code_point_range in code_point_list_with_ranges:
247	for code_point in ([code_point_range]
248	if type(code_point_range) == type(int())
249	else range(code_point_range[0],
250	code_point_range[1]+1)):
251	for char_class_tuple in char_classes:
252	char_class = char_class_tuple[0]
253	in_char_class = char_class_tuple[1]
254	if (code_point in ctype_dict[char_class]) != in_char_class:
255	errorcounter = cperror(
256	('error: %(code_point)s %(char)s '
257	+ '%(char_class)s %(in)s: %(reason)s') %{
258	'code_point': hex(code_point),
259	'char': chr(code_point),
260	'char_class': char_class,
261	'in': not in_char_class,
262	'reason': reason},
263	errorcounter)
264	return errorcounter
265
266	def tests(ctype_dict, errorcounter = 0):
267	'''Test a LC_CTYPE character class dictionary for known errors'''
268	# copy the information from ctype_dict (which contains lists) in
269	# a new dictionary ctype_dict2 (which contains dictionaries).
270	# The checks below are easier with that type of data structure.
271
272	ctype_dict2 = {}
273	for key in ctype_dict:
274	ctype_dict2[key] = {}
275	if ctype_dict[key]:
276	if type(ctype_dict[key][0]) == type(int()):
277	for value in ctype_dict[key]:
278	ctype_dict2[key][value] = 1
279	else: # key is 'toupper', 'tolower', or 'totitle'
280	for value in ctype_dict[key]:
281	ctype_dict2[key][value[0]] = value[1]
282
283	for test_case in TEST_CASES:
284	errorcounter = cpcheck(ctype_dict2,
285	test_case[0],
286	test_case[1],
287	test_case[2],
288	errorcounter = errorcounter)
289
290	for code_point in range(0, 0x110000):
291	# toupper restriction: "Only characters specified for the keywords
292	# lower and upper shall be specified.
293	if (code_point in ctype_dict2['toupper']
294	and code_point != ctype_dict2['toupper'][code_point]
295	and not (code_point in ctype_dict2['lower']
296	or code_point in ctype_dict2['upper'])):
297	errorcounter = cperror(
298	('error: %(char1)s is not upper\|lower '
299	+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
300	'char1': chr(code_point),
301	'cp1': hex(code_point),
302	'cp2': hex(ctype_dict2['toupper'][code_point]),
303	'char2': chr(ctype_dict2['toupper'][code_point])
304	},
305	errorcounter)
306	# tolower restriction: "Only characters specified for the keywords
307	# lower and upper shall be specified.
308	if (code_point in ctype_dict2['tolower']
309	and code_point != ctype_dict2['tolower'][code_point]
310	and not (code_point in ctype_dict2['lower']
311	or code_point in ctype_dict2['upper'])):
312	errorcounter = cperror(
313	('error: %(char1)s is not upper\|lower '
314	+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
315	'char1': chr(code_point),
316	'cp1': hex(code_point),
317	'cp2': hex(ctype_dict2['tolower'][code_point]),
318	'char2': chr(ctype_dict2['tolower'][code_point])
319	},
320	errorcounter)
321	# alpha restriction: "Characters classified as either upper or lower
322	# shall automatically belong to this class.
323	if ((code_point in ctype_dict2['lower']
324	or code_point in ctype_dict2['upper'])
325	and code_point not in ctype_dict2['alpha']):
326	errorcounter = cperror(
327	'error: %(char)s %(cp)s is upper\|lower but not alpha' %{
328	'char': chr(code_point),
329	'cp': hex(code_point)
330	},
331	errorcounter)
332	# alpha restriction: "No character specified for the keywords cntrl,
333	# digit, punct or space shall be specified."
334	if (code_point in ctype_dict2['alpha']
335	and code_point in ctype_dict2['cntrl']):
336	errorcounter = cperror(
337	'error: %(char)s %(cp)s is alpha and cntrl' %{
338	'char': chr(code_point),
339	'cp': hex(code_point)
340	},
341	errorcounter)
342	if (code_point in ctype_dict2['alpha']
343	and code_point in ctype_dict2['digit']):
344	errorcounter = cperror(
345	'error: %(char)s %(cp)s is alpha and digit' %{
346	'char': chr(code_point),
347	'cp': hex(code_point)
348	},
349	errorcounter)
350	if (code_point in ctype_dict2['alpha']
351	and code_point in ctype_dict2['punct']):
352	errorcounter = cperror(
353	'error: %(char)s %(cp)s is alpha and punct' %{
354	'char': chr(code_point),
355	'cp': hex(code_point)
356	},
357	errorcounter)
358	if (code_point in ctype_dict2['alpha']
359	and code_point in ctype_dict2['space']):
360	errorcounter = cperror(
361	'error: %(char)s %(cp)s is alpha and space' %{
362	'char': chr(code_point),
363	'cp': hex(code_point)
364	},
365	errorcounter)
366	# space restriction: "No character specified for the keywords upper,
367	# lower, alpha, digit, graph or xdigit shall be specified."
368	# upper, lower, alpha already checked above.
369	if (code_point in ctype_dict2['space']
370	and code_point in ctype_dict2['digit']):
371	errorcounter = cperror(
372	'error: %(char)s %(cp)s is space and digit' %{
373	'char': chr(code_point),
374	'cp': hex(code_point)
375	},
376	errorcounter)
377	if (code_point in ctype_dict2['space']
378	and code_point in ctype_dict2['graph']):
379	errorcounter = cperror(
380	'error: %(char)s %(cp)s is space and graph' %{
381	'char': chr(code_point),
382	'cp': hex(code_point)
383	},
384	errorcounter)
385	if (code_point in ctype_dict2['space']
386	and code_point in ctype_dict2['xdigit']):
387	errorcounter = cperror(
388	'error: %(char)s %(cp)s is space and xdigit' %{
389	'char': chr(code_point),
390	'cp': hex(code_point)
391	},
392	errorcounter)
393	# cntrl restriction: "No character specified for the keywords upper,
394	# lower, alpha, digit, punct, graph, print or xdigit shall be
395	# specified." upper, lower, alpha already checked above.
396	if (code_point in ctype_dict2['cntrl']
397	and code_point in ctype_dict2['digit']):
398	errorcounter = cperror(
399	'error: %(char)s %(cp)s is cntrl and digit' %{
400	'char': chr(code_point),
401	'cp': hex(code_point)
402	},
403	errorcounter)
404	if (code_point in ctype_dict2['cntrl']
405	and code_point in ctype_dict2['punct']):
406	errorcounter = cperror(
407	'error: %(char)s %(cp)s is cntrl and punct' %{
408	'char': chr(code_point),
409	'cp': hex(code_point)
410	},
411	errorcounter)
412	if (code_point in ctype_dict2['cntrl']
413	and code_point in ctype_dict2['graph']):
414	errorcounter = cperror(
415	'error: %(char)s %(cp)s is cntrl and graph' %{
416	'char': chr(code_point),
417	'cp': hex(code_point)
418	},
419	errorcounter)
420	if (code_point in ctype_dict2['cntrl']
421	and code_point in ctype_dict2['print']):
422	errorcounter = cperror(
423	'error: %(char)s %(cp)s is cntrl and print' %{
424	'char': chr(code_point),
425	'cp': hex(code_point)
426	},
427	errorcounter)
428	if (code_point in ctype_dict2['cntrl']
429	and code_point in ctype_dict2['xdigit']):
430	errorcounter = cperror(
431	'error: %(char)s %(cp)s is cntrl and xdigit' %{
432	'char': chr(code_point),
433	'cp': hex(code_point)
434	},
435	errorcounter)
436	# punct restriction: "No character specified for the keywords upper,
437	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
438	# be specified." upper, lower, alpha, cntrl already checked above.
439	if (code_point in ctype_dict2['punct']
440	and code_point in ctype_dict2['digit']):
441	errorcounter = cperror(
442	'error: %(char)s %(cp)s is punct and digit' %{
443	'char': chr(code_point),
444	'cp': hex(code_point)
445	},
446	errorcounter)
447	if (code_point in ctype_dict2['punct']
448	and code_point in ctype_dict2['xdigit']):
449	errorcounter = cperror(
450	'error: %(char)s %(cp)s is punct and xdigit' %{
451	'char': chr(code_point),
452	'cp': hex(code_point)
453	},
454	errorcounter)
455	if (code_point in ctype_dict2['punct']
456	and code_point == 0x0020):
457	errorcounter = cperror(
458	'error: %(char)s %(cp)s is punct.' %{
459	'char': chr(code_point),
460	'cp': hex(code_point)
461	},
462	errorcounter)
463	# graph restriction: "No character specified for the keyword cntrl
464	# shall be specified." Already checked above.
465
466	# print restriction: "No character specified for the keyword cntrl
467	# shall be specified." Already checked above.
468
469	# graph - print relation: differ only in the <space> character.
470	# How is this possible if there are more than one space character?!
471	# I think susv2/xbd/locale.html should speak of "space characters",
472	# not "space character".
473	if (code_point in ctype_dict2['print']
474	and not (code_point in ctype_dict2['graph']
475	or code_point in ctype_dict2['space'])):
476	errorcounter = cperror(
477	'error: %(char)s %(cp)s is print but not graph\|space' %{
478	'char': chr(code_point),
479	'cp': hex(code_point)
480	},
481	errorcounter)
482	if (code_point not in ctype_dict2['print']
483	and (code_point in ctype_dict2['graph']
484	or code_point == 0x0020)):
485	errorcounter = cperror(
486	'error: %(char)s %(cp)s graph\|space but not print' %{
487	'char': chr(code_point),
488	'cp': hex(code_point)
489	},
490	errorcounter)
491	return errorcounter
492
493	if __name__ == "__main__":
494	PARSER = argparse.ArgumentParser(
495	description='''
496	Compare the contents of LC_CTYPE in two files and check for errors.
497	''')
498	PARSER.add_argument(
499	'-o', '--old_ctype_file',
500	nargs='?',
501	type=str,
502	default='i18n',
503	help='The old ctype file, default: %(default)s')
504	PARSER.add_argument(
505	'-n', '--new_ctype_file',
506	nargs='?',
507	type=str,
508	default='unicode-ctype',
509	help='The new ctype file, default: %(default)s')
510	PARSER.add_argument(
511	'-a', '--show_added_characters',
512	action='store_true',
513	help=('Show characters which were added to each '
514	+ 'character class in detail.'))
515	PARSER.add_argument(
516	'-m', '--show_missing_characters',
517	action='store_true',
518	help=('Show characters which were removed from each '
519	+ 'character class in detail.'))
520	ARGS = PARSER.parse_args()
521
522	OLD_CTYPE_DICT = extract_character_classes(
523	ARGS.old_ctype_file)
524	NEW_CTYPE_DICT = extract_character_classes(
525	ARGS.new_ctype_file)
526	compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
527	print('============================================================')
528	print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
529	print('------------------------------------------------------------')
530	NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
531	print('------------------------------------------------------------')
532	print('Old file = %s' %ARGS.old_ctype_file)
533	print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
534	print('------------------------------------------------------------')
535	print('============================================================')
536	print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
537	print('------------------------------------------------------------')
538	NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
539	print('------------------------------------------------------------')
540	print('New file = %s' %ARGS.new_ctype_file)
541	print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
542	print('------------------------------------------------------------')
543	if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
544	exit(1)
545	else:
546	exit(0)