[thirdparty/glibc.git] / localedata / unicode-gen / gen_unicode_ctype.py

#!/usr/bin/python3
#
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
# Copyright (C) 2014-2020 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

'''
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
DerivedCoreProperties.txt files.

To see how this script is used, call it with the “-h” option:

    $ ./gen_unicode_ctype.py -h
    … prints usage message …
'''

import argparse
import time
import re
import unicode_utils

def code_point_ranges(is_class_function):
    '''Returns a list of ranges of code points for which is_class_function
    returns True.

    Example:

    [[65, 90], [192, 214], [216, 222], [256], … ]
    '''
    cp_ranges  = []
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        if is_class_function(code_point):
            if (cp_ranges
                and cp_ranges[-1][-1] == code_point - 1):
                if len(cp_ranges[-1]) == 1:
                    cp_ranges[-1].append(code_point)
                else:
                    cp_ranges[-1][-1] = code_point
            else:
                cp_ranges.append([code_point])
    return cp_ranges

def output_charclass(i18n_file, class_name, is_class_function):
    '''Output a LC_CTYPE character class section

    Example:

    upper /
       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
       …
       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
    '''
    cp_ranges = code_point_ranges(is_class_function)
    if cp_ranges:
        i18n_file.write('%s /\n' %class_name)
        max_column = 75
        prefix = '   '
        line = prefix
        range_string = ''
        for code_point_range in cp_ranges:
            if line.strip():
                line  += ';'
            if len(code_point_range) == 1:
                range_string = unicode_utils.ucs_symbol(code_point_range[0])
            else:
                range_string = unicode_utils.ucs_symbol_range(
                    code_point_range[0], code_point_range[-1])
            if len(line+range_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += range_string
        if line.strip():
            i18n_file.write(line+'\n')
        i18n_file.write('\n')

def output_charmap(i18n_file, map_name, map_function):
    '''Output a LC_CTYPE character map section

    Example:

    toupper /
      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
      …
      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
    '''
    max_column = 75
    prefix = '   '
    line = prefix
    map_string = ''
    i18n_file.write('%s /\n' %map_name)
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        mapped = map_function(code_point)
        if code_point != mapped:
            if line.strip():
                line += ';'
            map_string = '(' \
                         + unicode_utils.ucs_symbol(code_point) \
                         + ',' \
                         + unicode_utils.ucs_symbol(mapped) \
                         + ')'
            if len(line+map_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += map_string
    if line.strip():
        i18n_file.write(line+'\n')
    i18n_file.write('\n')

def read_input_file(filename):
    '''Reads the original glibc i18n file to get the original head
    and tail.

    We want to replace only the character classes in LC_CTYPE, and the
    date stamp. All the rest of the i18n file should stay unchanged.
    To avoid having to cut and paste the generated data into the
    original file, it is helpful to read the original file here
    to be able to generate a complete result file.
    '''
    head = tail = ''
    with open(filename, mode='r') as i18n_file:
        for line in i18n_file:
            match = re.match(
                r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
                line)
            if match:
                line = match.group('key') \
                       + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
            head = head + line
            if line.startswith('LC_CTYPE'):
                break
        for line in i18n_file:
            if line.startswith('translit_start'):
                tail = line
                break
        for line in i18n_file:
            tail = tail + line
    return (head, tail)

def output_head(i18n_file, unicode_version, head=''):
    '''Write the header of the output file, i.e. the part of the file
    before the “LC_CTYPE” line.
    '''
    if ARGS.input_file and head:
        i18n_file.write(head)
    else:
        i18n_file.write('escape_char /\n')
        i18n_file.write('comment_char %\n')
        i18n_file.write('\n')
        i18n_file.write('% Generated automatically by '
                        + 'gen_unicode_ctype.py '
                        + 'for Unicode {:s}.\n'.format(unicode_version))
        i18n_file.write('\n')
        i18n_file.write('LC_IDENTIFICATION\n')
        i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
            unicode_version))
        i18n_file.write('source    "UnicodeData.txt, '
                        + 'DerivedCoreProperties.txt"\n')
        i18n_file.write('address   ""\n')
        i18n_file.write('contact   ""\n')
        i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
        i18n_file.write('tel       ""\n')
        i18n_file.write('fax       ""\n')
        i18n_file.write('language  ""\n')
        i18n_file.write('territory "Earth"\n')
        i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
        i18n_file.write('date      "{:s}"\n'.format(
            time.strftime('%Y-%m-%d')))
        i18n_file.write('category  "i18n:2012";LC_CTYPE\n')
        i18n_file.write('END LC_IDENTIFICATION\n')
        i18n_file.write('\n')
        i18n_file.write('LC_CTYPE\n')

def output_tail(i18n_file, tail=''):
    '''Write the tail of the output file, i.e. the part of the file
    after the last “LC_CTYPE” character class.
    '''
    if ARGS.input_file and tail:
        i18n_file.write(tail)
    else:
        i18n_file.write('END LC_CTYPE\n')

def output_tables(i18n_file, unicode_version, turkish):
    '''Write the new LC_CTYPE character classes to the output file'''
    i18n_file.write('% The following is the 14652 i18n fdcc-set '
                    + 'LC_CTYPE category.\n')
    i18n_file.write('% It covers Unicode version {:s}.\n'.format(
        unicode_version))
    i18n_file.write('% The character classes and mapping tables were '
                    + 'automatically\n')
    i18n_file.write('% generated using the gen_unicode_ctype.py '
                    + 'program.\n\n')
    i18n_file.write('% The "upper" class reflects the uppercase '
                    + 'characters of class "alpha"\n')
    output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
    i18n_file.write('% The "lower" class reflects the lowercase '
                    + 'characters of class "alpha"\n')
    output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
    i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
                    + 'reflecting\n')
    i18n_file.write('% the recommendations in TR 10176 annex A\n')
    output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
    i18n_file.write('% The "digit" class must only contain the '
                    + 'BASIC LATIN digits, says ISO C 99\n')
    i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
    output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
    i18n_file.write('% The "outdigit" information is by default '
                    + '"0" to "9".  We don\'t have to\n')
    i18n_file.write('% provide it here since localedef will fill '
               + 'in the bits and it would\n')
    i18n_file.write('% prevent locales copying this file define '
                    + 'their own values.\n')
    i18n_file.write('% outdigit /\n')
    i18n_file.write('%    <U0030>..<U0039>\n\n')
    # output_charclass(i18n_file, 'outdigit', is_outdigit)
    output_charclass(i18n_file, 'space', unicode_utils.is_space)
    output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
    output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
    output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
    output_charclass(i18n_file, 'print', unicode_utils.is_print)
    i18n_file.write('% The "xdigit" class must only contain the '
                    + 'BASIC LATIN digits and A-F, a-f,\n')
    i18n_file.write('% says ISO C 99 '
                    + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
    output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
    output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
    if turkish:
        i18n_file.write('% The case conversions reflect '
                        + 'Turkish conventions.\n')
        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
    else:
        output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
        output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
    output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
    i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
                    + 'annex B.1\n')
    i18n_file.write('% That is, all combining characters (level 2+3).\n')
    output_charclass(i18n_file, 'class "combining";',
                     unicode_utils.is_combining)
    i18n_file.write('% The "combining_level3" class reflects '
                    + 'ISO/IEC 10646-1 annex B.2\n')
    i18n_file.write('% That is, combining characters of level 3.\n')
    output_charclass(i18n_file, 'class "combining_level3";',
                     unicode_utils.is_combining_level3)

if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Generate a Unicode conforming LC_CTYPE category from
        UnicodeData.txt and DerivedCoreProperties.txt files.
        ''')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        default='UnicodeData.txt',
        help=('The UnicodeData.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-d', '--derived_core_properties_file',
        nargs='?',
        type=str,
        default='DerivedCoreProperties.txt',
        help=('The DerivedCoreProperties.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-i', '--input_file',
        nargs='?',
        type=str,
        help='''The original glibc/localedata/locales/i18n file.''')
    PARSER.add_argument(
        '-o', '--output_file',
        nargs='?',
        type=str,
        default='i18n.new',
        help='''The file which shall contain the generated LC_CTYPE category,
        default: %(default)s.  If the original
        glibc/localedata/locales/i18n has been given
        as an option, all data from the original file
        except the newly generated LC_CTYPE character
        classes and the date stamp in
        LC_IDENTIFICATION will be copied unchanged
        into the output file.  ''')
    PARSER.add_argument(
        '--unicode_version',
        nargs='?',
        required=True,
        type=str,
        help='The Unicode version of the input files used.')
    PARSER.add_argument(
        '--turkish',
        action='store_true',
        help='Use Turkish case conversions.')
    ARGS = PARSER.parse_args()

    unicode_utils.fill_attributes(
        ARGS.unicode_data_file)
    unicode_utils.fill_derived_core_properties(
        ARGS.derived_core_properties_file)
    unicode_utils.verifications()
    HEAD = TAIL = ''
    if ARGS.input_file:
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
    with open(ARGS.output_file, mode='w') as I18N_FILE:
        output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
        output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
        output_tail(I18N_FILE, tail=TAIL)
Commit	Line	Data
4a4839c9 AO	1	#!/usr/bin/python3
	2	#
	3	# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
d614a753	4	# Copyright (C) 2014-2020 Free Software Foundation, Inc.
4a4839c9 AO	5	# This file is part of the GNU C Library.
	6	# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
	7	#
	8	# The GNU C Library is free software; you can redistribute it and/or
	9	# modify it under the terms of the GNU Lesser General Public
	10	# License as published by the Free Software Foundation; either
	11	# version 2.1 of the License, or (at your option) any later version.
	12	#
	13	# The GNU C Library is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	16	# Lesser General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU Lesser General Public
	19	# License along with the GNU C Library; if not, see
5a82c748	20	# <https://www.gnu.org/licenses/>.
4a4839c9 AO	21
	22	'''
	23	Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
	24	DerivedCoreProperties.txt files.
	25
	26	To see how this script is used, call it with the “-h” option:
	27
	28	$ ./gen_unicode_ctype.py -h
	29	… prints usage message …
	30	'''
	31
	32	import argparse
4a4839c9 AO	33	import time
4a4839c9 AO	34	import re
dd8e8e54	35	import unicode_utils
4a4839c9 AO	36
	37	def code_point_ranges(is_class_function):
	38	'''Returns a list of ranges of code points for which is_class_function
	39	returns True.
	40
	41	Example:
	42
	43	[[65, 90], [192, 214], [216, 222], [256], … ]
	44	'''
	45	cp_ranges = []
dd8e8e54	46	for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
4a4839c9 AO	47	if is_class_function(code_point):
	48	if (cp_ranges
	49	and cp_ranges[-1][-1] == code_point - 1):
	50	if len(cp_ranges[-1]) == 1:
	51	cp_ranges[-1].append(code_point)
	52	else:
	53	cp_ranges[-1][-1] = code_point
	54	else:
	55	cp_ranges.append([code_point])
	56	return cp_ranges
	57
	58	def output_charclass(i18n_file, class_name, is_class_function):
	59	'''Output a LC_CTYPE character class section
	60
	61	Example:
	62
	63	upper /
	64	<U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
	65	…
	66	<U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
	67	<U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
	68	'''
	69	cp_ranges = code_point_ranges(is_class_function)
	70	if cp_ranges:
	71	i18n_file.write('%s /\n' %class_name)
	72	max_column = 75
	73	prefix = ' '
	74	line = prefix
	75	range_string = ''
	76	for code_point_range in cp_ranges:
	77	if line.strip():
	78	line += ';'
	79	if len(code_point_range) == 1:
dd8e8e54	80	range_string = unicode_utils.ucs_symbol(code_point_range[0])
4a4839c9	81	else:
dd8e8e54	82	range_string = unicode_utils.ucs_symbol_range(
4a4839c9 AO	83	code_point_range[0], code_point_range[-1])
	84	if len(line+range_string) > max_column:
	85	i18n_file.write(line+'/\n')
	86	line = prefix
	87	line += range_string
	88	if line.strip():
	89	i18n_file.write(line+'\n')
	90	i18n_file.write('\n')
	91
	92	def output_charmap(i18n_file, map_name, map_function):
	93	'''Output a LC_CTYPE character map section
	94
	95	Example:
	96
	97	toupper /
	98	(<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
	99	…
	100	(<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
	101	(<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
	102	'''
	103	max_column = 75
	104	prefix = ' '
	105	line = prefix
	106	map_string = ''
	107	i18n_file.write('%s /\n' %map_name)
dd8e8e54	108	for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
4a4839c9 AO	109	mapped = map_function(code_point)
	110	if code_point != mapped:
	111	if line.strip():
	112	line += ';'
	113	map_string = '(' \
dd8e8e54	114	+ unicode_utils.ucs_symbol(code_point) \
4a4839c9	115	+ ',' \
dd8e8e54	116	+ unicode_utils.ucs_symbol(mapped) \
4a4839c9 AO	117	+ ')'
	118	if len(line+map_string) > max_column:
	119	i18n_file.write(line+'/\n')
	120	line = prefix
	121	line += map_string
	122	if line.strip():
	123	i18n_file.write(line+'\n')
	124	i18n_file.write('\n')
	125
4a4839c9 AO	126	def read_input_file(filename):
	127	'''Reads the original glibc i18n file to get the original head
	128	and tail.
	129
	130	We want to replace only the character classes in LC_CTYPE, and the
	131	date stamp. All the rest of the i18n file should stay unchanged.
	132	To avoid having to cut and paste the generated data into the
	133	original file, it is helpful to read the original file here
	134	to be able to generate a complete result file.
	135	'''
	136	head = tail = ''
	137	with open(filename, mode='r') as i18n_file:
	138	for line in i18n_file:
	139	match = re.match(
	140	r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
	141	line)
	142	if match:
	143	line = match.group('key') \
	144	+ '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
	145	head = head + line
	146	if line.startswith('LC_CTYPE'):
	147	break
	148	for line in i18n_file:
	149	if line.startswith('translit_start'):
	150	tail = line
	151	break
	152	for line in i18n_file:
	153	tail = tail + line
	154	return (head, tail)
	155
	156	def output_head(i18n_file, unicode_version, head=''):
	157	'''Write the header of the output file, i.e. the part of the file
	158	before the “LC_CTYPE” line.
	159	'''
	160	if ARGS.input_file and head:
	161	i18n_file.write(head)
	162	else:
	163	i18n_file.write('escape_char /\n')
	164	i18n_file.write('comment_char %\n')
	165	i18n_file.write('\n')
	166	i18n_file.write('% Generated automatically by '
	167	+ 'gen_unicode_ctype.py '
	168	+ 'for Unicode {:s}.\n'.format(unicode_version))
	169	i18n_file.write('\n')
	170	i18n_file.write('LC_IDENTIFICATION\n')
	171	i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
	172	unicode_version))
	173	i18n_file.write('source "UnicodeData.txt, '
	174	+ 'DerivedCoreProperties.txt"\n')
	175	i18n_file.write('address ""\n')
	176	i18n_file.write('contact ""\n')
	177	i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
	178	i18n_file.write('tel ""\n')
	179	i18n_file.write('fax ""\n')
	180	i18n_file.write('language ""\n')
	181	i18n_file.write('territory "Earth"\n')
	182	i18n_file.write('revision "{:s}"\n'.format(unicode_version))
	183	i18n_file.write('date "{:s}"\n'.format(
	184	time.strftime('%Y-%m-%d')))
1bb36539	185	i18n_file.write('category "i18n:2012";LC_CTYPE\n')
4a4839c9 AO	186	i18n_file.write('END LC_IDENTIFICATION\n')
	187	i18n_file.write('\n')
	188	i18n_file.write('LC_CTYPE\n')
	189
	190	def output_tail(i18n_file, tail=''):
	191	'''Write the tail of the output file, i.e. the part of the file
	192	after the last “LC_CTYPE” character class.
	193	'''
	194	if ARGS.input_file and tail:
	195	i18n_file.write(tail)
	196	else:
	197	i18n_file.write('END LC_CTYPE\n')
	198
85bafe6f	199	def output_tables(i18n_file, unicode_version, turkish):
4a4839c9 AO	200	'''Write the new LC_CTYPE character classes to the output file'''
	201	i18n_file.write('% The following is the 14652 i18n fdcc-set '
	202	+ 'LC_CTYPE category.\n')
	203	i18n_file.write('% It covers Unicode version {:s}.\n'.format(
	204	unicode_version))
	205	i18n_file.write('% The character classes and mapping tables were '
	206	+ 'automatically\n')
	207	i18n_file.write('% generated using the gen_unicode_ctype.py '
	208	+ 'program.\n\n')
	209	i18n_file.write('% The "upper" class reflects the uppercase '
	210	+ 'characters of class "alpha"\n')
dd8e8e54	211	output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
4a4839c9 AO	212	i18n_file.write('% The "lower" class reflects the lowercase '
4a4839c9 AO	213	+ 'characters of class "alpha"\n')
dd8e8e54	214	output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
4a4839c9 AO	215	i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
	216	+ 'reflecting\n')
	217	i18n_file.write('% the recommendations in TR 10176 annex A\n')
dd8e8e54	218	output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
4a4839c9 AO	219	i18n_file.write('% The "digit" class must only contain the '
	220	+ 'BASIC LATIN digits, says ISO C 99\n')
	221	i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
dd8e8e54	222	output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
4a4839c9 AO	223	i18n_file.write('% The "outdigit" information is by default '
	224	+ '"0" to "9". We don\'t have to\n')
	225	i18n_file.write('% provide it here since localedef will fill '
	226	+ 'in the bits and it would\n')
	227	i18n_file.write('% prevent locales copying this file define '
	228	+ 'their own values.\n')
	229	i18n_file.write('% outdigit /\n')
	230	i18n_file.write('% <U0030>..<U0039>\n\n')
	231	# output_charclass(i18n_file, 'outdigit', is_outdigit)
dd8e8e54 CD	232	output_charclass(i18n_file, 'space', unicode_utils.is_space)
	233	output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
	234	output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
	235	output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
	236	output_charclass(i18n_file, 'print', unicode_utils.is_print)
4a4839c9 AO	237	i18n_file.write('% The "xdigit" class must only contain the '
	238	+ 'BASIC LATIN digits and A-F, a-f,\n')
	239	i18n_file.write('% says ISO C 99 '
	240	+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
dd8e8e54 CD	241	output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
dd8e8e54 CD	242	output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
85bafe6f JM	243	if turkish:
	244	i18n_file.write('% The case conversions reflect '
	245	+ 'Turkish conventions.\n')
	246	output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
	247	output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
	248	else:
	249	output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
	250	output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
dd8e8e54	251	output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
4a4839c9 AO	252	i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
	253	+ 'annex B.1\n')
	254	i18n_file.write('% That is, all combining characters (level 2+3).\n')
dd8e8e54 CD	255	output_charclass(i18n_file, 'class "combining";',
dd8e8e54 CD	256	unicode_utils.is_combining)
4a4839c9 AO	257	i18n_file.write('% The "combining_level3" class reflects '
	258	+ 'ISO/IEC 10646-1 annex B.2\n')
	259	i18n_file.write('% That is, combining characters of level 3.\n')
dd8e8e54 CD	260	output_charclass(i18n_file, 'class "combining_level3";',
dd8e8e54 CD	261	unicode_utils.is_combining_level3)
4a4839c9 AO	262
	263	if __name__ == "__main__":
	264	PARSER = argparse.ArgumentParser(
	265	description='''
	266	Generate a Unicode conforming LC_CTYPE category from
	267	UnicodeData.txt and DerivedCoreProperties.txt files.
	268	''')
	269	PARSER.add_argument(
	270	'-u', '--unicode_data_file',
	271	nargs='?',
	272	type=str,
	273	default='UnicodeData.txt',
	274	help=('The UnicodeData.txt file to read, '
	275	+ 'default: %(default)s'))
	276	PARSER.add_argument(
	277	'-d', '--derived_core_properties_file',
	278	nargs='?',
	279	type=str,
	280	default='DerivedCoreProperties.txt',
	281	help=('The DerivedCoreProperties.txt file to read, '
	282	+ 'default: %(default)s'))
	283	PARSER.add_argument(
	284	'-i', '--input_file',
	285	nargs='?',
	286	type=str,
	287	help='''The original glibc/localedata/locales/i18n file.''')
	288	PARSER.add_argument(
	289	'-o', '--output_file',
	290	nargs='?',
	291	type=str,
	292	default='i18n.new',
	293	help='''The file which shall contain the generated LC_CTYPE category,
	294	default: %(default)s. If the original
	295	glibc/localedata/locales/i18n has been given
	296	as an option, all data from the original file
	297	except the newly generated LC_CTYPE character
	298	classes and the date stamp in
	299	LC_IDENTIFICATION will be copied unchanged
	300	into the output file. ''')
	301	PARSER.add_argument(
	302	'--unicode_version',
	303	nargs='?',
	304	required=True,
	305	type=str,
	306	help='The Unicode version of the input files used.')
85bafe6f JM	307	PARSER.add_argument(
	308	'--turkish',
	309	action='store_true',
	310	help='Use Turkish case conversions.')
4a4839c9 AO	311	ARGS = PARSER.parse_args()
4a4839c9 AO	312
dd8e8e54 CD	313	unicode_utils.fill_attributes(
	314	ARGS.unicode_data_file)
	315	unicode_utils.fill_derived_core_properties(
	316	ARGS.derived_core_properties_file)
	317	unicode_utils.verifications()
4a4839c9 AO	318	HEAD = TAIL = ''
	319	if ARGS.input_file:
	320	(HEAD, TAIL) = read_input_file(ARGS.input_file)
	321	with open(ARGS.output_file, mode='w') as I18N_FILE:
	322	output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
85bafe6f	323	output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
4a4839c9	324	output_tail(I18N_FILE, tail=TAIL)