[thirdparty/glibc.git] / localedata / unicode-gen / gen_translit_compat.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_compat file from a UnicodeData file.
# Copyright (C) 2015-2024 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

'''
Generate a translit_compat file from UnicodeData.txt

To see how this script is used, call it with the “-h” option:

    $ ./gen_translit_compat -h
    … prints usage message …
'''

import argparse
import time
import unicode_utils

def read_input_file(filename):
    '''Reads the original glibc translit_compat file to get the
    original head and tail.

    We want to replace only the part of the file between
    “translit_start” and “translit_end”
    '''
    head = tail = ''
    with open(filename, mode='r') as translit_file:
        for line in translit_file:
            head = head + line
            if line.startswith('translit_start'):
                break
        for line in translit_file:
            if line.startswith('translit_end'):
                tail = line
                break
        for line in translit_file:
            tail = tail + line
    return (head, tail)

def output_head(translit_file, unicode_version, head=''):
    '''Write the header of the output file, i.e. the part of the file
    before the “translit_start” line.
    '''
    if ARGS.input_file and head:
        translit_file.write(head)
    else:
        translit_file.write('escape_char /\n')
        translit_file.write('comment_char %\n')
        translit_file.write(unicode_utils.COMMENT_HEADER)
        translit_file.write('\n')
        translit_file.write('% Transliterations of compatibility characters ')
        translit_file.write('and ligatures.\n')
        translit_file.write('% Generated automatically from UnicodeData.txt '
                            + 'by gen_translit_compat.py '
                            + 'on {:s} '.format(time.strftime('%Y-%m-%d'))
                            + 'for Unicode {:s}.\n'.format(unicode_version))
        translit_file.write('\n')
        translit_file.write('LC_CTYPE\n')
        translit_file.write('\n')
        translit_file.write('translit_start\n')

def output_tail(translit_file, tail=''):
    '''Write the tail of the output file'''
    if ARGS.input_file and tail:
        translit_file.write(tail)
    else:
        translit_file.write('translit_end\n')
        translit_file.write('\n')
        translit_file.write('END LC_CTYPE\n')

def compatibility_decompose(code_point):
    '''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings

    “The compatibility decomposition is formed by recursively applying
    the canonical and compatibility mappings, then applying the
    Canonical Ordering Algorithm.”

    We don’t do the canonical decomposition here because this is
    done in gen_translit_combining.py to generate translit_combining.

    And we ignore some of the possible compatibility formatting tags
    here. Some of them are used in other translit_* files, not
    translit_compat:

    <font>:   translit_font
    <circle>: translit_circle
    <wide>:   translit_wide
    <narrow>: translit_narrow
    <square>: translit_cjk_compat
    <fraction>: translit_fraction

    And we ignore

    <noBreak>, <initial>, <medial>, <final>, <isolated>

    because they seem to be not useful for transliteration.
    '''
    decomposition = unicode_utils.UNICODE_ATTRIBUTES[
        code_point]['decomposition']
    compatibility_tags = (
        '<compat>', '<super>', '<sub>', '<vertical>')
    for compatibility_tag in compatibility_tags:
        if decomposition.startswith(compatibility_tag):
            decomposition = decomposition[len(compatibility_tag)+1:]
            decomposed_code_points = [int(x, 16)
                                      for x in decomposition.split(' ')]
            if (len(decomposed_code_points) > 1
                    and decomposed_code_points[0] == 0x0020
                    and decomposed_code_points[1] >= 0x0300
                    and decomposed_code_points[1] <= 0x03FF):
                # Decomposes into a space followed by a combining character.
                # This is not useful fo transliteration.
                return []
            else:
                return_value = []
                for index in range(0, len(decomposed_code_points)):
                    cd_code_points = compatibility_decompose(
                        decomposed_code_points[index])
                    if cd_code_points:
                        return_value += cd_code_points
                    else:
                        return_value += [decomposed_code_points[index]]
                return return_value
    return []

def special_decompose(code_point_list):
    '''
    Decompositions which are not in UnicodeData.txt at all but which
    were used in the original translit_compat file in glibc and
    which seem to make sense.  I want to keep the update of
    translit_compat close to the spirit of the original file,
    therefore I added this special decomposition rules here.
    '''
    special_decompose_dict = {
        (0x03BC,): [0x0075], # μ → u
        (0x02BC,): [0x0027], # ʼ → '
    }
    if tuple(code_point_list) in special_decompose_dict:
        return special_decompose_dict[tuple(code_point_list)]
    else:
        return code_point_list

def special_ligature_decompose(code_point):
    '''
    Decompositions for ligatures which are not in UnicodeData.txt at
    all but which were used in the original translit_compat file in
    glibc and which seem to make sense.  I want to keep the update of
    translit_compat close to the spirit of the original file,
    therefore I added these special ligature decomposition rules here.

    '''
    special_ligature_decompose_dict = {
        0x00E6: [0x0061, 0x0065], # æ → ae
        0x00C6: [0x0041, 0x0045], # Æ → AE
        # These following 5 special ligature decompositions were
        # in the original glibc/localedata/locales/translit_compat file
        0x0152: [0x004F, 0x0045], # Œ → OE
        0x0153: [0x006F, 0x0065], # œ → oe
        0x05F0: [0x05D5, 0x05D5], # װ → וו
        0x05F1: [0x05D5, 0x05D9], # ױ → וי
        0x05F2: [0x05D9, 0x05D9], # ײ → יי
        # The following special ligature decompositions were
        # not in the original glibc/localedata/locales/translit_compat file
        # U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
        # → U+041D CYRILLIC CAPITAL LETTER EN,
        #   U+0413 CYRILLIC CAPITAL LETTER GHE
        0x04A4: [0x041D, 0x0413], # Ҥ → НГ
        # U+04A5 CYRILLIC SMALL LIGATURE EN GHE
        # → U+043D CYRILLIC SMALL LETTER EN,
        #   U+0433 CYRILLIC SMALL LETTER GHE
        0x04A5: [0x043D, 0x0433], # ҥ → нг
        # U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
        # → U+0422 CYRILLIC CAPITAL LETTER TE,
        #   U+0426 CYRILLIC CAPITAL LETTER TSE
        0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
        # U+04B5 CYRILLIC SMALL LIGATURE TE TSE
        # → U+0442 CYRILLIC SMALL LETTER TE,
        #   U+0446 CYRILLIC SMALL LETTER TSE
        0x04B5: [0x0442, 0x0446], # ҵ → тц
        # U+04d4 CYRILLIC CAPITAL LIGATURE A IE
        # → U+0410 CYRILLIC CAPITAL LETTER A
        #   U+0415;CYRILLIC CAPITAL LETTER IE
        0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
        # U+04D5 CYRILLIC SMALL LIGATURE A IE
        # → U+0430 CYRILLIC SMALL LETTER A,
        #   U+0435 CYRILLIC SMALL LETTER IE
        0x04D5: [0x0430, 0x0435], # ӕ → ае
        # I am not sure what to do with the following ligatures
        # maybe it makes no sense to decompose them:
        # U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
        # U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
        # U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
        # U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
        # U+fe20 COMBINING LIGATURE LEFT HALF
        # U+fe21 COMBINING LIGATURE RIGHT HALF
        # U+fe27 COMBINING LIGATURE LEFT HALF BELOW
        # U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
        # U+11176 MAHAJANI LIGATURE SHRI
        # U+1f670 SCRIPT LIGATURE ET ORNAMENT
        # U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
        # U+1f672 LIGATURE OPEN ET ORNAMENT
        # U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
    }
    if code_point in special_ligature_decompose_dict:
        return special_ligature_decompose_dict[code_point]
    else:
        return [code_point]

def output_transliteration(translit_file):
    '''Write the new transliteration to the output file'''
    translit_file.write('\n')
    for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
        name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
        decomposed_code_points = [compatibility_decompose(code_point)]
        if not decomposed_code_points[0]:
            if special_decompose([code_point]) != [code_point]:
                decomposed_code_points[0] = special_decompose([code_point])
        else:
            special_decomposed_code_points = []
            while True:
                special_decomposed_code_points = special_decompose(
                    decomposed_code_points[-1])
                if (special_decomposed_code_points
                        != decomposed_code_points[-1]):
                    decomposed_code_points.append(
                        special_decomposed_code_points)
                    continue
                special_decomposed_code_points = []
                for decomposed_code_point in decomposed_code_points[-1]:
                    special_decomposed_code_points += special_decompose(
                        [decomposed_code_point])
                if (special_decomposed_code_points
                        == decomposed_code_points[-1]):
                    break
                decomposed_code_points.append(
                    special_decomposed_code_points)
        if decomposed_code_points[0]:
            translit_file.write('% {:s}\n'.format(name))
            translit_file.write('{:s} '.format(
                unicode_utils.ucs_symbol(code_point)))
            for index in range(0, len(decomposed_code_points)):
                if index > 0:
                    translit_file.write(';')
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points[index]:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
            translit_file.write('\n')
        elif 'LIGATURE' in name and 'ARABIC' not in name:
            decomposed_code_points = special_ligature_decompose(code_point)
            if decomposed_code_points[0] != code_point:
                translit_file.write('% {:s}\n'.format(name))
                translit_file.write('{:s} '.format(
                    unicode_utils.ucs_symbol(code_point)))
                translit_file.write('"')
                for decomposed_code_point in decomposed_code_points:
                    translit_file.write('{:s}'.format(
                        unicode_utils.ucs_symbol(decomposed_code_point)))
                translit_file.write('"')
                translit_file.write('\n')
            else:
                print('Warning: unhandled ligature: {:x} {:s}'.format(
                    code_point, name))
    translit_file.write('\n')

if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Generate a translit_compat file from UnicodeData.txt.
        ''')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        default='UnicodeData.txt',
        help=('The UnicodeData.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-i', '--input_file',
        nargs='?',
        type=str,
        help=''' The original glibc/localedata/locales/translit_compat
        file.''')
    PARSER.add_argument(
        '-o', '--output_file',
        nargs='?',
        type=str,
        default='translit_compat.new',
        help='''The new translit_compat file, default: %(default)s.  If the
        original glibc/localedata/locales/translit_compat file has
        been given as an option, the header up to the
        “translit_start” line and the tail from the “translit_end”
        line to the end of the file will be copied unchanged into the
        output file.  ''')
    PARSER.add_argument(
        '--unicode_version',
        nargs='?',
        required=True,
        type=str,
        help='The Unicode version of the input files used.')
    ARGS = PARSER.parse_args()

    unicode_utils.fill_attributes(ARGS.unicode_data_file)
    HEAD = TAIL = ''
    if ARGS.input_file:
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
    with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
        output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
        output_transliteration(TRANSLIT_FILE)
        output_tail(TRANSLIT_FILE, tail=TAIL)
Commit	Line	Data
dd8e8e54 CD	1	#!/usr/bin/python3
	2	# -- coding: utf-8 --
	3	#
	4	# Generate a translit_compat file from a UnicodeData file.
dff8da6b	5	# Copyright (C) 2015-2024 Free Software Foundation, Inc.
dd8e8e54 CD	6	# This file is part of the GNU C Library.
	7	#
	8	# The GNU C Library is free software; you can redistribute it and/or
	9	# modify it under the terms of the GNU Lesser General Public
	10	# License as published by the Free Software Foundation; either
	11	# version 2.1 of the License, or (at your option) any later version.
	12	#
	13	# The GNU C Library is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	16	# Lesser General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU Lesser General Public
	19	# License along with the GNU C Library; if not, see
5a82c748	20	# <https://www.gnu.org/licenses/>.
dd8e8e54 CD	21
	22	'''
	23	Generate a translit_compat file from UnicodeData.txt
	24
	25	To see how this script is used, call it with the “-h” option:
	26
	27	$ ./gen_translit_compat -h
	28	… prints usage message …
	29	'''
	30
	31	import argparse
	32	import time
	33	import unicode_utils
	34
	35	def read_input_file(filename):
	36	'''Reads the original glibc translit_compat file to get the
	37	original head and tail.
	38
	39	We want to replace only the part of the file between
	40	“translit_start” and “translit_end”
	41	'''
	42	head = tail = ''
	43	with open(filename, mode='r') as translit_file:
	44	for line in translit_file:
	45	head = head + line
	46	if line.startswith('translit_start'):
	47	break
	48	for line in translit_file:
	49	if line.startswith('translit_end'):
	50	tail = line
	51	break
	52	for line in translit_file:
	53	tail = tail + line
	54	return (head, tail)
	55
	56	def output_head(translit_file, unicode_version, head=''):
	57	'''Write the header of the output file, i.e. the part of the file
	58	before the “translit_start” line.
	59	'''
	60	if ARGS.input_file and head:
	61	translit_file.write(head)
	62	else:
	63	translit_file.write('escape_char /\n')
	64	translit_file.write('comment_char %\n')
277da2ab	65	translit_file.write(unicode_utils.COMMENT_HEADER)
dd8e8e54 CD	66	translit_file.write('\n')
	67	translit_file.write('% Transliterations of compatibility characters ')
	68	translit_file.write('and ligatures.\n')
	69	translit_file.write('% Generated automatically from UnicodeData.txt '
	70	+ 'by gen_translit_compat.py '
	71	+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
	72	+ 'for Unicode {:s}.\n'.format(unicode_version))
	73	translit_file.write('\n')
	74	translit_file.write('LC_CTYPE\n')
	75	translit_file.write('\n')
	76	translit_file.write('translit_start\n')
	77
	78	def output_tail(translit_file, tail=''):
	79	'''Write the tail of the output file'''
	80	if ARGS.input_file and tail:
	81	translit_file.write(tail)
	82	else:
	83	translit_file.write('translit_end\n')
	84	translit_file.write('\n')
	85	translit_file.write('END LC_CTYPE\n')
	86
	87	def compatibility_decompose(code_point):
	88	'''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
	89
	90	“The compatibility decomposition is formed by recursively applying
	91	the canonical and compatibility mappings, then applying the
	92	Canonical Ordering Algorithm.”
	93
	94	We don’t do the canonical decomposition here because this is
	95	done in gen_translit_combining.py to generate translit_combining.
	96
	97	And we ignore some of the possible compatibility formatting tags
	98	here. Some of them are used in other translit_* files, not
	99	translit_compat:
	100
	101	<font>: translit_font
	102	<circle>: translit_circle
	103	<wide>: translit_wide
	104	<narrow>: translit_narrow
	105	<square>: translit_cjk_compat
	106	<fraction>: translit_fraction
	107
	108	And we ignore
	109
	110	<noBreak>, <initial>, <medial>, <final>, <isolated>
	111
	112	because they seem to be not useful for transliteration.
	113	'''
	114	decomposition = unicode_utils.UNICODE_ATTRIBUTES[
	115	code_point]['decomposition']
	116	compatibility_tags = (
	117	'<compat>', '<super>', '<sub>', '<vertical>')
	118	for compatibility_tag in compatibility_tags:
	119	if decomposition.startswith(compatibility_tag):
	120	decomposition = decomposition[len(compatibility_tag)+1:]
	121	decomposed_code_points = [int(x, 16)
	122	for x in decomposition.split(' ')]
	123	if (len(decomposed_code_points) > 1
	124	and decomposed_code_points[0] == 0x0020
	125	and decomposed_code_points[1] >= 0x0300
	126	and decomposed_code_points[1] <= 0x03FF):
	127	# Decomposes into a space followed by a combining character.
	128	# This is not useful fo transliteration.
	129	return []
130	else:
131	return_value = []
132	for index in range(0, len(decomposed_code_points)):
133	cd_code_points = compatibility_decompose(
134	decomposed_code_points[index])
135	if cd_code_points:
136	return_value += cd_code_points
137	else:
138	return_value += [decomposed_code_points[index]]
139	return return_value
140	return []
141
142	def special_decompose(code_point_list):
143	'''
144	Decompositions which are not in UnicodeData.txt at all but which
145	were used in the original translit_compat file in glibc and
146	which seem to make sense. I want to keep the update of
147	translit_compat close to the spirit of the original file,
148	therefore I added this special decomposition rules here.
149	'''
150	special_decompose_dict = {
151	(0x03BC,): [0x0075], # μ → u
152	(0x02BC,): [0x0027], # ʼ → '
153	}
154	if tuple(code_point_list) in special_decompose_dict:
155	return special_decompose_dict[tuple(code_point_list)]
156	else:
157	return code_point_list
158
159	def special_ligature_decompose(code_point):
160	'''
161	Decompositions for ligatures which are not in UnicodeData.txt at
162	all but which were used in the original translit_compat file in
163	glibc and which seem to make sense. I want to keep the update of
164	translit_compat close to the spirit of the original file,
165	therefore I added these special ligature decomposition rules here.
166
167	'''
168	special_ligature_decompose_dict = {
169	0x00E6: [0x0061, 0x0065], # æ → ae
170	0x00C6: [0x0041, 0x0045], # Æ → AE
171	# These following 5 special ligature decompositions were
172	# in the original glibc/localedata/locales/translit_compat file
173	0x0152: [0x004F, 0x0045], # Œ → OE
174	0x0153: [0x006F, 0x0065], # œ → oe
175	0x05F0: [0x05D5, 0x05D5], # װ → וו
176	0x05F1: [0x05D5, 0x05D9], # ױ → וי
177	0x05F2: [0x05D9, 0x05D9], # ײ → יי
178	# The following special ligature decompositions were
179	# not in the original glibc/localedata/locales/translit_compat file
180	# U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
181	# → U+041D CYRILLIC CAPITAL LETTER EN,
182	# U+0413 CYRILLIC CAPITAL LETTER GHE
183	0x04A4: [0x041D, 0x0413], # Ҥ → НГ
184	# U+04A5 CYRILLIC SMALL LIGATURE EN GHE
185	# → U+043D CYRILLIC SMALL LETTER EN,
186	# U+0433 CYRILLIC SMALL LETTER GHE
187	0x04A5: [0x043D, 0x0433], # ҥ → нг
188	# U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
189	# → U+0422 CYRILLIC CAPITAL LETTER TE,
190	# U+0426 CYRILLIC CAPITAL LETTER TSE
191	0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
192	# U+04B5 CYRILLIC SMALL LIGATURE TE TSE
193	# → U+0442 CYRILLIC SMALL LETTER TE,
194	# U+0446 CYRILLIC SMALL LETTER TSE
195	0x04B5: [0x0442, 0x0446], # ҵ → тц
196	# U+04d4 CYRILLIC CAPITAL LIGATURE A IE
197	# → U+0410 CYRILLIC CAPITAL LETTER A
198	# U+0415;CYRILLIC CAPITAL LETTER IE
199	0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
200	# U+04D5 CYRILLIC SMALL LIGATURE A IE
201	# → U+0430 CYRILLIC SMALL LETTER A,
202	# U+0435 CYRILLIC SMALL LETTER IE
203	0x04D5: [0x0430, 0x0435], # ӕ → ае
204	# I am not sure what to do with the following ligatures
205	# maybe it makes no sense to decompose them:
206	# U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
207	# U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
208	# U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
209	# U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
210	# U+fe20 COMBINING LIGATURE LEFT HALF
211	# U+fe21 COMBINING LIGATURE RIGHT HALF
212	# U+fe27 COMBINING LIGATURE LEFT HALF BELOW
213	# U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
214	# U+11176 MAHAJANI LIGATURE SHRI
215	# U+1f670 SCRIPT LIGATURE ET ORNAMENT
216	# U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
217	# U+1f672 LIGATURE OPEN ET ORNAMENT
218	# U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
219	}
220	if code_point in special_ligature_decompose_dict:
221	return special_ligature_decompose_dict[code_point]
222	else:
223	return [code_point]
224
225	def output_transliteration(translit_file):
226	'''Write the new transliteration to the output file'''
227	translit_file.write('\n')
228	for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
229	name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
230	decomposed_code_points = [compatibility_decompose(code_point)]
231	if not decomposed_code_points[0]:
232	if special_decompose([code_point]) != [code_point]:
233	decomposed_code_points[0] = special_decompose([code_point])
234	else:
235	special_decomposed_code_points = []
236	while True:
237	special_decomposed_code_points = special_decompose(
238	decomposed_code_points[-1])
239	if (special_decomposed_code_points
240	!= decomposed_code_points[-1]):
241	decomposed_code_points.append(
242	special_decomposed_code_points)
243	continue
244	special_decomposed_code_points = []
245	for decomposed_code_point in decomposed_code_points[-1]:
246	special_decomposed_code_points += special_decompose(
247	[decomposed_code_point])
248	if (special_decomposed_code_points
249	== decomposed_code_points[-1]):
250	break
251	decomposed_code_points.append(
252	special_decomposed_code_points)
253	if decomposed_code_points[0]:
254	translit_file.write('% {:s}\n'.format(name))
255	translit_file.write('{:s} '.format(
256	unicode_utils.ucs_symbol(code_point)))
257	for index in range(0, len(decomposed_code_points)):
258	if index > 0:
259	translit_file.write(';')
260	translit_file.write('"')
261	for decomposed_code_point in decomposed_code_points[index]:
262	translit_file.write('{:s}'.format(
263	unicode_utils.ucs_symbol(decomposed_code_point)))
264	translit_file.write('"')
265	translit_file.write('\n')
266	elif 'LIGATURE' in name and 'ARABIC' not in name:
267	decomposed_code_points = special_ligature_decompose(code_point)
268	if decomposed_code_points[0] != code_point:
269	translit_file.write('% {:s}\n'.format(name))
270	translit_file.write('{:s} '.format(
271	unicode_utils.ucs_symbol(code_point)))
272	translit_file.write('"')
273	for decomposed_code_point in decomposed_code_points:
274	translit_file.write('{:s}'.format(
275	unicode_utils.ucs_symbol(decomposed_code_point)))
276	translit_file.write('"')
277	translit_file.write('\n')
278	else:
279	print('Warning: unhandled ligature: {:x} {:s}'.format(
280	code_point, name))
281	translit_file.write('\n')
282
283	if __name__ == "__main__":
284	PARSER = argparse.ArgumentParser(
285	description='''
286	Generate a translit_compat file from UnicodeData.txt.
287	''')
288	PARSER.add_argument(
289	'-u', '--unicode_data_file',
290	nargs='?',
291	type=str,
292	default='UnicodeData.txt',
293	help=('The UnicodeData.txt file to read, '
294	+ 'default: %(default)s'))
295	PARSER.add_argument(
296	'-i', '--input_file',
297	nargs='?',
298	type=str,
299	help=''' The original glibc/localedata/locales/translit_compat
300	file.''')
301	PARSER.add_argument(
302	'-o', '--output_file',
303	nargs='?',
304	type=str,
305	default='translit_compat.new',
306	help='''The new translit_compat file, default: %(default)s. If the
307	original glibc/localedata/locales/translit_compat file has
308	been given as an option, the header up to the
309	“translit_start” line and the tail from the “translit_end”
310	line to the end of the file will be copied unchanged into the
311	output file. ''')
312	PARSER.add_argument(
313	'--unicode_version',
314	nargs='?',
315	required=True,
316	type=str,
317	help='The Unicode version of the input files used.')
318	ARGS = PARSER.parse_args()
319
320	unicode_utils.fill_attributes(ARGS.unicode_data_file)
321	HEAD = TAIL = ''
322	if ARGS.input_file:
323	(HEAD, TAIL) = read_input_file(ARGS.input_file)
324	with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
325	output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
326	output_transliteration(TRANSLIT_FILE)
327	output_tail(TRANSLIT_FILE, tail=TAIL)