[thirdparty/glibc.git] / localedata / unicode-gen / utf8_gen.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014-2019 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

'''glibc/localedata/charmaps/UTF-8 file generator script

This script generates a glibc/localedata/charmaps/UTF-8 file
from Unicode data.

Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt

It will output UTF-8 file
'''

import argparse
import sys
import re
import unicode_utils

# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
# sections 3.11 and 4.4.

JAMO_INITIAL_SHORT_NAME = (
    'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
    'C', 'K', 'T', 'P', 'H'
)

JAMO_MEDIAL_SHORT_NAME = (
    'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
    'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
)

JAMO_FINAL_SHORT_NAME = (
    '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
    'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
    'P', 'H'
)

def process_range(start, end, outfile, name):
    '''Writes a range of code points into the CHARMAP section of the
    output file

    '''
    if 'Hangul Syllable' in name:
        # from glibc/localedata/ChangeLog:
        #
        #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
        #  so they become printable and carry a width. Comment out surrogate
        #  ranges. Add a WIDTH table
        #
        # So we expand the Hangul Syllables here:
        for i in range(int(start, 16), int(end, 16)+1 ):
            index2, index3 = divmod(i - 0xaC00, 28)
            index1, index2 = divmod(index2, 21)
            hangul_syllable_name = 'HANGUL SYLLABLE ' \
                                   + JAMO_INITIAL_SHORT_NAME[index1] \
                                   + JAMO_MEDIAL_SHORT_NAME[index2] \
                                   + JAMO_FINAL_SHORT_NAME[index3]
            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(i), convert_to_hex(i),
                hangul_syllable_name))
        return
    # UnicodeData.txt file has contains code point ranges like this:
    #
    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    #
    # The glibc UTF-8 file splits ranges like these into shorter
    # ranges of 64 code points each:
    #
    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
    # …
    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
    for i in range(int(start, 16), int(end, 16), 64 ):
        if i > (int(end, 16)-64):
            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                    unicode_utils.ucs_symbol(i),
                    unicode_utils.ucs_symbol(int(end,16)),
                    convert_to_hex(i),
                    name))
            break
        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(i),
                unicode_utils.ucs_symbol(i+63),
                convert_to_hex(i),
                name))

def process_charmap(flines, outfile):
    '''This function takes an array which contains *all* lines of
    of UnicodeData.txt and write lines to outfile as used in the

    CHARMAP
    …
    END CHARMAP

    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.

    Samples for input lines:

    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;

    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):

    <U0010>     /x10 DATA LINK ESCAPE
    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>

    '''
    fields_start = []
    for line in flines:
        fields = line.split(";")
         # Some characters have “<control>” as their name. We try to
         # use the “Unicode 1.0 Name” (10th field in
         # UnicodeData.txt) for them.
         #
         # The Characters U+0080, U+0081, U+0084 and U+0099 have
         # “<control>” as their name but do not even have aa
         # ”Unicode 1.0 Name”. We could write code to take their
         # alternate names from NameAliases.txt.
        if fields[1] == "<control>" and fields[10]:
            fields[1] = fields[10]
        # Handling code point ranges like:
        #
        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
            fields_start = fields
            continue
        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
            process_range(fields_start[0], fields[0],
                          outfile, fields[1][:-7]+'>')
            fields_start = []
            continue
        fields_start = []
        if 'Surrogate,' in fields[1]:
            # Comment out the surrogates in the UTF-8 file.
            # One could of course skip them completely but
            # the original UTF-8 file in glibc had them as
            # comments, so we keep these comment lines.
            outfile.write('%')
        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                unicode_utils.ucs_symbol(int(fields[0], 16)),
                convert_to_hex(int(fields[0], 16)),
                fields[1]))

def convert_to_hex(code_point):
    '''Converts a code point to a hexadecimal UTF-8 representation
    like /x**/x**/x**.'''
    # Getting UTF8 of Unicode characters.
    # In Python3, .encode('UTF-8') does not work for
    # surrogates. Therefore, we use this conversion table
    surrogates = {
        0xD800: '/xed/xa0/x80',
        0xDB7F: '/xed/xad/xbf',
        0xDB80: '/xed/xae/x80',
        0xDBFF: '/xed/xaf/xbf',
        0xDC00: '/xed/xb0/x80',
        0xDFFF: '/xed/xbf/xbf',
    }
    if code_point in surrogates:
        return surrogates[code_point]
    return ''.join([
        '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
    ])

def write_header_charmap(outfile):
    '''Write the header on top of the CHARMAP section to the output file'''
    outfile.write("<code_set_name> UTF-8\n")
    outfile.write("<comment_char> %\n")
    outfile.write("<escape_char> /\n")
    outfile.write("<mb_cur_min> 1\n")
    outfile.write("<mb_cur_max> 6\n\n")
    outfile.write("% CHARMAP generated using utf8_gen.py\n")
    outfile.write("% alias ISO-10646/UTF-8\n")
    outfile.write("CHARMAP\n")

def write_header_width(outfile, unicode_version):
    '''Writes the header on top of the WIDTH section to the output file'''
    outfile.write('% Character width according to Unicode '
                  + '{:s}.\n'.format(unicode_version))
    outfile.write('% - Default width is 1.\n')
    outfile.write('% - Double-width characters have width 2; generated from\n')
    outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
    outfile.write('% - Non-spacing characters have width 0; '
                  + 'generated from PropList.txt or\n')
    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
                  + 'UnicodeData.txt"\n')
    outfile.write('% - Format control characters have width 0; '
                  + 'generated from\n')
    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
#   Not needed covered by Cf
#    outfile.write("% - Zero width characters have width 0; generated from\n")
#    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
    outfile.write("WIDTH\n")

def process_width(outfile, ulines, elines, plines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt containing characters with width “W” or “F”,
    plines are lines from PropList.txt which contain characters
    with the property “Prepended_Concatenation_Mark”.

    '''
    width_dict = {}
    for line in elines:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16),
                         int(code_points[1], 16)+1):
            width_dict[key] = 2

    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
            width_dict[int(fields[0], 16)] = 0

    for line in plines:
        # Characters with the property “Prepended_Concatenation_Mark”
        # should have the width 1:
        fields = line.split(";")
        if not '..' in fields[0]:
            code_points = (fields[0], fields[0])
        else:
            code_points = fields[0].split("..")
        for key in range(int(code_points[0], 16),
                         int(code_points[1], 16)+1):
            del width_dict[key] # default width is 1

    # handle special cases for compatibility
    for key in list((0x00AD,)):
        # https://www.cs.tut.fi/~jkorpela/shy.html
        if key in width_dict:
            del width_dict[key] # default width is 1
    for key in list(range(0x1160, 0x1200)):
        width_dict[key] = 0
    for key in list(range(0x3248, 0x3250)):
        # These are “A” which means we can decide whether to treat them
        # as “W” or “N” based on context:
        # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
        # For us, “W” seems better.
        width_dict[key] = 2
    for key in list(range(0x4DC0, 0x4E00)):
        width_dict[key] = 2

    same_width_lists = []
    current_width_list = []
    for key in sorted(width_dict):
        if not current_width_list:
            current_width_list = [key]
        elif (key == current_width_list[-1] + 1
              and width_dict[key] == width_dict[current_width_list[0]]):
            current_width_list.append(key)
        else:
            same_width_lists.append(current_width_list)
            current_width_list = [key]
    if current_width_list:
        same_width_lists.append(current_width_list)

    for same_width_list in same_width_lists:
        if len(same_width_list) == 1:
            outfile.write('{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                width_dict[same_width_list[0]]))
        else:
            outfile.write('{:s}...{:s}\t{:d}\n'.format(
                unicode_utils.ucs_symbol(same_width_list[0]),
                unicode_utils.ucs_symbol(same_width_list[-1]),
                width_dict[same_width_list[0]]))

if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
        ''')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        default='UnicodeData.txt',
        help=('The UnicodeData.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-e', '--east_asian_with_file',
        nargs='?',
        type=str,
        default='EastAsianWidth.txt',
        help=('The EastAsianWidth.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-p', '--prop_list_file',
        nargs='?',
        type=str,
        default='PropList.txt',
        help=('The PropList.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '--unicode_version',
        nargs='?',
        required=True,
        type=str,
        help='The Unicode version of the input files used.')
    ARGS = PARSER.parse_args()

    with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
        UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
    with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
        EAST_ASIAN_WIDTH_LINES = []
        for LINE in EAST_ASIAN_WIDTH_FILE:
            # If characters from EastAasianWidth.txt which are from
            # from reserved ranges (i.e. not yet assigned code points)
            # are added to the WIDTH section of the UTF-8 file, then
            # “make check” produces “Unknown Character” errors for
            # these code points because such unassigned code points
            # are not in the CHARMAP section of the UTF-8 file.
            #
            # Therefore, we skip all reserved code points when reading
            # the EastAsianWidth.txt file.
            if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
                continue
            if re.match(r'^[^;]*;[WF]', LINE):
                EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
    with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
        PROP_LIST_LINES = []
        for LINE in PROP_LIST_FILE:
            if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
                PROP_LIST_LINES.append(LINE.strip())
    with open('UTF-8', mode='w') as OUTFILE:
        # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
        write_header_charmap(OUTFILE)
        process_charmap(UNICODE_DATA_LINES, OUTFILE)
        OUTFILE.write("END CHARMAP\n\n")
        # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
        write_header_width(OUTFILE, ARGS.unicode_version)
        process_width(OUTFILE,
                      UNICODE_DATA_LINES,
                      EAST_ASIAN_WIDTH_LINES,
                      PROP_LIST_LINES)
        OUTFILE.write("END WIDTH\n")
Commit	Line	Data
4a4839c9 AO	1	#!/usr/bin/python3
4a4839c9 AO	2	# -- coding: utf-8 --
04277e02	3	# Copyright (C) 2014-2019 Free Software Foundation, Inc.
4a4839c9 AO	4	# This file is part of the GNU C Library.
	5	#
	6	# The GNU C Library is free software; you can redistribute it and/or
	7	# modify it under the terms of the GNU Lesser General Public
	8	# License as published by the Free Software Foundation; either
	9	# version 2.1 of the License, or (at your option) any later version.
	10	#
	11	# The GNU C Library is distributed in the hope that it will be useful,
	12	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	# Lesser General Public License for more details.
	15	#
	16	# You should have received a copy of the GNU Lesser General Public
	17	# License along with the GNU C Library; if not, see
5a82c748	18	# <https://www.gnu.org/licenses/>.
4a4839c9 AO	19
	20	'''glibc/localedata/charmaps/UTF-8 file generator script
	21
	22	This script generates a glibc/localedata/charmaps/UTF-8 file
	23	from Unicode data.
	24
	25	Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
	26
	27	It will output UTF-8 file
	28	'''
	29
4beefeeb	30	import argparse
4a4839c9 AO	31	import sys
4a4839c9 AO	32	import re
dd8e8e54	33	import unicode_utils
4a4839c9 AO	34
	35	# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
	36	# sections 3.11 and 4.4.
	37
7b1ec6a0	38	JAMO_INITIAL_SHORT_NAME = (
4a4839c9 AO	39	'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
4a4839c9 AO	40	'C', 'K', 'T', 'P', 'H'
7b1ec6a0	41	)
4a4839c9	42
7b1ec6a0	43	JAMO_MEDIAL_SHORT_NAME = (
4a4839c9 AO	44	'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
4a4839c9 AO	45	'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
7b1ec6a0	46	)
4a4839c9	47
7b1ec6a0	48	JAMO_FINAL_SHORT_NAME = (
4a4839c9 AO	49	'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
	50	'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
	51	'P', 'H'
7b1ec6a0	52	)
4a4839c9	53
4a4839c9 AO	54	def process_range(start, end, outfile, name):
	55	'''Writes a range of code points into the CHARMAP section of the
	56	output file
	57
	58	'''
	59	if 'Hangul Syllable' in name:
	60	# from glibc/localedata/ChangeLog:
	61	#
	62	# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
	63	# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
	64	# so they become printable and carry a width. Comment out surrogate
	65	# ranges. Add a WIDTH table
	66	#
	67	# So we expand the Hangul Syllables here:
	68	for i in range(int(start, 16), int(end, 16)+1 ):
	69	index2, index3 = divmod(i - 0xaC00, 28)
	70	index1, index2 = divmod(index2, 21)
	71	hangul_syllable_name = 'HANGUL SYLLABLE ' \
7b1ec6a0 AO	72	+ JAMO_INITIAL_SHORT_NAME[index1] \
	73	+ JAMO_MEDIAL_SHORT_NAME[index2] \
	74	+ JAMO_FINAL_SHORT_NAME[index3]
4a4839c9	75	outfile.write('{:<11s} {:<12s} {:s}\n'.format(
dd8e8e54	76	unicode_utils.ucs_symbol(i), convert_to_hex(i),
4a4839c9 AO	77	hangul_syllable_name))
	78	return
	79	# UnicodeData.txt file has contains code point ranges like this:
	80	#
	81	# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
	82	# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
	83	#
	84	# The glibc UTF-8 file splits ranges like these into shorter
	85	# ranges of 64 code points each:
	86	#
	87	# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
	88	# …
	89	# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
	90	for i in range(int(start, 16), int(end, 16), 64 ):
	91	if i > (int(end, 16)-64):
	92	outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
dd8e8e54 CD	93	unicode_utils.ucs_symbol(i),
dd8e8e54 CD	94	unicode_utils.ucs_symbol(int(end,16)),
4a4839c9 AO	95	convert_to_hex(i),
	96	name))
	97	break
	98	outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
dd8e8e54 CD	99	unicode_utils.ucs_symbol(i),
dd8e8e54 CD	100	unicode_utils.ucs_symbol(i+63),
4a4839c9 AO	101	convert_to_hex(i),
	102	name))
	103
	104	def process_charmap(flines, outfile):
	105	'''This function takes an array which contains all lines of
	106	of UnicodeData.txt and write lines to outfile as used in the
	107
	108	CHARMAP
	109	…
	110	END CHARMAP
	111
	112	section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
	113
	114	Samples for input lines:
	115
	116	0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
	117	3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
	118	4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
	119	D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
	120	DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
	121	100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
	122	10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
	123
	124	Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
	125
	126	<U0010> /x10 DATA LINK ESCAPE
	127	<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
	128	%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
	129	%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
	130	<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
	131
	132	'''
	133	fields_start = []
	134	for line in flines:
	135	fields = line.split(";")
	136	# Some characters have “<control>” as their name. We try to
	137	# use the “Unicode 1.0 Name” (10th field in
	138	# UnicodeData.txt) for them.
	139	#
	140	# The Characters U+0080, U+0081, U+0084 and U+0099 have
	141	# “<control>” as their name but do not even have aa
	142	# ”Unicode 1.0 Name”. We could write code to take their
	143	# alternate names from NameAliases.txt.
	144	if fields[1] == "<control>" and fields[10]:
	145	fields[1] = fields[10]
	146	# Handling code point ranges like:
	147	#
	148	# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
	149	# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
	150	if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
	151	fields_start = fields
	152	continue
	153	if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
	154	process_range(fields_start[0], fields[0],
	155	outfile, fields[1][:-7]+'>')
	156	fields_start = []
	157	continue
	158	fields_start = []
	159	if 'Surrogate,' in fields[1]:
	160	# Comment out the surrogates in the UTF-8 file.
	161	# One could of course skip them completely but
	162	# the original UTF-8 file in glibc had them as
	163	# comments, so we keep these comment lines.
	164	outfile.write('%')
165	outfile.write('{:<11s} {:<12s} {:s}\n'.format(
dd8e8e54	166	unicode_utils.ucs_symbol(int(fields[0], 16)),
4a4839c9 AO	167	convert_to_hex(int(fields[0], 16)),
	168	fields[1]))
	169
	170	def convert_to_hex(code_point):
	171	'''Converts a code point to a hexadecimal UTF-8 representation
	172	like /x/x/x**.'''
	173	# Getting UTF8 of Unicode characters.
	174	# In Python3, .encode('UTF-8') does not work for
	175	# surrogates. Therefore, we use this conversion table
	176	surrogates = {
	177	0xD800: '/xed/xa0/x80',
	178	0xDB7F: '/xed/xad/xbf',
	179	0xDB80: '/xed/xae/x80',
	180	0xDBFF: '/xed/xaf/xbf',
	181	0xDC00: '/xed/xb0/x80',
	182	0xDFFF: '/xed/xbf/xbf',
	183	}
	184	if code_point in surrogates:
	185	return surrogates[code_point]
	186	return ''.join([
	187	'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
	188	])
	189
	190	def write_header_charmap(outfile):
	191	'''Write the header on top of the CHARMAP section to the output file'''
	192	outfile.write("<code_set_name> UTF-8\n")
	193	outfile.write("<comment_char> %\n")
	194	outfile.write("<escape_char> /\n")
	195	outfile.write("<mb_cur_min> 1\n")
	196	outfile.write("<mb_cur_max> 6\n\n")
	197	outfile.write("% CHARMAP generated using utf8_gen.py\n")
	198	outfile.write("% alias ISO-10646/UTF-8\n")
	199	outfile.write("CHARMAP\n")
	200
4beefeeb	201	def write_header_width(outfile, unicode_version):
4a4839c9	202	'''Writes the header on top of the WIDTH section to the output file'''
4beefeeb MF	203	outfile.write('% Character width according to Unicode '
4beefeeb MF	204	+ '{:s}.\n'.format(unicode_version))
4a4839c9 AO	205	outfile.write('% - Default width is 1.\n')
	206	outfile.write('% - Double-width characters have width 2; generated from\n')
	207	outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
	208	outfile.write('% - Non-spacing characters have width 0; '
	209	+ 'generated from PropList.txt or\n')
	210	outfile.write('% "grep \'^[^;];[^;];[^;];[^;];NSM;\' '
	211	+ 'UnicodeData.txt"\n')
	212	outfile.write('% - Format control characters have width 0; '
	213	+ 'generated from\n')
	214	outfile.write("% \"grep '^[^;];[^;];Cf;' UnicodeData.txt\"\n")
	215	# Not needed covered by Cf
	216	# outfile.write("% - Zero width characters have width 0; generated from\n")
	217	# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
	218	outfile.write("WIDTH\n")
	219
2ae5be04	220	def process_width(outfile, ulines, elines, plines):
4a4839c9	221	'''ulines are lines from UnicodeData.txt, elines are lines from
2ae5be04 MF	222	EastAsianWidth.txt containing characters with width “W” or “F”,
	223	plines are lines from PropList.txt which contain characters
	224	with the property “Prepended_Concatenation_Mark”.
4a4839c9 AO	225
	226	'''
	227	width_dict = {}
4a4839c9	228	for line in elines:
4a4839c9 AO	229	fields = line.split(";")
4a4839c9 AO	230	if not '..' in fields[0]:
580be303	231	code_points = (fields[0], fields[0])
4a4839c9 AO	232	else:
4a4839c9 AO	233	code_points = fields[0].split("..")
580be303 TG	234	for key in range(int(code_points[0], 16),
580be303 TG	235	int(code_points[1], 16)+1):
af83ed5c	236	width_dict[key] = 2
2ae5be04	237
580be303 TG	238	for line in ulines:
580be303 TG	239	fields = line.split(";")
41b6f0ce	240	if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
af83ed5c	241	width_dict[int(fields[0], 16)] = 0
4a4839c9	242
2ae5be04 MF	243	for line in plines:
	244	# Characters with the property “Prepended_Concatenation_Mark”
	245	# should have the width 1:
	246	fields = line.split(";")
	247	if not '..' in fields[0]:
	248	code_points = (fields[0], fields[0])
	249	else:
	250	code_points = fields[0].split("..")
	251	for key in range(int(code_points[0], 16),
	252	int(code_points[1], 16)+1):
	253	del width_dict[key] # default width is 1
	254
267ee5d7	255	# handle special cases for compatibility
af83ed5c MF	256	for key in list((0x00AD,)):
af83ed5c MF	257	# https://www.cs.tut.fi/~jkorpela/shy.html
267ee5d7	258	if key in width_dict:
2ae5be04	259	del width_dict[key] # default width is 1
af83ed5c MF	260	for key in list(range(0x1160, 0x1200)):
	261	width_dict[key] = 0
	262	for key in list(range(0x3248, 0x3250)):
	263	# These are “A” which means we can decide whether to treat them
	264	# as “W” or “N” based on context:
	265	# http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
	266	# For us, “W” seems better.
	267	width_dict[key] = 2
	268	for key in list(range(0x4DC0, 0x4E00)):
	269	width_dict[key] = 2
267ee5d7	270
af83ed5c MF	271	same_width_lists = []
af83ed5c MF	272	current_width_list = []
4a4839c9	273	for key in sorted(width_dict):
af83ed5c MF	274	if not current_width_list:
	275	current_width_list = [key]
	276	elif (key == current_width_list[-1] + 1
	277	and width_dict[key] == width_dict[current_width_list[0]]):
	278	current_width_list.append(key)
	279	else:
	280	same_width_lists.append(current_width_list)
	281	current_width_list = [key]
	282	if current_width_list:
	283	same_width_lists.append(current_width_list)
	284
	285	for same_width_list in same_width_lists:
	286	if len(same_width_list) == 1:
	287	outfile.write('{:s}\t{:d}\n'.format(
	288	unicode_utils.ucs_symbol(same_width_list[0]),
	289	width_dict[same_width_list[0]]))
	290	else:
	291	outfile.write('{:s}...{:s}\t{:d}\n'.format(
	292	unicode_utils.ucs_symbol(same_width_list[0]),
	293	unicode_utils.ucs_symbol(same_width_list[-1]),
	294	width_dict[same_width_list[0]]))
4a4839c9 AO	295
4a4839c9 AO	296	if __name__ == "__main__":
4beefeeb MF	297	PARSER = argparse.ArgumentParser(
	298	description='''
	299	Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
	300	''')
	301	PARSER.add_argument(
	302	'-u', '--unicode_data_file',
	303	nargs='?',
	304	type=str,
	305	default='UnicodeData.txt',
	306	help=('The UnicodeData.txt file to read, '
	307	+ 'default: %(default)s'))
	308	PARSER.add_argument(
	309	'-e', '--east_asian_with_file',
	310	nargs='?',
	311	type=str,
	312	default='EastAsianWidth.txt',
	313	help=('The EastAsianWidth.txt file to read, '
	314	+ 'default: %(default)s'))
	315	PARSER.add_argument(
	316	'-p', '--prop_list_file',
	317	nargs='?',
	318	type=str,
	319	default='PropList.txt',
	320	help=('The PropList.txt file to read, '
	321	+ 'default: %(default)s'))
	322	PARSER.add_argument(
	323	'--unicode_version',
	324	nargs='?',
	325	required=True,
	326	type=str,
	327	help='The Unicode version of the input files used.')
	328	ARGS = PARSER.parse_args()
	329
	330	with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
	331	UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
	332	with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
	333	EAST_ASIAN_WIDTH_LINES = []
	334	for LINE in EAST_ASIAN_WIDTH_FILE:
	335	# If characters from EastAasianWidth.txt which are from
	336	# from reserved ranges (i.e. not yet assigned code points)
	337	# are added to the WIDTH section of the UTF-8 file, then
	338	# “make check” produces “Unknown Character” errors for
	339	# these code points because such unassigned code points
	340	# are not in the CHARMAP section of the UTF-8 file.
	341	#
	342	# Therefore, we skip all reserved code points when reading
	343	# the EastAsianWidth.txt file.
	344	if re.match(r'.<reserved-.+>\.\.<reserved-.+>.', LINE):
	345	continue
	346	if re.match(r'^[^;]*;[WF]', LINE):
	347	EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
	348	with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
	349	PROP_LIST_LINES = []
	350	for LINE in PROP_LIST_FILE:
	351	if re.match(r'^[^;];[\s]Prepended_Concatenation_Mark', LINE):
	352	PROP_LIST_LINES.append(LINE.strip())
	353	with open('UTF-8', mode='w') as OUTFILE:
	354	# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
	355	write_header_charmap(OUTFILE)
	356	process_charmap(UNICODE_DATA_LINES, OUTFILE)
	357	OUTFILE.write("END CHARMAP\n\n")
	358	# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
	359	write_header_width(OUTFILE, ARGS.unicode_version)
	360	process_width(OUTFILE,
361	UNICODE_DATA_LINES,
362	EAST_ASIAN_WIDTH_LINES,
363	PROP_LIST_LINES)
364	OUTFILE.write("END WIDTH\n")