localedata/unicode-gen/utf8_gen.py

   1 #!/usr/bin/python3
   2 # -*- coding: utf-8 -*-
   3 # Copyright (C) 2014-2019 Free Software Foundation, Inc.
   4 # This file is part of the GNU C Library.
   5 #
   6 # The GNU C Library is free software; you can redistribute it and/or
   7 # modify it under the terms of the GNU Lesser General Public
   8 # License as published by the Free Software Foundation; either
   9 # version 2.1 of the License, or (at your option) any later version.
  10 #
  11 # The GNU C Library is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14 # Lesser General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU Lesser General Public
  17 # License along with the GNU C Library; if not, see
  18 # <http://www.gnu.org/licenses/>.
  19
  20 '''glibc/localedata/charmaps/UTF-8 file generator script
  21
  22 This script generates a glibc/localedata/charmaps/UTF-8 file
  23 from Unicode data.
  24
  25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
  26
  27 It will output UTF-8 file
  28 '''
  29
  30 import argparse
  31 import sys
  32 import re
  33 import unicode_utils
  34
  35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
  36 # sections 3.11 and 4.4.
  37
  38 JAMO_INITIAL_SHORT_NAME = (
  39     'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
  40     'C', 'K', 'T', 'P', 'H'
  41 )
  42
  43 JAMO_MEDIAL_SHORT_NAME = (
  44     'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
  45     'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
  46 )
  47
  48 JAMO_FINAL_SHORT_NAME = (
  49     '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
  50     'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
  51     'P', 'H'
  52 )
  53
  54 def process_range(start, end, outfile, name):
  55     '''Writes a range of code points into the CHARMAP section of the
  56     output file
  57
  58     '''
  59     if 'Hangul Syllable' in name:
  60         # from glibc/localedata/ChangeLog:
  61         #
  62         #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
  63         #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
  64         #  so they become printable and carry a width. Comment out surrogate
  65         #  ranges. Add a WIDTH table
  66         #
  67         # So we expand the Hangul Syllables here:
  68         for i in range(int(start, 16), int(end, 16)+1 ):
  69             index2, index3 = divmod(i - 0xaC00, 28)
  70             index1, index2 = divmod(index2, 21)
  71             hangul_syllable_name = 'HANGUL SYLLABLE ' \
  72                                    + JAMO_INITIAL_SHORT_NAME[index1] \
  73                                    + JAMO_MEDIAL_SHORT_NAME[index2] \
  74                                    + JAMO_FINAL_SHORT_NAME[index3]
  75             outfile.write('{:<11s} {:<12s} {:s}\n'.format(
  76                 unicode_utils.ucs_symbol(i), convert_to_hex(i),
  77                 hangul_syllable_name))
  78         return
  79     # UnicodeData.txt file has contains code point ranges like this:
  80     #
  81     # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
  82     # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
  83     #
  84     # The glibc UTF-8 file splits ranges like these into shorter
  85     # ranges of 64 code points each:
  86     #
  87     # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
  88     # …
  89     # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
  90     for i in range(int(start, 16), int(end, 16), 64 ):
  91         if i > (int(end, 16)-64):
  92             outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  93                     unicode_utils.ucs_symbol(i),
  94                     unicode_utils.ucs_symbol(int(end,16)),
  95                     convert_to_hex(i),
  96                     name))
  97             break
  98         outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
  99                 unicode_utils.ucs_symbol(i),
 100                 unicode_utils.ucs_symbol(i+63),
 101                 convert_to_hex(i),
 102                 name))
 103
 104 def process_charmap(flines, outfile):
 105     '''This function takes an array which contains *all* lines of
 106     of UnicodeData.txt and write lines to outfile as used in the
 107
 108     CHARMAP
 109     …
 110     END CHARMAP
 111
 112     section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
 113
 114     Samples for input lines:
 115
 116     0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
 117     3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 118     4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 119     D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
 120     DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
 121     100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
 122     10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
 123
 124     Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
 125
 126     <U0010>     /x10 DATA LINK ESCAPE
 127     <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
 128     %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
 129     %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
 130     <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
 131
 132     '''
 133     fields_start = []
 134     for line in flines:
 135         fields = line.split(";")
 136          # Some characters have “<control>” as their name. We try to
 137          # use the “Unicode 1.0 Name” (10th field in
 138          # UnicodeData.txt) for them.
 139          #
 140          # The Characters U+0080, U+0081, U+0084 and U+0099 have
 141          # “<control>” as their name but do not even have aa
 142          # ”Unicode 1.0 Name”. We could write code to take their
 143          # alternate names from NameAliases.txt.
 144         if fields[1] == "<control>" and fields[10]:
 145             fields[1] = fields[10]
 146         # Handling code point ranges like:
 147         #
 148         # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
 149         # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
 150         if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
 151             fields_start = fields
 152             continue
 153         if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
 154             process_range(fields_start[0], fields[0],
 155                           outfile, fields[1][:-7]+'>')
 156             fields_start = []
 157             continue
 158         fields_start = []
 159         if 'Surrogate,' in fields[1]:
 160             # Comment out the surrogates in the UTF-8 file.
 161             # One could of course skip them completely but
 162             # the original UTF-8 file in glibc had them as
 163             # comments, so we keep these comment lines.
 164             outfile.write('%')
 165         outfile.write('{:<11s} {:<12s} {:s}\n'.format(
 166                 unicode_utils.ucs_symbol(int(fields[0], 16)),
 167                 convert_to_hex(int(fields[0], 16)),
 168                 fields[1]))
 169
 170 def convert_to_hex(code_point):
 171     '''Converts a code point to a hexadecimal UTF-8 representation
 172     like /x**/x**/x**.'''
 173     # Getting UTF8 of Unicode characters.
 174     # In Python3, .encode('UTF-8') does not work for
 175     # surrogates. Therefore, we use this conversion table
 176     surrogates = {
 177         0xD800: '/xed/xa0/x80',
 178         0xDB7F: '/xed/xad/xbf',
 179         0xDB80: '/xed/xae/x80',
 180         0xDBFF: '/xed/xaf/xbf',
 181         0xDC00: '/xed/xb0/x80',
 182         0xDFFF: '/xed/xbf/xbf',
 183     }
 184     if code_point in surrogates:
 185         return surrogates[code_point]
 186     return ''.join([
 187         '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
 188     ])
 189
 190 def write_header_charmap(outfile):
 191     '''Write the header on top of the CHARMAP section to the output file'''
 192     outfile.write("<code_set_name> UTF-8\n")
 193     outfile.write("<comment_char> %\n")
 194     outfile.write("<escape_char> /\n")
 195     outfile.write("<mb_cur_min> 1\n")
 196     outfile.write("<mb_cur_max> 6\n\n")
 197     outfile.write("% CHARMAP generated using utf8_gen.py\n")
 198     outfile.write("% alias ISO-10646/UTF-8\n")
 199     outfile.write("CHARMAP\n")
 200
 201 def write_header_width(outfile, unicode_version):
 202     '''Writes the header on top of the WIDTH section to the output file'''
 203     outfile.write('% Character width according to Unicode '
 204                   + '{:s}.\n'.format(unicode_version))
 205     outfile.write('% - Default width is 1.\n')
 206     outfile.write('% - Double-width characters have width 2; generated from\n')
 207     outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
 208     outfile.write('% - Non-spacing characters have width 0; '
 209                   + 'generated from PropList.txt or\n')
 210     outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
 211                   + 'UnicodeData.txt"\n')
 212     outfile.write('% - Format control characters have width 0; '
 213                   + 'generated from\n')
 214     outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
 215 #   Not needed covered by Cf
 216 #    outfile.write("% - Zero width characters have width 0; generated from\n")
 217 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
 218     outfile.write("WIDTH\n")
 219
 220 def process_width(outfile, ulines, elines, plines):
 221     '''ulines are lines from UnicodeData.txt, elines are lines from
 222     EastAsianWidth.txt containing characters with width “W” or “F”,
 223     plines are lines from PropList.txt which contain characters
 224     with the property “Prepended_Concatenation_Mark”.
 225
 226     '''
 227     width_dict = {}
 228     for line in elines:
 229         fields = line.split(";")
 230         if not '..' in fields[0]:
 231             code_points = (fields[0], fields[0])
 232         else:
 233             code_points = fields[0].split("..")
 234         for key in range(int(code_points[0], 16),
 235                          int(code_points[1], 16)+1):
 236             width_dict[key] = 2
 237
 238     for line in ulines:
 239         fields = line.split(";")
 240         if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
 241             width_dict[int(fields[0], 16)] = 0
 242
 243     for line in plines:
 244         # Characters with the property “Prepended_Concatenation_Mark”
 245         # should have the width 1:
 246         fields = line.split(";")
 247         if not '..' in fields[0]:
 248             code_points = (fields[0], fields[0])
 249         else:
 250             code_points = fields[0].split("..")
 251         for key in range(int(code_points[0], 16),
 252                          int(code_points[1], 16)+1):
 253             del width_dict[key] # default width is 1
 254
 255     # handle special cases for compatibility
 256     for key in list((0x00AD,)):
 257         # https://www.cs.tut.fi/~jkorpela/shy.html
 258         if key in width_dict:
 259             del width_dict[key] # default width is 1
 260     for key in list(range(0x1160, 0x1200)):
 261         width_dict[key] = 0
 262     for key in list(range(0x3248, 0x3250)):
 263         # These are “A” which means we can decide whether to treat them
 264         # as “W” or “N” based on context:
 265         # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
 266         # For us, “W” seems better.
 267         width_dict[key] = 2
 268     for key in list(range(0x4DC0, 0x4E00)):
 269         width_dict[key] = 2
 270
 271     same_width_lists = []
 272     current_width_list = []
 273     for key in sorted(width_dict):
 274         if not current_width_list:
 275             current_width_list = [key]
 276         elif (key == current_width_list[-1] + 1
 277               and width_dict[key] == width_dict[current_width_list[0]]):
 278             current_width_list.append(key)
 279         else:
 280             same_width_lists.append(current_width_list)
 281             current_width_list = [key]
 282     if current_width_list:
 283         same_width_lists.append(current_width_list)
 284
 285     for same_width_list in same_width_lists:
 286         if len(same_width_list) == 1:
 287             outfile.write('{:s}\t{:d}\n'.format(
 288                 unicode_utils.ucs_symbol(same_width_list[0]),
 289                 width_dict[same_width_list[0]]))
 290         else:
 291             outfile.write('{:s}...{:s}\t{:d}\n'.format(
 292                 unicode_utils.ucs_symbol(same_width_list[0]),
 293                 unicode_utils.ucs_symbol(same_width_list[-1]),
 294                 width_dict[same_width_list[0]]))
 295
 296 if __name__ == "__main__":
 297     PARSER = argparse.ArgumentParser(
 298         description='''
 299         Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
 300         ''')
 301     PARSER.add_argument(
 302         '-u', '--unicode_data_file',
 303         nargs='?',
 304         type=str,
 305         default='UnicodeData.txt',
 306         help=('The UnicodeData.txt file to read, '
 307               + 'default: %(default)s'))
 308     PARSER.add_argument(
 309         '-e', '--east_asian_with_file',
 310         nargs='?',
 311         type=str,
 312         default='EastAsianWidth.txt',
 313         help=('The EastAsianWidth.txt file to read, '
 314               + 'default: %(default)s'))
 315     PARSER.add_argument(
 316         '-p', '--prop_list_file',
 317         nargs='?',
 318         type=str,
 319         default='PropList.txt',
 320         help=('The PropList.txt file to read, '
 321               + 'default: %(default)s'))
 322     PARSER.add_argument(
 323         '--unicode_version',
 324         nargs='?',
 325         required=True,
 326         type=str,
 327         help='The Unicode version of the input files used.')
 328     ARGS = PARSER.parse_args()
 329
 330     with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
 331         UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
 332     with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
 333         EAST_ASIAN_WIDTH_LINES = []
 334         for LINE in EAST_ASIAN_WIDTH_FILE:
 335             # If characters from EastAasianWidth.txt which are from
 336             # from reserved ranges (i.e. not yet assigned code points)
 337             # are added to the WIDTH section of the UTF-8 file, then
 338             # “make check” produces “Unknown Character” errors for
 339             # these code points because such unassigned code points
 340             # are not in the CHARMAP section of the UTF-8 file.
 341             #
 342             # Therefore, we skip all reserved code points when reading
 343             # the EastAsianWidth.txt file.
 344             if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
 345                 continue
 346             if re.match(r'^[^;]*;[WF]', LINE):
 347                 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
 348     with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
 349         PROP_LIST_LINES = []
 350         for LINE in PROP_LIST_FILE:
 351             if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
 352                 PROP_LIST_LINES.append(LINE.strip())
 353     with open('UTF-8', mode='w') as OUTFILE:
 354         # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
 355         write_header_charmap(OUTFILE)
 356         process_charmap(UNICODE_DATA_LINES, OUTFILE)
 357         OUTFILE.write("END CHARMAP\n\n")
 358         # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
 359         write_header_width(OUTFILE, ARGS.unicode_version)
 360         process_width(OUTFILE,
 361                       UNICODE_DATA_LINES,
 362                       EAST_ASIAN_WIDTH_LINES,
 363                       PROP_LIST_LINES)
 364         OUTFILE.write("END WIDTH\n")