]> git.ipfire.org Git - thirdparty/glibc.git/blame - localedata/unicode-gen/utf8_gen.py
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / localedata / unicode-gen / utf8_gen.py
CommitLineData
4a4839c9
AO
1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
04277e02 3# Copyright (C) 2014-2019 Free Software Foundation, Inc.
4a4839c9
AO
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
5a82c748 18# <https://www.gnu.org/licenses/>.
4a4839c9
AO
19
20'''glibc/localedata/charmaps/UTF-8 file generator script
21
22This script generates a glibc/localedata/charmaps/UTF-8 file
23from Unicode data.
24
25Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
26
27It will output UTF-8 file
28'''
29
4beefeeb 30import argparse
4a4839c9
AO
31import sys
32import re
dd8e8e54 33import unicode_utils
4a4839c9
AO
34
35# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
36# sections 3.11 and 4.4.
37
7b1ec6a0 38JAMO_INITIAL_SHORT_NAME = (
4a4839c9
AO
39 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
40 'C', 'K', 'T', 'P', 'H'
7b1ec6a0 41)
4a4839c9 42
7b1ec6a0 43JAMO_MEDIAL_SHORT_NAME = (
4a4839c9
AO
44 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
45 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
7b1ec6a0 46)
4a4839c9 47
7b1ec6a0 48JAMO_FINAL_SHORT_NAME = (
4a4839c9
AO
49 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
50 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
51 'P', 'H'
7b1ec6a0 52)
4a4839c9 53
4a4839c9
AO
54def process_range(start, end, outfile, name):
55 '''Writes a range of code points into the CHARMAP section of the
56 output file
57
58 '''
59 if 'Hangul Syllable' in name:
60 # from glibc/localedata/ChangeLog:
61 #
62 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
63 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
64 # so they become printable and carry a width. Comment out surrogate
65 # ranges. Add a WIDTH table
66 #
67 # So we expand the Hangul Syllables here:
68 for i in range(int(start, 16), int(end, 16)+1 ):
69 index2, index3 = divmod(i - 0xaC00, 28)
70 index1, index2 = divmod(index2, 21)
71 hangul_syllable_name = 'HANGUL SYLLABLE ' \
7b1ec6a0
AO
72 + JAMO_INITIAL_SHORT_NAME[index1] \
73 + JAMO_MEDIAL_SHORT_NAME[index2] \
74 + JAMO_FINAL_SHORT_NAME[index3]
4a4839c9 75 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
dd8e8e54 76 unicode_utils.ucs_symbol(i), convert_to_hex(i),
4a4839c9
AO
77 hangul_syllable_name))
78 return
79 # UnicodeData.txt file has contains code point ranges like this:
80 #
81 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
82 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
83 #
84 # The glibc UTF-8 file splits ranges like these into shorter
85 # ranges of 64 code points each:
86 #
87 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
88 # …
89 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
90 for i in range(int(start, 16), int(end, 16), 64 ):
91 if i > (int(end, 16)-64):
92 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
dd8e8e54
CD
93 unicode_utils.ucs_symbol(i),
94 unicode_utils.ucs_symbol(int(end,16)),
4a4839c9
AO
95 convert_to_hex(i),
96 name))
97 break
98 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
dd8e8e54
CD
99 unicode_utils.ucs_symbol(i),
100 unicode_utils.ucs_symbol(i+63),
4a4839c9
AO
101 convert_to_hex(i),
102 name))
103
104def process_charmap(flines, outfile):
105 '''This function takes an array which contains *all* lines of
106 of UnicodeData.txt and write lines to outfile as used in the
107
108 CHARMAP
109
110 END CHARMAP
111
112 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
113
114 Samples for input lines:
115
116 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
117 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
118 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
119 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
120 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
121 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
122 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
123
124 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
125
126 <U0010> /x10 DATA LINK ESCAPE
127 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
128 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
129 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
130 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
131
132 '''
133 fields_start = []
134 for line in flines:
135 fields = line.split(";")
136 # Some characters have “<control>” as their name. We try to
137 # use the “Unicode 1.0 Name” (10th field in
138 # UnicodeData.txt) for them.
139 #
140 # The Characters U+0080, U+0081, U+0084 and U+0099 have
141 # “<control>” as their name but do not even have aa
142 # ”Unicode 1.0 Name”. We could write code to take their
143 # alternate names from NameAliases.txt.
144 if fields[1] == "<control>" and fields[10]:
145 fields[1] = fields[10]
146 # Handling code point ranges like:
147 #
148 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
149 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
150 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
151 fields_start = fields
152 continue
153 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
154 process_range(fields_start[0], fields[0],
155 outfile, fields[1][:-7]+'>')
156 fields_start = []
157 continue
158 fields_start = []
159 if 'Surrogate,' in fields[1]:
160 # Comment out the surrogates in the UTF-8 file.
161 # One could of course skip them completely but
162 # the original UTF-8 file in glibc had them as
163 # comments, so we keep these comment lines.
164 outfile.write('%')
165 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
dd8e8e54 166 unicode_utils.ucs_symbol(int(fields[0], 16)),
4a4839c9
AO
167 convert_to_hex(int(fields[0], 16)),
168 fields[1]))
169
170def convert_to_hex(code_point):
171 '''Converts a code point to a hexadecimal UTF-8 representation
172 like /x**/x**/x**.'''
173 # Getting UTF8 of Unicode characters.
174 # In Python3, .encode('UTF-8') does not work for
175 # surrogates. Therefore, we use this conversion table
176 surrogates = {
177 0xD800: '/xed/xa0/x80',
178 0xDB7F: '/xed/xad/xbf',
179 0xDB80: '/xed/xae/x80',
180 0xDBFF: '/xed/xaf/xbf',
181 0xDC00: '/xed/xb0/x80',
182 0xDFFF: '/xed/xbf/xbf',
183 }
184 if code_point in surrogates:
185 return surrogates[code_point]
186 return ''.join([
187 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
188 ])
189
190def write_header_charmap(outfile):
191 '''Write the header on top of the CHARMAP section to the output file'''
192 outfile.write("<code_set_name> UTF-8\n")
193 outfile.write("<comment_char> %\n")
194 outfile.write("<escape_char> /\n")
195 outfile.write("<mb_cur_min> 1\n")
196 outfile.write("<mb_cur_max> 6\n\n")
197 outfile.write("% CHARMAP generated using utf8_gen.py\n")
198 outfile.write("% alias ISO-10646/UTF-8\n")
199 outfile.write("CHARMAP\n")
200
4beefeeb 201def write_header_width(outfile, unicode_version):
4a4839c9 202 '''Writes the header on top of the WIDTH section to the output file'''
4beefeeb
MF
203 outfile.write('% Character width according to Unicode '
204 + '{:s}.\n'.format(unicode_version))
4a4839c9
AO
205 outfile.write('% - Default width is 1.\n')
206 outfile.write('% - Double-width characters have width 2; generated from\n')
207 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
208 outfile.write('% - Non-spacing characters have width 0; '
209 + 'generated from PropList.txt or\n')
210 outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
211 + 'UnicodeData.txt"\n')
212 outfile.write('% - Format control characters have width 0; '
213 + 'generated from\n')
214 outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
215# Not needed covered by Cf
216# outfile.write("% - Zero width characters have width 0; generated from\n")
217# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
218 outfile.write("WIDTH\n")
219
2ae5be04 220def process_width(outfile, ulines, elines, plines):
4a4839c9 221 '''ulines are lines from UnicodeData.txt, elines are lines from
2ae5be04
MF
222 EastAsianWidth.txt containing characters with width “W” or “F”,
223 plines are lines from PropList.txt which contain characters
224 with the property “Prepended_Concatenation_Mark”.
4a4839c9
AO
225
226 '''
227 width_dict = {}
4a4839c9 228 for line in elines:
4a4839c9
AO
229 fields = line.split(";")
230 if not '..' in fields[0]:
580be303 231 code_points = (fields[0], fields[0])
4a4839c9
AO
232 else:
233 code_points = fields[0].split("..")
580be303
TG
234 for key in range(int(code_points[0], 16),
235 int(code_points[1], 16)+1):
af83ed5c 236 width_dict[key] = 2
2ae5be04 237
580be303
TG
238 for line in ulines:
239 fields = line.split(";")
41b6f0ce 240 if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
af83ed5c 241 width_dict[int(fields[0], 16)] = 0
4a4839c9 242
2ae5be04
MF
243 for line in plines:
244 # Characters with the property “Prepended_Concatenation_Mark”
245 # should have the width 1:
246 fields = line.split(";")
247 if not '..' in fields[0]:
248 code_points = (fields[0], fields[0])
249 else:
250 code_points = fields[0].split("..")
251 for key in range(int(code_points[0], 16),
252 int(code_points[1], 16)+1):
253 del width_dict[key] # default width is 1
254
267ee5d7 255 # handle special cases for compatibility
af83ed5c
MF
256 for key in list((0x00AD,)):
257 # https://www.cs.tut.fi/~jkorpela/shy.html
267ee5d7 258 if key in width_dict:
2ae5be04 259 del width_dict[key] # default width is 1
af83ed5c
MF
260 for key in list(range(0x1160, 0x1200)):
261 width_dict[key] = 0
262 for key in list(range(0x3248, 0x3250)):
263 # These are “A” which means we can decide whether to treat them
264 # as “W” or “N” based on context:
265 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
266 # For us, “W” seems better.
267 width_dict[key] = 2
268 for key in list(range(0x4DC0, 0x4E00)):
269 width_dict[key] = 2
267ee5d7 270
af83ed5c
MF
271 same_width_lists = []
272 current_width_list = []
4a4839c9 273 for key in sorted(width_dict):
af83ed5c
MF
274 if not current_width_list:
275 current_width_list = [key]
276 elif (key == current_width_list[-1] + 1
277 and width_dict[key] == width_dict[current_width_list[0]]):
278 current_width_list.append(key)
279 else:
280 same_width_lists.append(current_width_list)
281 current_width_list = [key]
282 if current_width_list:
283 same_width_lists.append(current_width_list)
284
285 for same_width_list in same_width_lists:
286 if len(same_width_list) == 1:
287 outfile.write('{:s}\t{:d}\n'.format(
288 unicode_utils.ucs_symbol(same_width_list[0]),
289 width_dict[same_width_list[0]]))
290 else:
291 outfile.write('{:s}...{:s}\t{:d}\n'.format(
292 unicode_utils.ucs_symbol(same_width_list[0]),
293 unicode_utils.ucs_symbol(same_width_list[-1]),
294 width_dict[same_width_list[0]]))
4a4839c9
AO
295
296if __name__ == "__main__":
4beefeeb
MF
297 PARSER = argparse.ArgumentParser(
298 description='''
299 Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
300 ''')
301 PARSER.add_argument(
302 '-u', '--unicode_data_file',
303 nargs='?',
304 type=str,
305 default='UnicodeData.txt',
306 help=('The UnicodeData.txt file to read, '
307 + 'default: %(default)s'))
308 PARSER.add_argument(
309 '-e', '--east_asian_with_file',
310 nargs='?',
311 type=str,
312 default='EastAsianWidth.txt',
313 help=('The EastAsianWidth.txt file to read, '
314 + 'default: %(default)s'))
315 PARSER.add_argument(
316 '-p', '--prop_list_file',
317 nargs='?',
318 type=str,
319 default='PropList.txt',
320 help=('The PropList.txt file to read, '
321 + 'default: %(default)s'))
322 PARSER.add_argument(
323 '--unicode_version',
324 nargs='?',
325 required=True,
326 type=str,
327 help='The Unicode version of the input files used.')
328 ARGS = PARSER.parse_args()
329
330 with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
331 UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
332 with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
333 EAST_ASIAN_WIDTH_LINES = []
334 for LINE in EAST_ASIAN_WIDTH_FILE:
335 # If characters from EastAasianWidth.txt which are from
336 # from reserved ranges (i.e. not yet assigned code points)
337 # are added to the WIDTH section of the UTF-8 file, then
338 # “make check” produces “Unknown Character” errors for
339 # these code points because such unassigned code points
340 # are not in the CHARMAP section of the UTF-8 file.
341 #
342 # Therefore, we skip all reserved code points when reading
343 # the EastAsianWidth.txt file.
344 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
345 continue
346 if re.match(r'^[^;]*;[WF]', LINE):
347 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
348 with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
349 PROP_LIST_LINES = []
350 for LINE in PROP_LIST_FILE:
351 if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
352 PROP_LIST_LINES.append(LINE.strip())
353 with open('UTF-8', mode='w') as OUTFILE:
354 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
355 write_header_charmap(OUTFILE)
356 process_charmap(UNICODE_DATA_LINES, OUTFILE)
357 OUTFILE.write("END CHARMAP\n\n")
358 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
359 write_header_width(OUTFILE, ARGS.unicode_version)
360 process_width(OUTFILE,
361 UNICODE_DATA_LINES,
362 EAST_ASIAN_WIDTH_LINES,
363 PROP_LIST_LINES)
364 OUTFILE.write("END WIDTH\n")