]>
Commit | Line | Data |
---|---|---|
4a4839c9 AO |
1 | #!/usr/bin/python3 |
2 | # -*- coding: utf-8 -*- | |
04277e02 | 3 | # Copyright (C) 2014-2019 Free Software Foundation, Inc. |
4a4839c9 AO |
4 | # This file is part of the GNU C Library. |
5 | # | |
6 | # The GNU C Library is free software; you can redistribute it and/or | |
7 | # modify it under the terms of the GNU Lesser General Public | |
8 | # License as published by the Free Software Foundation; either | |
9 | # version 2.1 of the License, or (at your option) any later version. | |
10 | # | |
11 | # The GNU C Library is distributed in the hope that it will be useful, | |
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | # Lesser General Public License for more details. | |
15 | # | |
16 | # You should have received a copy of the GNU Lesser General Public | |
17 | # License along with the GNU C Library; if not, see | |
5a82c748 | 18 | # <https://www.gnu.org/licenses/>. |
4a4839c9 AO |
19 | |
20 | '''glibc/localedata/charmaps/UTF-8 file generator script | |
21 | ||
22 | This script generates a glibc/localedata/charmaps/UTF-8 file | |
23 | from Unicode data. | |
24 | ||
25 | Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt | |
26 | ||
27 | It will output UTF-8 file | |
28 | ''' | |
29 | ||
4beefeeb | 30 | import argparse |
4a4839c9 AO |
31 | import sys |
32 | import re | |
dd8e8e54 | 33 | import unicode_utils |
4a4839c9 AO |
34 | |
35 | # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, | |
36 | # sections 3.11 and 4.4. | |
37 | ||
7b1ec6a0 | 38 | JAMO_INITIAL_SHORT_NAME = ( |
4a4839c9 AO |
39 | 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', |
40 | 'C', 'K', 'T', 'P', 'H' | |
7b1ec6a0 | 41 | ) |
4a4839c9 | 42 | |
7b1ec6a0 | 43 | JAMO_MEDIAL_SHORT_NAME = ( |
4a4839c9 AO |
44 | 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', |
45 | 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' | |
7b1ec6a0 | 46 | ) |
4a4839c9 | 47 | |
7b1ec6a0 | 48 | JAMO_FINAL_SHORT_NAME = ( |
4a4839c9 AO |
49 | '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', |
50 | 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', | |
51 | 'P', 'H' | |
7b1ec6a0 | 52 | ) |
4a4839c9 | 53 | |
4a4839c9 AO |
54 | def process_range(start, end, outfile, name): |
55 | '''Writes a range of code points into the CHARMAP section of the | |
56 | output file | |
57 | ||
58 | ''' | |
59 | if 'Hangul Syllable' in name: | |
60 | # from glibc/localedata/ChangeLog: | |
61 | # | |
62 | # 2000-09-24 Bruno Haible <haible@clisp.cons.org> | |
63 | # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges, | |
64 | # so they become printable and carry a width. Comment out surrogate | |
65 | # ranges. Add a WIDTH table | |
66 | # | |
67 | # So we expand the Hangul Syllables here: | |
68 | for i in range(int(start, 16), int(end, 16)+1 ): | |
69 | index2, index3 = divmod(i - 0xaC00, 28) | |
70 | index1, index2 = divmod(index2, 21) | |
71 | hangul_syllable_name = 'HANGUL SYLLABLE ' \ | |
7b1ec6a0 AO |
72 | + JAMO_INITIAL_SHORT_NAME[index1] \ |
73 | + JAMO_MEDIAL_SHORT_NAME[index2] \ | |
74 | + JAMO_FINAL_SHORT_NAME[index3] | |
4a4839c9 | 75 | outfile.write('{:<11s} {:<12s} {:s}\n'.format( |
dd8e8e54 | 76 | unicode_utils.ucs_symbol(i), convert_to_hex(i), |
4a4839c9 AO |
77 | hangul_syllable_name)) |
78 | return | |
79 | # UnicodeData.txt file has contains code point ranges like this: | |
80 | # | |
81 | # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |
82 | # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |
83 | # | |
84 | # The glibc UTF-8 file splits ranges like these into shorter | |
85 | # ranges of 64 code points each: | |
86 | # | |
87 | # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> | |
88 | # … | |
89 | # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A> | |
90 | for i in range(int(start, 16), int(end, 16), 64 ): | |
91 | if i > (int(end, 16)-64): | |
92 | outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( | |
dd8e8e54 CD |
93 | unicode_utils.ucs_symbol(i), |
94 | unicode_utils.ucs_symbol(int(end,16)), | |
4a4839c9 AO |
95 | convert_to_hex(i), |
96 | name)) | |
97 | break | |
98 | outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( | |
dd8e8e54 CD |
99 | unicode_utils.ucs_symbol(i), |
100 | unicode_utils.ucs_symbol(i+63), | |
4a4839c9 AO |
101 | convert_to_hex(i), |
102 | name)) | |
103 | ||
104 | def process_charmap(flines, outfile): | |
105 | '''This function takes an array which contains *all* lines of | |
106 | of UnicodeData.txt and write lines to outfile as used in the | |
107 | ||
108 | CHARMAP | |
109 | … | |
110 | END CHARMAP | |
111 | ||
112 | section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. | |
113 | ||
114 | Samples for input lines: | |
115 | ||
116 | 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; | |
117 | 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |
118 | 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |
119 | D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; | |
120 | DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; | |
121 | 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; | |
122 | 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; | |
123 | ||
124 | Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): | |
125 | ||
126 | <U0010> /x10 DATA LINK ESCAPE | |
127 | <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> | |
128 | %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> | |
129 | %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> | |
130 | <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> | |
131 | ||
132 | ''' | |
133 | fields_start = [] | |
134 | for line in flines: | |
135 | fields = line.split(";") | |
136 | # Some characters have “<control>” as their name. We try to | |
137 | # use the “Unicode 1.0 Name” (10th field in | |
138 | # UnicodeData.txt) for them. | |
139 | # | |
140 | # The Characters U+0080, U+0081, U+0084 and U+0099 have | |
141 | # “<control>” as their name but do not even have aa | |
142 | # ”Unicode 1.0 Name”. We could write code to take their | |
143 | # alternate names from NameAliases.txt. | |
144 | if fields[1] == "<control>" and fields[10]: | |
145 | fields[1] = fields[10] | |
146 | # Handling code point ranges like: | |
147 | # | |
148 | # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; | |
149 | # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; | |
150 | if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: | |
151 | fields_start = fields | |
152 | continue | |
153 | if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: | |
154 | process_range(fields_start[0], fields[0], | |
155 | outfile, fields[1][:-7]+'>') | |
156 | fields_start = [] | |
157 | continue | |
158 | fields_start = [] | |
159 | if 'Surrogate,' in fields[1]: | |
160 | # Comment out the surrogates in the UTF-8 file. | |
161 | # One could of course skip them completely but | |
162 | # the original UTF-8 file in glibc had them as | |
163 | # comments, so we keep these comment lines. | |
164 | outfile.write('%') | |
165 | outfile.write('{:<11s} {:<12s} {:s}\n'.format( | |
dd8e8e54 | 166 | unicode_utils.ucs_symbol(int(fields[0], 16)), |
4a4839c9 AO |
167 | convert_to_hex(int(fields[0], 16)), |
168 | fields[1])) | |
169 | ||
170 | def convert_to_hex(code_point): | |
171 | '''Converts a code point to a hexadecimal UTF-8 representation | |
172 | like /x**/x**/x**.''' | |
173 | # Getting UTF8 of Unicode characters. | |
174 | # In Python3, .encode('UTF-8') does not work for | |
175 | # surrogates. Therefore, we use this conversion table | |
176 | surrogates = { | |
177 | 0xD800: '/xed/xa0/x80', | |
178 | 0xDB7F: '/xed/xad/xbf', | |
179 | 0xDB80: '/xed/xae/x80', | |
180 | 0xDBFF: '/xed/xaf/xbf', | |
181 | 0xDC00: '/xed/xb0/x80', | |
182 | 0xDFFF: '/xed/xbf/xbf', | |
183 | } | |
184 | if code_point in surrogates: | |
185 | return surrogates[code_point] | |
186 | return ''.join([ | |
187 | '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') | |
188 | ]) | |
189 | ||
190 | def write_header_charmap(outfile): | |
191 | '''Write the header on top of the CHARMAP section to the output file''' | |
192 | outfile.write("<code_set_name> UTF-8\n") | |
193 | outfile.write("<comment_char> %\n") | |
194 | outfile.write("<escape_char> /\n") | |
195 | outfile.write("<mb_cur_min> 1\n") | |
196 | outfile.write("<mb_cur_max> 6\n\n") | |
197 | outfile.write("% CHARMAP generated using utf8_gen.py\n") | |
198 | outfile.write("% alias ISO-10646/UTF-8\n") | |
199 | outfile.write("CHARMAP\n") | |
200 | ||
4beefeeb | 201 | def write_header_width(outfile, unicode_version): |
4a4839c9 | 202 | '''Writes the header on top of the WIDTH section to the output file''' |
4beefeeb MF |
203 | outfile.write('% Character width according to Unicode ' |
204 | + '{:s}.\n'.format(unicode_version)) | |
4a4839c9 AO |
205 | outfile.write('% - Default width is 1.\n') |
206 | outfile.write('% - Double-width characters have width 2; generated from\n') | |
207 | outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') | |
208 | outfile.write('% - Non-spacing characters have width 0; ' | |
209 | + 'generated from PropList.txt or\n') | |
210 | outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' ' | |
211 | + 'UnicodeData.txt"\n') | |
212 | outfile.write('% - Format control characters have width 0; ' | |
213 | + 'generated from\n') | |
214 | outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n") | |
215 | # Not needed covered by Cf | |
216 | # outfile.write("% - Zero width characters have width 0; generated from\n") | |
217 | # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n") | |
218 | outfile.write("WIDTH\n") | |
219 | ||
2ae5be04 | 220 | def process_width(outfile, ulines, elines, plines): |
4a4839c9 | 221 | '''ulines are lines from UnicodeData.txt, elines are lines from |
2ae5be04 MF |
222 | EastAsianWidth.txt containing characters with width “W” or “F”, |
223 | plines are lines from PropList.txt which contain characters | |
224 | with the property “Prepended_Concatenation_Mark”. | |
4a4839c9 AO |
225 | |
226 | ''' | |
227 | width_dict = {} | |
4a4839c9 | 228 | for line in elines: |
4a4839c9 AO |
229 | fields = line.split(";") |
230 | if not '..' in fields[0]: | |
580be303 | 231 | code_points = (fields[0], fields[0]) |
4a4839c9 AO |
232 | else: |
233 | code_points = fields[0].split("..") | |
580be303 TG |
234 | for key in range(int(code_points[0], 16), |
235 | int(code_points[1], 16)+1): | |
af83ed5c | 236 | width_dict[key] = 2 |
2ae5be04 | 237 | |
580be303 TG |
238 | for line in ulines: |
239 | fields = line.split(";") | |
41b6f0ce | 240 | if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): |
af83ed5c | 241 | width_dict[int(fields[0], 16)] = 0 |
4a4839c9 | 242 | |
2ae5be04 MF |
243 | for line in plines: |
244 | # Characters with the property “Prepended_Concatenation_Mark” | |
245 | # should have the width 1: | |
246 | fields = line.split(";") | |
247 | if not '..' in fields[0]: | |
248 | code_points = (fields[0], fields[0]) | |
249 | else: | |
250 | code_points = fields[0].split("..") | |
251 | for key in range(int(code_points[0], 16), | |
252 | int(code_points[1], 16)+1): | |
253 | del width_dict[key] # default width is 1 | |
254 | ||
267ee5d7 | 255 | # handle special cases for compatibility |
af83ed5c MF |
256 | for key in list((0x00AD,)): |
257 | # https://www.cs.tut.fi/~jkorpela/shy.html | |
267ee5d7 | 258 | if key in width_dict: |
2ae5be04 | 259 | del width_dict[key] # default width is 1 |
af83ed5c MF |
260 | for key in list(range(0x1160, 0x1200)): |
261 | width_dict[key] = 0 | |
262 | for key in list(range(0x3248, 0x3250)): | |
263 | # These are “A” which means we can decide whether to treat them | |
264 | # as “W” or “N” based on context: | |
265 | # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html | |
266 | # For us, “W” seems better. | |
267 | width_dict[key] = 2 | |
268 | for key in list(range(0x4DC0, 0x4E00)): | |
269 | width_dict[key] = 2 | |
267ee5d7 | 270 | |
af83ed5c MF |
271 | same_width_lists = [] |
272 | current_width_list = [] | |
4a4839c9 | 273 | for key in sorted(width_dict): |
af83ed5c MF |
274 | if not current_width_list: |
275 | current_width_list = [key] | |
276 | elif (key == current_width_list[-1] + 1 | |
277 | and width_dict[key] == width_dict[current_width_list[0]]): | |
278 | current_width_list.append(key) | |
279 | else: | |
280 | same_width_lists.append(current_width_list) | |
281 | current_width_list = [key] | |
282 | if current_width_list: | |
283 | same_width_lists.append(current_width_list) | |
284 | ||
285 | for same_width_list in same_width_lists: | |
286 | if len(same_width_list) == 1: | |
287 | outfile.write('{:s}\t{:d}\n'.format( | |
288 | unicode_utils.ucs_symbol(same_width_list[0]), | |
289 | width_dict[same_width_list[0]])) | |
290 | else: | |
291 | outfile.write('{:s}...{:s}\t{:d}\n'.format( | |
292 | unicode_utils.ucs_symbol(same_width_list[0]), | |
293 | unicode_utils.ucs_symbol(same_width_list[-1]), | |
294 | width_dict[same_width_list[0]])) | |
4a4839c9 AO |
295 | |
296 | if __name__ == "__main__": | |
4beefeeb MF |
297 | PARSER = argparse.ArgumentParser( |
298 | description=''' | |
299 | Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. | |
300 | ''') | |
301 | PARSER.add_argument( | |
302 | '-u', '--unicode_data_file', | |
303 | nargs='?', | |
304 | type=str, | |
305 | default='UnicodeData.txt', | |
306 | help=('The UnicodeData.txt file to read, ' | |
307 | + 'default: %(default)s')) | |
308 | PARSER.add_argument( | |
309 | '-e', '--east_asian_with_file', | |
310 | nargs='?', | |
311 | type=str, | |
312 | default='EastAsianWidth.txt', | |
313 | help=('The EastAsianWidth.txt file to read, ' | |
314 | + 'default: %(default)s')) | |
315 | PARSER.add_argument( | |
316 | '-p', '--prop_list_file', | |
317 | nargs='?', | |
318 | type=str, | |
319 | default='PropList.txt', | |
320 | help=('The PropList.txt file to read, ' | |
321 | + 'default: %(default)s')) | |
322 | PARSER.add_argument( | |
323 | '--unicode_version', | |
324 | nargs='?', | |
325 | required=True, | |
326 | type=str, | |
327 | help='The Unicode version of the input files used.') | |
328 | ARGS = PARSER.parse_args() | |
329 | ||
330 | with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: | |
331 | UNICODE_DATA_LINES = UNIDATA_FILE.readlines() | |
332 | with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: | |
333 | EAST_ASIAN_WIDTH_LINES = [] | |
334 | for LINE in EAST_ASIAN_WIDTH_FILE: | |
335 | # If characters from EastAasianWidth.txt which are from | |
336 | # from reserved ranges (i.e. not yet assigned code points) | |
337 | # are added to the WIDTH section of the UTF-8 file, then | |
338 | # “make check” produces “Unknown Character” errors for | |
339 | # these code points because such unassigned code points | |
340 | # are not in the CHARMAP section of the UTF-8 file. | |
341 | # | |
342 | # Therefore, we skip all reserved code points when reading | |
343 | # the EastAsianWidth.txt file. | |
344 | if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): | |
345 | continue | |
346 | if re.match(r'^[^;]*;[WF]', LINE): | |
347 | EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) | |
348 | with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: | |
349 | PROP_LIST_LINES = [] | |
350 | for LINE in PROP_LIST_FILE: | |
351 | if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): | |
352 | PROP_LIST_LINES.append(LINE.strip()) | |
353 | with open('UTF-8', mode='w') as OUTFILE: | |
354 | # Processing UnicodeData.txt and write CHARMAP to UTF-8 file | |
355 | write_header_charmap(OUTFILE) | |
356 | process_charmap(UNICODE_DATA_LINES, OUTFILE) | |
357 | OUTFILE.write("END CHARMAP\n\n") | |
358 | # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file | |
359 | write_header_width(OUTFILE, ARGS.unicode_version) | |
360 | process_width(OUTFILE, | |
361 | UNICODE_DATA_LINES, | |
362 | EAST_ASIAN_WIDTH_LINES, | |
363 | PROP_LIST_LINES) | |
364 | OUTFILE.write("END WIDTH\n") |