]> git.ipfire.org Git - thirdparty/glibc.git/blame - localedata/unicode-gen/gen_unicode_ctype.py
Prefer https to http for gnu.org and fsf.org URLs
[thirdparty/glibc.git] / localedata / unicode-gen / gen_unicode_ctype.py
CommitLineData
4a4839c9
AO
1#!/usr/bin/python3
2#
3# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
04277e02 4# Copyright (C) 2014-2019 Free Software Foundation, Inc.
4a4839c9
AO
5# This file is part of the GNU C Library.
6# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
7#
8# The GNU C Library is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# The GNU C Library is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with the GNU C Library; if not, see
5a82c748 20# <https://www.gnu.org/licenses/>.
4a4839c9
AO
21
22'''
23Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
24DerivedCoreProperties.txt files.
25
26To see how this script is used, call it with the “-h” option:
27
28 $ ./gen_unicode_ctype.py -h
29 … prints usage message …
30'''
31
32import argparse
4a4839c9
AO
33import time
34import re
dd8e8e54 35import unicode_utils
4a4839c9
AO
36
37def code_point_ranges(is_class_function):
38 '''Returns a list of ranges of code points for which is_class_function
39 returns True.
40
41 Example:
42
43 [[65, 90], [192, 214], [216, 222], [256], … ]
44 '''
45 cp_ranges = []
dd8e8e54 46 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
4a4839c9
AO
47 if is_class_function(code_point):
48 if (cp_ranges
49 and cp_ranges[-1][-1] == code_point - 1):
50 if len(cp_ranges[-1]) == 1:
51 cp_ranges[-1].append(code_point)
52 else:
53 cp_ranges[-1][-1] = code_point
54 else:
55 cp_ranges.append([code_point])
56 return cp_ranges
57
58def output_charclass(i18n_file, class_name, is_class_function):
59 '''Output a LC_CTYPE character class section
60
61 Example:
62
63 upper /
64 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
65
66 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
67 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
68 '''
69 cp_ranges = code_point_ranges(is_class_function)
70 if cp_ranges:
71 i18n_file.write('%s /\n' %class_name)
72 max_column = 75
73 prefix = ' '
74 line = prefix
75 range_string = ''
76 for code_point_range in cp_ranges:
77 if line.strip():
78 line += ';'
79 if len(code_point_range) == 1:
dd8e8e54 80 range_string = unicode_utils.ucs_symbol(code_point_range[0])
4a4839c9 81 else:
dd8e8e54 82 range_string = unicode_utils.ucs_symbol_range(
4a4839c9
AO
83 code_point_range[0], code_point_range[-1])
84 if len(line+range_string) > max_column:
85 i18n_file.write(line+'/\n')
86 line = prefix
87 line += range_string
88 if line.strip():
89 i18n_file.write(line+'\n')
90 i18n_file.write('\n')
91
92def output_charmap(i18n_file, map_name, map_function):
93 '''Output a LC_CTYPE character map section
94
95 Example:
96
97 toupper /
98 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
99
100 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
101 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
102 '''
103 max_column = 75
104 prefix = ' '
105 line = prefix
106 map_string = ''
107 i18n_file.write('%s /\n' %map_name)
dd8e8e54 108 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
4a4839c9
AO
109 mapped = map_function(code_point)
110 if code_point != mapped:
111 if line.strip():
112 line += ';'
113 map_string = '(' \
dd8e8e54 114 + unicode_utils.ucs_symbol(code_point) \
4a4839c9 115 + ',' \
dd8e8e54 116 + unicode_utils.ucs_symbol(mapped) \
4a4839c9
AO
117 + ')'
118 if len(line+map_string) > max_column:
119 i18n_file.write(line+'/\n')
120 line = prefix
121 line += map_string
122 if line.strip():
123 i18n_file.write(line+'\n')
124 i18n_file.write('\n')
125
4a4839c9
AO
126def read_input_file(filename):
127 '''Reads the original glibc i18n file to get the original head
128 and tail.
129
130 We want to replace only the character classes in LC_CTYPE, and the
131 date stamp. All the rest of the i18n file should stay unchanged.
132 To avoid having to cut and paste the generated data into the
133 original file, it is helpful to read the original file here
134 to be able to generate a complete result file.
135 '''
136 head = tail = ''
137 with open(filename, mode='r') as i18n_file:
138 for line in i18n_file:
139 match = re.match(
140 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
141 line)
142 if match:
143 line = match.group('key') \
144 + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
145 head = head + line
146 if line.startswith('LC_CTYPE'):
147 break
148 for line in i18n_file:
149 if line.startswith('translit_start'):
150 tail = line
151 break
152 for line in i18n_file:
153 tail = tail + line
154 return (head, tail)
155
156def output_head(i18n_file, unicode_version, head=''):
157 '''Write the header of the output file, i.e. the part of the file
158 before the “LC_CTYPE” line.
159 '''
160 if ARGS.input_file and head:
161 i18n_file.write(head)
162 else:
163 i18n_file.write('escape_char /\n')
164 i18n_file.write('comment_char %\n')
165 i18n_file.write('\n')
166 i18n_file.write('% Generated automatically by '
167 + 'gen_unicode_ctype.py '
168 + 'for Unicode {:s}.\n'.format(unicode_version))
169 i18n_file.write('\n')
170 i18n_file.write('LC_IDENTIFICATION\n')
171 i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
172 unicode_version))
173 i18n_file.write('source "UnicodeData.txt, '
174 + 'DerivedCoreProperties.txt"\n')
175 i18n_file.write('address ""\n')
176 i18n_file.write('contact ""\n')
177 i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
178 i18n_file.write('tel ""\n')
179 i18n_file.write('fax ""\n')
180 i18n_file.write('language ""\n')
181 i18n_file.write('territory "Earth"\n')
182 i18n_file.write('revision "{:s}"\n'.format(unicode_version))
183 i18n_file.write('date "{:s}"\n'.format(
184 time.strftime('%Y-%m-%d')))
1bb36539 185 i18n_file.write('category "i18n:2012";LC_CTYPE\n')
4a4839c9
AO
186 i18n_file.write('END LC_IDENTIFICATION\n')
187 i18n_file.write('\n')
188 i18n_file.write('LC_CTYPE\n')
189
190def output_tail(i18n_file, tail=''):
191 '''Write the tail of the output file, i.e. the part of the file
192 after the last “LC_CTYPE” character class.
193 '''
194 if ARGS.input_file and tail:
195 i18n_file.write(tail)
196 else:
197 i18n_file.write('END LC_CTYPE\n')
198
85bafe6f 199def output_tables(i18n_file, unicode_version, turkish):
4a4839c9
AO
200 '''Write the new LC_CTYPE character classes to the output file'''
201 i18n_file.write('% The following is the 14652 i18n fdcc-set '
202 + 'LC_CTYPE category.\n')
203 i18n_file.write('% It covers Unicode version {:s}.\n'.format(
204 unicode_version))
205 i18n_file.write('% The character classes and mapping tables were '
206 + 'automatically\n')
207 i18n_file.write('% generated using the gen_unicode_ctype.py '
208 + 'program.\n\n')
209 i18n_file.write('% The "upper" class reflects the uppercase '
210 + 'characters of class "alpha"\n')
dd8e8e54 211 output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
4a4839c9
AO
212 i18n_file.write('% The "lower" class reflects the lowercase '
213 + 'characters of class "alpha"\n')
dd8e8e54 214 output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
4a4839c9
AO
215 i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
216 + 'reflecting\n')
217 i18n_file.write('% the recommendations in TR 10176 annex A\n')
dd8e8e54 218 output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
4a4839c9
AO
219 i18n_file.write('% The "digit" class must only contain the '
220 + 'BASIC LATIN digits, says ISO C 99\n')
221 i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
dd8e8e54 222 output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
4a4839c9
AO
223 i18n_file.write('% The "outdigit" information is by default '
224 + '"0" to "9". We don\'t have to\n')
225 i18n_file.write('% provide it here since localedef will fill '
226 + 'in the bits and it would\n')
227 i18n_file.write('% prevent locales copying this file define '
228 + 'their own values.\n')
229 i18n_file.write('% outdigit /\n')
230 i18n_file.write('% <U0030>..<U0039>\n\n')
231 # output_charclass(i18n_file, 'outdigit', is_outdigit)
dd8e8e54
CD
232 output_charclass(i18n_file, 'space', unicode_utils.is_space)
233 output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
234 output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
235 output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
236 output_charclass(i18n_file, 'print', unicode_utils.is_print)
4a4839c9
AO
237 i18n_file.write('% The "xdigit" class must only contain the '
238 + 'BASIC LATIN digits and A-F, a-f,\n')
239 i18n_file.write('% says ISO C 99 '
240 + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
dd8e8e54
CD
241 output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
242 output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
85bafe6f
JM
243 if turkish:
244 i18n_file.write('% The case conversions reflect '
245 + 'Turkish conventions.\n')
246 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
247 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
248 else:
249 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
250 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
dd8e8e54 251 output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
4a4839c9
AO
252 i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
253 + 'annex B.1\n')
254 i18n_file.write('% That is, all combining characters (level 2+3).\n')
dd8e8e54
CD
255 output_charclass(i18n_file, 'class "combining";',
256 unicode_utils.is_combining)
4a4839c9
AO
257 i18n_file.write('% The "combining_level3" class reflects '
258 + 'ISO/IEC 10646-1 annex B.2\n')
259 i18n_file.write('% That is, combining characters of level 3.\n')
dd8e8e54
CD
260 output_charclass(i18n_file, 'class "combining_level3";',
261 unicode_utils.is_combining_level3)
4a4839c9
AO
262
263if __name__ == "__main__":
264 PARSER = argparse.ArgumentParser(
265 description='''
266 Generate a Unicode conforming LC_CTYPE category from
267 UnicodeData.txt and DerivedCoreProperties.txt files.
268 ''')
269 PARSER.add_argument(
270 '-u', '--unicode_data_file',
271 nargs='?',
272 type=str,
273 default='UnicodeData.txt',
274 help=('The UnicodeData.txt file to read, '
275 + 'default: %(default)s'))
276 PARSER.add_argument(
277 '-d', '--derived_core_properties_file',
278 nargs='?',
279 type=str,
280 default='DerivedCoreProperties.txt',
281 help=('The DerivedCoreProperties.txt file to read, '
282 + 'default: %(default)s'))
283 PARSER.add_argument(
284 '-i', '--input_file',
285 nargs='?',
286 type=str,
287 help='''The original glibc/localedata/locales/i18n file.''')
288 PARSER.add_argument(
289 '-o', '--output_file',
290 nargs='?',
291 type=str,
292 default='i18n.new',
293 help='''The file which shall contain the generated LC_CTYPE category,
294 default: %(default)s. If the original
295 glibc/localedata/locales/i18n has been given
296 as an option, all data from the original file
297 except the newly generated LC_CTYPE character
298 classes and the date stamp in
299 LC_IDENTIFICATION will be copied unchanged
300 into the output file. ''')
301 PARSER.add_argument(
302 '--unicode_version',
303 nargs='?',
304 required=True,
305 type=str,
306 help='The Unicode version of the input files used.')
85bafe6f
JM
307 PARSER.add_argument(
308 '--turkish',
309 action='store_true',
310 help='Use Turkish case conversions.')
4a4839c9
AO
311 ARGS = PARSER.parse_args()
312
dd8e8e54
CD
313 unicode_utils.fill_attributes(
314 ARGS.unicode_data_file)
315 unicode_utils.fill_derived_core_properties(
316 ARGS.derived_core_properties_file)
317 unicode_utils.verifications()
4a4839c9
AO
318 HEAD = TAIL = ''
319 if ARGS.input_file:
320 (HEAD, TAIL) = read_input_file(ARGS.input_file)
321 with open(ARGS.output_file, mode='w') as I18N_FILE:
322 output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
85bafe6f 323 output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
4a4839c9 324 output_tail(I18N_FILE, tail=TAIL)