]> git.ipfire.org Git - thirdparty/glibc.git/blob - localedata/unicode-gen/gen_unicode_ctype.py
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / localedata / unicode-gen / gen_unicode_ctype.py
1 #!/usr/bin/python3
2 #
3 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
4 # Copyright (C) 2014-2017 Free Software Foundation, Inc.
5 # This file is part of the GNU C Library.
6 # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
7 #
8 # The GNU C Library is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU Lesser General Public
10 # License as published by the Free Software Foundation; either
11 # version 2.1 of the License, or (at your option) any later version.
12 #
13 # The GNU C Library is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 # Lesser General Public License for more details.
17 #
18 # You should have received a copy of the GNU Lesser General Public
19 # License along with the GNU C Library; if not, see
20 # <http://www.gnu.org/licenses/>.
21
22 '''
23 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
24 DerivedCoreProperties.txt files.
25
26 To see how this script is used, call it with the “-h” option:
27
28 $ ./gen_unicode_ctype.py -h
29 … prints usage message …
30 '''
31
32 import argparse
33 import time
34 import re
35 import unicode_utils
36
37 def code_point_ranges(is_class_function):
38 '''Returns a list of ranges of code points for which is_class_function
39 returns True.
40
41 Example:
42
43 [[65, 90], [192, 214], [216, 222], [256], … ]
44 '''
45 cp_ranges = []
46 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
47 if is_class_function(code_point):
48 if (cp_ranges
49 and cp_ranges[-1][-1] == code_point - 1):
50 if len(cp_ranges[-1]) == 1:
51 cp_ranges[-1].append(code_point)
52 else:
53 cp_ranges[-1][-1] = code_point
54 else:
55 cp_ranges.append([code_point])
56 return cp_ranges
57
58 def output_charclass(i18n_file, class_name, is_class_function):
59 '''Output a LC_CTYPE character class section
60
61 Example:
62
63 upper /
64 <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
65
66 <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
67 <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
68 '''
69 cp_ranges = code_point_ranges(is_class_function)
70 if cp_ranges:
71 i18n_file.write('%s /\n' %class_name)
72 max_column = 75
73 prefix = ' '
74 line = prefix
75 range_string = ''
76 for code_point_range in cp_ranges:
77 if line.strip():
78 line += ';'
79 if len(code_point_range) == 1:
80 range_string = unicode_utils.ucs_symbol(code_point_range[0])
81 else:
82 range_string = unicode_utils.ucs_symbol_range(
83 code_point_range[0], code_point_range[-1])
84 if len(line+range_string) > max_column:
85 i18n_file.write(line+'/\n')
86 line = prefix
87 line += range_string
88 if line.strip():
89 i18n_file.write(line+'\n')
90 i18n_file.write('\n')
91
92 def output_charmap(i18n_file, map_name, map_function):
93 '''Output a LC_CTYPE character map section
94
95 Example:
96
97 toupper /
98 (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
99
100 (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
101 (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
102 '''
103 max_column = 75
104 prefix = ' '
105 line = prefix
106 map_string = ''
107 i18n_file.write('%s /\n' %map_name)
108 for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
109 mapped = map_function(code_point)
110 if code_point != mapped:
111 if line.strip():
112 line += ';'
113 map_string = '(' \
114 + unicode_utils.ucs_symbol(code_point) \
115 + ',' \
116 + unicode_utils.ucs_symbol(mapped) \
117 + ')'
118 if len(line+map_string) > max_column:
119 i18n_file.write(line+'/\n')
120 line = prefix
121 line += map_string
122 if line.strip():
123 i18n_file.write(line+'\n')
124 i18n_file.write('\n')
125
126 def read_input_file(filename):
127 '''Reads the original glibc i18n file to get the original head
128 and tail.
129
130 We want to replace only the character classes in LC_CTYPE, and the
131 date stamp. All the rest of the i18n file should stay unchanged.
132 To avoid having to cut and paste the generated data into the
133 original file, it is helpful to read the original file here
134 to be able to generate a complete result file.
135 '''
136 head = tail = ''
137 with open(filename, mode='r') as i18n_file:
138 for line in i18n_file:
139 match = re.match(
140 r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
141 line)
142 if match:
143 line = match.group('key') \
144 + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
145 head = head + line
146 if line.startswith('LC_CTYPE'):
147 break
148 for line in i18n_file:
149 if line.startswith('translit_start'):
150 tail = line
151 break
152 for line in i18n_file:
153 tail = tail + line
154 return (head, tail)
155
156 def output_head(i18n_file, unicode_version, head=''):
157 '''Write the header of the output file, i.e. the part of the file
158 before the “LC_CTYPE” line.
159 '''
160 if ARGS.input_file and head:
161 i18n_file.write(head)
162 else:
163 i18n_file.write('escape_char /\n')
164 i18n_file.write('comment_char %\n')
165 i18n_file.write('\n')
166 i18n_file.write('% Generated automatically by '
167 + 'gen_unicode_ctype.py '
168 + 'for Unicode {:s}.\n'.format(unicode_version))
169 i18n_file.write('\n')
170 i18n_file.write('LC_IDENTIFICATION\n')
171 i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
172 unicode_version))
173 i18n_file.write('source "UnicodeData.txt, '
174 + 'DerivedCoreProperties.txt"\n')
175 i18n_file.write('address ""\n')
176 i18n_file.write('contact ""\n')
177 i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
178 i18n_file.write('tel ""\n')
179 i18n_file.write('fax ""\n')
180 i18n_file.write('language ""\n')
181 i18n_file.write('territory "Earth"\n')
182 i18n_file.write('revision "{:s}"\n'.format(unicode_version))
183 i18n_file.write('date "{:s}"\n'.format(
184 time.strftime('%Y-%m-%d')))
185 i18n_file.write('category "unicode:2014";LC_CTYPE\n')
186 i18n_file.write('END LC_IDENTIFICATION\n')
187 i18n_file.write('\n')
188 i18n_file.write('LC_CTYPE\n')
189
190 def output_tail(i18n_file, tail=''):
191 '''Write the tail of the output file, i.e. the part of the file
192 after the last “LC_CTYPE” character class.
193 '''
194 if ARGS.input_file and tail:
195 i18n_file.write(tail)
196 else:
197 i18n_file.write('END LC_CTYPE\n')
198
199 def output_tables(i18n_file, unicode_version, turkish):
200 '''Write the new LC_CTYPE character classes to the output file'''
201 i18n_file.write('% The following is the 14652 i18n fdcc-set '
202 + 'LC_CTYPE category.\n')
203 i18n_file.write('% It covers Unicode version {:s}.\n'.format(
204 unicode_version))
205 i18n_file.write('% The character classes and mapping tables were '
206 + 'automatically\n')
207 i18n_file.write('% generated using the gen_unicode_ctype.py '
208 + 'program.\n\n')
209 i18n_file.write('% The "upper" class reflects the uppercase '
210 + 'characters of class "alpha"\n')
211 output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
212 i18n_file.write('% The "lower" class reflects the lowercase '
213 + 'characters of class "alpha"\n')
214 output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
215 i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
216 + 'reflecting\n')
217 i18n_file.write('% the recommendations in TR 10176 annex A\n')
218 output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
219 i18n_file.write('% The "digit" class must only contain the '
220 + 'BASIC LATIN digits, says ISO C 99\n')
221 i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
222 output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
223 i18n_file.write('% The "outdigit" information is by default '
224 + '"0" to "9". We don\'t have to\n')
225 i18n_file.write('% provide it here since localedef will fill '
226 + 'in the bits and it would\n')
227 i18n_file.write('% prevent locales copying this file define '
228 + 'their own values.\n')
229 i18n_file.write('% outdigit /\n')
230 i18n_file.write('% <U0030>..<U0039>\n\n')
231 # output_charclass(i18n_file, 'outdigit', is_outdigit)
232 output_charclass(i18n_file, 'space', unicode_utils.is_space)
233 output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
234 output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
235 output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
236 output_charclass(i18n_file, 'print', unicode_utils.is_print)
237 i18n_file.write('% The "xdigit" class must only contain the '
238 + 'BASIC LATIN digits and A-F, a-f,\n')
239 i18n_file.write('% says ISO C 99 '
240 + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
241 output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
242 output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
243 if turkish:
244 i18n_file.write('% The case conversions reflect '
245 + 'Turkish conventions.\n')
246 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish)
247 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish)
248 else:
249 output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
250 output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
251 output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
252 i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
253 + 'annex B.1\n')
254 i18n_file.write('% That is, all combining characters (level 2+3).\n')
255 output_charclass(i18n_file, 'class "combining";',
256 unicode_utils.is_combining)
257 i18n_file.write('% The "combining_level3" class reflects '
258 + 'ISO/IEC 10646-1 annex B.2\n')
259 i18n_file.write('% That is, combining characters of level 3.\n')
260 output_charclass(i18n_file, 'class "combining_level3";',
261 unicode_utils.is_combining_level3)
262
263 if __name__ == "__main__":
264 PARSER = argparse.ArgumentParser(
265 description='''
266 Generate a Unicode conforming LC_CTYPE category from
267 UnicodeData.txt and DerivedCoreProperties.txt files.
268 ''')
269 PARSER.add_argument(
270 '-u', '--unicode_data_file',
271 nargs='?',
272 type=str,
273 default='UnicodeData.txt',
274 help=('The UnicodeData.txt file to read, '
275 + 'default: %(default)s'))
276 PARSER.add_argument(
277 '-d', '--derived_core_properties_file',
278 nargs='?',
279 type=str,
280 default='DerivedCoreProperties.txt',
281 help=('The DerivedCoreProperties.txt file to read, '
282 + 'default: %(default)s'))
283 PARSER.add_argument(
284 '-i', '--input_file',
285 nargs='?',
286 type=str,
287 help='''The original glibc/localedata/locales/i18n file.''')
288 PARSER.add_argument(
289 '-o', '--output_file',
290 nargs='?',
291 type=str,
292 default='i18n.new',
293 help='''The file which shall contain the generated LC_CTYPE category,
294 default: %(default)s. If the original
295 glibc/localedata/locales/i18n has been given
296 as an option, all data from the original file
297 except the newly generated LC_CTYPE character
298 classes and the date stamp in
299 LC_IDENTIFICATION will be copied unchanged
300 into the output file. ''')
301 PARSER.add_argument(
302 '--unicode_version',
303 nargs='?',
304 required=True,
305 type=str,
306 help='The Unicode version of the input files used.')
307 PARSER.add_argument(
308 '--turkish',
309 action='store_true',
310 help='Use Turkish case conversions.')
311 ARGS = PARSER.parse_args()
312
313 unicode_utils.fill_attributes(
314 ARGS.unicode_data_file)
315 unicode_utils.fill_derived_core_properties(
316 ARGS.derived_core_properties_file)
317 unicode_utils.verifications()
318 HEAD = TAIL = ''
319 if ARGS.input_file:
320 (HEAD, TAIL) = read_input_file(ARGS.input_file)
321 with open(ARGS.output_file, mode='w') as I18N_FILE:
322 output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
323 output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish)
324 output_tail(I18N_FILE, tail=TAIL)