]>
Commit | Line | Data |
---|---|---|
4a4839c9 AO |
1 | #!/usr/bin/python3 |
2 | # | |
3 | # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file. | |
d614a753 | 4 | # Copyright (C) 2014-2020 Free Software Foundation, Inc. |
4a4839c9 AO |
5 | # This file is part of the GNU C Library. |
6 | # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000. | |
7 | # | |
8 | # The GNU C Library is free software; you can redistribute it and/or | |
9 | # modify it under the terms of the GNU Lesser General Public | |
10 | # License as published by the Free Software Foundation; either | |
11 | # version 2.1 of the License, or (at your option) any later version. | |
12 | # | |
13 | # The GNU C Library is distributed in the hope that it will be useful, | |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
16 | # Lesser General Public License for more details. | |
17 | # | |
18 | # You should have received a copy of the GNU Lesser General Public | |
19 | # License along with the GNU C Library; if not, see | |
5a82c748 | 20 | # <https://www.gnu.org/licenses/>. |
4a4839c9 AO |
21 | |
22 | ''' | |
23 | Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and | |
24 | DerivedCoreProperties.txt files. | |
25 | ||
26 | To see how this script is used, call it with the “-h” option: | |
27 | ||
28 | $ ./gen_unicode_ctype.py -h | |
29 | … prints usage message … | |
30 | ''' | |
31 | ||
32 | import argparse | |
4a4839c9 AO |
33 | import time |
34 | import re | |
dd8e8e54 | 35 | import unicode_utils |
4a4839c9 AO |
36 | |
37 | def code_point_ranges(is_class_function): | |
38 | '''Returns a list of ranges of code points for which is_class_function | |
39 | returns True. | |
40 | ||
41 | Example: | |
42 | ||
43 | [[65, 90], [192, 214], [216, 222], [256], … ] | |
44 | ''' | |
45 | cp_ranges = [] | |
dd8e8e54 | 46 | for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): |
4a4839c9 AO |
47 | if is_class_function(code_point): |
48 | if (cp_ranges | |
49 | and cp_ranges[-1][-1] == code_point - 1): | |
50 | if len(cp_ranges[-1]) == 1: | |
51 | cp_ranges[-1].append(code_point) | |
52 | else: | |
53 | cp_ranges[-1][-1] = code_point | |
54 | else: | |
55 | cp_ranges.append([code_point]) | |
56 | return cp_ranges | |
57 | ||
58 | def output_charclass(i18n_file, class_name, is_class_function): | |
59 | '''Output a LC_CTYPE character class section | |
60 | ||
61 | Example: | |
62 | ||
63 | upper / | |
64 | <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/ | |
65 | … | |
66 | <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/ | |
67 | <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189> | |
68 | ''' | |
69 | cp_ranges = code_point_ranges(is_class_function) | |
70 | if cp_ranges: | |
71 | i18n_file.write('%s /\n' %class_name) | |
72 | max_column = 75 | |
73 | prefix = ' ' | |
74 | line = prefix | |
75 | range_string = '' | |
76 | for code_point_range in cp_ranges: | |
77 | if line.strip(): | |
78 | line += ';' | |
79 | if len(code_point_range) == 1: | |
dd8e8e54 | 80 | range_string = unicode_utils.ucs_symbol(code_point_range[0]) |
4a4839c9 | 81 | else: |
dd8e8e54 | 82 | range_string = unicode_utils.ucs_symbol_range( |
4a4839c9 AO |
83 | code_point_range[0], code_point_range[-1]) |
84 | if len(line+range_string) > max_column: | |
85 | i18n_file.write(line+'/\n') | |
86 | line = prefix | |
87 | line += range_string | |
88 | if line.strip(): | |
89 | i18n_file.write(line+'\n') | |
90 | i18n_file.write('\n') | |
91 | ||
92 | def output_charmap(i18n_file, map_name, map_function): | |
93 | '''Output a LC_CTYPE character map section | |
94 | ||
95 | Example: | |
96 | ||
97 | toupper / | |
98 | (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/ | |
99 | … | |
100 | (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/ | |
101 | (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>) | |
102 | ''' | |
103 | max_column = 75 | |
104 | prefix = ' ' | |
105 | line = prefix | |
106 | map_string = '' | |
107 | i18n_file.write('%s /\n' %map_name) | |
dd8e8e54 | 108 | for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES): |
4a4839c9 AO |
109 | mapped = map_function(code_point) |
110 | if code_point != mapped: | |
111 | if line.strip(): | |
112 | line += ';' | |
113 | map_string = '(' \ | |
dd8e8e54 | 114 | + unicode_utils.ucs_symbol(code_point) \ |
4a4839c9 | 115 | + ',' \ |
dd8e8e54 | 116 | + unicode_utils.ucs_symbol(mapped) \ |
4a4839c9 AO |
117 | + ')' |
118 | if len(line+map_string) > max_column: | |
119 | i18n_file.write(line+'/\n') | |
120 | line = prefix | |
121 | line += map_string | |
122 | if line.strip(): | |
123 | i18n_file.write(line+'\n') | |
124 | i18n_file.write('\n') | |
125 | ||
4a4839c9 AO |
126 | def read_input_file(filename): |
127 | '''Reads the original glibc i18n file to get the original head | |
128 | and tail. | |
129 | ||
130 | We want to replace only the character classes in LC_CTYPE, and the | |
131 | date stamp. All the rest of the i18n file should stay unchanged. | |
132 | To avoid having to cut and paste the generated data into the | |
133 | original file, it is helpful to read the original file here | |
134 | to be able to generate a complete result file. | |
135 | ''' | |
136 | head = tail = '' | |
137 | with open(filename, mode='r') as i18n_file: | |
138 | for line in i18n_file: | |
139 | match = re.match( | |
140 | r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")', | |
141 | line) | |
142 | if match: | |
143 | line = match.group('key') \ | |
144 | + '"{:s}"\n'.format(time.strftime('%Y-%m-%d')) | |
145 | head = head + line | |
146 | if line.startswith('LC_CTYPE'): | |
147 | break | |
148 | for line in i18n_file: | |
149 | if line.startswith('translit_start'): | |
150 | tail = line | |
151 | break | |
152 | for line in i18n_file: | |
153 | tail = tail + line | |
154 | return (head, tail) | |
155 | ||
156 | def output_head(i18n_file, unicode_version, head=''): | |
157 | '''Write the header of the output file, i.e. the part of the file | |
158 | before the “LC_CTYPE” line. | |
159 | ''' | |
160 | if ARGS.input_file and head: | |
161 | i18n_file.write(head) | |
162 | else: | |
163 | i18n_file.write('escape_char /\n') | |
164 | i18n_file.write('comment_char %\n') | |
165 | i18n_file.write('\n') | |
166 | i18n_file.write('% Generated automatically by ' | |
167 | + 'gen_unicode_ctype.py ' | |
168 | + 'for Unicode {:s}.\n'.format(unicode_version)) | |
169 | i18n_file.write('\n') | |
170 | i18n_file.write('LC_IDENTIFICATION\n') | |
171 | i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format( | |
172 | unicode_version)) | |
173 | i18n_file.write('source "UnicodeData.txt, ' | |
174 | + 'DerivedCoreProperties.txt"\n') | |
175 | i18n_file.write('address ""\n') | |
176 | i18n_file.write('contact ""\n') | |
177 | i18n_file.write('email "bug-glibc-locales@gnu.org"\n') | |
178 | i18n_file.write('tel ""\n') | |
179 | i18n_file.write('fax ""\n') | |
180 | i18n_file.write('language ""\n') | |
181 | i18n_file.write('territory "Earth"\n') | |
182 | i18n_file.write('revision "{:s}"\n'.format(unicode_version)) | |
183 | i18n_file.write('date "{:s}"\n'.format( | |
184 | time.strftime('%Y-%m-%d'))) | |
1bb36539 | 185 | i18n_file.write('category "i18n:2012";LC_CTYPE\n') |
4a4839c9 AO |
186 | i18n_file.write('END LC_IDENTIFICATION\n') |
187 | i18n_file.write('\n') | |
188 | i18n_file.write('LC_CTYPE\n') | |
189 | ||
190 | def output_tail(i18n_file, tail=''): | |
191 | '''Write the tail of the output file, i.e. the part of the file | |
192 | after the last “LC_CTYPE” character class. | |
193 | ''' | |
194 | if ARGS.input_file and tail: | |
195 | i18n_file.write(tail) | |
196 | else: | |
197 | i18n_file.write('END LC_CTYPE\n') | |
198 | ||
85bafe6f | 199 | def output_tables(i18n_file, unicode_version, turkish): |
4a4839c9 AO |
200 | '''Write the new LC_CTYPE character classes to the output file''' |
201 | i18n_file.write('% The following is the 14652 i18n fdcc-set ' | |
202 | + 'LC_CTYPE category.\n') | |
203 | i18n_file.write('% It covers Unicode version {:s}.\n'.format( | |
204 | unicode_version)) | |
205 | i18n_file.write('% The character classes and mapping tables were ' | |
206 | + 'automatically\n') | |
207 | i18n_file.write('% generated using the gen_unicode_ctype.py ' | |
208 | + 'program.\n\n') | |
209 | i18n_file.write('% The "upper" class reflects the uppercase ' | |
210 | + 'characters of class "alpha"\n') | |
dd8e8e54 | 211 | output_charclass(i18n_file, 'upper', unicode_utils.is_upper) |
4a4839c9 AO |
212 | i18n_file.write('% The "lower" class reflects the lowercase ' |
213 | + 'characters of class "alpha"\n') | |
dd8e8e54 | 214 | output_charclass(i18n_file, 'lower', unicode_utils.is_lower) |
4a4839c9 AO |
215 | i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' |
216 | + 'reflecting\n') | |
217 | i18n_file.write('% the recommendations in TR 10176 annex A\n') | |
dd8e8e54 | 218 | output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha) |
4a4839c9 AO |
219 | i18n_file.write('% The "digit" class must only contain the ' |
220 | + 'BASIC LATIN digits, says ISO C 99\n') | |
221 | i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') | |
dd8e8e54 | 222 | output_charclass(i18n_file, 'digit', unicode_utils.is_digit) |
4a4839c9 AO |
223 | i18n_file.write('% The "outdigit" information is by default ' |
224 | + '"0" to "9". We don\'t have to\n') | |
225 | i18n_file.write('% provide it here since localedef will fill ' | |
226 | + 'in the bits and it would\n') | |
227 | i18n_file.write('% prevent locales copying this file define ' | |
228 | + 'their own values.\n') | |
229 | i18n_file.write('% outdigit /\n') | |
230 | i18n_file.write('% <U0030>..<U0039>\n\n') | |
231 | # output_charclass(i18n_file, 'outdigit', is_outdigit) | |
dd8e8e54 CD |
232 | output_charclass(i18n_file, 'space', unicode_utils.is_space) |
233 | output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl) | |
234 | output_charclass(i18n_file, 'punct', unicode_utils.is_punct) | |
235 | output_charclass(i18n_file, 'graph', unicode_utils.is_graph) | |
236 | output_charclass(i18n_file, 'print', unicode_utils.is_print) | |
4a4839c9 AO |
237 | i18n_file.write('% The "xdigit" class must only contain the ' |
238 | + 'BASIC LATIN digits and A-F, a-f,\n') | |
239 | i18n_file.write('% says ISO C 99 ' | |
240 | + '(sections 7.25.2.1.12 and 6.4.4.1).\n') | |
dd8e8e54 CD |
241 | output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit) |
242 | output_charclass(i18n_file, 'blank', unicode_utils.is_blank) | |
85bafe6f JM |
243 | if turkish: |
244 | i18n_file.write('% The case conversions reflect ' | |
245 | + 'Turkish conventions.\n') | |
246 | output_charmap(i18n_file, 'toupper', unicode_utils.to_upper_turkish) | |
247 | output_charmap(i18n_file, 'tolower', unicode_utils.to_lower_turkish) | |
248 | else: | |
249 | output_charmap(i18n_file, 'toupper', unicode_utils.to_upper) | |
250 | output_charmap(i18n_file, 'tolower', unicode_utils.to_lower) | |
dd8e8e54 | 251 | output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title) |
4a4839c9 AO |
252 | i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' |
253 | + 'annex B.1\n') | |
254 | i18n_file.write('% That is, all combining characters (level 2+3).\n') | |
dd8e8e54 CD |
255 | output_charclass(i18n_file, 'class "combining";', |
256 | unicode_utils.is_combining) | |
4a4839c9 AO |
257 | i18n_file.write('% The "combining_level3" class reflects ' |
258 | + 'ISO/IEC 10646-1 annex B.2\n') | |
259 | i18n_file.write('% That is, combining characters of level 3.\n') | |
dd8e8e54 CD |
260 | output_charclass(i18n_file, 'class "combining_level3";', |
261 | unicode_utils.is_combining_level3) | |
4a4839c9 AO |
262 | |
263 | if __name__ == "__main__": | |
264 | PARSER = argparse.ArgumentParser( | |
265 | description=''' | |
266 | Generate a Unicode conforming LC_CTYPE category from | |
267 | UnicodeData.txt and DerivedCoreProperties.txt files. | |
268 | ''') | |
269 | PARSER.add_argument( | |
270 | '-u', '--unicode_data_file', | |
271 | nargs='?', | |
272 | type=str, | |
273 | default='UnicodeData.txt', | |
274 | help=('The UnicodeData.txt file to read, ' | |
275 | + 'default: %(default)s')) | |
276 | PARSER.add_argument( | |
277 | '-d', '--derived_core_properties_file', | |
278 | nargs='?', | |
279 | type=str, | |
280 | default='DerivedCoreProperties.txt', | |
281 | help=('The DerivedCoreProperties.txt file to read, ' | |
282 | + 'default: %(default)s')) | |
283 | PARSER.add_argument( | |
284 | '-i', '--input_file', | |
285 | nargs='?', | |
286 | type=str, | |
287 | help='''The original glibc/localedata/locales/i18n file.''') | |
288 | PARSER.add_argument( | |
289 | '-o', '--output_file', | |
290 | nargs='?', | |
291 | type=str, | |
292 | default='i18n.new', | |
293 | help='''The file which shall contain the generated LC_CTYPE category, | |
294 | default: %(default)s. If the original | |
295 | glibc/localedata/locales/i18n has been given | |
296 | as an option, all data from the original file | |
297 | except the newly generated LC_CTYPE character | |
298 | classes and the date stamp in | |
299 | LC_IDENTIFICATION will be copied unchanged | |
300 | into the output file. ''') | |
301 | PARSER.add_argument( | |
302 | '--unicode_version', | |
303 | nargs='?', | |
304 | required=True, | |
305 | type=str, | |
306 | help='The Unicode version of the input files used.') | |
85bafe6f JM |
307 | PARSER.add_argument( |
308 | '--turkish', | |
309 | action='store_true', | |
310 | help='Use Turkish case conversions.') | |
4a4839c9 AO |
311 | ARGS = PARSER.parse_args() |
312 | ||
dd8e8e54 CD |
313 | unicode_utils.fill_attributes( |
314 | ARGS.unicode_data_file) | |
315 | unicode_utils.fill_derived_core_properties( | |
316 | ARGS.derived_core_properties_file) | |
317 | unicode_utils.verifications() | |
4a4839c9 AO |
318 | HEAD = TAIL = '' |
319 | if ARGS.input_file: | |
320 | (HEAD, TAIL) = read_input_file(ARGS.input_file) | |
321 | with open(ARGS.output_file, mode='w') as I18N_FILE: | |
322 | output_head(I18N_FILE, ARGS.unicode_version, head=HEAD) | |
85bafe6f | 323 | output_tables(I18N_FILE, ARGS.unicode_version, ARGS.turkish) |
4a4839c9 | 324 | output_tail(I18N_FILE, tail=TAIL) |