]>
Commit | Line | Data |
---|---|---|
4a4839c9 AO |
1 | #!/usr/bin/python3 |
2 | # -*- coding: utf-8 -*- | |
04277e02 | 3 | # Copyright (C) 2014-2019 Free Software Foundation, Inc. |
4a4839c9 AO |
4 | # This file is part of the GNU C Library. |
5 | # | |
6 | # The GNU C Library is free software; you can redistribute it and/or | |
7 | # modify it under the terms of the GNU Lesser General Public | |
8 | # License as published by the Free Software Foundation; either | |
9 | # version 2.1 of the License, or (at your option) any later version. | |
10 | # | |
11 | # The GNU C Library is distributed in the hope that it will be useful, | |
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | # Lesser General Public License for more details. | |
15 | # | |
16 | # You should have received a copy of the GNU Lesser General Public | |
17 | # License along with the GNU C Library; if not, see | |
18 | # <http://www.gnu.org/licenses/>. | |
19 | ||
20 | ''' | |
21 | This script is useful for checking backward compatibility of newly | |
22 | generated UTF-8 file from utf8_gen.py script | |
23 | ||
24 | To see how this script is used, call it with the “-h” option: | |
25 | ||
26 | $ ./utf8_compatibility.py -h | |
27 | … prints usage message … | |
28 | ''' | |
29 | ||
30 | import sys | |
31 | import re | |
32 | import argparse | |
dd8e8e54 | 33 | import unicode_utils |
4a4839c9 AO |
34 | |
35 | def create_charmap_dictionary(file_name): | |
36 | '''Create a dictionary for all code points found in the CHARMAP | |
37 | section of a file | |
38 | ''' | |
39 | with open(file_name, mode='r') as utf8_file: | |
40 | charmap_dictionary = {} | |
41 | for line in utf8_file: | |
42 | if line.startswith('CHARMAP'): | |
43 | break | |
44 | for line in utf8_file: | |
45 | if line.startswith('END CHARMAP'): | |
46 | return charmap_dictionary | |
47 | if line.startswith('%'): | |
48 | continue | |
49 | match = re.match( | |
50 | r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' | |
51 | +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' | |
52 | +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})', | |
53 | line) | |
54 | if not match: | |
55 | continue | |
56 | codepoint1 = match.group('codepoint1') | |
57 | codepoint2 = match.group('codepoint2') | |
58 | if not codepoint2: | |
59 | codepoint2 = codepoint1 | |
60 | for i in range(int(codepoint1, 16), | |
61 | int(codepoint2, 16) + 1): | |
62 | charmap_dictionary[i] = match.group('hexutf8') | |
63 | sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n' | |
64 | %file_name) | |
65 | exit(1) | |
66 | ||
67 | def check_charmap(original_file_name, new_file_name): | |
68 | '''Report differences in the CHARMAP section between the old and the | |
69 | new file | |
70 | ''' | |
71 | print('************************************************************') | |
72 | print('Report on CHARMAP:') | |
73 | ocharmap = create_charmap_dictionary(original_file_name) | |
74 | ncharmap = create_charmap_dictionary(new_file_name) | |
75 | print('------------------------------------------------------------') | |
76 | print('Total removed characters in newly generated CHARMAP: %d' | |
77 | %len(set(ocharmap)-set(ncharmap))) | |
78 | if ARGS.show_missing_characters: | |
79 | for key in sorted(set(ocharmap)-set(ncharmap)): | |
80 | print('removed: {:s} {:s} {:s}'.format( | |
dd8e8e54 | 81 | unicode_utils.ucs_symbol(key), |
4a4839c9 | 82 | ocharmap[key], |
dd8e8e54 CD |
83 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ |
84 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
85 | print('------------------------------------------------------------') |
86 | changed_charmap = {} | |
87 | for key in set(ocharmap).intersection(set(ncharmap)): | |
88 | if ocharmap[key] != ncharmap[key]: | |
89 | changed_charmap[key] = (ocharmap[key], ncharmap[key]) | |
90 | print('Total changed characters in newly generated CHARMAP: %d' | |
91 | %len(changed_charmap)) | |
92 | if ARGS.show_changed_characters: | |
93 | for key in sorted(changed_charmap): | |
94 | print('changed: {:s} {:s}->{:s} {:s}'.format( | |
dd8e8e54 | 95 | unicode_utils.ucs_symbol(key), |
4a4839c9 AO |
96 | changed_charmap[key][0], |
97 | changed_charmap[key][1], | |
dd8e8e54 CD |
98 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ |
99 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
100 | print('------------------------------------------------------------') |
101 | print('Total added characters in newly generated CHARMAP: %d' | |
102 | %len(set(ncharmap)-set(ocharmap))) | |
103 | if ARGS.show_added_characters: | |
104 | for key in sorted(set(ncharmap)-set(ocharmap)): | |
105 | print('added: {:s} {:s} {:s}'.format( | |
dd8e8e54 | 106 | unicode_utils.ucs_symbol(key), |
4a4839c9 | 107 | ncharmap[key], |
dd8e8e54 CD |
108 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \ |
109 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
110 | |
111 | def create_width_dictionary(file_name): | |
112 | '''Create a dictionary for all code points found in the WIDTH | |
113 | section of a file | |
114 | ''' | |
115 | with open(file_name, mode='r') as utf8_file: | |
116 | width_dictionary = {} | |
117 | for line in utf8_file: | |
118 | if line.startswith('WIDTH'): | |
119 | break | |
120 | for line in utf8_file: | |
121 | if line.startswith('END WIDTH'): | |
122 | return width_dictionary | |
123 | match = re.match( | |
124 | r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' | |
125 | +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' | |
126 | +r'\s+(?P<width>[02])', | |
127 | line) | |
128 | if not match: | |
129 | continue | |
130 | codepoint1 = match.group('codepoint1') | |
131 | codepoint2 = match.group('codepoint2') | |
132 | if not codepoint2: | |
133 | codepoint2 = codepoint1 | |
134 | for i in range(int(codepoint1, 16), | |
135 | int(codepoint2, 16) + 1): | |
136 | width_dictionary[i] = int(match.group('width')) | |
137 | sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file) | |
138 | ||
139 | def check_width(original_file_name, new_file_name): | |
140 | '''Report differences in the WIDTH section between the old and the new | |
141 | file | |
142 | ''' | |
143 | print('************************************************************') | |
144 | print('Report on WIDTH:') | |
145 | owidth = create_width_dictionary(original_file_name) | |
146 | nwidth = create_width_dictionary(new_file_name) | |
147 | print('------------------------------------------------------------') | |
148 | print('Total removed characters in newly generated WIDTH: %d' | |
149 | %len(set(owidth)-set(nwidth))) | |
150 | print('(Characters not in WIDTH get width 1 by default, ' | |
151 | + 'i.e. these have width 1 now.)') | |
152 | if ARGS.show_missing_characters: | |
153 | for key in sorted(set(owidth)-set(nwidth)): | |
dd8e8e54 | 154 | print('removed: {:s} '.format(unicode_utils.ucs_symbol(key)) |
4a4839c9 AO |
155 | + '{:d} : '.format(owidth[key]) |
156 | + 'eaw={:s} '.format( | |
dd8e8e54 CD |
157 | unicode_utils.EAST_ASIAN_WIDTHS[key] |
158 | if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') | |
4a4839c9 | 159 | + 'category={:2s} '.format( |
dd8e8e54 CD |
160 | unicode_utils.UNICODE_ATTRIBUTES[key]['category'] |
161 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 162 | + 'bidi={:3s} '.format( |
dd8e8e54 CD |
163 | unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] |
164 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 165 | + 'name={:s}'.format( |
dd8e8e54 CD |
166 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] |
167 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
168 | print('------------------------------------------------------------') |
169 | changed_width = {} | |
170 | for key in set(owidth).intersection(set(nwidth)): | |
171 | if owidth[key] != nwidth[key]: | |
172 | changed_width[key] = (owidth[key], nwidth[key]) | |
173 | print('Total changed characters in newly generated WIDTH: %d' | |
174 | %len(changed_width)) | |
175 | if ARGS.show_changed_characters: | |
176 | for key in sorted(changed_width): | |
dd8e8e54 | 177 | print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key)) |
4a4839c9 AO |
178 | + '{:d}->{:d} : '.format(changed_width[key][0], |
179 | changed_width[key][1]) | |
180 | + 'eaw={:s} '.format( | |
dd8e8e54 CD |
181 | unicode_utils.EAST_ASIAN_WIDTHS[key] |
182 | if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') | |
4a4839c9 | 183 | + 'category={:2s} '.format( |
dd8e8e54 CD |
184 | unicode_utils.UNICODE_ATTRIBUTES[key]['category'] |
185 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 186 | + 'bidi={:3s} '.format( |
dd8e8e54 CD |
187 | unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] |
188 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 189 | + 'name={:s}'.format( |
dd8e8e54 CD |
190 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] |
191 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
192 | print('------------------------------------------------------------') |
193 | print('Total added characters in newly generated WIDTH: %d' | |
194 | %len(set(nwidth)-set(owidth))) | |
195 | print('(Characters not in WIDTH get width 1 by default, ' | |
196 | + 'i.e. these had width 1 before.)') | |
197 | if ARGS.show_added_characters: | |
198 | for key in sorted(set(nwidth)-set(owidth)): | |
dd8e8e54 | 199 | print('added: {:s} '.format(unicode_utils.ucs_symbol(key)) |
4a4839c9 AO |
200 | + '{:d} : '.format(nwidth[key]) |
201 | + 'eaw={:s} '.format( | |
dd8e8e54 CD |
202 | unicode_utils.EAST_ASIAN_WIDTHS[key] |
203 | if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None') | |
4a4839c9 | 204 | + 'category={:2s} '.format( |
dd8e8e54 CD |
205 | unicode_utils.UNICODE_ATTRIBUTES[key]['category'] |
206 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 207 | + 'bidi={:3s} '.format( |
dd8e8e54 CD |
208 | unicode_utils.UNICODE_ATTRIBUTES[key]['bidi'] |
209 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None') | |
4a4839c9 | 210 | + 'name={:s}'.format( |
dd8e8e54 CD |
211 | unicode_utils.UNICODE_ATTRIBUTES[key]['name'] |
212 | if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')) | |
4a4839c9 AO |
213 | |
214 | if __name__ == "__main__": | |
215 | PARSER = argparse.ArgumentParser( | |
216 | description=''' | |
217 | Compare the contents of LC_CTYPE in two files and check for errors. | |
218 | ''') | |
219 | PARSER.add_argument( | |
220 | '-o', '--old_utf8_file', | |
221 | nargs='?', | |
222 | required=True, | |
223 | type=str, | |
224 | help='The old UTF-8 file.') | |
225 | PARSER.add_argument( | |
226 | '-n', '--new_utf8_file', | |
227 | nargs='?', | |
228 | required=True, | |
229 | type=str, | |
230 | help='The new UTF-8 file.') | |
231 | PARSER.add_argument( | |
232 | '-u', '--unicode_data_file', | |
233 | nargs='?', | |
234 | type=str, | |
235 | help='The UnicodeData.txt file to read.') | |
236 | PARSER.add_argument( | |
237 | '-e', '--east_asian_width_file', | |
238 | nargs='?', | |
239 | type=str, | |
240 | help='The EastAsianWidth.txt file to read.') | |
241 | PARSER.add_argument( | |
242 | '-a', '--show_added_characters', | |
243 | action='store_true', | |
244 | help='Show characters which were added in detail.') | |
245 | PARSER.add_argument( | |
246 | '-m', '--show_missing_characters', | |
247 | action='store_true', | |
248 | help='Show characters which were removed in detail.') | |
249 | PARSER.add_argument( | |
250 | '-c', '--show_changed_characters', | |
251 | action='store_true', | |
252 | help='Show characters whose width was changed in detail.') | |
253 | ARGS = PARSER.parse_args() | |
254 | ||
255 | if ARGS.unicode_data_file: | |
dd8e8e54 | 256 | unicode_utils.fill_attributes(ARGS.unicode_data_file) |
4a4839c9 | 257 | if ARGS.east_asian_width_file: |
dd8e8e54 | 258 | unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file) |
4a4839c9 AO |
259 | check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) |
260 | check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) |