]> git.ipfire.org Git - thirdparty/glibc.git/blame - localedata/unicode-gen/utf8_compatibility.py
Update copyright dates with scripts/update-copyrights.
[thirdparty/glibc.git] / localedata / unicode-gen / utf8_compatibility.py
CommitLineData
4a4839c9
AO
1#!/usr/bin/python3
2# -*- coding: utf-8 -*-
04277e02 3# Copyright (C) 2014-2019 Free Software Foundation, Inc.
4a4839c9
AO
4# This file is part of the GNU C Library.
5#
6# The GNU C Library is free software; you can redistribute it and/or
7# modify it under the terms of the GNU Lesser General Public
8# License as published by the Free Software Foundation; either
9# version 2.1 of the License, or (at your option) any later version.
10#
11# The GNU C Library is distributed in the hope that it will be useful,
12# but WITHOUT ANY WARRANTY; without even the implied warranty of
13# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14# Lesser General Public License for more details.
15#
16# You should have received a copy of the GNU Lesser General Public
17# License along with the GNU C Library; if not, see
18# <http://www.gnu.org/licenses/>.
19
20'''
21This script is useful for checking backward compatibility of newly
22generated UTF-8 file from utf8_gen.py script
23
24To see how this script is used, call it with the “-h” option:
25
26 $ ./utf8_compatibility.py -h
27 … prints usage message …
28'''
29
30import sys
31import re
32import argparse
dd8e8e54 33import unicode_utils
4a4839c9
AO
34
35def create_charmap_dictionary(file_name):
36 '''Create a dictionary for all code points found in the CHARMAP
37 section of a file
38 '''
39 with open(file_name, mode='r') as utf8_file:
40 charmap_dictionary = {}
41 for line in utf8_file:
42 if line.startswith('CHARMAP'):
43 break
44 for line in utf8_file:
45 if line.startswith('END CHARMAP'):
46 return charmap_dictionary
47 if line.startswith('%'):
48 continue
49 match = re.match(
50 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
51 +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
52 +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
53 line)
54 if not match:
55 continue
56 codepoint1 = match.group('codepoint1')
57 codepoint2 = match.group('codepoint2')
58 if not codepoint2:
59 codepoint2 = codepoint1
60 for i in range(int(codepoint1, 16),
61 int(codepoint2, 16) + 1):
62 charmap_dictionary[i] = match.group('hexutf8')
63 sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
64 %file_name)
65 exit(1)
66
67def check_charmap(original_file_name, new_file_name):
68 '''Report differences in the CHARMAP section between the old and the
69 new file
70 '''
71 print('************************************************************')
72 print('Report on CHARMAP:')
73 ocharmap = create_charmap_dictionary(original_file_name)
74 ncharmap = create_charmap_dictionary(new_file_name)
75 print('------------------------------------------------------------')
76 print('Total removed characters in newly generated CHARMAP: %d'
77 %len(set(ocharmap)-set(ncharmap)))
78 if ARGS.show_missing_characters:
79 for key in sorted(set(ocharmap)-set(ncharmap)):
80 print('removed: {:s} {:s} {:s}'.format(
dd8e8e54 81 unicode_utils.ucs_symbol(key),
4a4839c9 82 ocharmap[key],
dd8e8e54
CD
83 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
84 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
85 print('------------------------------------------------------------')
86 changed_charmap = {}
87 for key in set(ocharmap).intersection(set(ncharmap)):
88 if ocharmap[key] != ncharmap[key]:
89 changed_charmap[key] = (ocharmap[key], ncharmap[key])
90 print('Total changed characters in newly generated CHARMAP: %d'
91 %len(changed_charmap))
92 if ARGS.show_changed_characters:
93 for key in sorted(changed_charmap):
94 print('changed: {:s} {:s}->{:s} {:s}'.format(
dd8e8e54 95 unicode_utils.ucs_symbol(key),
4a4839c9
AO
96 changed_charmap[key][0],
97 changed_charmap[key][1],
dd8e8e54
CD
98 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
99 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
100 print('------------------------------------------------------------')
101 print('Total added characters in newly generated CHARMAP: %d'
102 %len(set(ncharmap)-set(ocharmap)))
103 if ARGS.show_added_characters:
104 for key in sorted(set(ncharmap)-set(ocharmap)):
105 print('added: {:s} {:s} {:s}'.format(
dd8e8e54 106 unicode_utils.ucs_symbol(key),
4a4839c9 107 ncharmap[key],
dd8e8e54
CD
108 unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
109 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
110
111def create_width_dictionary(file_name):
112 '''Create a dictionary for all code points found in the WIDTH
113 section of a file
114 '''
115 with open(file_name, mode='r') as utf8_file:
116 width_dictionary = {}
117 for line in utf8_file:
118 if line.startswith('WIDTH'):
119 break
120 for line in utf8_file:
121 if line.startswith('END WIDTH'):
122 return width_dictionary
123 match = re.match(
124 r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
125 +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
126 +r'\s+(?P<width>[02])',
127 line)
128 if not match:
129 continue
130 codepoint1 = match.group('codepoint1')
131 codepoint2 = match.group('codepoint2')
132 if not codepoint2:
133 codepoint2 = codepoint1
134 for i in range(int(codepoint1, 16),
135 int(codepoint2, 16) + 1):
136 width_dictionary[i] = int(match.group('width'))
137 sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
138
139def check_width(original_file_name, new_file_name):
140 '''Report differences in the WIDTH section between the old and the new
141 file
142 '''
143 print('************************************************************')
144 print('Report on WIDTH:')
145 owidth = create_width_dictionary(original_file_name)
146 nwidth = create_width_dictionary(new_file_name)
147 print('------------------------------------------------------------')
148 print('Total removed characters in newly generated WIDTH: %d'
149 %len(set(owidth)-set(nwidth)))
150 print('(Characters not in WIDTH get width 1 by default, '
151 + 'i.e. these have width 1 now.)')
152 if ARGS.show_missing_characters:
153 for key in sorted(set(owidth)-set(nwidth)):
dd8e8e54 154 print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
4a4839c9
AO
155 + '{:d} : '.format(owidth[key])
156 + 'eaw={:s} '.format(
dd8e8e54
CD
157 unicode_utils.EAST_ASIAN_WIDTHS[key]
158 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
4a4839c9 159 + 'category={:2s} '.format(
dd8e8e54
CD
160 unicode_utils.UNICODE_ATTRIBUTES[key]['category']
161 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 162 + 'bidi={:3s} '.format(
dd8e8e54
CD
163 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
164 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 165 + 'name={:s}'.format(
dd8e8e54
CD
166 unicode_utils.UNICODE_ATTRIBUTES[key]['name']
167 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
168 print('------------------------------------------------------------')
169 changed_width = {}
170 for key in set(owidth).intersection(set(nwidth)):
171 if owidth[key] != nwidth[key]:
172 changed_width[key] = (owidth[key], nwidth[key])
173 print('Total changed characters in newly generated WIDTH: %d'
174 %len(changed_width))
175 if ARGS.show_changed_characters:
176 for key in sorted(changed_width):
dd8e8e54 177 print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
4a4839c9
AO
178 + '{:d}->{:d} : '.format(changed_width[key][0],
179 changed_width[key][1])
180 + 'eaw={:s} '.format(
dd8e8e54
CD
181 unicode_utils.EAST_ASIAN_WIDTHS[key]
182 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
4a4839c9 183 + 'category={:2s} '.format(
dd8e8e54
CD
184 unicode_utils.UNICODE_ATTRIBUTES[key]['category']
185 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 186 + 'bidi={:3s} '.format(
dd8e8e54
CD
187 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
188 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 189 + 'name={:s}'.format(
dd8e8e54
CD
190 unicode_utils.UNICODE_ATTRIBUTES[key]['name']
191 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
192 print('------------------------------------------------------------')
193 print('Total added characters in newly generated WIDTH: %d'
194 %len(set(nwidth)-set(owidth)))
195 print('(Characters not in WIDTH get width 1 by default, '
196 + 'i.e. these had width 1 before.)')
197 if ARGS.show_added_characters:
198 for key in sorted(set(nwidth)-set(owidth)):
dd8e8e54 199 print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
4a4839c9
AO
200 + '{:d} : '.format(nwidth[key])
201 + 'eaw={:s} '.format(
dd8e8e54
CD
202 unicode_utils.EAST_ASIAN_WIDTHS[key]
203 if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
4a4839c9 204 + 'category={:2s} '.format(
dd8e8e54
CD
205 unicode_utils.UNICODE_ATTRIBUTES[key]['category']
206 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 207 + 'bidi={:3s} '.format(
dd8e8e54
CD
208 unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
209 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
4a4839c9 210 + 'name={:s}'.format(
dd8e8e54
CD
211 unicode_utils.UNICODE_ATTRIBUTES[key]['name']
212 if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
4a4839c9
AO
213
214if __name__ == "__main__":
215 PARSER = argparse.ArgumentParser(
216 description='''
217 Compare the contents of LC_CTYPE in two files and check for errors.
218 ''')
219 PARSER.add_argument(
220 '-o', '--old_utf8_file',
221 nargs='?',
222 required=True,
223 type=str,
224 help='The old UTF-8 file.')
225 PARSER.add_argument(
226 '-n', '--new_utf8_file',
227 nargs='?',
228 required=True,
229 type=str,
230 help='The new UTF-8 file.')
231 PARSER.add_argument(
232 '-u', '--unicode_data_file',
233 nargs='?',
234 type=str,
235 help='The UnicodeData.txt file to read.')
236 PARSER.add_argument(
237 '-e', '--east_asian_width_file',
238 nargs='?',
239 type=str,
240 help='The EastAsianWidth.txt file to read.')
241 PARSER.add_argument(
242 '-a', '--show_added_characters',
243 action='store_true',
244 help='Show characters which were added in detail.')
245 PARSER.add_argument(
246 '-m', '--show_missing_characters',
247 action='store_true',
248 help='Show characters which were removed in detail.')
249 PARSER.add_argument(
250 '-c', '--show_changed_characters',
251 action='store_true',
252 help='Show characters whose width was changed in detail.')
253 ARGS = PARSER.parse_args()
254
255 if ARGS.unicode_data_file:
dd8e8e54 256 unicode_utils.fill_attributes(ARGS.unicode_data_file)
4a4839c9 257 if ARGS.east_asian_width_file:
dd8e8e54 258 unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
4a4839c9
AO
259 check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
260 check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)