]> git.ipfire.org Git - thirdparty/gcc.git/blob - contrib/check-internal-format-escaping.py
analyzer: fix ICE due to corrupt MEM_REFs [PR113505]
[thirdparty/gcc.git] / contrib / check-internal-format-escaping.py
1 #!/usr/bin/env python3
2 #
3 # Check gcc.pot file for stylistic issues as described in
4 # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
5 # especially in gcc-internal-format messages.
6 #
7 # This file is part of GCC.
8 #
9 # GCC is free software; you can redistribute it and/or modify it under
10 # the terms of the GNU General Public License as published by the Free
11 # Software Foundation; either version 3, or (at your option) any later
12 # version.
13 #
14 # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 # WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 # FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 # for more details.
18 #
19 # You should have received a copy of the GNU General Public License
20 # along with GCC; see the file COPYING3. If not see
21 # <http://www.gnu.org/licenses/>.
22
23 import argparse
24 import re
25 from collections import Counter
26 from typing import Dict, Match
27
28 import polib
29
30 seen_warnings = Counter()
31
32
33 def location(msg: polib.POEntry):
34 if msg.occurrences:
35 occ = msg.occurrences[0]
36 return f'{occ[0]}:{occ[1]}'
37 return '<unknown location>'
38
39
40 def warn(msg: polib.POEntry,
41 diagnostic_id: str, diagnostic: str, include_msgid=True):
42 """
43 To suppress a warning for a particular message,
44 add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
45 """
46
47 if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
48 return
49
50 seen_warnings[diagnostic] += 1
51
52 if include_msgid:
53 print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
54 else:
55 print(f'{location(msg)}: {diagnostic}')
56
57
58 def lint_gcc_internal_format(msg: polib.POEntry):
59 """
60 Checks a single message that has the gcc-internal-format. These
61 messages use a variety of placeholders like %qs, %<quotes%> and
62 %q#E.
63 """
64
65 msgid: str = msg.msgid
66
67 def outside_quotes(m: Match[str]):
68 before = msgid[:m.start(0)]
69 return before.count('%<') == before.count('%>')
70
71 def lint_matching_placeholders():
72 """
73 Warns when literal values in placeholders are not exactly equal
74 in the translation. This can happen when doing copy-and-paste
75 translations of similar messages.
76
77 To avoid these mismatches in the first place,
78 structurally equal messages are found by
79 lint_diagnostics_differing_only_in_placeholders.
80
81 This check only applies when checking a finished translation
82 such as de.po, not gcc.pot.
83 """
84
85 if not msg.translated():
86 return
87
88 in_msgid = re.findall('%<[^%]+%>', msgid)
89 in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
90
91 if set(in_msgid) != set(in_msgstr):
92 warn(msg,
93 'placeholder-mismatch',
94 f'placeholder mismatch: msgid has {in_msgid}, '
95 f'msgstr has {in_msgstr}',
96 include_msgid=False)
97
98 def lint_option_outside_quotes():
99 for match in re.finditer(r'\S+', msgid):
100 part = match.group()
101 if not outside_quotes(match):
102 continue
103
104 if part.startswith('-'):
105 if len(part) >= 2 and part[1].isalpha():
106 if part == '-INF':
107 continue
108
109 warn(msg,
110 'option-outside-quotes',
111 'command line option outside %<quotes%>')
112
113 if part.startswith('__builtin_'):
114 warn(msg,
115 'builtin-outside-quotes',
116 'builtin function outside %<quotes%>')
117
118 def lint_plain_apostrophe():
119 for match in re.finditer("[^%]'", msgid):
120 if outside_quotes(match):
121 warn(msg, 'apostrophe', 'apostrophe without leading %')
122
123 def lint_space_before_quote():
124 """
125 A space before %< is often the result of string literals that
126 are joined by the C compiler and neither literal has a space
127 to separate the words.
128 """
129
130 for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid):
131 if match.group(1) != '%s':
132 warn(msg,
133 'no-space-before-quote',
134 '%< directly following a letter or digit')
135
136 def lint_underscore_outside_quotes():
137 """
138 An underscore outside of quotes is used in several contexts,
139 and many of them violate the GCC Guidelines for Diagnostics:
140
141 * names of GCC-internal compiler functions
142 * names of GCC-internal data structures
143 * static_cast and the like (which are legitimate)
144 """
145
146 for match in re.finditer('_', msgid):
147 if outside_quotes(match):
148 warn(msg,
149 'underscore-outside-quotes',
150 'underscore outside of %<quotes%>')
151 return
152
153 def lint_may_not():
154 """
155 The term "may not" may either mean "it could be the case"
156 or "should not". These two different meanings are sometimes
157 hard to tell apart.
158 """
159
160 if re.search(r'\bmay not\b', msgid):
161 warn(msg,
162 'ambiguous-may-not',
163 'the term "may not" is ambiguous')
164
165 def lint_unbalanced_quotes():
166 if msgid.count('%<') != msgid.count('%>'):
167 warn(msg,
168 'unbalanced-quotes',
169 'unbalanced %< and %> quotes')
170
171 if msg.translated():
172 if msg.msgstr.count('%<') != msg.msgstr.count('%>'):
173 warn(msg,
174 'unbalanced-quotes',
175 'unbalanced %< and %> quotes')
176
177 def lint_single_space_after_sentence():
178 """
179 After a sentence there should be two spaces.
180 """
181
182 if re.search(r'[.] [A-Z]', msgid):
183 warn(msg,
184 'single-space-after-sentence',
185 'single space after sentence')
186
187 def lint_non_canonical_quotes():
188 """
189 Catches %<%s%>, which can be written in the shorter form %qs.
190 """
191 match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
192 if match:
193 warn(msg,
194 'non-canonical-quotes',
195 f'placeholder {match.group()} should be written as %qs')
196
197 lint_option_outside_quotes()
198 lint_plain_apostrophe()
199 lint_space_before_quote()
200 lint_underscore_outside_quotes()
201 lint_may_not()
202 lint_unbalanced_quotes()
203 lint_matching_placeholders()
204 lint_single_space_after_sentence()
205 lint_non_canonical_quotes()
206
207
208 def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
209 """
210 Detects messages that are structurally the same, except that they
211 use different plain strings inside %<quotes%>. These messages can
212 be merged in order to prevent copy-and-paste mistakes by the
213 translators.
214
215 See bug 90119.
216 """
217
218 seen: Dict[str, polib.POEntry] = {}
219
220 for msg in po:
221 msg: polib.POEntry
222 msgid = msg.msgid
223
224 normalized = re.sub('%<[^%]+%>', '%qs', msgid)
225 if normalized not in seen:
226 seen[normalized] = msg
227 seen[msgid] = msg
228 continue
229
230 prev = seen[normalized]
231 warn(msg,
232 'same-pattern',
233 f'same pattern for {repr(msgid)} and '
234 f'{repr(prev.msgid)} in {location(prev)}',
235 include_msgid=False)
236
237
238 def lint_file(po: polib.POFile):
239 for msg in po:
240 msg: polib.POEntry
241
242 if not msg.obsolete and not msg.fuzzy:
243 if 'gcc-internal-format' in msg.flags:
244 lint_gcc_internal_format(msg)
245
246 lint_diagnostics_differing_only_in_placeholders(po)
247
248
249 def main():
250 parser = argparse.ArgumentParser(description='')
251 parser.add_argument('file', help='pot file')
252
253 args = parser.parse_args()
254
255 po = polib.pofile(args.file)
256 lint_file(po)
257
258 print()
259 print('summary:')
260 for entry in seen_warnings.most_common():
261 if entry[1] > 1:
262 print(f'{entry[1]}\t{entry[0]}')
263
264
265 if __name__ == '__main__':
266 main()