]>
Commit | Line | Data |
---|---|---|
711a322a | 1 | #! /usr/bin/python3 |
581c785b | 2 | # Copyright (C) 2019-2022 Free Software Foundation, Inc. |
711a322a ZW |
3 | # This file is part of the GNU C Library. |
4 | # | |
5 | # The GNU C Library is free software; you can redistribute it and/or | |
6 | # modify it under the terms of the GNU Lesser General Public | |
7 | # License as published by the Free Software Foundation; either | |
8 | # version 2.1 of the License, or (at your option) any later version. | |
9 | # | |
10 | # The GNU C Library is distributed in the hope that it will be useful, | |
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
13 | # Lesser General Public License for more details. | |
14 | # | |
15 | # You should have received a copy of the GNU Lesser General Public | |
16 | # License along with the GNU C Library; if not, see | |
5a82c748 | 17 | # <https://www.gnu.org/licenses/>. |
711a322a ZW |
18 | |
19 | """Verifies that installed headers do not use any obsolete constructs: | |
20 | * legacy BSD typedefs superseded by <stdint.h>: | |
21 | ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t | |
22 | (sys/types.h is allowed to _define_ these types, but not to use them | |
23 | to define anything else). | |
24 | """ | |
25 | ||
26 | import argparse | |
27 | import collections | |
28 | import re | |
29 | import sys | |
30 | ||
31 | # Simplified lexical analyzer for C preprocessing tokens. | |
32 | # Does not implement trigraphs. | |
33 | # Does not implement backslash-newline in the middle of any lexical | |
34 | # item other than a string literal. | |
35 | # Does not implement universal-character-names in identifiers. | |
36 | # Treats prefixed strings (e.g. L"...") as two tokens (L and "...") | |
37 | # Accepts non-ASCII characters only within comments and strings. | |
38 | ||
39 | # Caution: The order of the outermost alternation matters. | |
40 | # STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST, | |
41 | # BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must | |
42 | # be last. | |
43 | # Caution: There should be no capturing groups other than the named | |
44 | # captures in the outermost alternation. | |
45 | ||
46 | # For reference, these are all of the C punctuators as of C11: | |
47 | # [ ] ( ) { } , ; ? ~ | |
48 | # ! != * *= / /= ^ ^= = == | |
49 | # # ## | |
50 | # % %= %> %: %:%: | |
51 | # & &= && | |
52 | # | |= || | |
53 | # + += ++ | |
54 | # - -= -- -> | |
55 | # . ... | |
56 | # : :> | |
57 | # < <% <: << <<= <= | |
58 | # > >= >> >>= | |
59 | ||
60 | # The BAD_* tokens are not part of the official definition of pp-tokens; | |
61 | # they match unclosed strings, character constants, and block comments, | |
62 | # so that the regex engine doesn't have to backtrack all the way to the | |
63 | # beginning of a broken construct and then emit dozens of junk tokens. | |
64 | ||
65 | PP_TOKEN_RE_ = re.compile(r""" | |
66 | (?P<STRING> \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\") | |
67 | |(?P<BAD_STRING> \"(?:[^\"\\\r\n]|\\[ -~])*) | |
68 | |(?P<CHARCONST> \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\') | |
69 | |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*) | |
70 | |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/) | |
71 | |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?) | |
72 | |(?P<LINE_COMMENT> //[^\r\n]*) | |
73 | |(?P<IDENT> [_a-zA-Z][_a-zA-Z0-9]*) | |
74 | |(?P<PP_NUMBER> \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*) | |
75 | |(?P<PUNCTUATOR> | |
76 | [,;?~(){}\[\]] | |
77 | | [!*/^=]=? | |
78 | | \#\#? | |
79 | | %(?:[=>]|:(?:%:)?)? | |
80 | | &[=&]? | |
81 | |\|[=|]? | |
82 | |\+[=+]? | |
83 | | -[=->]? | |
84 | |\.(?:\.\.)? | |
85 | | :>? | |
86 | | <(?:[%:]|<(?:=|<=?)?)? | |
87 | | >(?:=|>=?)?) | |
88 | |(?P<ESCNL> \\(?:\r|\n|\r\n)) | |
89 | |(?P<WHITESPACE> [ \t\n\r\v\f]+) | |
90 | |(?P<OTHER> .) | |
91 | """, re.DOTALL | re.VERBOSE) | |
92 | ||
93 | HEADER_NAME_RE_ = re.compile(r""" | |
94 | < [^>\r\n]+ > | |
95 | | " [^"\r\n]+ " | |
96 | """, re.DOTALL | re.VERBOSE) | |
97 | ||
98 | ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""") | |
99 | ||
100 | # based on the sample code in the Python re documentation | |
101 | Token_ = collections.namedtuple("Token", ( | |
102 | "kind", "text", "line", "column", "context")) | |
103 | Token_.__doc__ = """ | |
104 | One C preprocessing token, comment, or chunk of whitespace. | |
105 | 'kind' identifies the token type, which will be one of: | |
106 | STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT, | |
107 | PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME, | |
108 | or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are | |
109 | handled within tokenize_c, below. | |
110 | ||
111 | 'text' is the sequence of source characters making up the token; | |
112 | no decoding whatsoever is performed. | |
113 | ||
114 | 'line' and 'column' give the position of the first character of the | |
115 | token within the source file. They are both 1-based. | |
116 | ||
117 | 'context' indicates whether or not this token occurred within a | |
118 | preprocessing directive; it will be None for running text, | |
119 | '<null>' for the leading '#' of a directive line (because '#' | |
120 | all by itself on a line is a "null directive"), or the name of | |
121 | the directive for tokens within a directive line, starting with | |
122 | the IDENT for the name itself. | |
123 | """ | |
124 | ||
125 | def tokenize_c(file_contents, reporter): | |
126 | """Yield a series of Token objects, one for each preprocessing | |
127 | token, comment, or chunk of whitespace within FILE_CONTENTS. | |
128 | The REPORTER object is expected to have one method, | |
129 | reporter.error(token, message), which will be called to | |
130 | indicate a lexical error at the position of TOKEN. | |
131 | If MESSAGE contains the four-character sequence '{!r}', that | |
132 | is expected to be replaced by repr(token.text). | |
133 | """ | |
134 | ||
135 | Token = Token_ | |
136 | PP_TOKEN_RE = PP_TOKEN_RE_ | |
137 | ENDLINE_RE = ENDLINE_RE_ | |
138 | HEADER_NAME_RE = HEADER_NAME_RE_ | |
139 | ||
140 | line_num = 1 | |
141 | line_start = 0 | |
142 | pos = 0 | |
143 | limit = len(file_contents) | |
144 | directive = None | |
145 | at_bol = True | |
146 | while pos < limit: | |
147 | if directive == "include": | |
148 | mo = HEADER_NAME_RE.match(file_contents, pos) | |
149 | if mo: | |
150 | kind = "HEADER_NAME" | |
151 | directive = "after_include" | |
152 | else: | |
153 | mo = PP_TOKEN_RE.match(file_contents, pos) | |
154 | kind = mo.lastgroup | |
155 | if kind != "WHITESPACE": | |
156 | directive = "after_include" | |
157 | else: | |
158 | mo = PP_TOKEN_RE.match(file_contents, pos) | |
159 | kind = mo.lastgroup | |
160 | ||
161 | text = mo.group() | |
162 | line = line_num | |
163 | column = mo.start() - line_start | |
164 | adj_line_start = 0 | |
165 | # only these kinds can contain a newline | |
166 | if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT", | |
167 | "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"): | |
168 | for tmo in ENDLINE_RE.finditer(text): | |
169 | line_num += 1 | |
170 | adj_line_start = tmo.end() | |
171 | if adj_line_start: | |
172 | line_start = mo.start() + adj_line_start | |
173 | ||
174 | # Track whether or not we are scanning a preprocessing directive. | |
175 | if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start): | |
176 | at_bol = True | |
177 | directive = None | |
178 | else: | |
179 | if kind == "PUNCTUATOR" and text == "#" and at_bol: | |
180 | directive = "<null>" | |
181 | elif kind == "IDENT" and directive == "<null>": | |
182 | directive = text | |
183 | at_bol = False | |
184 | ||
185 | # Report ill-formed tokens and rewrite them as their well-formed | |
186 | # equivalents, so downstream processing doesn't have to know about them. | |
187 | # (Rewriting instead of discarding provides better error recovery.) | |
188 | if kind == "BAD_BLOCK_COM": | |
189 | reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""), | |
190 | "unclosed block comment") | |
191 | text += "*/" | |
192 | kind = "BLOCK_COMMENT" | |
193 | elif kind == "BAD_STRING": | |
194 | reporter.error(Token("BAD_STRING", "", line, column+1, ""), | |
195 | "unclosed string") | |
196 | text += "\"" | |
197 | kind = "STRING" | |
198 | elif kind == "BAD_CHARCONST": | |
199 | reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""), | |
200 | "unclosed char constant") | |
201 | text += "'" | |
202 | kind = "CHARCONST" | |
203 | ||
204 | tok = Token(kind, text, line, column+1, | |
205 | "include" if directive == "after_include" else directive) | |
206 | # Do not complain about OTHER tokens inside macro definitions. | |
207 | # $ and @ appear in macros defined by headers intended to be | |
208 | # included from assembly language, e.g. sysdeps/mips/sys/asm.h. | |
209 | if kind == "OTHER" and directive != "define": | |
210 | self.error(tok, "stray {!r} in program") | |
211 | ||
212 | yield tok | |
213 | pos = mo.end() | |
214 | ||
215 | # | |
216 | # Base and generic classes for individual checks. | |
217 | # | |
218 | ||
219 | class ConstructChecker: | |
220 | """Scan a stream of C preprocessing tokens and possibly report | |
221 | problems with them. The REPORTER object passed to __init__ has | |
222 | one method, reporter.error(token, message), which should be | |
223 | called to indicate a problem detected at the position of TOKEN. | |
224 | If MESSAGE contains the four-character sequence '{!r}' then that | |
225 | will be replaced with a textual representation of TOKEN. | |
226 | """ | |
227 | def __init__(self, reporter): | |
228 | self.reporter = reporter | |
229 | ||
230 | def examine(self, tok): | |
231 | """Called once for each token in a header file. | |
232 | Call self.reporter.error if a problem is detected. | |
233 | """ | |
234 | raise NotImplementedError | |
235 | ||
236 | def eof(self): | |
237 | """Called once at the end of the stream. Subclasses need only | |
238 | override this if it might have something to do.""" | |
239 | pass | |
240 | ||
241 | class NoCheck(ConstructChecker): | |
242 | """Generic checker class which doesn't do anything. Substitute this | |
243 | class for a real checker when a particular check should be skipped | |
244 | for some file.""" | |
245 | ||
246 | def examine(self, tok): | |
247 | pass | |
248 | ||
249 | # | |
250 | # Check for obsolete type names. | |
251 | # | |
252 | ||
253 | # The obsolete type names we're looking for: | |
254 | OBSOLETE_TYPE_RE_ = re.compile(r"""\A | |
255 | (__)? | |
256 | ( quad_t | |
257 | | u(?: short | int | long | |
258 | | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t ))) | |
259 | \Z""", re.VERBOSE) | |
260 | ||
261 | class ObsoleteNotAllowed(ConstructChecker): | |
262 | """Don't allow any use of the obsolete typedefs.""" | |
263 | def examine(self, tok): | |
264 | if OBSOLETE_TYPE_RE_.match(tok.text): | |
265 | self.reporter.error(tok, "use of {!r}") | |
266 | ||
267 | class ObsoletePrivateDefinitionsAllowed(ConstructChecker): | |
268 | """Allow definitions of the private versions of the | |
269 | obsolete typedefs; that is, 'typedef [anything] __obsolete;' | |
270 | """ | |
271 | def __init__(self, reporter): | |
272 | super().__init__(reporter) | |
273 | self.in_typedef = False | |
274 | self.prev_token = None | |
275 | ||
276 | def examine(self, tok): | |
277 | # bits/types.h hides 'typedef' in a macro sometimes. | |
278 | if (tok.kind == "IDENT" | |
279 | and tok.text in ("typedef", "__STD_TYPE") | |
280 | and tok.context is None): | |
281 | self.in_typedef = True | |
282 | elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef: | |
283 | self.in_typedef = False | |
284 | if self.prev_token.kind == "IDENT": | |
285 | m = OBSOLETE_TYPE_RE_.match(self.prev_token.text) | |
286 | if m and m.group(1) != "__": | |
287 | self.reporter.error(self.prev_token, "use of {!r}") | |
288 | self.prev_token = None | |
289 | else: | |
290 | self._check_prev() | |
291 | ||
292 | self.prev_token = tok | |
293 | ||
294 | def eof(self): | |
295 | self._check_prev() | |
296 | ||
297 | def _check_prev(self): | |
298 | if (self.prev_token is not None | |
299 | and self.prev_token.kind == "IDENT" | |
300 | and OBSOLETE_TYPE_RE_.match(self.prev_token.text)): | |
301 | self.reporter.error(self.prev_token, "use of {!r}") | |
302 | ||
303 | class ObsoletePublicDefinitionsAllowed(ConstructChecker): | |
304 | """Allow definitions of the public versions of the obsolete | |
305 | typedefs. Only specific forms of definition are allowed: | |
306 | ||
307 | typedef __obsolete obsolete; // identifiers must agree | |
308 | typedef __uintN_t u_intN_t; // N must agree | |
309 | typedef unsigned long int ulong; | |
310 | typedef unsigned short int ushort; | |
311 | typedef unsigned int uint; | |
312 | """ | |
313 | def __init__(self, reporter): | |
314 | super().__init__(reporter) | |
315 | self.typedef_tokens = [] | |
316 | ||
317 | def examine(self, tok): | |
318 | if tok.kind in ("WHITESPACE", "BLOCK_COMMENT", | |
319 | "LINE_COMMENT", "NL", "ESCNL"): | |
320 | pass | |
321 | ||
322 | elif (tok.kind == "IDENT" and tok.text == "typedef" | |
323 | and tok.context is None): | |
324 | if self.typedef_tokens: | |
325 | self.reporter.error(tok, "typedef inside typedef") | |
326 | self._reset() | |
327 | self.typedef_tokens.append(tok) | |
328 | ||
329 | elif tok.kind == "PUNCTUATOR" and tok.text == ";": | |
330 | self._finish() | |
331 | ||
332 | elif self.typedef_tokens: | |
333 | self.typedef_tokens.append(tok) | |
334 | ||
335 | def eof(self): | |
336 | self._reset() | |
337 | ||
338 | def _reset(self): | |
339 | while self.typedef_tokens: | |
340 | tok = self.typedef_tokens.pop(0) | |
341 | if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text): | |
342 | self.reporter.error(tok, "use of {!r}") | |
343 | ||
344 | def _finish(self): | |
345 | if not self.typedef_tokens: return | |
346 | if self.typedef_tokens[-1].kind == "IDENT": | |
347 | m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text) | |
348 | if m: | |
349 | if self._permissible_public_definition(m): | |
350 | self.typedef_tokens.clear() | |
351 | self._reset() | |
352 | ||
353 | def _permissible_public_definition(self, m): | |
354 | if m.group(1) == "__": return False | |
355 | name = m.group(2) | |
356 | toks = self.typedef_tokens | |
357 | ntok = len(toks) | |
358 | if ntok == 3 and toks[1].kind == "IDENT": | |
359 | defn = toks[1].text | |
360 | n = OBSOLETE_TYPE_RE_.match(defn) | |
361 | if n and n.group(1) == "__" and n.group(2) == name: | |
362 | return True | |
363 | ||
364 | if (name[:5] == "u_int" and name[-2:] == "_t" | |
365 | and defn[:6] == "__uint" and defn[-2:] == "_t" | |
366 | and name[5:-2] == defn[6:-2]): | |
367 | return True | |
368 | ||
369 | return False | |
370 | ||
371 | if (name == "ulong" and ntok == 5 | |
372 | and toks[1].kind == "IDENT" and toks[1].text == "unsigned" | |
373 | and toks[2].kind == "IDENT" and toks[2].text == "long" | |
374 | and toks[3].kind == "IDENT" and toks[3].text == "int"): | |
375 | return True | |
376 | ||
377 | if (name == "ushort" and ntok == 5 | |
378 | and toks[1].kind == "IDENT" and toks[1].text == "unsigned" | |
379 | and toks[2].kind == "IDENT" and toks[2].text == "short" | |
380 | and toks[3].kind == "IDENT" and toks[3].text == "int"): | |
381 | return True | |
382 | ||
383 | if (name == "uint" and ntok == 4 | |
384 | and toks[1].kind == "IDENT" and toks[1].text == "unsigned" | |
385 | and toks[2].kind == "IDENT" and toks[2].text == "int"): | |
386 | return True | |
387 | ||
388 | return False | |
389 | ||
390 | def ObsoleteTypedefChecker(reporter, fname): | |
391 | """Factory: produce an instance of the appropriate | |
392 | obsolete-typedef checker for FNAME.""" | |
393 | ||
394 | # The obsolete rpc/ and rpcsvc/ headers are allowed to use the | |
395 | # obsolete types, because it would be more trouble than it's | |
396 | # worth to remove them from headers that we intend to stop | |
397 | # installing eventually anyway. | |
398 | if (fname.startswith("rpc/") | |
399 | or fname.startswith("rpcsvc/") | |
400 | or "/rpc/" in fname | |
401 | or "/rpcsvc/" in fname): | |
402 | return NoCheck(reporter) | |
403 | ||
404 | # bits/types.h is allowed to define the __-versions of the | |
405 | # obsolete types. | |
406 | if (fname == "bits/types.h" | |
407 | or fname.endswith("/bits/types.h")): | |
408 | return ObsoletePrivateDefinitionsAllowed(reporter) | |
409 | ||
410 | # sys/types.h is allowed to use the __-versions of the | |
411 | # obsolete types, but only to define the unprefixed versions. | |
412 | if (fname == "sys/types.h" | |
413 | or fname.endswith("/sys/types.h")): | |
414 | return ObsoletePublicDefinitionsAllowed(reporter) | |
415 | ||
416 | return ObsoleteNotAllowed(reporter) | |
417 | ||
418 | # | |
419 | # Master control | |
420 | # | |
421 | ||
422 | class HeaderChecker: | |
423 | """Perform all of the checks on each header. This is also the | |
424 | "reporter" object expected by tokenize_c and ConstructChecker. | |
425 | """ | |
426 | def __init__(self): | |
427 | self.fname = None | |
428 | self.status = 0 | |
429 | ||
430 | def error(self, tok, message): | |
431 | self.status = 1 | |
432 | if '{!r}' in message: | |
433 | message = message.format(tok.text) | |
434 | sys.stderr.write("{}:{}:{}: error: {}\n".format( | |
435 | self.fname, tok.line, tok.column, message)) | |
436 | ||
437 | def check(self, fname): | |
438 | self.fname = fname | |
439 | try: | |
97f8225d | 440 | with open(fname, "rt", encoding="utf-8") as fp: |
711a322a ZW |
441 | contents = fp.read() |
442 | except OSError as e: | |
443 | sys.stderr.write("{}: {}\n".format(fname, e.strerror)) | |
444 | self.status = 1 | |
445 | return | |
446 | ||
447 | typedef_checker = ObsoleteTypedefChecker(self, self.fname) | |
448 | ||
449 | for tok in tokenize_c(contents, self): | |
450 | typedef_checker.examine(tok) | |
451 | ||
452 | def main(): | |
453 | ap = argparse.ArgumentParser(description=__doc__) | |
454 | ap.add_argument("headers", metavar="header", nargs="+", | |
455 | help="one or more headers to scan for obsolete constructs") | |
456 | args = ap.parse_args() | |
457 | ||
458 | checker = HeaderChecker() | |
459 | for fname in args.headers: | |
460 | # Headers whose installed name begins with "finclude/" contain | |
461 | # Fortran, not C, and this program should completely ignore them. | |
462 | if not (fname.startswith("finclude/") or "/finclude/" in fname): | |
463 | checker.check(fname) | |
464 | sys.exit(checker.status) | |
465 | ||
466 | main() |