]> git.ipfire.org Git - thirdparty/glibc.git/blame - scripts/check-obsolete-constructs.py
Update copyright dates with scripts/update-copyrights
[thirdparty/glibc.git] / scripts / check-obsolete-constructs.py
CommitLineData
711a322a 1#! /usr/bin/python3
581c785b 2# Copyright (C) 2019-2022 Free Software Foundation, Inc.
711a322a
ZW
3# This file is part of the GNU C Library.
4#
5# The GNU C Library is free software; you can redistribute it and/or
6# modify it under the terms of the GNU Lesser General Public
7# License as published by the Free Software Foundation; either
8# version 2.1 of the License, or (at your option) any later version.
9#
10# The GNU C Library is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13# Lesser General Public License for more details.
14#
15# You should have received a copy of the GNU Lesser General Public
16# License along with the GNU C Library; if not, see
5a82c748 17# <https://www.gnu.org/licenses/>.
711a322a
ZW
18
19"""Verifies that installed headers do not use any obsolete constructs:
20 * legacy BSD typedefs superseded by <stdint.h>:
21 ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
22 (sys/types.h is allowed to _define_ these types, but not to use them
23 to define anything else).
24"""
25
26import argparse
27import collections
28import re
29import sys
30
31# Simplified lexical analyzer for C preprocessing tokens.
32# Does not implement trigraphs.
33# Does not implement backslash-newline in the middle of any lexical
34# item other than a string literal.
35# Does not implement universal-character-names in identifiers.
36# Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
37# Accepts non-ASCII characters only within comments and strings.
38
39# Caution: The order of the outermost alternation matters.
40# STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
41# BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
42# be last.
43# Caution: There should be no capturing groups other than the named
44# captures in the outermost alternation.
45
46# For reference, these are all of the C punctuators as of C11:
47# [ ] ( ) { } , ; ? ~
48# ! != * *= / /= ^ ^= = ==
49# # ##
50# % %= %> %: %:%:
51# & &= &&
52# | |= ||
53# + += ++
54# - -= -- ->
55# . ...
56# : :>
57# < <% <: << <<= <=
58# > >= >> >>=
59
60# The BAD_* tokens are not part of the official definition of pp-tokens;
61# they match unclosed strings, character constants, and block comments,
62# so that the regex engine doesn't have to backtrack all the way to the
63# beginning of a broken construct and then emit dozens of junk tokens.
64
65PP_TOKEN_RE_ = re.compile(r"""
66 (?P<STRING> \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
67 |(?P<BAD_STRING> \"(?:[^\"\\\r\n]|\\[ -~])*)
68 |(?P<CHARCONST> \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
69 |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
70 |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
71 |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
72 |(?P<LINE_COMMENT> //[^\r\n]*)
73 |(?P<IDENT> [_a-zA-Z][_a-zA-Z0-9]*)
74 |(?P<PP_NUMBER> \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
75 |(?P<PUNCTUATOR>
76 [,;?~(){}\[\]]
77 | [!*/^=]=?
78 | \#\#?
79 | %(?:[=>]|:(?:%:)?)?
80 | &[=&]?
81 |\|[=|]?
82 |\+[=+]?
83 | -[=->]?
84 |\.(?:\.\.)?
85 | :>?
86 | <(?:[%:]|<(?:=|<=?)?)?
87 | >(?:=|>=?)?)
88 |(?P<ESCNL> \\(?:\r|\n|\r\n))
89 |(?P<WHITESPACE> [ \t\n\r\v\f]+)
90 |(?P<OTHER> .)
91""", re.DOTALL | re.VERBOSE)
92
93HEADER_NAME_RE_ = re.compile(r"""
94 < [^>\r\n]+ >
95 | " [^"\r\n]+ "
96""", re.DOTALL | re.VERBOSE)
97
98ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")
99
100# based on the sample code in the Python re documentation
101Token_ = collections.namedtuple("Token", (
102 "kind", "text", "line", "column", "context"))
103Token_.__doc__ = """
104 One C preprocessing token, comment, or chunk of whitespace.
105 'kind' identifies the token type, which will be one of:
106 STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
107 PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
108 or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are
109 handled within tokenize_c, below.
110
111 'text' is the sequence of source characters making up the token;
112 no decoding whatsoever is performed.
113
114 'line' and 'column' give the position of the first character of the
115 token within the source file. They are both 1-based.
116
117 'context' indicates whether or not this token occurred within a
118 preprocessing directive; it will be None for running text,
119 '<null>' for the leading '#' of a directive line (because '#'
120 all by itself on a line is a "null directive"), or the name of
121 the directive for tokens within a directive line, starting with
122 the IDENT for the name itself.
123"""
124
125def tokenize_c(file_contents, reporter):
126 """Yield a series of Token objects, one for each preprocessing
127 token, comment, or chunk of whitespace within FILE_CONTENTS.
128 The REPORTER object is expected to have one method,
129 reporter.error(token, message), which will be called to
130 indicate a lexical error at the position of TOKEN.
131 If MESSAGE contains the four-character sequence '{!r}', that
132 is expected to be replaced by repr(token.text).
133 """
134
135 Token = Token_
136 PP_TOKEN_RE = PP_TOKEN_RE_
137 ENDLINE_RE = ENDLINE_RE_
138 HEADER_NAME_RE = HEADER_NAME_RE_
139
140 line_num = 1
141 line_start = 0
142 pos = 0
143 limit = len(file_contents)
144 directive = None
145 at_bol = True
146 while pos < limit:
147 if directive == "include":
148 mo = HEADER_NAME_RE.match(file_contents, pos)
149 if mo:
150 kind = "HEADER_NAME"
151 directive = "after_include"
152 else:
153 mo = PP_TOKEN_RE.match(file_contents, pos)
154 kind = mo.lastgroup
155 if kind != "WHITESPACE":
156 directive = "after_include"
157 else:
158 mo = PP_TOKEN_RE.match(file_contents, pos)
159 kind = mo.lastgroup
160
161 text = mo.group()
162 line = line_num
163 column = mo.start() - line_start
164 adj_line_start = 0
165 # only these kinds can contain a newline
166 if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
167 "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
168 for tmo in ENDLINE_RE.finditer(text):
169 line_num += 1
170 adj_line_start = tmo.end()
171 if adj_line_start:
172 line_start = mo.start() + adj_line_start
173
174 # Track whether or not we are scanning a preprocessing directive.
175 if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
176 at_bol = True
177 directive = None
178 else:
179 if kind == "PUNCTUATOR" and text == "#" and at_bol:
180 directive = "<null>"
181 elif kind == "IDENT" and directive == "<null>":
182 directive = text
183 at_bol = False
184
185 # Report ill-formed tokens and rewrite them as their well-formed
186 # equivalents, so downstream processing doesn't have to know about them.
187 # (Rewriting instead of discarding provides better error recovery.)
188 if kind == "BAD_BLOCK_COM":
189 reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
190 "unclosed block comment")
191 text += "*/"
192 kind = "BLOCK_COMMENT"
193 elif kind == "BAD_STRING":
194 reporter.error(Token("BAD_STRING", "", line, column+1, ""),
195 "unclosed string")
196 text += "\""
197 kind = "STRING"
198 elif kind == "BAD_CHARCONST":
199 reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
200 "unclosed char constant")
201 text += "'"
202 kind = "CHARCONST"
203
204 tok = Token(kind, text, line, column+1,
205 "include" if directive == "after_include" else directive)
206 # Do not complain about OTHER tokens inside macro definitions.
207 # $ and @ appear in macros defined by headers intended to be
208 # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
209 if kind == "OTHER" and directive != "define":
210 self.error(tok, "stray {!r} in program")
211
212 yield tok
213 pos = mo.end()
214
215#
216# Base and generic classes for individual checks.
217#
218
219class ConstructChecker:
220 """Scan a stream of C preprocessing tokens and possibly report
221 problems with them. The REPORTER object passed to __init__ has
222 one method, reporter.error(token, message), which should be
223 called to indicate a problem detected at the position of TOKEN.
224 If MESSAGE contains the four-character sequence '{!r}' then that
225 will be replaced with a textual representation of TOKEN.
226 """
227 def __init__(self, reporter):
228 self.reporter = reporter
229
230 def examine(self, tok):
231 """Called once for each token in a header file.
232 Call self.reporter.error if a problem is detected.
233 """
234 raise NotImplementedError
235
236 def eof(self):
237 """Called once at the end of the stream. Subclasses need only
238 override this if it might have something to do."""
239 pass
240
241class NoCheck(ConstructChecker):
242 """Generic checker class which doesn't do anything. Substitute this
243 class for a real checker when a particular check should be skipped
244 for some file."""
245
246 def examine(self, tok):
247 pass
248
249#
250# Check for obsolete type names.
251#
252
253# The obsolete type names we're looking for:
254OBSOLETE_TYPE_RE_ = re.compile(r"""\A
255 (__)?
256 ( quad_t
257 | u(?: short | int | long
258 | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
259\Z""", re.VERBOSE)
260
261class ObsoleteNotAllowed(ConstructChecker):
262 """Don't allow any use of the obsolete typedefs."""
263 def examine(self, tok):
264 if OBSOLETE_TYPE_RE_.match(tok.text):
265 self.reporter.error(tok, "use of {!r}")
266
267class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
268 """Allow definitions of the private versions of the
269 obsolete typedefs; that is, 'typedef [anything] __obsolete;'
270 """
271 def __init__(self, reporter):
272 super().__init__(reporter)
273 self.in_typedef = False
274 self.prev_token = None
275
276 def examine(self, tok):
277 # bits/types.h hides 'typedef' in a macro sometimes.
278 if (tok.kind == "IDENT"
279 and tok.text in ("typedef", "__STD_TYPE")
280 and tok.context is None):
281 self.in_typedef = True
282 elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
283 self.in_typedef = False
284 if self.prev_token.kind == "IDENT":
285 m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
286 if m and m.group(1) != "__":
287 self.reporter.error(self.prev_token, "use of {!r}")
288 self.prev_token = None
289 else:
290 self._check_prev()
291
292 self.prev_token = tok
293
294 def eof(self):
295 self._check_prev()
296
297 def _check_prev(self):
298 if (self.prev_token is not None
299 and self.prev_token.kind == "IDENT"
300 and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
301 self.reporter.error(self.prev_token, "use of {!r}")
302
303class ObsoletePublicDefinitionsAllowed(ConstructChecker):
304 """Allow definitions of the public versions of the obsolete
305 typedefs. Only specific forms of definition are allowed:
306
307 typedef __obsolete obsolete; // identifiers must agree
308 typedef __uintN_t u_intN_t; // N must agree
309 typedef unsigned long int ulong;
310 typedef unsigned short int ushort;
311 typedef unsigned int uint;
312 """
313 def __init__(self, reporter):
314 super().__init__(reporter)
315 self.typedef_tokens = []
316
317 def examine(self, tok):
318 if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
319 "LINE_COMMENT", "NL", "ESCNL"):
320 pass
321
322 elif (tok.kind == "IDENT" and tok.text == "typedef"
323 and tok.context is None):
324 if self.typedef_tokens:
325 self.reporter.error(tok, "typedef inside typedef")
326 self._reset()
327 self.typedef_tokens.append(tok)
328
329 elif tok.kind == "PUNCTUATOR" and tok.text == ";":
330 self._finish()
331
332 elif self.typedef_tokens:
333 self.typedef_tokens.append(tok)
334
335 def eof(self):
336 self._reset()
337
338 def _reset(self):
339 while self.typedef_tokens:
340 tok = self.typedef_tokens.pop(0)
341 if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
342 self.reporter.error(tok, "use of {!r}")
343
344 def _finish(self):
345 if not self.typedef_tokens: return
346 if self.typedef_tokens[-1].kind == "IDENT":
347 m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
348 if m:
349 if self._permissible_public_definition(m):
350 self.typedef_tokens.clear()
351 self._reset()
352
353 def _permissible_public_definition(self, m):
354 if m.group(1) == "__": return False
355 name = m.group(2)
356 toks = self.typedef_tokens
357 ntok = len(toks)
358 if ntok == 3 and toks[1].kind == "IDENT":
359 defn = toks[1].text
360 n = OBSOLETE_TYPE_RE_.match(defn)
361 if n and n.group(1) == "__" and n.group(2) == name:
362 return True
363
364 if (name[:5] == "u_int" and name[-2:] == "_t"
365 and defn[:6] == "__uint" and defn[-2:] == "_t"
366 and name[5:-2] == defn[6:-2]):
367 return True
368
369 return False
370
371 if (name == "ulong" and ntok == 5
372 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
373 and toks[2].kind == "IDENT" and toks[2].text == "long"
374 and toks[3].kind == "IDENT" and toks[3].text == "int"):
375 return True
376
377 if (name == "ushort" and ntok == 5
378 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
379 and toks[2].kind == "IDENT" and toks[2].text == "short"
380 and toks[3].kind == "IDENT" and toks[3].text == "int"):
381 return True
382
383 if (name == "uint" and ntok == 4
384 and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
385 and toks[2].kind == "IDENT" and toks[2].text == "int"):
386 return True
387
388 return False
389
390def ObsoleteTypedefChecker(reporter, fname):
391 """Factory: produce an instance of the appropriate
392 obsolete-typedef checker for FNAME."""
393
394 # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
395 # obsolete types, because it would be more trouble than it's
396 # worth to remove them from headers that we intend to stop
397 # installing eventually anyway.
398 if (fname.startswith("rpc/")
399 or fname.startswith("rpcsvc/")
400 or "/rpc/" in fname
401 or "/rpcsvc/" in fname):
402 return NoCheck(reporter)
403
404 # bits/types.h is allowed to define the __-versions of the
405 # obsolete types.
406 if (fname == "bits/types.h"
407 or fname.endswith("/bits/types.h")):
408 return ObsoletePrivateDefinitionsAllowed(reporter)
409
410 # sys/types.h is allowed to use the __-versions of the
411 # obsolete types, but only to define the unprefixed versions.
412 if (fname == "sys/types.h"
413 or fname.endswith("/sys/types.h")):
414 return ObsoletePublicDefinitionsAllowed(reporter)
415
416 return ObsoleteNotAllowed(reporter)
417
418#
419# Master control
420#
421
422class HeaderChecker:
423 """Perform all of the checks on each header. This is also the
424 "reporter" object expected by tokenize_c and ConstructChecker.
425 """
426 def __init__(self):
427 self.fname = None
428 self.status = 0
429
430 def error(self, tok, message):
431 self.status = 1
432 if '{!r}' in message:
433 message = message.format(tok.text)
434 sys.stderr.write("{}:{}:{}: error: {}\n".format(
435 self.fname, tok.line, tok.column, message))
436
437 def check(self, fname):
438 self.fname = fname
439 try:
97f8225d 440 with open(fname, "rt", encoding="utf-8") as fp:
711a322a
ZW
441 contents = fp.read()
442 except OSError as e:
443 sys.stderr.write("{}: {}\n".format(fname, e.strerror))
444 self.status = 1
445 return
446
447 typedef_checker = ObsoleteTypedefChecker(self, self.fname)
448
449 for tok in tokenize_c(contents, self):
450 typedef_checker.examine(tok)
451
452def main():
453 ap = argparse.ArgumentParser(description=__doc__)
454 ap.add_argument("headers", metavar="header", nargs="+",
455 help="one or more headers to scan for obsolete constructs")
456 args = ap.parse_args()
457
458 checker = HeaderChecker()
459 for fname in args.headers:
460 # Headers whose installed name begins with "finclude/" contain
461 # Fortran, not C, and this program should completely ignore them.
462 if not (fname.startswith("finclude/") or "/finclude/" in fname):
463 checker.check(fname)
464 sys.exit(checker.status)
465
466main()