[thirdparty/glibc.git] / scripts / check-obsolete-constructs.py

#! /usr/bin/python3
# Copyright (C) 2019-2022 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <https://www.gnu.org/licenses/>.

"""Verifies that installed headers do not use any obsolete constructs:
 * legacy BSD typedefs superseded by <stdint.h>:
   ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
   (sys/types.h is allowed to _define_ these types, but not to use them
    to define anything else).
"""

import argparse
import collections
import re
import sys

# Simplified lexical analyzer for C preprocessing tokens.
# Does not implement trigraphs.
# Does not implement backslash-newline in the middle of any lexical
#   item other than a string literal.
# Does not implement universal-character-names in identifiers.
# Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
# Accepts non-ASCII characters only within comments and strings.

# Caution: The order of the outermost alternation matters.
# STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
# BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
# be last.
# Caution: There should be no capturing groups other than the named
# captures in the outermost alternation.

# For reference, these are all of the C punctuators as of C11:
#   [ ] ( ) { } , ; ? ~
#   ! != * *= / /= ^ ^= = ==
#   # ##
#   % %= %> %: %:%:
#   & &= &&
#   | |= ||
#   + += ++
#   - -= -- ->
#   . ...
#   : :>
#   < <% <: << <<= <=
#   > >= >> >>=

# The BAD_* tokens are not part of the official definition of pp-tokens;
# they match unclosed strings, character constants, and block comments,
# so that the regex engine doesn't have to backtrack all the way to the
# beginning of a broken construct and then emit dozens of junk tokens.

PP_TOKEN_RE_ = re.compile(r"""
    (?P<STRING>        \"(?:[^\"\\\r\n]|\\(?:[\r\n -~]|\r\n))*\")
   |(?P<BAD_STRING>    \"(?:[^\"\\\r\n]|\\[ -~])*)
   |(?P<CHARCONST>     \'(?:[^\'\\\r\n]|\\(?:[\r\n -~]|\r\n))*\')
   |(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]|\\[ -~])*)
   |(?P<BLOCK_COMMENT> /\*(?:\*(?!/)|[^*])*\*/)
   |(?P<BAD_BLOCK_COM> /\*(?:\*(?!/)|[^*])*\*?)
   |(?P<LINE_COMMENT>  //[^\r\n]*)
   |(?P<IDENT>         [_a-zA-Z][_a-zA-Z0-9]*)
   |(?P<PP_NUMBER>     \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]|[eEpP][+-]?)*)
   |(?P<PUNCTUATOR>
       [,;?~(){}\[\]]
     | [!*/^=]=?
     | \#\#?
     | %(?:[=>]|:(?:%:)?)?
     | &[=&]?
     |\|[=|]?
     |\+[=+]?
     | -[=->]?
     |\.(?:\.\.)?
     | :>?
     | <(?:[%:]|<(?:=|<=?)?)?
     | >(?:=|>=?)?)
   |(?P<ESCNL>         \\(?:\r|\n|\r\n))
   |(?P<WHITESPACE>    [ \t\n\r\v\f]+)
   |(?P<OTHER>         .)
""", re.DOTALL | re.VERBOSE)

HEADER_NAME_RE_ = re.compile(r"""
    < [^>\r\n]+ >
  | " [^"\r\n]+ "
""", re.DOTALL | re.VERBOSE)

ENDLINE_RE_ = re.compile(r"""\r|\n|\r\n""")

# based on the sample code in the Python re documentation
Token_ = collections.namedtuple("Token", (
    "kind", "text", "line", "column", "context"))
Token_.__doc__ = """
   One C preprocessing token, comment, or chunk of whitespace.
   'kind' identifies the token type, which will be one of:
       STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
       PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
       or OTHER.  The BAD_* alternatives in PP_TOKEN_RE_ are
       handled within tokenize_c, below.

   'text' is the sequence of source characters making up the token;
       no decoding whatsoever is performed.

   'line' and 'column' give the position of the first character of the
      token within the source file.  They are both 1-based.

   'context' indicates whether or not this token occurred within a
      preprocessing directive; it will be None for running text,
      '<null>' for the leading '#' of a directive line (because '#'
      all by itself on a line is a "null directive"), or the name of
      the directive for tokens within a directive line, starting with
      the IDENT for the name itself.
"""

def tokenize_c(file_contents, reporter):
    """Yield a series of Token objects, one for each preprocessing
       token, comment, or chunk of whitespace within FILE_CONTENTS.
       The REPORTER object is expected to have one method,
       reporter.error(token, message), which will be called to
       indicate a lexical error at the position of TOKEN.
       If MESSAGE contains the four-character sequence '{!r}', that
       is expected to be replaced by repr(token.text).
    """

    Token = Token_
    PP_TOKEN_RE = PP_TOKEN_RE_
    ENDLINE_RE = ENDLINE_RE_
    HEADER_NAME_RE = HEADER_NAME_RE_

    line_num = 1
    line_start = 0
    pos = 0
    limit = len(file_contents)
    directive = None
    at_bol = True
    while pos < limit:
        if directive == "include":
            mo = HEADER_NAME_RE.match(file_contents, pos)
            if mo:
                kind = "HEADER_NAME"
                directive = "after_include"
            else:
                mo = PP_TOKEN_RE.match(file_contents, pos)
                kind = mo.lastgroup
                if kind != "WHITESPACE":
                    directive = "after_include"
        else:
            mo = PP_TOKEN_RE.match(file_contents, pos)
            kind = mo.lastgroup

        text = mo.group()
        line = line_num
        column = mo.start() - line_start
        adj_line_start = 0
        # only these kinds can contain a newline
        if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
                    "STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
            for tmo in ENDLINE_RE.finditer(text):
                line_num += 1
                adj_line_start = tmo.end()
            if adj_line_start:
                line_start = mo.start() + adj_line_start

        # Track whether or not we are scanning a preprocessing directive.
        if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
            at_bol = True
            directive = None
        else:
            if kind == "PUNCTUATOR" and text == "#" and at_bol:
                directive = "<null>"
            elif kind == "IDENT" and directive == "<null>":
                directive = text
            at_bol = False

        # Report ill-formed tokens and rewrite them as their well-formed
        # equivalents, so downstream processing doesn't have to know about them.
        # (Rewriting instead of discarding provides better error recovery.)
        if kind == "BAD_BLOCK_COM":
            reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
                           "unclosed block comment")
            text += "*/"
            kind = "BLOCK_COMMENT"
        elif kind == "BAD_STRING":
            reporter.error(Token("BAD_STRING", "", line, column+1, ""),
                           "unclosed string")
            text += "\""
            kind = "STRING"
        elif kind == "BAD_CHARCONST":
            reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
                           "unclosed char constant")
            text += "'"
            kind = "CHARCONST"

        tok = Token(kind, text, line, column+1,
                    "include" if directive == "after_include" else directive)
        # Do not complain about OTHER tokens inside macro definitions.
        # $ and @ appear in macros defined by headers intended to be
        # included from assembly language, e.g. sysdeps/mips/sys/asm.h.
        if kind == "OTHER" and directive != "define":
            self.error(tok, "stray {!r} in program")

        yield tok
        pos = mo.end()

#
# Base and generic classes for individual checks.
#

class ConstructChecker:
    """Scan a stream of C preprocessing tokens and possibly report
       problems with them.  The REPORTER object passed to __init__ has
       one method, reporter.error(token, message), which should be
       called to indicate a problem detected at the position of TOKEN.
       If MESSAGE contains the four-character sequence '{!r}' then that
       will be replaced with a textual representation of TOKEN.
    """
    def __init__(self, reporter):
        self.reporter = reporter

    def examine(self, tok):
        """Called once for each token in a header file.
           Call self.reporter.error if a problem is detected.
        """
        raise NotImplementedError

    def eof(self):
        """Called once at the end of the stream.  Subclasses need only
           override this if it might have something to do."""
        pass

class NoCheck(ConstructChecker):
    """Generic checker class which doesn't do anything.  Substitute this
       class for a real checker when a particular check should be skipped
       for some file."""

    def examine(self, tok):
        pass

#
# Check for obsolete type names.
#

# The obsolete type names we're looking for:
OBSOLETE_TYPE_RE_ = re.compile(r"""\A
  (__)?
  (   quad_t
    | u(?: short | int | long
         | _(?: char | short | int(?:[0-9]+_t)? | long | quad_t )))
\Z""", re.VERBOSE)

class ObsoleteNotAllowed(ConstructChecker):
    """Don't allow any use of the obsolete typedefs."""
    def examine(self, tok):
        if OBSOLETE_TYPE_RE_.match(tok.text):
            self.reporter.error(tok, "use of {!r}")

class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
    """Allow definitions of the private versions of the
       obsolete typedefs; that is, 'typedef [anything] __obsolete;'
    """
    def __init__(self, reporter):
        super().__init__(reporter)
        self.in_typedef = False
        self.prev_token = None

    def examine(self, tok):
        # bits/types.h hides 'typedef' in a macro sometimes.
        if (tok.kind == "IDENT"
            and tok.text in ("typedef", "__STD_TYPE")
            and tok.context is None):
            self.in_typedef = True
        elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
            self.in_typedef = False
            if self.prev_token.kind == "IDENT":
                m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
                if m and m.group(1) != "__":
                    self.reporter.error(self.prev_token, "use of {!r}")
            self.prev_token = None
        else:
            self._check_prev()

        self.prev_token = tok

    def eof(self):
        self._check_prev()

    def _check_prev(self):
        if (self.prev_token is not None
            and self.prev_token.kind == "IDENT"
            and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
            self.reporter.error(self.prev_token, "use of {!r}")

class ObsoletePublicDefinitionsAllowed(ConstructChecker):
    """Allow definitions of the public versions of the obsolete
       typedefs.  Only specific forms of definition are allowed:

           typedef __obsolete obsolete;  // identifiers must agree
           typedef __uintN_t u_intN_t;   // N must agree
           typedef unsigned long int ulong;
           typedef unsigned short int ushort;
           typedef unsigned int uint;
    """
    def __init__(self, reporter):
        super().__init__(reporter)
        self.typedef_tokens = []

    def examine(self, tok):
        if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
                        "LINE_COMMENT", "NL", "ESCNL"):
            pass

        elif (tok.kind == "IDENT" and tok.text == "typedef"
              and tok.context is None):
            if self.typedef_tokens:
                self.reporter.error(tok, "typedef inside typedef")
                self._reset()
            self.typedef_tokens.append(tok)

        elif tok.kind == "PUNCTUATOR" and tok.text == ";":
            self._finish()

        elif self.typedef_tokens:
            self.typedef_tokens.append(tok)

    def eof(self):
        self._reset()

    def _reset(self):
        while self.typedef_tokens:
            tok = self.typedef_tokens.pop(0)
            if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
                self.reporter.error(tok, "use of {!r}")

    def _finish(self):
        if not self.typedef_tokens: return
        if self.typedef_tokens[-1].kind == "IDENT":
            m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
            if m:
                if self._permissible_public_definition(m):
                    self.typedef_tokens.clear()
        self._reset()

    def _permissible_public_definition(self, m):
        if m.group(1) == "__": return False
        name = m.group(2)
        toks = self.typedef_tokens
        ntok = len(toks)
        if ntok == 3 and toks[1].kind == "IDENT":
            defn = toks[1].text
            n = OBSOLETE_TYPE_RE_.match(defn)
            if n and n.group(1) == "__" and n.group(2) == name:
                return True

            if (name[:5] == "u_int" and name[-2:] == "_t"
                and defn[:6] == "__uint" and defn[-2:] == "_t"
                and name[5:-2] == defn[6:-2]):
                return True

            return False

        if (name == "ulong" and ntok == 5
            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
            and toks[2].kind == "IDENT" and toks[2].text == "long"
            and toks[3].kind == "IDENT" and toks[3].text == "int"):
            return True

        if (name == "ushort" and ntok == 5
            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
            and toks[2].kind == "IDENT" and toks[2].text == "short"
            and toks[3].kind == "IDENT" and toks[3].text == "int"):
            return True

        if (name == "uint" and ntok == 4
            and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
            and toks[2].kind == "IDENT" and toks[2].text == "int"):
            return True

        return False

def ObsoleteTypedefChecker(reporter, fname):
    """Factory: produce an instance of the appropriate
       obsolete-typedef checker for FNAME."""

    # The obsolete rpc/ and rpcsvc/ headers are allowed to use the
    # obsolete types, because it would be more trouble than it's
    # worth to remove them from headers that we intend to stop
    # installing eventually anyway.
    if (fname.startswith("rpc/")
        or fname.startswith("rpcsvc/")
        or "/rpc/" in fname
        or "/rpcsvc/" in fname):
        return NoCheck(reporter)

    # bits/types.h is allowed to define the __-versions of the
    # obsolete types.
    if (fname == "bits/types.h"
        or fname.endswith("/bits/types.h")):
        return ObsoletePrivateDefinitionsAllowed(reporter)

    # sys/types.h is allowed to use the __-versions of the
    # obsolete types, but only to define the unprefixed versions.
    if (fname == "sys/types.h"
        or fname.endswith("/sys/types.h")):
        return ObsoletePublicDefinitionsAllowed(reporter)

    return ObsoleteNotAllowed(reporter)

#
# Master control
#

class HeaderChecker:
    """Perform all of the checks on each header.  This is also the
       "reporter" object expected by tokenize_c and ConstructChecker.
    """
    def __init__(self):
        self.fname = None
        self.status = 0

    def error(self, tok, message):
        self.status = 1
        if '{!r}' in message:
            message = message.format(tok.text)
        sys.stderr.write("{}:{}:{}: error: {}\n".format(
            self.fname, tok.line, tok.column, message))

    def check(self, fname):
        self.fname = fname
        try:
            with open(fname, "rt", encoding="utf-8") as fp:
                contents = fp.read()
        except OSError as e:
            sys.stderr.write("{}: {}\n".format(fname, e.strerror))
            self.status = 1
            return

        typedef_checker = ObsoleteTypedefChecker(self, self.fname)

        for tok in tokenize_c(contents, self):
            typedef_checker.examine(tok)

def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("headers", metavar="header", nargs="+",
                    help="one or more headers to scan for obsolete constructs")
    args = ap.parse_args()

    checker = HeaderChecker()
    for fname in args.headers:
        # Headers whose installed name begins with "finclude/" contain
        # Fortran, not C, and this program should completely ignore them.
        if not (fname.startswith("finclude/") or "/finclude/" in fname):
            checker.check(fname)
    sys.exit(checker.status)

main()
Commit	Line	Data
711a322a	1	#! /usr/bin/python3
581c785b	2	# Copyright (C) 2019-2022 Free Software Foundation, Inc.
711a322a ZW	3	# This file is part of the GNU C Library.
	4	#
	5	# The GNU C Library is free software; you can redistribute it and/or
	6	# modify it under the terms of the GNU Lesser General Public
	7	# License as published by the Free Software Foundation; either
	8	# version 2.1 of the License, or (at your option) any later version.
	9	#
	10	# The GNU C Library is distributed in the hope that it will be useful,
	11	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	13	# Lesser General Public License for more details.
	14	#
	15	# You should have received a copy of the GNU Lesser General Public
	16	# License along with the GNU C Library; if not, see
5a82c748	17	# <https://www.gnu.org/licenses/>.
711a322a ZW	18
	19	"""Verifies that installed headers do not use any obsolete constructs:
	20	* legacy BSD typedefs superseded by <stdint.h>:
	21	ushort uint ulong u_char u_short u_int u_long u_intNN_t quad_t u_quad_t
	22	(sys/types.h is allowed to _define_ these types, but not to use them
	23	to define anything else).
	24	"""
	25
	26	import argparse
	27	import collections
	28	import re
	29	import sys
	30
	31	# Simplified lexical analyzer for C preprocessing tokens.
	32	# Does not implement trigraphs.
	33	# Does not implement backslash-newline in the middle of any lexical
	34	# item other than a string literal.
	35	# Does not implement universal-character-names in identifiers.
	36	# Treats prefixed strings (e.g. L"...") as two tokens (L and "...")
	37	# Accepts non-ASCII characters only within comments and strings.
	38
	39	# Caution: The order of the outermost alternation matters.
	40	# STRING must be before BAD_STRING, CHARCONST before BAD_CHARCONST,
	41	# BLOCK_COMMENT before BAD_BLOCK_COM before PUNCTUATOR, and OTHER must
	42	# be last.
	43	# Caution: There should be no capturing groups other than the named
	44	# captures in the outermost alternation.
	45
	46	# For reference, these are all of the C punctuators as of C11:
	47	# [ ] ( ) { } , ; ? ~
	48	# ! != * *= / /= ^ ^= = ==
	49	# # ##
	50	# % %= %> %: %:%:
	51	# & &= &&
	52	# \| \|= \|\|
	53	# + += ++
	54	# - -= -- ->
	55	# . ...
	56	# : :>
	57	# < <% <: << <<= <=
	58	# > >= >> >>=
	59
	60	# The BAD_* tokens are not part of the official definition of pp-tokens;
	61	# they match unclosed strings, character constants, and block comments,
	62	# so that the regex engine doesn't have to backtrack all the way to the
	63	# beginning of a broken construct and then emit dozens of junk tokens.
	64
	65	PP_TOKEN_RE_ = re.compile(r"""
	66	(?P<STRING> \"(?:[^\"\\\r\n]\|\\(?:[\r\n -~]\|\r\n))*\")
	67	\|(?P<BAD_STRING> \"(?:[^\"\\\r\n]\|\\[ -~])*)
	68	\|(?P<CHARCONST> \'(?:[^\'\\\r\n]\|\\(?:[\r\n -~]\|\r\n))*\')
	69	\|(?P<BAD_CHARCONST> \'(?:[^\'\\\r\n]\|\\[ -~])*)
	70	\|(?P<BLOCK_COMMENT> /\(?:\(?!/)\|[^])\*/)
	71	\|(?P<BAD_BLOCK_COM> /\(?:\(?!/)\|[^])\*?)
	72	\|(?P<LINE_COMMENT> //[^\r\n]*)
	73	\|(?P<IDENT> [_a-zA-Z][_a-zA-Z0-9]*)
	74	\|(?P<PP_NUMBER> \.?[0-9](?:[0-9a-df-oq-zA-DF-OQ-Z_.]\|[eEpP][+-]?)*)
	75	\|(?P<PUNCTUATOR>
	76	[,;?~(){}\[\]]
	77	\| [!*/^=]=?
	78	\| \#\#?
	79	\| %(?:[=>]\|:(?:%:)?)?
	80	\| &[=&]?
	81	\|\\|[=\|]?
82	\|\+[=+]?
83	\| -[=->]?
84	\|\.(?:\.\.)?
85	\| :>?
86	\| <(?:[%:]\|<(?:=\|<=?)?)?
87	\| >(?:=\|>=?)?)
88	\|(?P<ESCNL> \\(?:\r\|\n\|\r\n))
89	\|(?P<WHITESPACE> [ \t\n\r\v\f]+)
90	\|(?P<OTHER> .)
91	""", re.DOTALL \| re.VERBOSE)
92
93	HEADER_NAME_RE_ = re.compile(r"""
94	< [^>\r\n]+ >
95	\| " [^"\r\n]+ "
96	""", re.DOTALL \| re.VERBOSE)
97
98	ENDLINE_RE_ = re.compile(r"""\r\|\n\|\r\n""")
99
100	# based on the sample code in the Python re documentation
101	Token_ = collections.namedtuple("Token", (
102	"kind", "text", "line", "column", "context"))
103	Token_.__doc__ = """
104	One C preprocessing token, comment, or chunk of whitespace.
105	'kind' identifies the token type, which will be one of:
106	STRING, CHARCONST, BLOCK_COMMENT, LINE_COMMENT, IDENT,
107	PP_NUMBER, PUNCTUATOR, ESCNL, WHITESPACE, HEADER_NAME,
108	or OTHER. The BAD_* alternatives in PP_TOKEN_RE_ are
109	handled within tokenize_c, below.
110
111	'text' is the sequence of source characters making up the token;
112	no decoding whatsoever is performed.
113
114	'line' and 'column' give the position of the first character of the
115	token within the source file. They are both 1-based.
116
117	'context' indicates whether or not this token occurred within a
118	preprocessing directive; it will be None for running text,
119	'<null>' for the leading '#' of a directive line (because '#'
120	all by itself on a line is a "null directive"), or the name of
121	the directive for tokens within a directive line, starting with
122	the IDENT for the name itself.
123	"""
124
125	def tokenize_c(file_contents, reporter):
126	"""Yield a series of Token objects, one for each preprocessing
127	token, comment, or chunk of whitespace within FILE_CONTENTS.
128	The REPORTER object is expected to have one method,
129	reporter.error(token, message), which will be called to
130	indicate a lexical error at the position of TOKEN.
131	If MESSAGE contains the four-character sequence '{!r}', that
132	is expected to be replaced by repr(token.text).
133	"""
134
135	Token = Token_
136	PP_TOKEN_RE = PP_TOKEN_RE_
137	ENDLINE_RE = ENDLINE_RE_
138	HEADER_NAME_RE = HEADER_NAME_RE_
139
140	line_num = 1
141	line_start = 0
142	pos = 0
143	limit = len(file_contents)
144	directive = None
145	at_bol = True
146	while pos < limit:
147	if directive == "include":
148	mo = HEADER_NAME_RE.match(file_contents, pos)
149	if mo:
150	kind = "HEADER_NAME"
151	directive = "after_include"
152	else:
153	mo = PP_TOKEN_RE.match(file_contents, pos)
154	kind = mo.lastgroup
155	if kind != "WHITESPACE":
156	directive = "after_include"
157	else:
158	mo = PP_TOKEN_RE.match(file_contents, pos)
159	kind = mo.lastgroup
160
161	text = mo.group()
162	line = line_num
163	column = mo.start() - line_start
164	adj_line_start = 0
165	# only these kinds can contain a newline
166	if kind in ("WHITESPACE", "BLOCK_COMMENT", "LINE_COMMENT",
167	"STRING", "CHARCONST", "BAD_BLOCK_COM", "ESCNL"):
168	for tmo in ENDLINE_RE.finditer(text):
169	line_num += 1
170	adj_line_start = tmo.end()
171	if adj_line_start:
172	line_start = mo.start() + adj_line_start
173
174	# Track whether or not we are scanning a preprocessing directive.
175	if kind == "LINE_COMMENT" or (kind == "WHITESPACE" and adj_line_start):
176	at_bol = True
177	directive = None
178	else:
179	if kind == "PUNCTUATOR" and text == "#" and at_bol:
180	directive = "<null>"
181	elif kind == "IDENT" and directive == "<null>":
182	directive = text
183	at_bol = False
184
185	# Report ill-formed tokens and rewrite them as their well-formed
186	# equivalents, so downstream processing doesn't have to know about them.
187	# (Rewriting instead of discarding provides better error recovery.)
188	if kind == "BAD_BLOCK_COM":
189	reporter.error(Token("BAD_BLOCK_COM", "", line, column+1, ""),
190	"unclosed block comment")
191	text += "*/"
192	kind = "BLOCK_COMMENT"
193	elif kind == "BAD_STRING":
194	reporter.error(Token("BAD_STRING", "", line, column+1, ""),
195	"unclosed string")
196	text += "\""
197	kind = "STRING"
198	elif kind == "BAD_CHARCONST":
199	reporter.error(Token("BAD_CHARCONST", "", line, column+1, ""),
200	"unclosed char constant")
201	text += "'"
202	kind = "CHARCONST"
203
204	tok = Token(kind, text, line, column+1,
205	"include" if directive == "after_include" else directive)
206	# Do not complain about OTHER tokens inside macro definitions.
207	# $ and @ appear in macros defined by headers intended to be
208	# included from assembly language, e.g. sysdeps/mips/sys/asm.h.
209	if kind == "OTHER" and directive != "define":
210	self.error(tok, "stray {!r} in program")
211
212	yield tok
213	pos = mo.end()
214
215	#
216	# Base and generic classes for individual checks.
217	#
218
219	class ConstructChecker:
220	"""Scan a stream of C preprocessing tokens and possibly report
221	problems with them. The REPORTER object passed to __init__ has
222	one method, reporter.error(token, message), which should be
223	called to indicate a problem detected at the position of TOKEN.
224	If MESSAGE contains the four-character sequence '{!r}' then that
225	will be replaced with a textual representation of TOKEN.
226	"""
227	def __init__(self, reporter):
228	self.reporter = reporter
229
230	def examine(self, tok):
231	"""Called once for each token in a header file.
232	Call self.reporter.error if a problem is detected.
233	"""
234	raise NotImplementedError
235
236	def eof(self):
237	"""Called once at the end of the stream. Subclasses need only
238	override this if it might have something to do."""
239	pass
240
241	class NoCheck(ConstructChecker):
242	"""Generic checker class which doesn't do anything. Substitute this
243	class for a real checker when a particular check should be skipped
244	for some file."""
245
246	def examine(self, tok):
247	pass
248
249	#
250	# Check for obsolete type names.
251	#
252
253	# The obsolete type names we're looking for:
254	OBSOLETE_TYPE_RE_ = re.compile(r"""\A
255	(__)?
256	( quad_t
257	\| u(?: short \| int \| long
258	\| _(?: char \| short \| int(?:[0-9]+_t)? \| long \| quad_t )))
259	\Z""", re.VERBOSE)
260
261	class ObsoleteNotAllowed(ConstructChecker):
262	"""Don't allow any use of the obsolete typedefs."""
263	def examine(self, tok):
264	if OBSOLETE_TYPE_RE_.match(tok.text):
265	self.reporter.error(tok, "use of {!r}")
266
267	class ObsoletePrivateDefinitionsAllowed(ConstructChecker):
268	"""Allow definitions of the private versions of the
269	obsolete typedefs; that is, 'typedef [anything] __obsolete;'
270	"""
271	def __init__(self, reporter):
272	super().__init__(reporter)
273	self.in_typedef = False
274	self.prev_token = None
275
276	def examine(self, tok):
277	# bits/types.h hides 'typedef' in a macro sometimes.
278	if (tok.kind == "IDENT"
279	and tok.text in ("typedef", "__STD_TYPE")
280	and tok.context is None):
281	self.in_typedef = True
282	elif tok.kind == "PUNCTUATOR" and tok.text == ";" and self.in_typedef:
283	self.in_typedef = False
284	if self.prev_token.kind == "IDENT":
285	m = OBSOLETE_TYPE_RE_.match(self.prev_token.text)
286	if m and m.group(1) != "__":
287	self.reporter.error(self.prev_token, "use of {!r}")
288	self.prev_token = None
289	else:
290	self._check_prev()
291
292	self.prev_token = tok
293
294	def eof(self):
295	self._check_prev()
296
297	def _check_prev(self):
298	if (self.prev_token is not None
299	and self.prev_token.kind == "IDENT"
300	and OBSOLETE_TYPE_RE_.match(self.prev_token.text)):
301	self.reporter.error(self.prev_token, "use of {!r}")
302
303	class ObsoletePublicDefinitionsAllowed(ConstructChecker):
304	"""Allow definitions of the public versions of the obsolete
305	typedefs. Only specific forms of definition are allowed:
306
307	typedef __obsolete obsolete; // identifiers must agree
308	typedef __uintN_t u_intN_t; // N must agree
309	typedef unsigned long int ulong;
310	typedef unsigned short int ushort;
311	typedef unsigned int uint;
312	"""
313	def __init__(self, reporter):
314	super().__init__(reporter)
315	self.typedef_tokens = []
316
317	def examine(self, tok):
318	if tok.kind in ("WHITESPACE", "BLOCK_COMMENT",
319	"LINE_COMMENT", "NL", "ESCNL"):
320	pass
321
322	elif (tok.kind == "IDENT" and tok.text == "typedef"
323	and tok.context is None):
324	if self.typedef_tokens:
325	self.reporter.error(tok, "typedef inside typedef")
326	self._reset()
327	self.typedef_tokens.append(tok)
328
329	elif tok.kind == "PUNCTUATOR" and tok.text == ";":
330	self._finish()
331
332	elif self.typedef_tokens:
333	self.typedef_tokens.append(tok)
334
335	def eof(self):
336	self._reset()
337
338	def _reset(self):
339	while self.typedef_tokens:
340	tok = self.typedef_tokens.pop(0)
341	if tok.kind == "IDENT" and OBSOLETE_TYPE_RE_.match(tok.text):
342	self.reporter.error(tok, "use of {!r}")
343
344	def _finish(self):
345	if not self.typedef_tokens: return
346	if self.typedef_tokens[-1].kind == "IDENT":
347	m = OBSOLETE_TYPE_RE_.match(self.typedef_tokens[-1].text)
348	if m:
349	if self._permissible_public_definition(m):
350	self.typedef_tokens.clear()
351	self._reset()
352
353	def _permissible_public_definition(self, m):
354	if m.group(1) == "__": return False
355	name = m.group(2)
356	toks = self.typedef_tokens
357	ntok = len(toks)
358	if ntok == 3 and toks[1].kind == "IDENT":
359	defn = toks[1].text
360	n = OBSOLETE_TYPE_RE_.match(defn)
361	if n and n.group(1) == "__" and n.group(2) == name:
362	return True
363
364	if (name[:5] == "u_int" and name[-2:] == "_t"
365	and defn[:6] == "__uint" and defn[-2:] == "_t"
366	and name[5:-2] == defn[6:-2]):
367	return True
368
369	return False
370
371	if (name == "ulong" and ntok == 5
372	and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
373	and toks[2].kind == "IDENT" and toks[2].text == "long"
374	and toks[3].kind == "IDENT" and toks[3].text == "int"):
375	return True
376
377	if (name == "ushort" and ntok == 5
378	and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
379	and toks[2].kind == "IDENT" and toks[2].text == "short"
380	and toks[3].kind == "IDENT" and toks[3].text == "int"):
381	return True
382
383	if (name == "uint" and ntok == 4
384	and toks[1].kind == "IDENT" and toks[1].text == "unsigned"
385	and toks[2].kind == "IDENT" and toks[2].text == "int"):
386	return True
387
388	return False
389
390	def ObsoleteTypedefChecker(reporter, fname):
391	"""Factory: produce an instance of the appropriate
392	obsolete-typedef checker for FNAME."""
393
394	# The obsolete rpc/ and rpcsvc/ headers are allowed to use the
395	# obsolete types, because it would be more trouble than it's
396	# worth to remove them from headers that we intend to stop
397	# installing eventually anyway.
398	if (fname.startswith("rpc/")
399	or fname.startswith("rpcsvc/")
400	or "/rpc/" in fname
401	or "/rpcsvc/" in fname):
402	return NoCheck(reporter)
403
404	# bits/types.h is allowed to define the __-versions of the
405	# obsolete types.
406	if (fname == "bits/types.h"
407	or fname.endswith("/bits/types.h")):
408	return ObsoletePrivateDefinitionsAllowed(reporter)
409
410	# sys/types.h is allowed to use the __-versions of the
411	# obsolete types, but only to define the unprefixed versions.
412	if (fname == "sys/types.h"
413	or fname.endswith("/sys/types.h")):
414	return ObsoletePublicDefinitionsAllowed(reporter)
415
416	return ObsoleteNotAllowed(reporter)
417
418	#
419	# Master control
420	#
421
422	class HeaderChecker:
423	"""Perform all of the checks on each header. This is also the
424	"reporter" object expected by tokenize_c and ConstructChecker.
425	"""
426	def __init__(self):
427	self.fname = None
428	self.status = 0
429
430	def error(self, tok, message):
431	self.status = 1
432	if '{!r}' in message:
433	message = message.format(tok.text)
434	sys.stderr.write("{}:{}:{}: error: {}\n".format(
435	self.fname, tok.line, tok.column, message))
436
437	def check(self, fname):
438	self.fname = fname
439	try:
97f8225d	440	with open(fname, "rt", encoding="utf-8") as fp:
711a322a ZW	441	contents = fp.read()
	442	except OSError as e:
	443	sys.stderr.write("{}: {}\n".format(fname, e.strerror))
	444	self.status = 1
	445	return
	446
	447	typedef_checker = ObsoleteTypedefChecker(self, self.fname)
	448
	449	for tok in tokenize_c(contents, self):
	450	typedef_checker.examine(tok)
	451
	452	def main():
	453	ap = argparse.ArgumentParser(description=__doc__)
	454	ap.add_argument("headers", metavar="header", nargs="+",
	455	help="one or more headers to scan for obsolete constructs")
	456	args = ap.parse_args()
	457
	458	checker = HeaderChecker()
	459	for fname in args.headers:
	460	# Headers whose installed name begins with "finclude/" contain
	461	# Fortran, not C, and this program should completely ignore them.
	462	if not (fname.startswith("finclude/") or "/finclude/" in fname):
	463	checker.check(fname)
	464	sys.exit(checker.status)
	465
	466	main()