gh-130587: Add hand-written docs for non-OP tokens (GH-130588)

author Petr Viktorin <encukou@gmail.com>

Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)

committer GitHub <noreply@github.com>

Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)
author Petr Viktorin <encukou@gmail.com>
Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)
committer GitHub <noreply@github.com>
Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc

index 39df2927a0b7f2dff52f84a7950e077fe0627547..655758c4a400ccd745a8bf27a80463b0811a1d3d 100644 (file)
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
@@ -1,230 +1,104 @@
  .. Auto-generated by Tools/build/generate_token.py
-.. data:: ENDMARKER
  
-.. data:: NAME
-
-.. data:: NUMBER
-
-.. data:: STRING
-
-.. data:: NEWLINE
-
-.. data:: INDENT
-
-.. data:: DEDENT
-
-.. data:: LPAR
-
-   Token value for ``"("``.
-
-.. data:: RPAR
-
-   Token value for ``")"``.
-
-.. data:: LSQB
-
-   Token value for ``"["``.
-
-.. data:: RSQB
-
-   Token value for ``"]"``.
-
-.. data:: COLON
-
-   Token value for ``":"``.
-
-.. data:: COMMA
-
-   Token value for ``","``.
-
-.. data:: SEMI
-
-   Token value for ``";"``.
-
-.. data:: PLUS
-
-   Token value for ``"+"``.
-
-.. data:: MINUS
-
-   Token value for ``"-"``.
-
-.. data:: STAR
-
-   Token value for ``"*"``.
-
-.. data:: SLASH
-
-   Token value for ``"/"``.
-
-.. data:: VBAR
-
-   Token value for ``"|"``.
-
-.. data:: AMPER
-
-   Token value for ``"&"``.
-
-.. data:: LESS
-
-   Token value for ``"<"``.
-
-.. data:: GREATER
-
-   Token value for ``">"``.
-
-.. data:: EQUAL
-
-   Token value for ``"="``.
-
-.. data:: DOT
-
-   Token value for ``"."``.
-
-.. data:: PERCENT
-
-   Token value for ``"%"``.
-
-.. data:: LBRACE
-
-   Token value for ``"{"``.
-
-.. data:: RBRACE
-
-   Token value for ``"}"``.
-
-.. data:: EQEQUAL
-
-   Token value for ``"=="``.
-
-.. data:: NOTEQUAL
-
-   Token value for ``"!="``.
-
-.. data:: LESSEQUAL
-
-   Token value for ``"<="``.
-
-.. data:: GREATEREQUAL
-
-   Token value for ``">="``.
-
-.. data:: TILDE
-
-   Token value for ``"~"``.
-
-.. data:: CIRCUMFLEX
-
-   Token value for ``"^"``.
-
-.. data:: LEFTSHIFT
-
-   Token value for ``"<<"``.
-
-.. data:: RIGHTSHIFT
-
-   Token value for ``">>"``.
-
-.. data:: DOUBLESTAR
-
-   Token value for ``"**"``.
-
-.. data:: PLUSEQUAL
-
-   Token value for ``"+="``.
-
-.. data:: MINEQUAL
-
-   Token value for ``"-="``.
-
-.. data:: STAREQUAL
-
-   Token value for ``"*="``.
-
-.. data:: SLASHEQUAL
-
-   Token value for ``"/="``.
-
-.. data:: PERCENTEQUAL
-
-   Token value for ``"%="``.
-
-.. data:: AMPEREQUAL
-
-   Token value for ``"&="``.
-
-.. data:: VBAREQUAL
-
-   Token value for ``"|="``.
-
-.. data:: CIRCUMFLEXEQUAL
-
-   Token value for ``"^="``.
-
-.. data:: LEFTSHIFTEQUAL
-
-   Token value for ``"<<="``.
-
-.. data:: RIGHTSHIFTEQUAL
-
-   Token value for ``">>="``.
-
-.. data:: DOUBLESTAREQUAL
-
-   Token value for ``"**="``.
-
-.. data:: DOUBLESLASH
-
-   Token value for ``"//"``.
-
-.. data:: DOUBLESLASHEQUAL
-
-   Token value for ``"//="``.
-
-.. data:: AT
-
-   Token value for ``"@"``.
-
-.. data:: ATEQUAL
-
-   Token value for ``"@="``.
-
-.. data:: RARROW
-
-   Token value for ``"->"``.
-
-.. data:: ELLIPSIS
-
-   Token value for ``"..."``.
-
-.. data:: COLONEQUAL
-
-   Token value for ``":="``.
-
-.. data:: EXCLAMATION
-
-   Token value for ``"!"``.
-
-.. data:: OP
-
-.. data:: TYPE_IGNORE
-
-.. data:: TYPE_COMMENT
-
-.. data:: SOFT_KEYWORD
-
-.. data:: FSTRING_START
-
-.. data:: FSTRING_MIDDLE
-
-.. data:: FSTRING_END
-
-.. data:: COMMENT
-
-.. data:: NL
-
-.. data:: ERRORTOKEN
-
-.. data:: N_TOKENS
-
-.. data:: NT_OFFSET
+.. list-table::
+   :align: left
+   :header-rows: 1
+
+   * - Token
+     - Value
+   * - .. data:: LPAR
+     - ``"("``
+   * - .. data:: RPAR
+     - ``")"``
+   * - .. data:: LSQB
+     - ``"["``
+   * - .. data:: RSQB
+     - ``"]"``
+   * - .. data:: COLON
+     - ``":"``
+   * - .. data:: COMMA
+     - ``","``
+   * - .. data:: SEMI
+     - ``";"``
+   * - .. data:: PLUS
+     - ``"+"``
+   * - .. data:: MINUS
+     - ``"-"``
+   * - .. data:: STAR
+     - ``"*"``
+   * - .. data:: SLASH
+     - ``"/"``
+   * - .. data:: VBAR
+     - ``"|"``
+   * - .. data:: AMPER
+     - ``"&"``
+   * - .. data:: LESS
+     - ``"<"``
+   * - .. data:: GREATER
+     - ``">"``
+   * - .. data:: EQUAL
+     - ``"="``
+   * - .. data:: DOT
+     - ``"."``
+   * - .. data:: PERCENT
+     - ``"%"``
+   * - .. data:: LBRACE
+     - ``"{"``
+   * - .. data:: RBRACE
+     - ``"}"``
+   * - .. data:: EQEQUAL
+     - ``"=="``
+   * - .. data:: NOTEQUAL
+     - ``"!="``
+   * - .. data:: LESSEQUAL
+     - ``"<="``
+   * - .. data:: GREATEREQUAL
+     - ``">="``
+   * - .. data:: TILDE
+     - ``"~"``
+   * - .. data:: CIRCUMFLEX
+     - ``"^"``
+   * - .. data:: LEFTSHIFT
+     - ``"<<"``
+   * - .. data:: RIGHTSHIFT
+     - ``">>"``
+   * - .. data:: DOUBLESTAR
+     - ``"**"``
+   * - .. data:: PLUSEQUAL
+     - ``"+="``
+   * - .. data:: MINEQUAL
+     - ``"-="``
+   * - .. data:: STAREQUAL
+     - ``"*="``
+   * - .. data:: SLASHEQUAL
+     - ``"/="``
+   * - .. data:: PERCENTEQUAL
+     - ``"%="``
+   * - .. data:: AMPEREQUAL
+     - ``"&="``
+   * - .. data:: VBAREQUAL
+     - ``"|="``
+   * - .. data:: CIRCUMFLEXEQUAL
+     - ``"^="``
+   * - .. data:: LEFTSHIFTEQUAL
+     - ``"<<="``
+   * - .. data:: RIGHTSHIFTEQUAL
+     - ``">>="``
+   * - .. data:: DOUBLESTAREQUAL
+     - ``"**="``
+   * - .. data:: DOUBLESLASH
+     - ``"//"``
+   * - .. data:: DOUBLESLASHEQUAL
+     - ``"//="``
+   * - .. data:: AT
+     - ``"@"``
+   * - .. data:: ATEQUAL
+     - ``"@="``
+   * - .. data:: RARROW
+     - ``"->"``
+   * - .. data:: ELLIPSIS
+     - ``"..."``
+   * - .. data:: COLONEQUAL
+     - ``":="``
+   * - .. data:: EXCLAMATION
+     - ``"!"``
diff --git a/Doc/library/token.rst b/Doc/library/token.rst

index 40982f32b4beeebb62bc3097c304f753c36e5875..24455b1ef778931c0fb21a5f533739711386e250 100644 (file)
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -19,6 +19,10 @@ change between Python versions.
  The module also provides a mapping from numeric codes to names and some
  functions.  The functions mirror definitions in the Python C header files.
  
+Note that a token's value may depend on tokenizer options. For example, a
+``"+"`` token may be reported as either :data:`PLUS` or :data:`OP`, or
+a ``"match"`` token may be either :data:`NAME` or :data:`SOFT_KEYWORD`.
+
  
  .. data:: tok_name
  
@@ -44,25 +48,93 @@ functions.  The functions mirror definitions in the Python C header files.
  
  The token constants are:
  
-.. include:: token-list.inc
+.. data:: NAME
+
+   Token value that indicates an :ref:`identifier <identifiers>`.
+   Note that keywords are also initially tokenized an ``NAME`` tokens.
+
+.. data:: NUMBER
+
+   Token value that indicates a :ref:`numeric literal <numbers>`
+
+.. data:: STRING
+
+   Token value that indicates a :ref:`string or byte literal <strings>`,
+   excluding :ref:`formatted string literals <f-strings>`.
+   The token string is not interpreted:
+   it includes the surrounding quotation marks and the prefix (if given);
+   backslashes are included literally, without processing escape sequences.
  
-The following token type values aren't used by the C tokenizer but are needed for
-the :mod:`tokenize` module.
+.. data:: OP
+
+   A generic token value that indicates an
+   :ref:`operator <operators>` or :ref:`delimiter <delimiters>`.
+
+   .. impl-detail::
+
+      This value is only reported by the :mod:`tokenize` module.
+      Internally, the tokenizer uses
+      :ref:`exact token types <token_operators_delimiters>` instead.
  
  .. data:: COMMENT
-   :noindex:
  
     Token value used to indicate a comment.
+   The parser ignores :data:`!COMMENT` tokens.
  
+.. data:: NEWLINE
+
+   Token value that indicates the end of a :ref:`logical line <logical-lines>`.
  
  .. data:: NL
-   :noindex:
  
-   Token value used to indicate a non-terminating newline.  The
-   :data:`NEWLINE` token indicates the end of a logical line of Python code;
-   ``NL`` tokens are generated when a logical line of code is continued over
-   multiple physical lines.
+   Token value used to indicate a non-terminating newline.
+   :data:`!NL` tokens are generated when a logical line of code is continued
+   over multiple physical lines. The parser ignores :data:`!NL` tokens.
+
+.. data:: INDENT
+
+   Token value used at the beginning of a :ref:`logical line <logical-lines>`
+   to indicate the start of an :ref:`indented block <indentation>`.
+
+.. data:: DEDENT
+
+   Token value used at the beginning of a :ref:`logical line <logical-lines>`
+   to indicate the end of an :ref:`indented block <indentation>`.
+
+.. data:: FSTRING_START
+
+   Token value used to indicate the beginning of an
+   :ref:`f-string literal <f-strings>`.
+
+   .. impl-detail::
+
+      The token string includes the prefix and the opening quote(s), but none
+      of the contents of the literal.
+
+.. data:: FSTRING_MIDDLE
+
+   Token value used for literal text inside an :ref:`f-string literal <f-strings>`,
+   including format specifications.
+
+   .. impl-detail::
+
+      Replacement fields (that is, the non-literal parts of f-strings) use
+      the same tokens as other expressions, and are delimited by
+      :data:`LBRACE`, :data:`RBRACE`, :data:`EXCLAMATION` and :data:`COLON`
+      tokens.
+
+.. data:: FSTRING_END
+
+   Token value used to indicate the end of a :ref:`f-string <f-strings>`.
  
+   .. impl-detail::
+
+      The token string contains the closing quote(s).
+
+.. data:: ENDMARKER
+
+   Token value that indicates the end of input.
+   Used in :ref:`top-level grammar rules <top-level>`.
  
  .. data:: ENCODING
  
@@ -70,14 +142,63 @@ the :mod:`tokenize` module.
     into text. The first token returned by :func:`tokenize.tokenize` will
     always be an ``ENCODING`` token.
  
+   .. impl-detail::
+
+      This token type isn't used by the C tokenizer but is needed for
+      the :mod:`tokenize` module.
+
+
+The following token types are not produced by the :mod:`tokenize` module,
+and are defined for special uses in the tokenizer or parser:
+
+.. data:: TYPE_IGNORE
+
+   Token value indicating that a ``type: ignore`` comment was recognized.
+   Such tokens are produced instead of regular :data:`COMMENT` tokens only
+   with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.
  
  .. data:: TYPE_COMMENT
-   :noindex:
  
-   Token value indicating that a type comment was recognized.  Such
-   tokens are only produced when :func:`ast.parse` is invoked with
-   ``type_comments=True``.
+   Token value indicating that a type comment was recognized.
+   Such tokens are produced instead of regular :data:`COMMENT` tokens only
+   with the :data:`~ast.PyCF_TYPE_COMMENTS` flag.
+
+.. data:: SOFT_KEYWORD
+
+   Token value indicating a :ref:`soft keyword <soft-keywords>`.
+
+   The tokenizer never produces this value.
+   To check for a soft keyword, pass a :data:`NAME` token's string to
+   :func:`keyword.issoftkeyword`.
+
+.. data:: ERRORTOKEN
  
+   Token value used to indicate wrong input.
+
+   The :mod:`tokenize` module generally indicates errors by
+   raising exceptions instead of emitting this token.
+   It can also emit tokens such as :data:`OP` or :data:`NAME` with strings that
+   are later rejected by the parser.
+
+
+.. _token_operators_delimiters:
+
+The remaining tokens represent specific :ref:`operators <operators>` and
+:ref:`delimiters <delimiters>`.
+(The :mod:`tokenize` module reports these as :data:`OP`; see ``exact_type``
+in the :mod:`tokenize` documentation for details.)
+
+.. include:: token-list.inc
+
+
+The following non-token constants are provided:
+
+.. data:: N_TOKENS
+
+   The number of token types defined in this module.
+
+.. NT_OFFSET is deliberately undocumented; if you need it you should be
+   reading the source
  
  .. data:: EXACT_TOKEN_TYPES
  
@@ -102,6 +223,9 @@ the :mod:`tokenize` module.
     to support parsing older Python versions for :func:`ast.parse` with
     ``feature_version`` set to 6 or lower).
  
+.. versionchanged:: 3.12
+   Added :data:`EXCLAMATION`.
+
  .. versionchanged:: 3.13
     Removed :data:`!AWAIT` and :data:`!ASYNC` tokens again.
  
diff --git a/Doc/reference/toplevel_components.rst b/Doc/reference/toplevel_components.rst

index f155fafbe4d738e307f58d89438d6111ebf25b78..bd64b1c08bd1ff2752b2139a0a1df1a89e8723ec 100644 (file)
--- a/Doc/reference/toplevel_components.rst
+++ b/Doc/reference/toplevel_components.rst
@@ -69,7 +69,7 @@ All input read from non-interactive files has the same form:
  .. grammar-snippet::
     :group: python-grammar
  
-   file_input: (NEWLINE | `statement`)*
+   file_input: (NEWLINE | `statement`)* ENDMARKER
  
  This syntax is used in the following situations:
  
@@ -90,7 +90,7 @@ Input in interactive mode is parsed using the following grammar:
  .. grammar-snippet::
     :group: python-grammar
  
-   interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE
+   interactive_input: [`stmt_list`] NEWLINE | `compound_stmt` NEWLINE | ENDMARKER
  
  Note that a (top-level) compound statement must be followed by a blank line in
  interactive mode; this is needed to help the parser detect the end of the input.
@@ -107,5 +107,7 @@ Expression input
  :func:`eval` is used for expression input.  It ignores leading whitespace. The
  string argument to :func:`eval` must have the following form:
  
-.. productionlist:: python-grammar
-   eval_input: `expression_list` NEWLINE*
+.. grammar-snippet::
+   :group: python-grammar
+
+   eval_input: `expression_list` NEWLINE* ENDMARKER
diff --git a/Tools/build/generate_token.py b/Tools/build/generate_token.py

index d32747f19945d84ab2486f95966626cf8ea873db..a5f9828c466eda83e749f65fd4ebe73dac1be91a 100755 (executable)
--- a/Tools/build/generate_token.py
+++ b/Tools/build/generate_token.py
@@ -1,10 +1,17 @@
  #! /usr/bin/env python3
  # This script generates token related files from Grammar/Tokens:
  #
-#   Doc/library/token-list.inc
-#   Include/token.h
-#   Parser/token.c
-#   Lib/token.py
+#   make_rst:
+#       Doc/library/token-list.inc
+#       Doc/library/token.rst  (checked, not generated)
+#   make_h:
+#       Include/token.h
+#   make_c:
+#       Parser/token.c
+#   make_py:
+#       Lib/token.py
+
+import re
  
  
  SCRIPT_NAME = 'Tools/build/generate_token.py'
@@ -199,23 +206,51 @@ def make_c(infile, outfile='Parser/token.c'):
  
  token_inc_template = f"""\
  .. {AUTO_GENERATED_BY_SCRIPT}
-%s
-.. data:: N_TOKENS
  
-.. data:: NT_OFFSET
+.. list-table::
+   :align: left
+   :header-rows: 1
+
+   * - Token
+     - Value
+%s
  """
  
-def make_rst(infile, outfile='Doc/library/token-list.inc'):
+def make_rst(infile, outfile='Doc/library/token-list.inc',
+             rstfile='Doc/library/token.rst'):
      tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
      tok_to_string = {value: s for s, value in string_to_tok.items()}
  
+    needs_handwritten_doc = set()
+
      names = []
-    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
-        names.append('.. data:: %s' % (name,))
+    for value, name in enumerate(tok_names):
          if value in tok_to_string:
-            names.append('')
-            names.append('   Token value for ``"%s"``.' % tok_to_string[value])
-        names.append('')
+            assert name.isupper()
+            names.append(f'   * - .. data:: {name}')
+            names.append(f'     - ``"{tok_to_string[value]}"``')
+        else:
+            needs_handwritten_doc.add(name)
+
+    has_handwritten_doc = set()
+    with open(rstfile) as fileobj:
+        tokendef_re = re.compile(r'.. data:: ([0-9A-Z_]+)\s*')
+        for line in fileobj:
+            if match := tokendef_re.fullmatch(line):
+                has_handwritten_doc.add(match[1])
+
+    # Exclude non-token constants in token.py
+    has_handwritten_doc -= {'N_TOKENS', 'NT_OFFSET', 'EXACT_TOKEN_TYPES'}
+
+    if needs_handwritten_doc != has_handwritten_doc:
+        message_parts = [f'ERROR: {rstfile} does not document all tokens!']
+        undocumented = needs_handwritten_doc - has_handwritten_doc
+        extra = has_handwritten_doc - needs_handwritten_doc
+        if undocumented:
+            message_parts.append(f'Undocumented tokens: {undocumented}')
+        if extra:
+            message_parts.append(f'Documented nonexistent tokens: {extra}')
+        exit('\n'.join(message_parts))
  
      if update_file(outfile, token_inc_template % '\n'.join(names)):
          print("%s regenerated from %s" % (outfile, infile))
author	Petr Viktorin <encukou@gmail.com>
	Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)
committer	GitHub <noreply@github.com>
	Wed, 19 Mar 2025 15:42:11 +0000 (16:42 +0100)
Doc/library/token-list.inc		patch \| blob \| blame \| history
Doc/library/token.rst		patch \| blob \| blame \| history
Doc/reference/toplevel_components.rst		patch \| blob \| blame \| history
Tools/build/generate_token.py		patch \| blob \| blame \| history