gh-148991: Add colour to `tokenize` CLI output (#148992)

author Hugo van Kemenade <1324225+hugovk@users.noreply.github.com>

Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)

committer GitHub <noreply@github.com>

Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)
author Hugo van Kemenade <1324225+hugovk@users.noreply.github.com>
Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)
committer GitHub <noreply@github.com>
Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)
diff --git a/Doc/library/tokenize.rst b/Doc/library/tokenize.rst

index 3db4cf42c17f3d826d952eda8743947097153549..72fbcaba16066078d3cf0d952f9f40144661da9c 100644 (file)
--- a/Doc/library/tokenize.rst
+++ b/Doc/library/tokenize.rst
@@ -28,7 +28,7 @@ type can be determined by checking the ``exact_type`` property on the
     **undefined** when providing invalid Python code and it can change at any
     point.
  
-Tokenizing Input
+Tokenizing input
  ----------------
  
  The primary entry point is a :term:`generator`:
@@ -146,7 +146,7 @@ function it uses to do this is available:
  
  .. _tokenize-cli:
  
-Command-Line Usage
+Command-line usage
  ------------------
  
  .. versionadded:: 3.3
@@ -173,8 +173,12 @@ The following options are accepted:
  If :file:`filename.py` is specified its contents are tokenized to stdout.
  Otherwise, tokenization is performed on stdin.
  
+.. versionadded:: next
+   Output is in color by default and can be
+   :ref:`controlled using environment variables <using-on-controlling-color>`.
+
  Examples
-------------------
+--------
  
  Example of a script rewriter that transforms float literals into Decimal
  objects::
@@ -227,7 +231,7 @@ Example of tokenizing from the command line.  The script::
  
  will be tokenized to the following output where the first column is the range
  of the line/column coordinates where the token is found, the second column is
-the name of the token, and the final column is the value of the token (if any)
+the name of the token, and the final column is the value of the token (if any):
  
  .. code-block:: shell-session
  
diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst

index 0afa47c334012aaa9febefce3444e9efb4e6b24f..405d388af487e8156e41acbdc09088f438f118d5 100644 (file)
--- a/Doc/whatsnew/3.15.rst
+++ b/Doc/whatsnew/3.15.rst
@@ -1244,6 +1244,15 @@ tkinter
    (Contributed by Matthias Kievernagel and Serhiy Storchaka in :gh:`47655`.)
  
  
+tokenize
+--------
+
+* The output of the :mod:`tokenize` :ref:`command-line interface
+  <tokenize-cli>` is colored by default. This can be controlled with
+  :ref:`environment variables <using-on-controlling-color>`.
+  (Contributed by Hugo van Kemenade in :gh:`148991`.)
+
+
  .. _whatsnew315-tomllib-1-1-0:
  
  tomllib
diff --git a/Lib/_colorize.py b/Lib/_colorize.py

index f9ee2caa9d091c1337b09f36336d123d626b1ce5..379ca2529b6585d5376d1e9333b8d7c8ec88f478 100644 (file)
--- a/Lib/_colorize.py
+++ b/Lib/_colorize.py
@@ -386,6 +386,14 @@ class Timeit(ThemeSection):
      reset: str = ANSIColors.RESET
  
  
+@dataclass(frozen=True, kw_only=True)
+class Tokenize(ThemeSection):
+    whitespace: str = ANSIColors.GREY
+    error: str = ANSIColors.BOLD_RED
+    position: str = ANSIColors.GREY
+    delimiter: str = ANSIColors.RESET
+
+
  @dataclass(frozen=True, kw_only=True)
  class Traceback(ThemeSection):
      type: str = ANSIColors.BOLD_MAGENTA
@@ -423,6 +431,7 @@ class Theme:
      live_profiler: LiveProfiler = field(default_factory=LiveProfiler)
      syntax: Syntax = field(default_factory=Syntax)
      timeit: Timeit = field(default_factory=Timeit)
+    tokenize: Tokenize = field(default_factory=Tokenize)
      traceback: Traceback = field(default_factory=Traceback)
      unittest: Unittest = field(default_factory=Unittest)
  
@@ -437,6 +446,7 @@ class Theme:
          live_profiler: LiveProfiler | None = None,
          syntax: Syntax | None = None,
          timeit: Timeit | None = None,
+        tokenize: Tokenize | None = None,
          traceback: Traceback | None = None,
          unittest: Unittest | None = None,
      ) -> Self:
@@ -454,6 +464,7 @@ class Theme:
              live_profiler=live_profiler or self.live_profiler,
              syntax=syntax or self.syntax,
              timeit=timeit or self.timeit,
+            tokenize=tokenize or self.tokenize,
              traceback=traceback or self.traceback,
              unittest=unittest or self.unittest,
          )
@@ -475,6 +486,7 @@ class Theme:
              live_profiler=LiveProfiler.no_colors(),
              syntax=Syntax.no_colors(),
              timeit=Timeit.no_colors(),
+            tokenize=Tokenize.no_colors(),
              traceback=Traceback.no_colors(),
              unittest=Unittest.no_colors(),
          )
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index ca67e381958757df73411c716152cd2e77683dc4..ab53a20cff5539270d9a2cd6b97bcc4a525dc235 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -3326,6 +3326,7 @@ class CTokenizerBufferTests(unittest.TestCase):
              run_test_script(file_name)
  
  
+@support.force_not_colorized_test_class
  class CommandLineTest(unittest.TestCase):
      def setUp(self):
          self.filename = tempfile.mktemp()
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 11c134482db0248a22694e5bc2ca71ce34dfa532..52cf3f0b7ccaa9c2368831279ca11dd48762933a 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -35,6 +35,7 @@ import sys
  from token import *
  from token import EXACT_TOKEN_TYPES
  import _tokenize
+lazy import _colorize
  
  cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -505,6 +506,56 @@ def generate_tokens(readline):
      """
      return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
  
+
+def _get_token_colors(syntax, tokenize):
+    """Map token type numbers to theme colors."""
+    return frozendict({
+        COMMENT: syntax.comment,
+        DEDENT: tokenize.whitespace,
+        ENCODING: tokenize.whitespace,
+        ENDMARKER: tokenize.whitespace,
+        ERRORTOKEN: tokenize.error,
+        FSTRING_START: syntax.string,
+        FSTRING_MIDDLE: syntax.string,
+        FSTRING_END: syntax.string,
+        INDENT: tokenize.whitespace,
+        NAME: syntax.reset,
+        NEWLINE: tokenize.whitespace,
+        NL: tokenize.whitespace,
+        NUMBER: syntax.number,
+        OP: syntax.op,
+        SOFT_KEYWORD: syntax.soft_keyword,
+        STRING: syntax.string,
+        TSTRING_START: syntax.string,
+        TSTRING_MIDDLE: syntax.string,
+        TSTRING_END: syntax.string,
+    })
+
+
+def _format_tokens(tokens, *, color=False, exact=False):
+    theme = _colorize.get_theme(force_no_color=not color)
+    s = theme.syntax
+    t = theme.tokenize
+    token_colors = _get_token_colors(s, t)
+    for token in tokens:
+        token_range = (
+            f"{t.position}{token.start[0]}"
+            f"{t.delimiter},{t.position}{token.start[1]}"
+            f"{t.delimiter}-"
+            f"{t.position}{token.end[0]}"
+            f"{t.delimiter},{t.position}{token.end[1]}"
+            f"{t.delimiter}:"
+        )
+        token_color = token_colors.get(token.type, s.reset)
+        token_name = tok_name[token.exact_type if exact else token.type]
+        visible_range = f"{token.start[0]},{token.start[1]}-{token.end[0]},{token.end[1]}:"
+        yield (
+            f"{token_range}{' ' * (20 - len(visible_range))}"
+            f"{token_color}{token_name:<15}"
+            f"{s.reset}{token.string!r:<15}"
+        )
+
+
  def _main(args=None):
      import argparse
  
@@ -524,7 +575,7 @@ def _main(args=None):
          sys.exit(1)
  
      # Parse the arguments and options
-    parser = argparse.ArgumentParser(color=True)
+    parser = argparse.ArgumentParser()
      parser.add_argument(dest='filename', nargs='?',
                          metavar='filename.py',
                          help='the file to tokenize; defaults to stdin')
@@ -545,13 +596,8 @@ def _main(args=None):
  
  
          # Output the tokenization
-        for token in tokens:
-            token_type = token.type
-            if args.exact:
-                token_type = token.exact_type
-            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
-            print("%-20s%-15s%-15r" %
-                  (token_range, tok_name[token_type], token.string))
+        for line in _format_tokens(tokens, color=True, exact=args.exact):
+            print(line)
      except IndentationError as err:
          line, column = err.args[1][1:3]
          error(err.args[0], filename, (line, column))
diff --git a/Misc/NEWS.d/next/Library/2026-04-25-18-09-16.gh-issue-148991.AZ64Et.rst b/Misc/NEWS.d/next/Library/2026-04-25-18-09-16.gh-issue-148991.AZ64Et.rst

new file mode 100644 (file)

index 0000000..336ed42
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-04-25-18-09-16.gh-issue-148991.AZ64Et.rst
@@ -0,0 +1 @@
+Add colour to :mod:`tokenize` CLI output. Patch by Hugo van Kemenade.
author	Hugo van Kemenade <1324225+hugovk@users.noreply.github.com>
	Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)
committer	GitHub <noreply@github.com>
	Sun, 26 Apr 2026 19:14:33 +0000 (22:14 +0300)
Doc/library/tokenize.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.15.rst		patch \| blob \| blame \| history
Lib/_colorize.py		patch \| blob \| blame \| history
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-04-25-18-09-16.gh-issue-148991.AZ64Et.rst	[new file with mode: 0644]	patch \| blob