From 21c4b7359d91b3d78acb04afbe339cbea92bffae Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Wed, 24 Jun 2026 13:09:50 +0200 Subject: [PATCH] gh-152056: Compile single-category character sets to a bare CATEGORY opcode (GH-152057) A character set containing exactly one category, e.g. [\d] or [^\s], now compiles to a single CATEGORY opcode (like \d or \S) instead of an IN block. The negated form maps to the complementary category. This speeds up matching and reduces the size of the compiled byte code. Co-authored-by: Claude Opus 4.8 (1M context) --- Doc/whatsnew/3.16.rst | 10 ++++++---- Lib/re/_parser.py | 6 ++++++ .../2026-06-24-10-30-00.gh-issue-152056.Qk7mZ2.rst | 5 +++++ 3 files changed, 17 insertions(+), 4 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2026-06-24-10-30-00.gh-issue-152056.Qk7mZ2.rst diff --git a/Doc/whatsnew/3.16.rst b/Doc/whatsnew/3.16.rst index e3f04739e3b4..f9e54cde10af 100644 --- a/Doc/whatsnew/3.16.rst +++ b/Doc/whatsnew/3.16.rst @@ -288,10 +288,12 @@ re -- * Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``) - outside a character set are now compiled to a single ``CATEGORY`` opcode - instead of being wrapped in an ``IN`` block. This speeds up matching of - patterns such as ``\d+`` and reduces the size of the compiled byte code. - (Contributed by Serhiy Storchaka in :gh:`152033`.) + outside a character set, and character sets containing a single such escape + (such as ``[\d]`` or ``[^\s]``), are now compiled to a single ``CATEGORY`` + opcode instead of being wrapped in an ``IN`` block. This speeds up matching + of patterns such as ``\d+`` and reduces the size of the compiled byte code. + (Contributed by Serhiy Storchaka in :gh:`152033` and Pieter Eendebak in + :gh:`152056`.) module_name ----------- diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py index 3c41c4340953..b8c19cd3070c 100644 --- a/Lib/re/_parser.py +++ b/Lib/re/_parser.py @@ -625,6 +625,12 @@ def _parse(source, state, verbose, nested, first=False): subpatternappend((NOT_LITERAL, set[0][1])) else: subpatternappend(set[0]) + elif _len(set) == 1 and set[0][0] is CATEGORY: + # optimization: a lone category like [\d] or [^\d] + if negate: + subpatternappend((CATEGORY, CH_NEGATE[set[0][1]])) + else: + subpatternappend(set[0]) else: if negate: set.insert(0, (NEGATE, None)) diff --git a/Misc/NEWS.d/next/Library/2026-06-24-10-30-00.gh-issue-152056.Qk7mZ2.rst b/Misc/NEWS.d/next/Library/2026-06-24-10-30-00.gh-issue-152056.Qk7mZ2.rst new file mode 100644 index 000000000000..6e71d720cd19 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-06-24-10-30-00.gh-issue-152056.Qk7mZ2.rst @@ -0,0 +1,5 @@ +Optimize matching of a character set that contains a single character +category, such as ``[\d]`` or ``[^\s]``: it is now compiled to a single +``CATEGORY`` opcode, the same as the corresponding ``\d`` or ``\S`` escape, +instead of being wrapped in an ``IN`` block. This speeds up matching and +reduces the size of the compiled byte code. Patch by Pieter Eendebak. -- 2.47.3