]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-152033: Optimize category escapes outside character sets (GH-152035)
authorSerhiy Storchaka <storchaka@gmail.com>
Wed, 24 Jun 2026 05:49:14 +0000 (08:49 +0300)
committerGitHub <noreply@github.com>
Wed, 24 Jun 2026 05:49:14 +0000 (08:49 +0300)
Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and
``\W``) that occur outside a character set are now compiled directly to a
single CATEGORY opcode instead of being wrapped in an IN block.  This
removes the IN wrapper (three code words) and an indirect charset() call,
and makes such an escape a simple repeatable unit so that, for example,
``\d+`` uses the REPEAT_ONE fast path; a CATEGORY case is added to
SRE(count).

The transformation preserves behaviour exactly.  For category-heavy
patterns the compiled byte code is about 20% smaller and matching is up
to ~2x faster, with no effect on patterns that do not use bare category
escapes.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Doc/whatsnew/3.16.rst
Lib/re/_compiler.py
Lib/re/_parser.py
Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst [new file with mode: 0644]
Modules/_sre/sre.c
Modules/_sre/sre_lib.h

index 64a986f2487d5cbbe338214d6c298ef2da6a81ac..9242a1ae3ea6ba1f00bdf518f62dc12b9ddcafab 100644 (file)
@@ -265,6 +265,15 @@ zipfile
 Optimizations
 =============
 
+re
+--
+
+* Character class escapes (``\d``, ``\D``, ``\s``, ``\S``, ``\w`` and ``\W``)
+  outside a character set are now compiled to a single ``CATEGORY`` opcode
+  instead of being wrapped in an ``IN`` block.  This speeds up matching of
+  patterns such as ``\d+`` and reduces the size of the compiled byte code.
+  (Contributed by Serhiy Storchaka in :gh:`152033`.)
+
 module_name
 -----------
 
index c2ca8e25abe34d3d0c4ef714e7e8e090a341dc35..304f875ea7fe7aa8e11b3babbaa912f5b55be310 100644 (file)
@@ -20,7 +20,7 @@ assert _sre.MAGIC == MAGIC, "SRE module mismatch"
 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
-_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
+_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY}
 
 _REPEATING_CODES = {
     MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
@@ -495,6 +495,8 @@ def _get_charset_prefix(pattern, flags):
         if iscased and iscased(av):
             return None
         return [(op, av)]
+    elif op is CATEGORY:
+        return [(op, av)]
     elif op is BRANCH:
         charset = []
         charsetappend = charset.append
index bd189fe0695f801a35de2e8093cf96a1e9ea273e..3c41c43409534b800301ffee1f38dce5ca79aad4 100644 (file)
@@ -27,6 +27,7 @@ WHITESPACE = frozenset(" \t\n\r\v\f")
 
 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT, POSSESSIVE_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
+_SETITEMCODES = frozenset({LITERAL, CATEGORY})
 
 ESCAPES = {
     r"\a": (LITERAL, ord("\a")),
@@ -43,12 +44,12 @@ CATEGORIES = {
     r"\A": (AT, AT_BEGINNING_STRING), # start of string
     r"\b": (AT, AT_BOUNDARY),
     r"\B": (AT, AT_NON_BOUNDARY),
-    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
-    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
-    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
-    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
-    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
-    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
+    r"\d": (CATEGORY, CATEGORY_DIGIT),
+    r"\D": (CATEGORY, CATEGORY_NOT_DIGIT),
+    r"\s": (CATEGORY, CATEGORY_SPACE),
+    r"\S": (CATEGORY, CATEGORY_NOT_SPACE),
+    r"\w": (CATEGORY, CATEGORY_WORD),
+    r"\W": (CATEGORY, CATEGORY_NOT_WORD),
     r"\z": (AT, AT_END_STRING), # end of string
     r"\Z": (AT, AT_END_STRING), # end of string (obsolete)
 }
@@ -315,7 +316,7 @@ def _class_escape(source, escape):
     if code:
         return code
     code = CATEGORIES.get(escape)
-    if code and code[0] is IN:
+    if code and code[0] is CATEGORY:
         return code
     try:
         c = escape[1:2]
@@ -493,7 +494,7 @@ def _parse_sub(source, state, verbose, nested):
         if len(item) != 1:
             break
         op, av = item[0]
-        if op is LITERAL:
+        if op in _SETITEMCODES:
             set.append((op, av))
         elif op is IN and av[0][0] is not NEGATE:
             set.extend(av)
@@ -590,8 +591,6 @@ def _parse(source, state, verbose, nested, first=False):
                         raise source.error("unterminated character set",
                                            source.tell() - here)
                     if that == "]":
-                        if code1[0] is IN:
-                            code1 = code1[1][0]
                         setappend(code1)
                         setappend((LITERAL, _ord("-")))
                         break
@@ -616,8 +615,6 @@ def _parse(source, state, verbose, nested, first=False):
                         raise source.error(msg, len(this) + 1 + len(that))
                     setappend((RANGE, (lo, hi)))
                 else:
-                    if code1[0] is IN:
-                        code1 = code1[1][0]
                     setappend(code1)
 
             set = _uniq(set)
diff --git a/Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst b/Misc/NEWS.d/next/Library/2026-06-23-22-15-00.gh-issue-152033.Ct1Egy.rst
new file mode 100644 (file)
index 0000000..7c54a6c
--- /dev/null
@@ -0,0 +1,5 @@
+Optimize matching of character class escapes (``\d``, ``\D``, ``\s``,
+``\S``, ``\w`` and ``\W``) that occur outside a character set: they are now
+compiled to a single ``CATEGORY`` opcode instead of being wrapped in an
+``IN`` block.  This speeds up patterns such as ``\d+`` and reduces the size
+of the compiled byte code.
index 32aa06bed4a409cbbf30ed2b6446541fc6d84406..9964532a7f401c12e8f00b4555873abfa3f53023 100644 (file)
@@ -1842,6 +1842,34 @@ bad_template:
     } while (0)
 #define GET_SKIP GET_SKIP_ADJ(0)
 
+static int
+_validate_category(SRE_CODE arg)
+{
+    switch (arg) {
+    case SRE_CATEGORY_DIGIT:
+    case SRE_CATEGORY_NOT_DIGIT:
+    case SRE_CATEGORY_SPACE:
+    case SRE_CATEGORY_NOT_SPACE:
+    case SRE_CATEGORY_WORD:
+    case SRE_CATEGORY_NOT_WORD:
+    case SRE_CATEGORY_LINEBREAK:
+    case SRE_CATEGORY_NOT_LINEBREAK:
+    case SRE_CATEGORY_LOC_WORD:
+    case SRE_CATEGORY_LOC_NOT_WORD:
+    case SRE_CATEGORY_UNI_DIGIT:
+    case SRE_CATEGORY_UNI_NOT_DIGIT:
+    case SRE_CATEGORY_UNI_SPACE:
+    case SRE_CATEGORY_UNI_NOT_SPACE:
+    case SRE_CATEGORY_UNI_WORD:
+    case SRE_CATEGORY_UNI_NOT_WORD:
+    case SRE_CATEGORY_UNI_LINEBREAK:
+    case SRE_CATEGORY_UNI_NOT_LINEBREAK:
+        return 1;
+    default:
+        return 0;
+    }
+}
+
 static int
 _validate_charset(SRE_CODE *code, SRE_CODE *end)
 {
@@ -1894,27 +1922,7 @@ _validate_charset(SRE_CODE *code, SRE_CODE *end)
 
         case SRE_OP_CATEGORY:
             GET_ARG;
-            switch (arg) {
-            case SRE_CATEGORY_DIGIT:
-            case SRE_CATEGORY_NOT_DIGIT:
-            case SRE_CATEGORY_SPACE:
-            case SRE_CATEGORY_NOT_SPACE:
-            case SRE_CATEGORY_WORD:
-            case SRE_CATEGORY_NOT_WORD:
-            case SRE_CATEGORY_LINEBREAK:
-            case SRE_CATEGORY_NOT_LINEBREAK:
-            case SRE_CATEGORY_LOC_WORD:
-            case SRE_CATEGORY_LOC_NOT_WORD:
-            case SRE_CATEGORY_UNI_DIGIT:
-            case SRE_CATEGORY_UNI_NOT_DIGIT:
-            case SRE_CATEGORY_UNI_SPACE:
-            case SRE_CATEGORY_UNI_NOT_SPACE:
-            case SRE_CATEGORY_UNI_WORD:
-            case SRE_CATEGORY_UNI_NOT_WORD:
-            case SRE_CATEGORY_UNI_LINEBREAK:
-            case SRE_CATEGORY_UNI_NOT_LINEBREAK:
-                break;
-            default:
+            if (!_validate_category(arg)) {
                 FAIL;
             }
             break;
@@ -1995,6 +2003,13 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
             }
             break;
 
+        case SRE_OP_CATEGORY:
+            GET_ARG;
+            if (!_validate_category(arg)) {
+                FAIL;
+            }
+            break;
+
         case SRE_OP_ANY:
         case SRE_OP_ANY_ALL:
             /* These have no operands */
index df377905bfae0d09023499da864b3502853ac267..6e6ae46f05a50f08397b35a75527b7b0db1aa9b7 100644 (file)
@@ -193,6 +193,7 @@ LOCAL(Py_ssize_t)
 SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
 {
     SRE_CODE chr;
+    SRE_CODE arg;
     SRE_CHAR c;
     const SRE_CHAR* ptr = (const SRE_CHAR *)state->ptr;
     const SRE_CHAR* end = (const SRE_CHAR *)state->end;
@@ -302,6 +303,13 @@ SRE(count)(SRE_STATE* state, const SRE_CODE* pattern, Py_ssize_t maxcount)
             ptr++;
         break;
 
+    case SRE_OP_CATEGORY:
+        arg = pattern[1];
+        TRACE(("|%p|%p|COUNT CATEGORY %d\n", pattern, ptr, arg));
+        while (ptr < end && sre_category(arg, *ptr))
+            ptr++;
+        break;
+
     default:
         /* repeated single character pattern */
         TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));