]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-152100: Move re compiler optimizations to Lib/re/_optimizer.py (GH-152154)
authorSerhiy Storchaka <storchaka@gmail.com>
Thu, 25 Jun 2026 08:09:56 +0000 (11:09 +0300)
committerGitHub <noreply@github.com>
Thu, 25 Jun 2026 08:09:56 +0000 (08:09 +0000)
Move the compile-time optimizations (_optimize_charset, _compile_charset,
_simple, _compile_info and the literal/charset prefix helpers) out of
_compiler.py into a new Lib/re/_optimizer.py.  _compiler.py keeps only the
bytecode emitter and imports them.  This is groundwork for a follow-up
optimization; there is no behavior change.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
Lib/re/_compiler.py
Lib/re/_optimizer.py [new file with mode: 0644]
Lib/test/test_re.py

index 304f875ea7fe7aa8e11b3babbaa912f5b55be310..fb0c8d35f6f89a3d862ab662031a52d412ff945d 100644 (file)
@@ -14,13 +14,16 @@ import _sre
 from . import _parser
 from ._constants import *
 from ._casefix import _EXTRA_CASES
+from ._optimizer import (
+    _combine_flags, _compile_charset, _optimize_charset, _compile_info,
+    _simple, _CHARSET_ALL, _CODEBITS, MAXCODE,
+)
 
 assert _sre.MAGIC == MAGIC, "SRE module mismatch"
 
 _LITERAL_CODES = {LITERAL, NOT_LITERAL}
 _SUCCESS_CODES = {SUCCESS, FAILURE}
 _ASSERT_CODES = {ASSERT, ASSERT_NOT}
-_UNIT_CODES = _LITERAL_CODES | {ANY, IN, CATEGORY}
 
 _REPEATING_CODES = {
     MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
@@ -28,14 +31,6 @@ _REPEATING_CODES = {
     POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
 }
 
-_CHARSET_ALL = [(NEGATE, None)]
-
-def _combine_flags(flags, add_flags, del_flags,
-                   TYPE_FLAGS=_parser.TYPE_FLAGS):
-    if add_flags & TYPE_FLAGS:
-        flags &= ~TYPE_FLAGS
-    return (flags | add_flags) & ~del_flags
-
 def _compile(code, pattern, flags):
     # internal: compile a (sub)pattern
     emit = code.append
@@ -218,374 +213,6 @@ def _compile(code, pattern, flags):
         else:
             raise PatternError(f"internal: unsupported operand type {op!r}")
 
-def _compile_charset(charset, flags, code):
-    # compile charset subprogram
-    emit = code.append
-    for op, av in charset:
-        emit(op)
-        if op is NEGATE:
-            pass
-        elif op is LITERAL:
-            emit(av)
-        elif op is RANGE or op is RANGE_UNI_IGNORE:
-            emit(av[0])
-            emit(av[1])
-        elif op is CHARSET:
-            code.extend(av)
-        elif op is BIGCHARSET:
-            code.extend(av)
-        elif op is CATEGORY:
-            if flags & SRE_FLAG_LOCALE:
-                emit(CH_LOCALE[av])
-            elif flags & SRE_FLAG_UNICODE:
-                emit(CH_UNICODE[av])
-            else:
-                emit(av)
-        else:
-            raise PatternError(f"internal: unsupported set operator {op!r}")
-    emit(FAILURE)
-
-def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
-    # internal: optimize character set
-    out = []
-    tail = []
-    charmap = bytearray(256)
-    hascased = False
-    for op, av in charset:
-        while True:
-            try:
-                if op is LITERAL:
-                    if fixup: # IGNORECASE and not LOCALE
-                        av = fixup(av)
-                        charmap[av] = 1
-                        if fixes and av in fixes:
-                            for k in fixes[av]:
-                                charmap[k] = 1
-                        if not hascased and iscased(av):
-                            hascased = True
-                    else:
-                        charmap[av] = 1
-                elif op is RANGE:
-                    r = range(av[0], av[1]+1)
-                    if fixup: # IGNORECASE and not LOCALE
-                        if fixes:
-                            for i in map(fixup, r):
-                                charmap[i] = 1
-                                if i in fixes:
-                                    for k in fixes[i]:
-                                        charmap[k] = 1
-                        else:
-                            for i in map(fixup, r):
-                                charmap[i] = 1
-                        if not hascased:
-                            hascased = any(map(iscased, r))
-                    else:
-                        for i in r:
-                            charmap[i] = 1
-                elif op is NEGATE:
-                    out.append((op, av))
-                elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
-                    # Optimize [\s\S] etc.
-                    out = [] if out else _CHARSET_ALL
-                    return out, False
-                else:
-                    tail.append((op, av))
-            except IndexError:
-                if len(charmap) == 256:
-                    # character set contains non-UCS1 character codes
-                    charmap += b'\0' * 0xff00
-                    continue
-                # Character set contains non-BMP character codes.
-                # For range, all BMP characters in the range are already
-                # proceeded.
-                if fixup: # IGNORECASE and not LOCALE
-                    # For now, IN_UNI_IGNORE+LITERAL and
-                    # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
-                    # characters, because two characters (at least one of
-                    # which is not in the BMP) match case-insensitively
-                    # if and only if:
-                    # 1) c1.lower() == c2.lower()
-                    # 2) c1.lower() == c2 or c1.lower().upper() == c2
-                    # Also, both c.lower() and c.lower().upper() are single
-                    # characters for every non-BMP character.
-                    if op is RANGE:
-                        if fixes: # not ASCII
-                            op = RANGE_UNI_IGNORE
-                        hascased = True
-                    else:
-                        assert op is LITERAL
-                        if not hascased and iscased(av):
-                            hascased = True
-                tail.append((op, av))
-            break
-
-    # compress character map
-    runs = []
-    q = 0
-    while True:
-        p = charmap.find(1, q)
-        if p < 0:
-            break
-        if len(runs) >= 2:
-            runs = None
-            break
-        q = charmap.find(0, p)
-        if q < 0:
-            runs.append((p, len(charmap)))
-            break
-        runs.append((p, q))
-    if runs is not None:
-        # use literal/range
-        for p, q in runs:
-            if q - p == 1:
-                out.append((LITERAL, p))
-            else:
-                out.append((RANGE, (p, q - 1)))
-        out += tail
-        # if the case was changed or new representation is more compact
-        if hascased or len(out) < len(charset):
-            return out, hascased
-        # else original character set is good enough
-        return charset, hascased
-
-    # use bitmap
-    if len(charmap) == 256:
-        data = _mk_bitmap(charmap)
-        out.append((CHARSET, data))
-        out += tail
-        return out, hascased
-
-    # To represent a big charset, first a bitmap of all characters in the
-    # set is constructed. Then, this bitmap is sliced into chunks of 256
-    # characters, duplicate chunks are eliminated, and each chunk is
-    # given a number. In the compiled expression, the charset is
-    # represented by a 32-bit word sequence, consisting of one word for
-    # the number of different chunks, a sequence of 256 bytes (64 words)
-    # of chunk numbers indexed by their original chunk position, and a
-    # sequence of 256-bit chunks (8 words each).
-
-    # Compression is normally good: in a typical charset, large ranges of
-    # Unicode will be either completely excluded (e.g. if only cyrillic
-    # letters are to be matched), or completely included (e.g. if large
-    # subranges of Kanji match). These ranges will be represented by
-    # chunks of all one-bits or all zero-bits.
-
-    # Matching can be also done efficiently: the more significant byte of
-    # the Unicode character is an index into the chunk number, and the
-    # less significant byte is a bit index in the chunk (just like the
-    # CHARSET matching).
-
-    charmap = charmap.take_bytes() # should be hashable
-    comps = {}
-    mapping = bytearray(256)
-    block = 0
-    data = bytearray()
-    for i in range(0, 65536, 256):
-        chunk = charmap[i: i + 256]
-        if chunk in comps:
-            mapping[i // 256] = comps[chunk]
-        else:
-            mapping[i // 256] = comps[chunk] = block
-            block += 1
-            data += chunk
-    data = _mk_bitmap(data)
-    data[0:0] = [block] + _bytes_to_codes(mapping)
-    out.append((BIGCHARSET, data))
-    out += tail
-    return out, hascased
-
-_CODEBITS = _sre.CODESIZE * 8
-MAXCODE = (1 << _CODEBITS) - 1
-_BITS_TRANS = b'0' + b'1' * 255
-def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
-    s = bits.translate(_BITS_TRANS)[::-1]
-    return [_int(s[i - _CODEBITS: i], 2)
-            for i in range(len(s), 0, -_CODEBITS)]
-
-def _bytes_to_codes(b):
-    # Convert block indices to word array
-    a = memoryview(b).cast('I')
-    assert a.itemsize == _sre.CODESIZE
-    assert len(a) * a.itemsize == len(b)
-    return a.tolist()
-
-def _simple(p):
-    # check if this subpattern is a "simple" operator
-    if len(p) != 1:
-        return False
-    op, av = p[0]
-    if op is SUBPATTERN:
-        return av[0] is None and _simple(av[-1])
-    return op in _UNIT_CODES
-
-def _generate_overlap_table(prefix):
-    """
-    Generate an overlap table for the following prefix.
-    An overlap table is a table of the same size as the prefix which
-    informs about the potential self-overlap for each index in the prefix:
-    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
-    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
-      prefix[0:k]
-    """
-    table = [0] * len(prefix)
-    for i in range(1, len(prefix)):
-        idx = table[i - 1]
-        while prefix[i] != prefix[idx]:
-            if idx == 0:
-                table[i] = 0
-                break
-            idx = table[idx - 1]
-        else:
-            table[i] = idx + 1
-    return table
-
-def _get_iscased(flags):
-    if not flags & SRE_FLAG_IGNORECASE:
-        return None
-    elif flags & SRE_FLAG_UNICODE:
-        return _sre.unicode_iscased
-    else:
-        return _sre.ascii_iscased
-
-def _get_literal_prefix(pattern, flags):
-    # look for literal prefix
-    prefix = []
-    prefixappend = prefix.append
-    prefix_skip = None
-    iscased = _get_iscased(flags)
-    for op, av in pattern.data:
-        if op is LITERAL:
-            if iscased and iscased(av):
-                break
-            prefixappend(av)
-        elif op is SUBPATTERN:
-            group, add_flags, del_flags, p = av
-            flags1 = _combine_flags(flags, add_flags, del_flags)
-            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
-                break
-            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
-            if prefix_skip is None:
-                if group is not None:
-                    prefix_skip = len(prefix)
-                elif prefix_skip1 is not None:
-                    prefix_skip = len(prefix) + prefix_skip1
-            prefix.extend(prefix1)
-            if not got_all:
-                break
-        else:
-            break
-    else:
-        return prefix, prefix_skip, True
-    return prefix, prefix_skip, False
-
-def _get_charset_prefix(pattern, flags):
-    while True:
-        if not pattern.data:
-            return None
-        op, av = pattern.data[0]
-        if op is not SUBPATTERN:
-            break
-        group, add_flags, del_flags, pattern = av
-        flags = _combine_flags(flags, add_flags, del_flags)
-        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
-            return None
-
-    iscased = _get_iscased(flags)
-    if op is LITERAL:
-        if iscased and iscased(av):
-            return None
-        return [(op, av)]
-    elif op is CATEGORY:
-        return [(op, av)]
-    elif op is BRANCH:
-        charset = []
-        charsetappend = charset.append
-        for p in av[1]:
-            if not p:
-                return None
-            op, av = p[0]
-            if op is LITERAL and not (iscased and iscased(av)):
-                charsetappend((op, av))
-            else:
-                return None
-        return charset
-    elif op is IN:
-        charset = av
-        if iscased:
-            for op, av in charset:
-                if op is LITERAL:
-                    if iscased(av):
-                        return None
-                elif op is RANGE:
-                    if av[1] > 0xffff:
-                        return None
-                    if any(map(iscased, range(av[0], av[1]+1))):
-                        return None
-        return charset
-    return None
-
-def _compile_info(code, pattern, flags):
-    # internal: compile an info block.  in the current version,
-    # this contains min/max pattern width, and an optional literal
-    # prefix or a character map
-    lo, hi = pattern.getwidth()
-    if hi > MAXCODE:
-        hi = MAXCODE
-    if lo == 0:
-        code.extend([INFO, 4, 0, lo, hi])
-        return
-    # look for a literal prefix
-    prefix = []
-    prefix_skip = 0
-    charset = None # not used
-    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
-        # look for literal prefix
-        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
-        # if no prefix, look for charset prefix
-        if not prefix:
-            charset = _get_charset_prefix(pattern, flags)
-            if charset:
-                charset, hascased = _optimize_charset(charset)
-                assert not hascased
-                if charset == _CHARSET_ALL:
-                    charset = None
-##     if prefix:
-##         print("*** PREFIX", prefix, prefix_skip)
-##     if charset:
-##         print("*** CHARSET", charset)
-    # add an info block
-    emit = code.append
-    emit(INFO)
-    skip = len(code); emit(0)
-    # literal flag
-    mask = 0
-    if prefix:
-        mask = SRE_INFO_PREFIX
-        if prefix_skip is None and got_all:
-            mask = mask | SRE_INFO_LITERAL
-    elif charset:
-        mask = mask | SRE_INFO_CHARSET
-    emit(mask)
-    # pattern length
-    if lo < MAXCODE:
-        emit(lo)
-    else:
-        emit(MAXCODE)
-        prefix = prefix[:MAXCODE]
-    emit(hi)
-    # add literal prefix
-    if prefix:
-        emit(len(prefix)) # length
-        if prefix_skip is None:
-            prefix_skip =  len(prefix)
-        emit(prefix_skip) # skip
-        code.extend(prefix)
-        # generate overlap table
-        code.extend(_generate_overlap_table(prefix))
-    elif charset:
-        _compile_charset(charset, flags, code)
-    code[skip] = len(code) - skip
-
 def isstring(obj):
     return isinstance(obj, (str, bytes))
 
diff --git a/Lib/re/_optimizer.py b/Lib/re/_optimizer.py
new file mode 100644 (file)
index 0000000..5e38925
--- /dev/null
@@ -0,0 +1,397 @@
+#
+# Secret Labs' Regular Expression Engine
+#
+# optimizations for the compiler
+#
+# Copyright (c) 1997-2001 by Secret Labs AB.  All rights reserved.
+#
+# See the __init__.py file for information on usage and redistribution.
+#
+
+"""Internal support module for sre.
+
+Optimization passes used by the compiler: character-set optimization
+(:func:`_optimize_charset`), the "simple" repeat-body test (:func:`_simple`),
+and the literal/charset prefix info block (:func:`_compile_info`).
+"""
+
+import _sre
+from . import _parser
+from ._constants import *
+
+_CHARSET_ALL = [(NEGATE, None)]
+_UNIT_CODES = {LITERAL, NOT_LITERAL, ANY, IN, CATEGORY}
+
+def _combine_flags(flags, add_flags, del_flags,
+                   TYPE_FLAGS=_parser.TYPE_FLAGS):
+    if add_flags & TYPE_FLAGS:
+        flags &= ~TYPE_FLAGS
+    return (flags | add_flags) & ~del_flags
+
+def _compile_charset(charset, flags, code):
+    # compile charset subprogram
+    emit = code.append
+    for op, av in charset:
+        emit(op)
+        if op is NEGATE:
+            pass
+        elif op is LITERAL:
+            emit(av)
+        elif op is RANGE or op is RANGE_UNI_IGNORE:
+            emit(av[0])
+            emit(av[1])
+        elif op is CHARSET:
+            code.extend(av)
+        elif op is BIGCHARSET:
+            code.extend(av)
+        elif op is CATEGORY:
+            if flags & SRE_FLAG_LOCALE:
+                emit(CH_LOCALE[av])
+            elif flags & SRE_FLAG_UNICODE:
+                emit(CH_UNICODE[av])
+            else:
+                emit(av)
+        else:
+            raise PatternError(f"internal: unsupported set operator {op!r}")
+    emit(FAILURE)
+
+def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
+    # internal: optimize character set
+    out = []
+    tail = []
+    charmap = bytearray(256)
+    hascased = False
+    for op, av in charset:
+        while True:
+            try:
+                if op is LITERAL:
+                    if fixup: # IGNORECASE and not LOCALE
+                        av = fixup(av)
+                        charmap[av] = 1
+                        if fixes and av in fixes:
+                            for k in fixes[av]:
+                                charmap[k] = 1
+                        if not hascased and iscased(av):
+                            hascased = True
+                    else:
+                        charmap[av] = 1
+                elif op is RANGE:
+                    r = range(av[0], av[1]+1)
+                    if fixup: # IGNORECASE and not LOCALE
+                        if fixes:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                                if i in fixes:
+                                    for k in fixes[i]:
+                                        charmap[k] = 1
+                        else:
+                            for i in map(fixup, r):
+                                charmap[i] = 1
+                        if not hascased:
+                            hascased = any(map(iscased, r))
+                    else:
+                        for i in r:
+                            charmap[i] = 1
+                elif op is NEGATE:
+                    out.append((op, av))
+                elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
+                    # Optimize [\s\S] etc.
+                    out = [] if out else _CHARSET_ALL
+                    return out, False
+                else:
+                    tail.append((op, av))
+            except IndexError:
+                if len(charmap) == 256:
+                    # character set contains non-UCS1 character codes
+                    charmap += b'\0' * 0xff00
+                    continue
+                # Character set contains non-BMP character codes.
+                # For range, all BMP characters in the range are already
+                # proceeded.
+                if fixup: # IGNORECASE and not LOCALE
+                    # For now, IN_UNI_IGNORE+LITERAL and
+                    # IN_UNI_IGNORE+RANGE_UNI_IGNORE work for all non-BMP
+                    # characters, because two characters (at least one of
+                    # which is not in the BMP) match case-insensitively
+                    # if and only if:
+                    # 1) c1.lower() == c2.lower()
+                    # 2) c1.lower() == c2 or c1.lower().upper() == c2
+                    # Also, both c.lower() and c.lower().upper() are single
+                    # characters for every non-BMP character.
+                    if op is RANGE:
+                        if fixes: # not ASCII
+                            op = RANGE_UNI_IGNORE
+                        hascased = True
+                    else:
+                        assert op is LITERAL
+                        if not hascased and iscased(av):
+                            hascased = True
+                tail.append((op, av))
+            break
+
+    # compress character map
+    runs = []
+    q = 0
+    while True:
+        p = charmap.find(1, q)
+        if p < 0:
+            break
+        if len(runs) >= 2:
+            runs = None
+            break
+        q = charmap.find(0, p)
+        if q < 0:
+            runs.append((p, len(charmap)))
+            break
+        runs.append((p, q))
+    if runs is not None:
+        # use literal/range
+        for p, q in runs:
+            if q - p == 1:
+                out.append((LITERAL, p))
+            else:
+                out.append((RANGE, (p, q - 1)))
+        out += tail
+        # if the case was changed or new representation is more compact
+        if hascased or len(out) < len(charset):
+            return out, hascased
+        # else original character set is good enough
+        return charset, hascased
+
+    # use bitmap
+    if len(charmap) == 256:
+        data = _mk_bitmap(charmap)
+        out.append((CHARSET, data))
+        out += tail
+        return out, hascased
+
+    # To represent a big charset, first a bitmap of all characters in the
+    # set is constructed. Then, this bitmap is sliced into chunks of 256
+    # characters, duplicate chunks are eliminated, and each chunk is
+    # given a number. In the compiled expression, the charset is
+    # represented by a 32-bit word sequence, consisting of one word for
+    # the number of different chunks, a sequence of 256 bytes (64 words)
+    # of chunk numbers indexed by their original chunk position, and a
+    # sequence of 256-bit chunks (8 words each).
+
+    # Compression is normally good: in a typical charset, large ranges of
+    # Unicode will be either completely excluded (e.g. if only cyrillic
+    # letters are to be matched), or completely included (e.g. if large
+    # subranges of Kanji match). These ranges will be represented by
+    # chunks of all one-bits or all zero-bits.
+
+    # Matching can be also done efficiently: the more significant byte of
+    # the Unicode character is an index into the chunk number, and the
+    # less significant byte is a bit index in the chunk (just like the
+    # CHARSET matching).
+
+    charmap = charmap.take_bytes() # should be hashable
+    comps = {}
+    mapping = bytearray(256)
+    block = 0
+    data = bytearray()
+    for i in range(0, 65536, 256):
+        chunk = charmap[i: i + 256]
+        if chunk in comps:
+            mapping[i // 256] = comps[chunk]
+        else:
+            mapping[i // 256] = comps[chunk] = block
+            block += 1
+            data += chunk
+    data = _mk_bitmap(data)
+    data[0:0] = [block] + _bytes_to_codes(mapping)
+    out.append((BIGCHARSET, data))
+    out += tail
+    return out, hascased
+
+_CODEBITS = _sre.CODESIZE * 8
+MAXCODE = (1 << _CODEBITS) - 1
+_BITS_TRANS = b'0' + b'1' * 255
+def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
+    s = bits.translate(_BITS_TRANS)[::-1]
+    return [_int(s[i - _CODEBITS: i], 2)
+            for i in range(len(s), 0, -_CODEBITS)]
+
+def _bytes_to_codes(b):
+    # Convert block indices to word array
+    a = memoryview(b).cast('I')
+    assert a.itemsize == _sre.CODESIZE
+    assert len(a) * a.itemsize == len(b)
+    return a.tolist()
+
+def _simple(p):
+    # check if this subpattern is a "simple" operator
+    if len(p) != 1:
+        return False
+    op, av = p[0]
+    if op is SUBPATTERN:
+        return av[0] is None and _simple(av[-1])
+    return op in _UNIT_CODES
+
+def _generate_overlap_table(prefix):
+    """
+    Generate an overlap table for the following prefix.
+    An overlap table is a table of the same size as the prefix which
+    informs about the potential self-overlap for each index in the prefix:
+    - if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
+    - if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
+      prefix[0:k]
+    """
+    table = [0] * len(prefix)
+    for i in range(1, len(prefix)):
+        idx = table[i - 1]
+        while prefix[i] != prefix[idx]:
+            if idx == 0:
+                table[i] = 0
+                break
+            idx = table[idx - 1]
+        else:
+            table[i] = idx + 1
+    return table
+
+def _get_iscased(flags):
+    if not flags & SRE_FLAG_IGNORECASE:
+        return None
+    elif flags & SRE_FLAG_UNICODE:
+        return _sre.unicode_iscased
+    else:
+        return _sre.ascii_iscased
+
+def _get_literal_prefix(pattern, flags):
+    # look for literal prefix
+    prefix = []
+    prefixappend = prefix.append
+    prefix_skip = None
+    iscased = _get_iscased(flags)
+    for op, av in pattern.data:
+        if op is LITERAL:
+            if iscased and iscased(av):
+                break
+            prefixappend(av)
+        elif op is SUBPATTERN:
+            group, add_flags, del_flags, p = av
+            flags1 = _combine_flags(flags, add_flags, del_flags)
+            if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
+                break
+            prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
+            if prefix_skip is None:
+                if group is not None:
+                    prefix_skip = len(prefix)
+                elif prefix_skip1 is not None:
+                    prefix_skip = len(prefix) + prefix_skip1
+            prefix.extend(prefix1)
+            if not got_all:
+                break
+        else:
+            break
+    else:
+        return prefix, prefix_skip, True
+    return prefix, prefix_skip, False
+
+def _get_charset_prefix(pattern, flags):
+    while True:
+        if not pattern.data:
+            return None
+        op, av = pattern.data[0]
+        if op is not SUBPATTERN:
+            break
+        group, add_flags, del_flags, pattern = av
+        flags = _combine_flags(flags, add_flags, del_flags)
+        if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+            return None
+
+    iscased = _get_iscased(flags)
+    if op is LITERAL:
+        if iscased and iscased(av):
+            return None
+        return [(op, av)]
+    elif op is CATEGORY:
+        return [(op, av)]
+    elif op is BRANCH:
+        charset = []
+        charsetappend = charset.append
+        for p in av[1]:
+            if not p:
+                return None
+            op, av = p[0]
+            if op is LITERAL and not (iscased and iscased(av)):
+                charsetappend((op, av))
+            else:
+                return None
+        return charset
+    elif op is IN:
+        charset = av
+        if iscased:
+            for op, av in charset:
+                if op is LITERAL:
+                    if iscased(av):
+                        return None
+                elif op is RANGE:
+                    if av[1] > 0xffff:
+                        return None
+                    if any(map(iscased, range(av[0], av[1]+1))):
+                        return None
+        return charset
+    return None
+
+def _compile_info(code, pattern, flags):
+    # internal: compile an info block.  in the current version,
+    # this contains min/max pattern width, and an optional literal
+    # prefix or a character map
+    lo, hi = pattern.getwidth()
+    if hi > MAXCODE:
+        hi = MAXCODE
+    if lo == 0:
+        code.extend([INFO, 4, 0, lo, hi])
+        return
+    # look for a literal prefix
+    prefix = []
+    prefix_skip = 0
+    charset = None # not used
+    if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
+        # look for literal prefix
+        prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
+        # if no prefix, look for charset prefix
+        if not prefix:
+            charset = _get_charset_prefix(pattern, flags)
+            if charset:
+                charset, hascased = _optimize_charset(charset)
+                assert not hascased
+                if charset == _CHARSET_ALL:
+                    charset = None
+##     if prefix:
+##         print("*** PREFIX", prefix, prefix_skip)
+##     if charset:
+##         print("*** CHARSET", charset)
+    # add an info block
+    emit = code.append
+    emit(INFO)
+    skip = len(code); emit(0)
+    # literal flag
+    mask = 0
+    if prefix:
+        mask = SRE_INFO_PREFIX
+        if prefix_skip is None and got_all:
+            mask = mask | SRE_INFO_LITERAL
+    elif charset:
+        mask = mask | SRE_INFO_CHARSET
+    emit(mask)
+    # pattern length
+    if lo < MAXCODE:
+        emit(lo)
+    else:
+        emit(MAXCODE)
+        prefix = prefix[:MAXCODE]
+    emit(hi)
+    # add literal prefix
+    if prefix:
+        emit(len(prefix)) # length
+        if prefix_skip is None:
+            prefix_skip =  len(prefix)
+        emit(prefix_skip) # skip
+        code.extend(prefix)
+        # generate overlap table
+        code.extend(_generate_overlap_table(prefix))
+    elif charset:
+        _compile_charset(charset, flags, code)
+    code[skip] = len(code) - skip
index 2a57370a6fb6436fd0764951e73fe1d042937089..4ab615b150002c85d6afc5ce80a276dea3a56a5f 100644 (file)
@@ -2825,6 +2825,12 @@ ATOMIC_GROUP
 17: SUCCESS
 ''')
 
+    def test_debug_charset_bitmap(self):
+        # gh-152100: disassembling a charset that compiles to a CHARSET/
+        # BIGCHARSET bitmap must not fail (the disassembler needs _CODEBITS).
+        out = get_debug_out(r'[aeiou]')
+        self.assertIn('CHARSET', out)
+
     def test_possesive_repeat_one(self):
         self.assertEqual(get_debug_out(r'a?+'), '''\
 POSSESSIVE_REPEAT 0 1
@@ -2950,7 +2956,7 @@ class ImplementationTest(unittest.TestCase):
             tp.foo = 1
 
     def test_overlap_table(self):
-        f = re._compiler._generate_overlap_table
+        f = re._optimizer._generate_overlap_table
         self.assertEqual(f(""), [])
         self.assertEqual(f("a"), [0])
         self.assertEqual(f("abcd"), [0, 0, 0, 0])