]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-95555: Allow a negated property as a character set member (GH-152245)
authorSerhiy Storchaka <storchaka@gmail.com>
Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
committerGitHub <noreply@github.com>
Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
A negated multi-range property such as \P{ASCII} or \P{Pattern_Syntax} was
rejected inside a character class.  Such members are now alternated in with
the other members: [\P{ASCII}abc] becomes [abc] | [^ASCII], and [\P{ASCII}]
alone is just the negated charset.

Co-authored-by: Claude Opus 4.8 <noreply@anthropic.com>
Lib/re/_parser.py
Lib/test/test_re.py

index 262286748fb25b2901ae4557bf7ddc08aa932abf..aab9b59168015c24db17a6df44b9e04006aba836 100644 (file)
@@ -310,7 +310,7 @@ class Tokenizer:
             msg = "bad character in group name %r" % name
             raise self.error(msg, len(name) + offset)
 
-def _property_escape(source, escape, in_set=False):
+def _property_escape(source, escape):
     # handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
     from . import _properties
     if not source.match('{'):
@@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False):
     if code is None:
         raise source.error("unknown property name %r" % name,
                            len(name) + len(r'\p{}'))
-    if in_set and code[1][0] == (NEGATE, None):
-        # A negated multi-range property cannot be a member of a set.
-        raise source.error("bad escape %s in character class" % escape,
-                           len(name) + len(r'\p{}'))
     return code
 
 def _class_escape(source, escape):
@@ -369,7 +365,7 @@ def _class_escape(source, escape):
                                    len(charname) + len(r'\N{}')) from None
             return LITERAL, c
         elif c in "pP" and source.istext:
-            return _property_escape(source, escape, in_set=True)
+            return _property_escape(source, escape)
         elif c in OCTDIGITS:
             # octal escape (up to three digits)
             escape += source.getwhile(2, OCTDIGITS)
@@ -574,11 +570,15 @@ def _difference(left, right, state):
 # with the next operand.
 _SETOPS = {'||': _union, '&&': _intersect, '--': _difference}
 
-def _operand_elements(set, compound):
-    # The operand's elements: a standalone nested set, else the member union.
+def _operand_elements(set, compound, negated, state):
+    # The operand's elements: a standalone nested set, else the member union,
+    # with any negated-property members alternated in (see addmember).
     if compound is not None:
         return compound
-    return [_charset_node(_uniq(set))]
+    result = [_charset_node(_uniq(set))] if set or not negated else None
+    for neg in negated:
+        result = [neg] if result is None else _union(result, [neg], state)
+    return result
 
 def _parse_operand(source, state, nested, here, allow_nested):
     # Read one operand, stopping at a set operator or the closing ']'.  An
@@ -591,10 +591,15 @@ def _parse_operand(source, state, nested, here, allow_nested):
     sourcematch = source.match
     set = []
     setappend = set.append
+    negated = []        # \P{...} negated-range props, alternated in at the end
     def addmember(code):
-        # Flatten a \p{...} property's IN into the member set.
+        # Flatten a \p{...} property's IN into the member set; a negated one is a
+        # complemented charset, set aside to _union in (it can't join the union).
         if code[0] is IN:
-            set.extend(code[1])
+            if code[1][0][0] is NEGATE:
+                negated.append(code)
+            else:
+                set.extend(code[1])
         else:
             setappend(code)
     compound = None     # elements of a standalone nested-set operand
@@ -607,9 +612,9 @@ def _parse_operand(source, state, nested, here, allow_nested):
         if this is None:
             raise source.error("unterminated character set",
                                source.tell() - here)
-        if set or compound is not None:
+        if set or compound is not None or negated:
             if this == "]":
-                return _operand_elements(set, compound), None
+                return _operand_elements(set, compound, negated, state), None
             if this in '-&|~' and source.next == this:
                 if this == '~':
                     import warnings
@@ -621,7 +626,7 @@ def _parse_operand(source, state, nested, here, allow_nested):
                 else:
                     # '--', '&&' or '||' ends this operand and starts the next.
                     sourceget()  # consume the second operator character
-                    return _operand_elements(set, compound), this + this
+                    return _operand_elements(set, compound, negated, state), this + this
         if this[0] == "\\":
             code1 = _class_escape(source, this)
         else:
@@ -641,12 +646,12 @@ def _parse_operand(source, state, nested, here, allow_nested):
                 # A trailing '-' is a literal.
                 addmember(code1)
                 setappend((LITERAL, _ord("-")))
-                return [_charset_node(_uniq(set))], None
+                return _operand_elements(set, None, negated, state), None
             if that == "-":
                 # 'X--': difference, not a range.  '--' after a single member
                 # lands here because the range probe consumed the first '-'.
                 addmember(code1)
-                return [_charset_node(_uniq(set))], "--"
+                return _operand_elements(set, None, negated, state), "--"
             if that[0] == "\\":
                 code2 = _class_escape(source, that)
             else:
index 7e8ed0e02833e83de49d2953019cef88648ad4b2..af6e4612dcfaef5a3ab02335bc2054bcfd9dce37 100644 (file)
@@ -1061,6 +1061,19 @@ class ReTests(unittest.TestCase):
         self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
         self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
 
+        # A negated multi-range property (not backed by an engine category) can
+        # be a set member; it is alternated in with the other members.
+        self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
+        self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
+        self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
+        self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
+        self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
+        self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~'))   # = ASCII
+        self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
+        # Composes with set operations.
+        self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_'))  # \w and ASCII
+        self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))
+
         # Errors.
         self.checkPatternError(r'\p', 'missing {, expected property name', 2)
         self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
@@ -1072,10 +1085,6 @@ class ReTests(unittest.TestCase):
         # \p is not special in bytes patterns.
         self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
         self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
-        # A negated multi-range property (one not backed by an engine
-        # category) cannot be a set member.
-        self.checkPatternError(r'[\P{ASCII}]',
-                               r'bad escape \P in character class', 1)
 
     def test_word_boundaries(self):
         # See http://bugs.python.org/issue10713