gh-95555: Allow a negated property as a character set member (GH-152245)

author Serhiy Storchaka <storchaka@gmail.com>

Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)

committer GitHub <noreply@github.com>

Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
committer GitHub <noreply@github.com>
Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
diff --git a/Lib/re/_parser.py b/Lib/re/_parser.py

index 262286748fb25b2901ae4557bf7ddc08aa932abf..aab9b59168015c24db17a6df44b9e04006aba836 100644 (file)
--- a/Lib/re/_parser.py
+++ b/Lib/re/_parser.py
@@ -310,7 +310,7 @@ class Tokenizer:
              msg = "bad character in group name %r" % name
              raise self.error(msg, len(name) + offset)
  
-def _property_escape(source, escape, in_set=False):
+def _property_escape(source, escape):
      # handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
      from . import _properties
      if not source.match('{'):
@@ -320,10 +320,6 @@ def _property_escape(source, escape, in_set=False):
      if code is None:
          raise source.error("unknown property name %r" % name,
                             len(name) + len(r'\p{}'))
-    if in_set and code[1][0] == (NEGATE, None):
-        # A negated multi-range property cannot be a member of a set.
-        raise source.error("bad escape %s in character class" % escape,
-                           len(name) + len(r'\p{}'))
      return code
  
  def _class_escape(source, escape):
@@ -369,7 +365,7 @@ def _class_escape(source, escape):
                                     len(charname) + len(r'\N{}')) from None
              return LITERAL, c
          elif c in "pP" and source.istext:
-            return _property_escape(source, escape, in_set=True)
+            return _property_escape(source, escape)
          elif c in OCTDIGITS:
              # octal escape (up to three digits)
              escape += source.getwhile(2, OCTDIGITS)
@@ -574,11 +570,15 @@ def _difference(left, right, state):
  # with the next operand.
  _SETOPS = {'||': _union, '&&': _intersect, '--': _difference}
  
-def _operand_elements(set, compound):
-    # The operand's elements: a standalone nested set, else the member union.
+def _operand_elements(set, compound, negated, state):
+    # The operand's elements: a standalone nested set, else the member union,
+    # with any negated-property members alternated in (see addmember).
      if compound is not None:
          return compound
-    return [_charset_node(_uniq(set))]
+    result = [_charset_node(_uniq(set))] if set or not negated else None
+    for neg in negated:
+        result = [neg] if result is None else _union(result, [neg], state)
+    return result
  
  def _parse_operand(source, state, nested, here, allow_nested):
      # Read one operand, stopping at a set operator or the closing ']'.  An
@@ -591,10 +591,15 @@ def _parse_operand(source, state, nested, here, allow_nested):
      sourcematch = source.match
      set = []
      setappend = set.append
+    negated = []        # \P{...} negated-range props, alternated in at the end
      def addmember(code):
-        # Flatten a \p{...} property's IN into the member set.
+        # Flatten a \p{...} property's IN into the member set; a negated one is a
+        # complemented charset, set aside to _union in (it can't join the union).
          if code[0] is IN:
-            set.extend(code[1])
+            if code[1][0][0] is NEGATE:
+                negated.append(code)
+            else:
+                set.extend(code[1])
          else:
              setappend(code)
      compound = None     # elements of a standalone nested-set operand
@@ -607,9 +612,9 @@ def _parse_operand(source, state, nested, here, allow_nested):
          if this is None:
              raise source.error("unterminated character set",
                                 source.tell() - here)
-        if set or compound is not None:
+        if set or compound is not None or negated:
              if this == "]":
-                return _operand_elements(set, compound), None
+                return _operand_elements(set, compound, negated, state), None
              if this in '-&|~' and source.next == this:
                  if this == '~':
                      import warnings
@@ -621,7 +626,7 @@ def _parse_operand(source, state, nested, here, allow_nested):
                  else:
                      # '--', '&&' or '||' ends this operand and starts the next.
                      sourceget()  # consume the second operator character
-                    return _operand_elements(set, compound), this + this
+                    return _operand_elements(set, compound, negated, state), this + this
          if this[0] == "\\":
              code1 = _class_escape(source, this)
          else:
@@ -641,12 +646,12 @@ def _parse_operand(source, state, nested, here, allow_nested):
                  # A trailing '-' is a literal.
                  addmember(code1)
                  setappend((LITERAL, _ord("-")))
-                return [_charset_node(_uniq(set))], None
+                return _operand_elements(set, None, negated, state), None
              if that == "-":
                  # 'X--': difference, not a range.  '--' after a single member
                  # lands here because the range probe consumed the first '-'.
                  addmember(code1)
-                return [_charset_node(_uniq(set))], "--"
+                return _operand_elements(set, None, negated, state), "--"
              if that[0] == "\\":
                  code2 = _class_escape(source, that)
              else:
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 7e8ed0e02833e83de49d2953019cef88648ad4b2..af6e4612dcfaef5a3ab02335bc2054bcfd9dce37 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1061,6 +1061,19 @@ class ReTests(unittest.TestCase):
          self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '０'))
          self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
  
+        # A negated multi-range property (not backed by an engine category) can
+        # be a set member; it is alternated in with the other members.
+        self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
+        self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
+        self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
+        self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
+        self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
+        self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~'))   # = ASCII
+        self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
+        # Composes with set operations.
+        self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_'))  # \w and ASCII
+        self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))
+
          # Errors.
          self.checkPatternError(r'\p', 'missing {, expected property name', 2)
          self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
@@ -1072,10 +1085,6 @@ class ReTests(unittest.TestCase):
          # \p is not special in bytes patterns.
          self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
          self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
-        # A negated multi-range property (one not backed by an engine
-        # category) cannot be a set member.
-        self.checkPatternError(r'[\P{ASCII}]',
-                               r'bad escape \P in character class', 1)
  
      def test_word_boundaries(self):
          # See http://bugs.python.org/issue10713
author	Serhiy Storchaka <storchaka@gmail.com>
	Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
committer	GitHub <noreply@github.com>
	Fri, 26 Jun 2026 11:15:12 +0000 (14:15 +0300)
Lib/re/_parser.py		patch \| blob \| blame \| history
Lib/test/test_re.py		patch \| blob \| blame \| history