msg = "bad character in group name %r" % name
raise self.error(msg, len(name) + offset)
-def _property_escape(source, escape, in_set=False):
+def _property_escape(source, escape):
# handle \p{...} and \P{...} (UTS #18 1.2.4, "Property Syntax")
from . import _properties
if not source.match('{'):
if code is None:
raise source.error("unknown property name %r" % name,
len(name) + len(r'\p{}'))
- if in_set and code[1][0] == (NEGATE, None):
- # A negated multi-range property cannot be a member of a set.
- raise source.error("bad escape %s in character class" % escape,
- len(name) + len(r'\p{}'))
return code
def _class_escape(source, escape):
len(charname) + len(r'\N{}')) from None
return LITERAL, c
elif c in "pP" and source.istext:
- return _property_escape(source, escape, in_set=True)
+ return _property_escape(source, escape)
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
# with the next operand.
_SETOPS = {'||': _union, '&&': _intersect, '--': _difference}
-def _operand_elements(set, compound):
- # The operand's elements: a standalone nested set, else the member union.
+def _operand_elements(set, compound, negated, state):
+ # The operand's elements: a standalone nested set, else the member union,
+ # with any negated-property members alternated in (see addmember).
if compound is not None:
return compound
- return [_charset_node(_uniq(set))]
+ result = [_charset_node(_uniq(set))] if set or not negated else None
+ for neg in negated:
+ result = [neg] if result is None else _union(result, [neg], state)
+ return result
def _parse_operand(source, state, nested, here, allow_nested):
# Read one operand, stopping at a set operator or the closing ']'. An
sourcematch = source.match
set = []
setappend = set.append
+ negated = [] # \P{...} negated-range props, alternated in at the end
def addmember(code):
- # Flatten a \p{...} property's IN into the member set.
+ # Flatten a \p{...} property's IN into the member set; a negated one is a
+ # complemented charset, set aside to _union in (it can't join the union).
if code[0] is IN:
- set.extend(code[1])
+ if code[1][0][0] is NEGATE:
+ negated.append(code)
+ else:
+ set.extend(code[1])
else:
setappend(code)
compound = None # elements of a standalone nested-set operand
if this is None:
raise source.error("unterminated character set",
source.tell() - here)
- if set or compound is not None:
+ if set or compound is not None or negated:
if this == "]":
- return _operand_elements(set, compound), None
+ return _operand_elements(set, compound, negated, state), None
if this in '-&|~' and source.next == this:
if this == '~':
import warnings
else:
# '--', '&&' or '||' ends this operand and starts the next.
sourceget() # consume the second operator character
- return _operand_elements(set, compound), this + this
+ return _operand_elements(set, compound, negated, state), this + this
if this[0] == "\\":
code1 = _class_escape(source, this)
else:
# A trailing '-' is a literal.
addmember(code1)
setappend((LITERAL, _ord("-")))
- return [_charset_node(_uniq(set))], None
+ return _operand_elements(set, None, negated, state), None
if that == "-":
# 'X--': difference, not a range. '--' after a single member
# lands here because the range probe consumed the first '-'.
addmember(code1)
- return [_charset_node(_uniq(set))], "--"
+ return _operand_elements(set, None, negated, state), "--"
if that[0] == "\\":
code2 = _class_escape(source, that)
else:
self.assertIsNone(re.fullmatch(r'\p{ASCII_Hex_Digit}', '0'))
self.assertIsNone(re.fullmatch(r'\p{Hex_Digit}', 'g'))
+ # A negated multi-range property (not backed by an engine category) can
+ # be a set member; it is alternated in with the other members.
+ self.assertIsNone(re.fullmatch(r'[\P{ASCII}]', 'a'))
+ self.assertTrue(re.fullmatch(r'[\P{ASCII}]', 'ä'))
+ self.assertTrue(re.fullmatch(r'[\P{ASCII}abc]+', 'abäc日'))
+ self.assertIsNone(re.fullmatch(r'[\P{ASCII}abc]', 'd'))
+ self.assertTrue(re.fullmatch(r'[abc\P{ASCII}]+', 'abäc日'))
+ self.assertTrue(re.fullmatch(r'[^\P{ASCII}]+', 'AZ09~')) # = ASCII
+ self.assertIsNone(re.fullmatch(r'[^\P{ASCII}]', 'ä'))
+ # Composes with set operations.
+ self.assertTrue(re.fullmatch(r'[\w--\P{ASCII}]+', 'AZ09_')) # \w and ASCII
+ self.assertIsNone(re.fullmatch(r'[\w--\P{ASCII}]', 'д'))
+
# Errors.
self.checkPatternError(r'\p', 'missing {, expected property name', 2)
self.checkPatternError(r'[\p]', 'missing {, expected property name', 3)
# \p is not special in bytes patterns.
self.checkPatternError(br'\p{Lu}', r'bad escape \p', 0)
self.checkPatternError(br'\P{Lu}', r'bad escape \P', 0)
- # A negated multi-range property (one not backed by an engine
- # category) cannot be a set member.
- self.checkPatternError(r'[\P{ASCII}]',
- r'bad escape \P in character class', 1)
def test_word_boundaries(self):
# See http://bugs.python.org/issue10713