# XXX: show string offset and offending character for all errors
from sre_constants import *
+from ast import literal_eval
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
WHITESPACE = frozenset(" \t\n\r\v\f")
+UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
+CLOSING_BRACE = frozenset("}")
+OPENING_BRACE = frozenset("{")
+
+
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
+ elif c == "N" and source.istext:
+ # named unicode escape e.g. \N{EM DASH}
+ escape += source.getwhile(1, OPENING_BRACE)
+ escape += source.getwhile(100, UNICODE_NAME)
+ escape += source.getwhile(1, CLOSING_BRACE)
+ try:
+ c = ord(literal_eval('"%s"' % escape))
+ except SyntaxError:
+ charname = escape[2:].strip('{}')
+ raise source.error("unknown Unicode character name %s" % charname, len(escape))
+ return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
+ elif c == "N" and source.istext:
+ # named unicode escape e.g. \N{EM DASH}
+ escape += source.getwhile(1, OPENING_BRACE)
+ escape += source.getwhile(100, UNICODE_NAME)
+ escape += source.getwhile(1, CLOSING_BRACE)
+ try:
+ c = ord(literal_eval('"%s"' % escape))
+ except SyntaxError:
+ charname = escape[2:].strip('{}')
+ raise source.error("unknown Unicode character name %s" % charname, len(escape))
+ return LITERAL, c
elif c == "0":
# octal escape
escape += source.getwhile(2, OCTDIGITS)
with self.subTest(c):
self.assertRaises(re.error, re.compile, '[\\%c]' % c)
+ def test_named_unicode_escapes(self):
+ # test individual Unicode named escapes
+ suites = [
+ [ # basic matches
+ ['\u2014', r'\u2014', '\N{EM DASH}',
+ r'\N{EM DASH}'], # pattern
+ ['\u2014', '\N{EM DASH}', '—', '—and more'], # matches
+ ['\u2015', '\N{EN DASH}'] # no match
+ ],
+ [ # character set matches
+ ['[\u2014-\u2020]', r'[\u2014-\u2020]',
+ '[\N{EM DASH}-\N{DAGGER}]', r'[\N{EM DASH}-\N{DAGGER}]',
+ '[\u2014-\N{DAGGER}]', '[\N{EM DASH}-\u2020]',], # pattern
+ ['\u2014', '\N{EM DASH}', '—', '—and more', '\u2020',
+ '\N{DAGGER}', '†', '\u2017', '\N{DOUBLE LOW LINE}'],
+ ['\u2011', '\N{EN DASH}', '\u2013', 'xyz', '\u2021']
+ ],
+ ]
+
+ for patterns, match_yes, match_no in suites:
+ for pat in patterns:
+ for target in match_yes:
+ self.assertTrue(re.match(pat, target))
+ for target in match_no:
+ self.assertIsNone(re.match(pat, target))
+
+ # test errors in \N{name} handling - only valid names should pass
+ badly_formed = [r'\N{BUBBA DASH}', r'\N{EM DASH',
+ r'\NEM DASH}', r'\NOGGIN']
+ for bad in badly_formed:
+ with self.assertRaises(re.error):
+ re.compile(bad)
+
def test_string_boundaries(self):
# See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),