# XXX: show string offset and offending character for all errors
from sre_constants import *
-from ast import literal_eval
+import unicodedata
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
WHITESPACE = frozenset(" \t\n\r\v\f")
-UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
-CLOSING_BRACE = frozenset("}")
-OPENING_BRACE = frozenset("{")
-
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
result += c
self.__next()
return result
- def getuntil(self, terminator):
+ def getuntil(self, terminator, name):
result = ''
while True:
c = self.next
self.__next()
if c is None:
if not result:
- raise self.error("missing group name")
+ raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator,
len(result))
if c == terminator:
if not result:
- raise self.error("missing group name", 1)
+ raise self.error("missing " + name, 1)
break
result += c
return result
return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
- escape += source.getwhile(1, OPENING_BRACE)
- escape += source.getwhile(100, UNICODE_NAME)
- escape += source.getwhile(1, CLOSING_BRACE)
+ if not source.match('{'):
+ raise source.error("missing {")
+ charname = source.getuntil('}', 'character name')
try:
- c = ord(literal_eval('"%s"' % escape))
- except SyntaxError:
- charname = escape[2:].strip('{}')
- raise source.error("unknown Unicode character name %s" % charname, len(escape))
+ c = ord(unicodedata.lookup(charname))
+ except KeyError:
+ raise source.error("undefined character name %r" % charname,
+ len(charname) + len(r'\N{}'))
return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
- escape += source.getwhile(1, OPENING_BRACE)
- escape += source.getwhile(100, UNICODE_NAME)
- escape += source.getwhile(1, CLOSING_BRACE)
+ if not source.match('{'):
+ raise source.error("missing {")
+ charname = source.getuntil('}', 'character name')
try:
- c = ord(literal_eval('"%s"' % escape))
- except SyntaxError:
- charname = escape[2:].strip('{}')
- raise source.error("unknown Unicode character name %s" % charname, len(escape))
+ c = ord(unicodedata.lookup(charname))
+ except KeyError:
+ raise source.error("undefined character name %r" % charname,
+ len(charname) + len(r'\N{}'))
return LITERAL, c
elif c == "0":
# octal escape
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
- name = source.getuntil(">")
+ name = source.getuntil(">", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif sourcematch("="):
# named backreference
- name = source.getuntil(")")
+ name = source.getuntil(")", "group name")
if not name.isidentifier():
msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1)
elif char == "(":
# conditional backreference group
- condname = source.getuntil(")")
+ condname = source.getuntil(")", "group name")
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
name = ""
if not s.match("<"):
raise s.error("missing <")
- name = s.getuntil(">")
+ name = s.getuntil(">", "group name")
if name.isidentifier():
try:
index = groupindex[name]