From 2272cec13b53c405d86c45d404f035f201c0baef Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Thu, 8 Feb 2018 18:51:45 +0200 Subject: [PATCH] Use unicodedata instead of eval. --- Lib/sre_parse.py | 48 ++++++++++++++++++++++-------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index e7c76c2f4935..2dd9c8016897 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,7 +13,7 @@ # XXX: show string offset and offending character for all errors from sre_constants import * -from ast import literal_eval +import unicodedata SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -26,10 +26,6 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = frozenset(" \t\n\r\v\f") -UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -') -CLOSING_BRACE = frozenset("}") -OPENING_BRACE = frozenset("{") - _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) @@ -270,19 +266,19 @@ class Tokenizer: result += c self.__next() return result - def getuntil(self, terminator): + def getuntil(self, terminator, name): result = '' while True: c = self.next self.__next() if c is None: if not result: - raise self.error("missing group name") + raise self.error("missing " + name) raise self.error("missing %s, unterminated name" % terminator, len(result)) if c == terminator: if not result: - raise self.error("missing group name", 1) + raise self.error("missing " + name, 1) break result += c return result @@ -330,14 +326,14 @@ def _class_escape(source, escape): return LITERAL, c elif c == "N" and source.istext: # named unicode escape e.g. \N{EM DASH} - escape += source.getwhile(1, OPENING_BRACE) - escape += source.getwhile(100, UNICODE_NAME) - escape += source.getwhile(1, CLOSING_BRACE) + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') try: - c = ord(literal_eval('"%s"' % escape)) - except SyntaxError: - charname = escape[2:].strip('{}') - raise source.error("unknown Unicode character name %s" % charname, len(escape)) + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) @@ -389,14 +385,14 @@ def _escape(source, escape, state): return LITERAL, c elif c == "N" and source.istext: # named unicode escape e.g. \N{EM DASH} - escape += source.getwhile(1, OPENING_BRACE) - escape += source.getwhile(100, UNICODE_NAME) - escape += source.getwhile(1, CLOSING_BRACE) + if not source.match('{'): + raise source.error("missing {") + charname = source.getuntil('}', 'character name') try: - c = ord(literal_eval('"%s"' % escape)) - except SyntaxError: - charname = escape[2:].strip('{}') - raise source.error("unknown Unicode character name %s" % charname, len(escape)) + c = ord(unicodedata.lookup(charname)) + except KeyError: + raise source.error("undefined character name %r" % charname, + len(charname) + len(r'\N{}')) return LITERAL, c elif c == "0": # octal escape @@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False): # python extensions if sourcematch("<"): # named group: skip forward to end of name - name = source.getuntil(">") + name = source.getuntil(">", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) elif sourcematch("="): # named backreference - name = source.getuntil(")") + name = source.getuntil(")", "group name") if not name.isidentifier(): msg = "bad character in group name %r" % name raise source.error(msg, len(name) + 1) @@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False): elif char == "(": # conditional backreference group - condname = source.getuntil(")") + condname = source.getuntil(")", "group name") if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: @@ -1005,7 +1001,7 @@ def parse_template(source, pattern): name = "" if not s.match("<"): raise s.error("missing <") - name = s.getuntil(">") + name = s.getuntil(">", "group name") if name.isidentifier(): try: index = groupindex[name] -- 2.47.3