Use unicodedata instead of eval.

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)

committer Serhiy Storchaka <storchaka@gmail.com>

Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)
committer Serhiy Storchaka <storchaka@gmail.com>
Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py

index e7c76c2f493550291b6f7b5603128cbb85a186fa..2dd9c8016897dc3a86da37dff6b108c07e5166f1 100644 (file)
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -13,7 +13,7 @@
  # XXX: show string offset and offending character for all errors
  
  from sre_constants import *
-from ast import literal_eval
+import unicodedata
  
  SPECIAL_CHARS = ".\\[{()*+?^$|"
  REPEAT_CHARS = "*+?{"
@@ -26,10 +26,6 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  
  WHITESPACE = frozenset(" \t\n\r\v\f")
  
-UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
-CLOSING_BRACE = frozenset("}")
-OPENING_BRACE = frozenset("{")
-
  
  _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
@@ -270,19 +266,19 @@ class Tokenizer:
              result += c
              self.__next()
          return result
-    def getuntil(self, terminator):
+    def getuntil(self, terminator, name):
          result = ''
          while True:
              c = self.next
              self.__next()
              if c is None:
                  if not result:
-                    raise self.error("missing group name")
+                    raise self.error("missing " + name)
                  raise self.error("missing %s, unterminated name" % terminator,
                                   len(result))
              if c == terminator:
                  if not result:
-                    raise self.error("missing group name", 1)
+                    raise self.error("missing " + name, 1)
                  break
              result += c
          return result
@@ -330,14 +326,14 @@ def _class_escape(source, escape):
              return LITERAL, c
          elif c == "N" and source.istext:
              # named unicode escape e.g. \N{EM DASH}
-            escape += source.getwhile(1, OPENING_BRACE)
-            escape += source.getwhile(100, UNICODE_NAME)
-            escape += source.getwhile(1, CLOSING_BRACE)
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
              try:
-                c = ord(literal_eval('"%s"' % escape))
-            except SyntaxError:
-                charname = escape[2:].strip('{}')
-                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
              return LITERAL, c
          elif c in OCTDIGITS:
              # octal escape (up to three digits)
@@ -389,14 +385,14 @@ def _escape(source, escape, state):
              return LITERAL, c
          elif c == "N" and source.istext:
              # named unicode escape e.g. \N{EM DASH}
-            escape += source.getwhile(1, OPENING_BRACE)
-            escape += source.getwhile(100, UNICODE_NAME)
-            escape += source.getwhile(1, CLOSING_BRACE)
+            if not source.match('{'):
+                raise source.error("missing {")
+            charname = source.getuntil('}', 'character name')
              try:
-                c = ord(literal_eval('"%s"' % escape))
-            except SyntaxError:
-                charname = escape[2:].strip('{}')
-                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+                c = ord(unicodedata.lookup(charname))
+            except KeyError:
+                raise source.error("undefined character name %r" % charname,
+                                   len(charname) + len(r'\N{}'))
              return LITERAL, c
          elif c == "0":
              # octal escape
@@ -707,13 +703,13 @@ def _parse(source, state, verbose, nested, first=False):
                      # python extensions
                      if sourcematch("<"):
                          # named group: skip forward to end of name
-                        name = source.getuntil(">")
+                        name = source.getuntil(">", "group name")
                          if not name.isidentifier():
                              msg = "bad character in group name %r" % name
                              raise source.error(msg, len(name) + 1)
                      elif sourcematch("="):
                          # named backreference
-                        name = source.getuntil(")")
+                        name = source.getuntil(")", "group name")
                          if not name.isidentifier():
                              msg = "bad character in group name %r" % name
                              raise source.error(msg, len(name) + 1)
@@ -776,7 +772,7 @@ def _parse(source, state, verbose, nested, first=False):
  
                  elif char == "(":
                      # conditional backreference group
-                    condname = source.getuntil(")")
+                    condname = source.getuntil(")", "group name")
                      if condname.isidentifier():
                          condgroup = state.groupdict.get(condname)
                          if condgroup is None:
@@ -1005,7 +1001,7 @@ def parse_template(source, pattern):
                  name = ""
                  if not s.match("<"):
                      raise s.error("missing <")
-                name = s.getuntil(">")
+                name = s.getuntil(">", "group name")
                  if name.isidentifier():
                      try:
                          index = groupindex[name]
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)
committer	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 8 Feb 2018 16:51:45 +0000 (18:51 +0200)