added \N{name} escapes to re patterns

author Jonathan Eunice <jonathan.eunice@gmail.com>

Sat, 17 Jun 2017 07:28:17 +0000 (03:28 -0400)

committer Jonathan Eunice <jonathan.eunice@gmail.com>

Sat, 17 Jun 2017 07:48:03 +0000 (03:48 -0400)
author Jonathan Eunice <jonathan.eunice@gmail.com>
Sat, 17 Jun 2017 07:28:17 +0000 (03:28 -0400)
committer Jonathan Eunice <jonathan.eunice@gmail.com>
Sat, 17 Jun 2017 07:48:03 +0000 (03:48 -0400)
diff --git a/Doc/library/re.rst b/Doc/library/re.rst

index 0b9d9755f30ec658a068f4415edd24a376dc54a1..6a0bbd328541d41ff8b3c2d5b6926ce54e4604a0 100644 (file)
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -443,7 +443,7 @@ character ``'$'``.
  Most of the standard escapes supported by Python string literals are also
  accepted by the regular expression parser::
  
-   \a      \b      \f      \n
+   \a      \b      \f      \n      \N{name}
     \r      \t      \u      \U
     \v      \x      \\
  
@@ -464,6 +464,9 @@ three digits in length.
  .. versionchanged:: 3.6
     Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.
  
+.. versionchanged:: 3.7
+   The ``'\N{name}'`` escape sequence has been added. As in string literals,
+   it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).
  
  .. seealso::
  
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py

index 545252074f63d1101ff53897b5d6789e7766f35e..a6f726aef9b28d713c092cad656d738d6981d5bf 100644 (file)
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -13,6 +13,7 @@
  # XXX: show string offset and offending character for all errors
  
  from sre_constants import *
+from ast import literal_eval
  
  SPECIAL_CHARS = ".\\[{()*+?^$|"
  REPEAT_CHARS = "*+?{"
@@ -25,6 +26,11 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  
  WHITESPACE = frozenset(" \t\n\r\v\f")
  
+UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
+CLOSING_BRACE = frozenset("}")
+OPENING_BRACE = frozenset("{")
+
+
  _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
  _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
  
@@ -322,6 +328,17 @@ def _class_escape(source, escape):
              c = int(escape[2:], 16)
              chr(c) # raise ValueError for invalid code
              return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            escape += source.getwhile(1, OPENING_BRACE)
+            escape += source.getwhile(100, UNICODE_NAME)
+            escape += source.getwhile(1, CLOSING_BRACE)
+            try:
+                c = ord(literal_eval('"%s"' % escape))
+            except SyntaxError:
+                charname = escape[2:].strip('{}')
+                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+            return LITERAL, c
          elif c in OCTDIGITS:
              # octal escape (up to three digits)
              escape += source.getwhile(2, OCTDIGITS)
@@ -370,6 +387,17 @@ def _escape(source, escape, state):
              c = int(escape[2:], 16)
              chr(c) # raise ValueError for invalid code
              return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            escape += source.getwhile(1, OPENING_BRACE)
+            escape += source.getwhile(100, UNICODE_NAME)
+            escape += source.getwhile(1, CLOSING_BRACE)
+            try:
+                c = ord(literal_eval('"%s"' % escape))
+            except SyntaxError:
+                charname = escape[2:].strip('{}')
+                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+            return LITERAL, c
          elif c == "0":
              # octal escape
              escape += source.getwhile(2, OCTDIGITS)
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 0ea5a20469646b181025b3a39d0967c01a9ae168..a05c35a2a4740e37308ac719d8db72181ccc8c3d 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -700,6 +700,39 @@ class ReTests(unittest.TestCase):
              with self.subTest(c):
                  self.assertRaises(re.error, re.compile, '[\\%c]' % c)
  
+    def test_named_unicode_escapes(self):
+        # test individual Unicode named escapes
+        suites = [
+            [   # basic matches
+                ['\u2014', r'\u2014', '\N{EM DASH}',
+                 r'\N{EM DASH}'],                               # pattern
+                ['\u2014', '\N{EM DASH}', '—', '—and more'],    # matches
+                ['\u2015', '\N{EN DASH}']                       # no match
+            ],
+            [   # character set matches
+                ['[\u2014-\u2020]', r'[\u2014-\u2020]',
+                 '[\N{EM DASH}-\N{DAGGER}]', r'[\N{EM DASH}-\N{DAGGER}]',
+                 '[\u2014-\N{DAGGER}]', '[\N{EM DASH}-\u2020]',],                               # pattern
+                ['\u2014', '\N{EM DASH}', '—', '—and more', '\u2020',
+                 '\N{DAGGER}', '†', '\u2017', '\N{DOUBLE LOW LINE}'],
+                ['\u2011', '\N{EN DASH}', '\u2013', 'xyz', '\u2021']
+            ],
+        ]
+
+        for patterns, match_yes, match_no in suites:
+            for pat in patterns:
+                for target in match_yes:
+                    self.assertTrue(re.match(pat, target))
+                for target in match_no:
+                    self.assertIsNone(re.match(pat, target))
+
+        # test errors in \N{name} handling - only valid names should pass
+        badly_formed = [r'\N{BUBBA DASH}', r'\N{EM DASH',
+                        r'\NEM DASH}', r'\NOGGIN']
+        for bad in badly_formed:
+            with self.assertRaises(re.error):
+                re.compile(bad)
+
      def test_string_boundaries(self):
          # See http://bugs.python.org/issue10713
          self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
author	Jonathan Eunice <jonathan.eunice@gmail.com>
	Sat, 17 Jun 2017 07:28:17 +0000 (03:28 -0400)
committer	Jonathan Eunice <jonathan.eunice@gmail.com>
	Sat, 17 Jun 2017 07:48:03 +0000 (03:48 -0400)
Doc/library/re.rst		patch \| blob \| blame \| history
Lib/sre_parse.py		patch \| blob \| blame \| history
Lib/test/test_re.py		patch \| blob \| blame \| history