]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
added \N{name} escapes to re patterns
authorJonathan Eunice <jonathan.eunice@gmail.com>
Sat, 17 Jun 2017 07:28:17 +0000 (03:28 -0400)
committerJonathan Eunice <jonathan.eunice@gmail.com>
Sat, 17 Jun 2017 07:48:03 +0000 (03:48 -0400)
Doc/library/re.rst
Lib/sre_parse.py
Lib/test/test_re.py

index 0b9d9755f30ec658a068f4415edd24a376dc54a1..6a0bbd328541d41ff8b3c2d5b6926ce54e4604a0 100644 (file)
@@ -443,7 +443,7 @@ character ``'$'``.
 Most of the standard escapes supported by Python string literals are also
 accepted by the regular expression parser::
 
-   \a      \b      \f      \n
+   \a      \b      \f      \n      \N{name}
    \r      \t      \u      \U
    \v      \x      \\
 
@@ -464,6 +464,9 @@ three digits in length.
 .. versionchanged:: 3.6
    Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.
 
+.. versionchanged:: 3.7
+   The ``'\N{name}'`` escape sequence has been added. As in string literals,
+   it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).
 
 .. seealso::
 
index 545252074f63d1101ff53897b5d6789e7766f35e..a6f726aef9b28d713c092cad656d738d6981d5bf 100644 (file)
@@ -13,6 +13,7 @@
 # XXX: show string offset and offending character for all errors
 
 from sre_constants import *
+from ast import literal_eval
 
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
@@ -25,6 +26,11 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 
 WHITESPACE = frozenset(" \t\n\r\v\f")
 
+UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -')
+CLOSING_BRACE = frozenset("}")
+OPENING_BRACE = frozenset("{")
+
+
 _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
 _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
 
@@ -322,6 +328,17 @@ def _class_escape(source, escape):
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            escape += source.getwhile(1, OPENING_BRACE)
+            escape += source.getwhile(100, UNICODE_NAME)
+            escape += source.getwhile(1, CLOSING_BRACE)
+            try:
+                c = ord(literal_eval('"%s"' % escape))
+            except SyntaxError:
+                charname = escape[2:].strip('{}')
+                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+            return LITERAL, c
         elif c in OCTDIGITS:
             # octal escape (up to three digits)
             escape += source.getwhile(2, OCTDIGITS)
@@ -370,6 +387,17 @@ def _escape(source, escape, state):
             c = int(escape[2:], 16)
             chr(c) # raise ValueError for invalid code
             return LITERAL, c
+        elif c == "N" and source.istext:
+            # named unicode escape e.g. \N{EM DASH}
+            escape += source.getwhile(1, OPENING_BRACE)
+            escape += source.getwhile(100, UNICODE_NAME)
+            escape += source.getwhile(1, CLOSING_BRACE)
+            try:
+                c = ord(literal_eval('"%s"' % escape))
+            except SyntaxError:
+                charname = escape[2:].strip('{}')
+                raise source.error("unknown Unicode character name %s" % charname, len(escape))
+            return LITERAL, c
         elif c == "0":
             # octal escape
             escape += source.getwhile(2, OCTDIGITS)
index 0ea5a20469646b181025b3a39d0967c01a9ae168..a05c35a2a4740e37308ac719d8db72181ccc8c3d 100644 (file)
@@ -700,6 +700,39 @@ class ReTests(unittest.TestCase):
             with self.subTest(c):
                 self.assertRaises(re.error, re.compile, '[\\%c]' % c)
 
+    def test_named_unicode_escapes(self):
+        # test individual Unicode named escapes
+        suites = [
+            [   # basic matches
+                ['\u2014', r'\u2014', '\N{EM DASH}',
+                 r'\N{EM DASH}'],                               # pattern
+                ['\u2014', '\N{EM DASH}', '—', '—and more'],    # matches
+                ['\u2015', '\N{EN DASH}']                       # no match
+            ],
+            [   # character set matches
+                ['[\u2014-\u2020]', r'[\u2014-\u2020]',
+                 '[\N{EM DASH}-\N{DAGGER}]', r'[\N{EM DASH}-\N{DAGGER}]',
+                 '[\u2014-\N{DAGGER}]', '[\N{EM DASH}-\u2020]',],                               # pattern
+                ['\u2014', '\N{EM DASH}', '—', '—and more', '\u2020',
+                 '\N{DAGGER}', '†', '\u2017', '\N{DOUBLE LOW LINE}'],
+                ['\u2011', '\N{EN DASH}', '\u2013', 'xyz', '\u2021']
+            ],
+        ]
+
+        for patterns, match_yes, match_no in suites:
+            for pat in patterns:
+                for target in match_yes:
+                    self.assertTrue(re.match(pat, target))
+                for target in match_no:
+                    self.assertIsNone(re.match(pat, target))
+
+        # test errors in \N{name} handling - only valid names should pass
+        badly_formed = [r'\N{BUBBA DASH}', r'\N{EM DASH',
+                        r'\NEM DASH}', r'\NOGGIN']
+        for bad in badly_formed:
+            with self.assertRaises(re.error):
+                re.compile(bad)
+
     def test_string_boundaries(self):
         # See http://bugs.python.org/issue10713
         self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),