From: Jonathan Eunice Date: Sat, 17 Jun 2017 07:28:17 +0000 (-0400) Subject: added \N{name} escapes to re patterns X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5f72f7a79f79b3c0caf5b88ab9ed7f2aa2ebd225;p=thirdparty%2FPython%2Fcpython.git added \N{name} escapes to re patterns --- diff --git a/Doc/library/re.rst b/Doc/library/re.rst index 0b9d9755f30e..6a0bbd328541 100644 --- a/Doc/library/re.rst +++ b/Doc/library/re.rst @@ -443,7 +443,7 @@ character ``'$'``. Most of the standard escapes supported by Python string literals are also accepted by the regular expression parser:: - \a \b \f \n + \a \b \f \n \N{name} \r \t \u \U \v \x \\ @@ -464,6 +464,9 @@ three digits in length. .. versionchanged:: 3.6 Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors. +.. versionchanged:: 3.7 + The ``'\N{name}'`` escape sequence has been added. As in string literals, + it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``). .. seealso:: diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 545252074f63..a6f726aef9b2 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -13,6 +13,7 @@ # XXX: show string offset and offending character for all errors from sre_constants import * +from ast import literal_eval SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -25,6 +26,11 @@ ASCIILETTERS = frozenset("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") WHITESPACE = frozenset(" \t\n\r\v\f") +UNICODE_NAME = ASCIILETTERS | DIGITS | frozenset(' -') +CLOSING_BRACE = frozenset("}") +OPENING_BRACE = frozenset("{") + + _REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT}) _UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY}) @@ -322,6 +328,17 @@ def _class_escape(source, escape): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + escape += source.getwhile(1, OPENING_BRACE) + escape += source.getwhile(100, UNICODE_NAME) + escape += source.getwhile(1, CLOSING_BRACE) + try: + c = ord(literal_eval('"%s"' % escape)) + except SyntaxError: + charname = escape[2:].strip('{}') + raise source.error("unknown Unicode character name %s" % charname, len(escape)) + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) escape += source.getwhile(2, OCTDIGITS) @@ -370,6 +387,17 @@ def _escape(source, escape, state): c = int(escape[2:], 16) chr(c) # raise ValueError for invalid code return LITERAL, c + elif c == "N" and source.istext: + # named unicode escape e.g. \N{EM DASH} + escape += source.getwhile(1, OPENING_BRACE) + escape += source.getwhile(100, UNICODE_NAME) + escape += source.getwhile(1, CLOSING_BRACE) + try: + c = ord(literal_eval('"%s"' % escape)) + except SyntaxError: + charname = escape[2:].strip('{}') + raise source.error("unknown Unicode character name %s" % charname, len(escape)) + return LITERAL, c elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 0ea5a2046964..a05c35a2a474 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -700,6 +700,39 @@ class ReTests(unittest.TestCase): with self.subTest(c): self.assertRaises(re.error, re.compile, '[\\%c]' % c) + def test_named_unicode_escapes(self): + # test individual Unicode named escapes + suites = [ + [ # basic matches + ['\u2014', r'\u2014', '\N{EM DASH}', + r'\N{EM DASH}'], # pattern + ['\u2014', '\N{EM DASH}', '—', '—and more'], # matches + ['\u2015', '\N{EN DASH}'] # no match + ], + [ # character set matches + ['[\u2014-\u2020]', r'[\u2014-\u2020]', + '[\N{EM DASH}-\N{DAGGER}]', r'[\N{EM DASH}-\N{DAGGER}]', + '[\u2014-\N{DAGGER}]', '[\N{EM DASH}-\u2020]',], # pattern + ['\u2014', '\N{EM DASH}', '—', '—and more', '\u2020', + '\N{DAGGER}', '†', '\u2017', '\N{DOUBLE LOW LINE}'], + ['\u2011', '\N{EN DASH}', '\u2013', 'xyz', '\u2021'] + ], + ] + + for patterns, match_yes, match_no in suites: + for pat in patterns: + for target in match_yes: + self.assertTrue(re.match(pat, target)) + for target in match_no: + self.assertIsNone(re.match(pat, target)) + + # test errors in \N{name} handling - only valid names should pass + badly_formed = [r'\N{BUBBA DASH}', r'\N{EM DASH', + r'\NEM DASH}', r'\NOGGIN'] + for bad in badly_formed: + with self.assertRaises(re.error): + re.compile(bad) + def test_string_boundaries(self): # See http://bugs.python.org/issue10713 self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),