gh-111259: Optimize complementary character sets in RE (GH-120742)

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 20 Jun 2024 07:19:32 +0000 (10:19 +0300)

committer GitHub <noreply@github.com>

Thu, 20 Jun 2024 07:19:32 +0000 (07:19 +0000)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 20 Jun 2024 07:19:32 +0000 (10:19 +0300)
committer GitHub <noreply@github.com>
Thu, 20 Jun 2024 07:19:32 +0000 (07:19 +0000)
diff --git a/Lib/re/_compiler.py b/Lib/re/_compiler.py

index 7b888f877eb3dce94ccf4974eb3078dc41eed626..29109f8812ee7be84520d7ac51bda5d10485344c 100644 (file)
--- a/Lib/re/_compiler.py
+++ b/Lib/re/_compiler.py
@@ -28,6 +28,8 @@ _REPEATING_CODES = {
      POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
  }
  
+_CHARSET_ALL = [(NEGATE, None)]
+
  def _combine_flags(flags, add_flags, del_flags,
                     TYPE_FLAGS=_parser.TYPE_FLAGS):
      if add_flags & TYPE_FLAGS:
@@ -84,17 +86,22 @@ def _compile(code, pattern, flags):
                      code[skip] = _len(code) - skip
          elif op is IN:
              charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
-            if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
-                emit(IN_LOC_IGNORE)
-            elif not hascased:
-                emit(IN)
-            elif not fixes:  # ascii
-                emit(IN_IGNORE)
+            if not charset:
+                emit(FAILURE)
+            elif charset == _CHARSET_ALL:
+                emit(ANY_ALL)
              else:
-                emit(IN_UNI_IGNORE)
-            skip = _len(code); emit(0)
-            _compile_charset(charset, flags, code)
-            code[skip] = _len(code) - skip
+                if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
+                    emit(IN_LOC_IGNORE)
+                elif not hascased:
+                    emit(IN)
+                elif not fixes:  # ascii
+                    emit(IN_IGNORE)
+                else:
+                    emit(IN_UNI_IGNORE)
+                skip = _len(code); emit(0)
+                _compile_charset(charset, flags, code)
+                code[skip] = _len(code) - skip
          elif op is ANY:
              if flags & SRE_FLAG_DOTALL:
                  emit(ANY_ALL)
@@ -277,6 +284,10 @@ def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
                              charmap[i] = 1
                  elif op is NEGATE:
                      out.append((op, av))
+                elif op is CATEGORY and tail and (CATEGORY, CH_NEGATE[av]) in tail:
+                    # Optimize [\s\S] etc.
+                    out = [] if out else _CHARSET_ALL
+                    return out, False
                  else:
                      tail.append((op, av))
              except IndexError:
@@ -519,13 +530,18 @@ def _compile_info(code, pattern, flags):
      # look for a literal prefix
      prefix = []
      prefix_skip = 0
-    charset = [] # not used
+    charset = None # not used
      if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
          # look for literal prefix
          prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
          # if no prefix, look for charset prefix
          if not prefix:
              charset = _get_charset_prefix(pattern, flags)
+            if charset:
+                charset, hascased = _optimize_charset(charset)
+                assert not hascased
+                if charset == _CHARSET_ALL:
+                    charset = None
  ##     if prefix:
  ##         print("*** PREFIX", prefix, prefix_skip)
  ##     if charset:
@@ -560,8 +576,6 @@ def _compile_info(code, pattern, flags):
          # generate overlap table
          code.extend(_generate_overlap_table(prefix))
      elif charset:
-        charset, hascased = _optimize_charset(charset)
-        assert not hascased
          _compile_charset(charset, flags, code)
      code[skip] = len(code) - skip
  
diff --git a/Lib/re/_constants.py b/Lib/re/_constants.py

index 4cb88c96d927151c644566559400431c56005773..d6f32302d37b2db934a93a520717304fbb94a27e 100644 (file)
--- a/Lib/re/_constants.py
+++ b/Lib/re/_constants.py
@@ -206,6 +206,8 @@ CH_UNICODE = {
      CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
  }
  
+CH_NEGATE = dict(zip(CHCODES[::2] + CHCODES[1::2], CHCODES[1::2] + CHCODES[::2]))
+
  # flags
  SRE_FLAG_IGNORECASE = 2 # case insensitive
  SRE_FLAG_LOCALE = 4 # honour system locale
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 1f2ab6028b588e28dbf9e8383d8b977e42b21ac8..a93c2aef170fc84447c3c60d1419a2588c7b27c2 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -2473,6 +2473,24 @@ class ReTests(unittest.TestCase):
      def test_fail(self):
          self.assertEqual(re.search(r'12(?!)|3', '123')[0], '3')
  
+    def test_character_set_any(self):
+        # The union of complementary character sets mathes any character
+        # and is equivalent to "(?s:.)".
+        s = '1x\n'
+        for p in r'[\s\S]', r'[\d\D]', r'[\w\W]', r'[\S\s]', r'\s|\S':
+            with self.subTest(pattern=p):
+                self.assertEqual(re.findall(p, s), list(s))
+                self.assertEqual(re.fullmatch('(?:' + p + ')+', s).group(), s)
+
+    def test_character_set_none(self):
+        # Negation of the union of complementary character sets does not match
+        # any character.
+        s = '1x\n'
+        for p in r'[^\s\S]', r'[^\d\D]', r'[^\w\W]', r'[^\S\s]':
+            with self.subTest(pattern=p):
+                self.assertIsNone(re.search(p, s))
+                self.assertIsNone(re.search('(?s:.)' + p, s))
+
  
  def get_debug_out(pat):
      with captured_stdout() as out:
diff --git a/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst b/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst

new file mode 100644 (file)

index 0000000..91ed5f5
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst
@@ -0,0 +1,3 @@
+:mod:`re` now handles patterns like ``"[\s\S]"`` or ``"\s|\S"`` which match
+any character as effectively as a dot with the ``DOTALL`` modifier
+(``"(?s:.)"``).
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 20 Jun 2024 07:19:32 +0000 (10:19 +0300)
committer	GitHub <noreply@github.com>
	Thu, 20 Jun 2024 07:19:32 +0000 (07:19 +0000)
Lib/re/_compiler.py		patch \| blob \| blame \| history
Lib/re/_constants.py		patch \| blob \| blame \| history
Lib/test/test_re.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2024-06-19-13-20-01.gh-issue-111259.Wki5PV.rst	[new file with mode: 0644]	patch \| blob