gh-140797: Forbid capturing groups in re.Scanner lexicon patterns (GH-140944)

author Abhishek Tiwari <Abhi210@users.noreply.github.com>

Tue, 4 Nov 2025 10:54:28 +0000 (16:24 +0530)

committer GitHub <noreply@github.com>

Tue, 4 Nov 2025 10:54:28 +0000 (12:54 +0200)
author Abhishek Tiwari <Abhi210@users.noreply.github.com>
Tue, 4 Nov 2025 10:54:28 +0000 (16:24 +0530)
committer GitHub <noreply@github.com>
Tue, 4 Nov 2025 10:54:28 +0000 (12:54 +0200)
diff --git a/Lib/re/__init__.py b/Lib/re/__init__.py

index a5316391297f4c95cdd9341c9606b42f983bc8eb..ecec16e9005f3b7b4f81f25bed988a599811c7e0 100644 (file)
--- a/Lib/re/__init__.py
+++ b/Lib/re/__init__.py
@@ -397,9 +397,12 @@ class Scanner:
          s = _parser.State()
          s.flags = flags
          for phrase, action in lexicon:
+            sub_pattern = _parser.parse(phrase, flags)
+            if sub_pattern.state.groups != 1:
+                raise ValueError("Cannot use capturing groups in re.Scanner")
              gid = s.opengroup()
              p.append(_parser.SubPattern(s, [
-                (SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
+                (SUBPATTERN, (gid, 0, 0, sub_pattern)),
                  ]))
              s.closegroup(gid, p[-1])
          p = _parser.SubPattern(s, [(BRANCH, (None, p))])
diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py

index 5fc95087f2b6addde948b525d5f526bd5caea1fc..9f6f04bf6b83474587b93f66e8bd0b601d1ab4f9 100644 (file)
--- a/Lib/test/test_re.py
+++ b/Lib/test/test_re.py
@@ -1639,6 +1639,24 @@ class ReTests(unittest.TestCase):
                           (['sum', 'op=', 3, 'op*', 'foo', 'op+', 312.5,
                             'op+', 'bar'], ''))
  
+    def test_bug_gh140797(self):
+        # gh140797: Capturing groups are not allowed in re.Scanner
+
+        msg = r"Cannot use capturing groups in re\.Scanner"
+        # Capturing group throws an error
+        with self.assertRaisesRegex(ValueError, msg):
+            Scanner([("(a)b", None)])
+
+        # Named Group
+        with self.assertRaisesRegex(ValueError, msg):
+            Scanner([("(?P<name>a)", None)])
+
+        # Non-capturing groups should pass normally
+        s = Scanner([("(?:a)b", lambda scanner, token: token)])
+        result, rem = s.scan("ab")
+        self.assertEqual(result,['ab'])
+        self.assertEqual(rem,'')
+
      def test_bug_448951(self):
          # bug 448951 (similar to 429357, but with single char match)
          # (Also test greedy matches.)
diff --git a/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst

new file mode 100644 (file)

index 0000000..493b740
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst
@@ -0,0 +1,2 @@
+The undocumented :class:`!re.Scanner` class now forbids regular expressions containing capturing groups in its lexicon patterns. Patterns using capturing groups could
+previously lead to crashes with segmentation fault. Use non-capturing groups (?:...) instead.
author	Abhishek Tiwari <Abhi210@users.noreply.github.com>
	Tue, 4 Nov 2025 10:54:28 +0000 (16:24 +0530)
committer	GitHub <noreply@github.com>
	Tue, 4 Nov 2025 10:54:28 +0000 (12:54 +0200)
Lib/re/__init__.py		patch \| blob \| blame \| history
Lib/test/test_re.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-11-03-16-23-54.gh-issue-140797.DuFEeR.rst	[new file with mode: 0644]	patch \| blob