]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-109747: Improve errors for unsupported look-behind patterns (GH-109859)
authorSerhiy Storchaka <storchaka@gmail.com>
Sat, 14 Oct 2023 06:13:02 +0000 (09:13 +0300)
committerGitHub <noreply@github.com>
Sat, 14 Oct 2023 06:13:02 +0000 (09:13 +0300)
Now re.error is raised instead of OverflowError or RuntimeError for
too large width of look-behind pattern.

The limit is increased to 2**32-1 (was 2**31-1).

Lib/re/_compiler.py
Lib/re/_parser.py
Lib/test/test_re.py
Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst [new file with mode: 0644]
Modules/_sre/sre.c
Modules/_sre/sre_lib.h

index d0a4c55caf6e41ce83d6124cbb5bedff723bb915..f87712d6d6f9f896054299e3ea60c13e829bd15d 100644 (file)
@@ -147,6 +147,8 @@ def _compile(code, pattern, flags):
                 emit(0) # look ahead
             else:
                 lo, hi = av[1].getwidth()
+                if lo > MAXCODE:
+                    raise error("looks too much behind")
                 if lo != hi:
                     raise error("look-behind requires fixed-width pattern")
                 emit(lo) # look behind
@@ -547,7 +549,7 @@ def _compile_info(code, pattern, flags):
     else:
         emit(MAXCODE)
         prefix = prefix[:MAXCODE]
-    emit(min(hi, MAXCODE))
+    emit(hi)
     # add literal prefix
     if prefix:
         emit(len(prefix)) # length
index d00b7e67d55958201175bf2d3598336032ba2806..f3c779340fe23035a618fd89d6cca2ff8181bc92 100644 (file)
@@ -67,6 +67,10 @@ FLAGS = {
 TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
 GLOBAL_FLAGS = SRE_FLAG_DEBUG
 
+# Maximal value returned by SubPattern.getwidth().
+# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
+MAXWIDTH = 1 << 64
+
 class State:
     # keeps track of state for parsing
     def __init__(self):
@@ -177,7 +181,7 @@ class SubPattern:
         lo = hi = 0
         for op, av in self.data:
             if op is BRANCH:
-                i = MAXREPEAT - 1
+                i = MAXWIDTH
                 j = 0
                 for av in av[1]:
                     l, h = av.getwidth()
@@ -196,7 +200,10 @@ class SubPattern:
             elif op in _REPEATCODES:
                 i, j = av[2].getwidth()
                 lo = lo + i * av[0]
-                hi = hi + j * av[1]
+                if av[1] == MAXREPEAT and j:
+                    hi = MAXWIDTH
+                else:
+                    hi = hi + j * av[1]
             elif op in _UNITCODES:
                 lo = lo + 1
                 hi = hi + 1
@@ -216,7 +223,7 @@ class SubPattern:
                 hi = hi + j
             elif op is SUCCESS:
                 break
-        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
+        self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
         return self.width
 
 class Tokenizer:
index 301d4a516568b21a5c8190760c3d5d3b289fcedf..1eca22f45378df1b3ecb58c18b1b9c0c3a588bf8 100644 (file)
@@ -1861,6 +1861,29 @@ class ReTests(unittest.TestCase):
         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
 
+    def test_look_behind_overflow(self):
+        string = "x" * 2_500_000
+        p1 = r"(?<=((.{%d}){%d}){%d})"
+        p2 = r"(?<!((.{%d}){%d}){%d})"
+        # Test that the templates are valid and look-behind with width 2**21
+        # (larger than sys.maxunicode) are supported.
+        self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
+                         (2**21, 2**21))
+        self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
+                         (0, 0))
+        # Test that 2**22 is accepted as a repetition number and look-behind
+        # width.
+        re.compile(p1 % (2**22, 1, 1))
+        re.compile(p1 % (1, 2**22, 1))
+        re.compile(p1 % (1, 1, 2**22))
+        re.compile(p2 % (2**22, 1, 1))
+        re.compile(p2 % (1, 2**22, 1))
+        re.compile(p2 % (1, 1, 2**22))
+        # But 2**66 is too large for look-behind width.
+        errmsg = "looks too much behind"
+        self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
+        self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
+
     def test_backref_group_name_in_exception(self):
         # Issue 17341: Poor error message when compiling invalid regex
         self.checkPatternError('(?P=<foo>)',
diff --git a/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst
new file mode 100644 (file)
index 0000000..b64ba62
--- /dev/null
@@ -0,0 +1,3 @@
+Improve errors for unsupported look-behind patterns. Now re.error is raised
+instead of OverflowError or RuntimeError for too large width of look-behind
+pattern.
index 798732ccddc99aa45e6d9fa868a19f6dcd3b9a81..0f134b194de6f67bb832a4c1ba7f7eb5ced6edd4 100644 (file)
@@ -2070,8 +2070,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
             GET_SKIP;
             GET_ARG; /* 0 for lookahead, width for lookbehind */
             code--; /* Back up over arg to simplify math below */
-            if (arg & 0x80000000)
-                FAIL; /* Width too large */
             /* Stop 1 before the end; we check the SUCCESS below */
             if (_validate_inner(code+1, code+skip-2, groups))
                 FAIL;
index 3c805aeeca09743ab41b7e4fc50360fbbb914e76..92dd725c70fd3891d08f2db3ff8902255200cfbd 100644 (file)
@@ -591,8 +591,8 @@ entrance:
         /* optimization info block */
         /* <INFO> <1=skip> <2=flags> <3=min> ... */
         if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
-            TRACE(("reject (got %zd chars, need %zd)\n",
-                   end - ptr, (Py_ssize_t) pattern[3]));
+            TRACE(("reject (got %tu chars, need %zu)\n",
+                   end - ptr, (size_t) pattern[3]));
             RETURN_FAILURE;
         }
         pattern += pattern[1] + 1;
@@ -1509,7 +1509,7 @@ dispatch:
             /* <ASSERT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT %d\n", pattern,
                    ptr, pattern[1]));
-            if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
+            if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
                 RETURN_FAILURE;
             state->ptr = ptr - pattern[1];
             DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
@@ -1522,7 +1522,7 @@ dispatch:
             /* <ASSERT_NOT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
                    ptr, pattern[1]));
-            if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
+            if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
                 state->ptr = ptr - pattern[1];
                 LASTMARK_SAVE();
                 if (state->repeat)
@@ -1658,9 +1658,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
 
         flags = pattern[2];
 
-        if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
-            TRACE(("reject (got %u chars, need %u)\n",
-                   (unsigned int)(end - ptr), pattern[3]));
+        if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
+            TRACE(("reject (got %tu chars, need %zu)\n",
+                   end - ptr, (size_t) pattern[3]));
             return 0;
         }
         if (pattern[3] > 1) {