]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
[3.11] gh-109747: Improve errors for unsupported look-behind patterns (GH-109859...
authorMiss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Sat, 14 Oct 2023 06:51:24 +0000 (08:51 +0200)
committerGitHub <noreply@github.com>
Sat, 14 Oct 2023 06:51:24 +0000 (06:51 +0000)
Now re.error is raised instead of OverflowError or RuntimeError for
too large width of look-behind pattern.

The limit is increased to 2**32-1 (was 2**31-1).
(cherry picked from commit e2b3d831fd2824d8a5713e3ed2a64aad0fb6b62d)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
Lib/re/_compiler.py
Lib/re/_parser.py
Lib/test/test_re.py
Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst [new file with mode: 0644]
Modules/_sre/sre.c
Modules/_sre/sre_lib.h

index d8e0d2fdefdccad7150fe369a673733a60754ca1..285c21936f2cfab00548b6196fbd7f9209b361a4 100644 (file)
@@ -149,6 +149,8 @@ def _compile(code, pattern, flags):
                 emit(0) # look ahead
             else:
                 lo, hi = av[1].getwidth()
+                if lo > MAXCODE:
+                    raise error("looks too much behind")
                 if lo != hi:
                     raise error("look-behind requires fixed-width pattern")
                 emit(lo) # look behind
@@ -549,7 +551,7 @@ def _compile_info(code, pattern, flags):
     else:
         emit(MAXCODE)
         prefix = prefix[:MAXCODE]
-    emit(min(hi, MAXCODE))
+    emit(hi)
     # add literal prefix
     if prefix:
         emit(len(prefix)) # length
index 6b715f54032f916c7dc80866aec9240bf5c80200..c795a7fb00fc97aa3d474d52071df6ff86cb886f 100644 (file)
@@ -68,6 +68,10 @@ FLAGS = {
 TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
 GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
 
+# Maximal value returned by SubPattern.getwidth().
+# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
+MAXWIDTH = 1 << 64
+
 class State:
     # keeps track of state for parsing
     def __init__(self):
@@ -178,7 +182,7 @@ class SubPattern:
         lo = hi = 0
         for op, av in self.data:
             if op is BRANCH:
-                i = MAXREPEAT - 1
+                i = MAXWIDTH
                 j = 0
                 for av in av[1]:
                     l, h = av.getwidth()
@@ -197,7 +201,10 @@ class SubPattern:
             elif op in _REPEATCODES:
                 i, j = av[2].getwidth()
                 lo = lo + i * av[0]
-                hi = hi + j * av[1]
+                if av[1] == MAXREPEAT and j:
+                    hi = MAXWIDTH
+                else:
+                    hi = hi + j * av[1]
             elif op in _UNITCODES:
                 lo = lo + 1
                 hi = hi + 1
@@ -217,7 +224,7 @@ class SubPattern:
                 hi = hi + j
             elif op is SUCCESS:
                 break
-        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
+        self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
         return self.width
 
 class Tokenizer:
index b41bcd49385453624d366186124c36e6e8b29d03..3c02d005b49a2abce2efad8d39e5ccf5b4658c31 100644 (file)
@@ -1830,6 +1830,29 @@ class ReTests(unittest.TestCase):
         self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
         self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
 
+    def test_look_behind_overflow(self):
+        string = "x" * 2_500_000
+        p1 = r"(?<=((.{%d}){%d}){%d})"
+        p2 = r"(?<!((.{%d}){%d}){%d})"
+        # Test that the templates are valid and look-behind with width 2**21
+        # (larger than sys.maxunicode) are supported.
+        self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
+                         (2**21, 2**21))
+        self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
+                         (0, 0))
+        # Test that 2**22 is accepted as a repetition number and look-behind
+        # width.
+        re.compile(p1 % (2**22, 1, 1))
+        re.compile(p1 % (1, 2**22, 1))
+        re.compile(p1 % (1, 1, 2**22))
+        re.compile(p2 % (2**22, 1, 1))
+        re.compile(p2 % (1, 2**22, 1))
+        re.compile(p2 % (1, 1, 2**22))
+        # But 2**66 is too large for look-behind width.
+        errmsg = "looks too much behind"
+        self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
+        self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
+
     def test_backref_group_name_in_exception(self):
         # Issue 17341: Poor error message when compiling invalid regex
         self.checkPatternError('(?P=<foo>)',
diff --git a/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst b/Misc/NEWS.d/next/Library/2023-09-25-20-05-41.gh-issue-109747._cRJH8.rst
new file mode 100644 (file)
index 0000000..b64ba62
--- /dev/null
@@ -0,0 +1,3 @@
+Improve errors for unsupported look-behind patterns. Now re.error is raised
+instead of OverflowError or RuntimeError for too large width of look-behind
+pattern.
index 81629f0007e6a15f3bd3508c08a4012ec902c659..3bad9d8d0ffa2884c0a3b50a89362e8b1e38ac7f 100644 (file)
@@ -1939,8 +1939,6 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
             GET_SKIP;
             GET_ARG; /* 0 for lookahead, width for lookbehind */
             code--; /* Back up over arg to simplify math below */
-            if (arg & 0x80000000)
-                FAIL; /* Width too large */
             /* Stop 1 before the end; we check the SUCCESS below */
             if (_validate_inner(code+1, code+skip-2, groups))
                 FAIL;
index f8d556b2db85c0576aafc968668bd73691a6054d..95c1ada908d222e3e2dff07efe7744e4146ceadc 100644 (file)
@@ -589,8 +589,8 @@ entrance:
         /* optimization info block */
         /* <INFO> <1=skip> <2=flags> <3=min> ... */
         if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
-            TRACE(("reject (got %zd chars, need %zd)\n",
-                   end - ptr, (Py_ssize_t) pattern[3]));
+            TRACE(("reject (got %tu chars, need %zu)\n",
+                   end - ptr, (size_t) pattern[3]));
             RETURN_FAILURE;
         }
         pattern += pattern[1] + 1;
@@ -1507,7 +1507,7 @@ dispatch:
             /* <ASSERT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT %d\n", pattern,
                    ptr, pattern[1]));
-            if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
+            if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
                 RETURN_FAILURE;
             state->ptr = ptr - pattern[1];
             DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
@@ -1520,7 +1520,7 @@ dispatch:
             /* <ASSERT_NOT> <skip> <back> <pattern> */
             TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
                    ptr, pattern[1]));
-            if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
+            if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
                 state->ptr = ptr - pattern[1];
                 LASTMARK_SAVE();
                 if (state->repeat)
@@ -1655,9 +1655,9 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
 
         flags = pattern[2];
 
-        if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
-            TRACE(("reject (got %u chars, need %u)\n",
-                   (unsigned int)(end - ptr), pattern[3]));
+        if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
+            TRACE(("reject (got %tu chars, need %zu)\n",
+                   end - ptr, (size_t) pattern[3]));
             return 0;
         }
         if (pattern[3] > 1) {