emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
+ if lo > MAXCODE:
+ raise error("looks too much behind")
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
- emit(min(hi, MAXCODE))
+ emit(hi)
# add literal prefix
if prefix:
emit(len(prefix)) # length
TYPE_FLAGS = SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE
GLOBAL_FLAGS = SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE
+# Maximal value returned by SubPattern.getwidth().
+# Must be larger than MAXREPEAT, MAXCODE and sys.maxsize.
+MAXWIDTH = 1 << 64
+
class State:
# keeps track of state for parsing
def __init__(self):
lo = hi = 0
for op, av in self.data:
if op is BRANCH:
- i = MAXREPEAT - 1
+ i = MAXWIDTH
j = 0
for av in av[1]:
l, h = av.getwidth()
elif op in _REPEATCODES:
i, j = av[2].getwidth()
lo = lo + i * av[0]
- hi = hi + j * av[1]
+ if av[1] == MAXREPEAT and j:
+ hi = MAXWIDTH
+ else:
+ hi = hi + j * av[1]
elif op in _UNITCODES:
lo = lo + 1
hi = hi + 1
hi = hi + j
elif op is SUCCESS:
break
- self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
+ self.width = min(lo, MAXWIDTH), min(hi, MAXWIDTH)
return self.width
class Tokenizer:
self.assertRaises(OverflowError, re.compile, r".{%d,}?" % 2**128)
self.assertRaises(OverflowError, re.compile, r".{%d,%d}" % (2**129, 2**128))
+ def test_look_behind_overflow(self):
+ string = "x" * 2_500_000
+ p1 = r"(?<=((.{%d}){%d}){%d})"
+ p2 = r"(?<!((.{%d}){%d}){%d})"
+ # Test that the templates are valid and look-behind with width 2**21
+ # (larger than sys.maxunicode) are supported.
+ self.assertEqual(re.search(p1 % (2**7, 2**7, 2**7), string).span(),
+ (2**21, 2**21))
+ self.assertEqual(re.search(p2 % (2**7, 2**7, 2**7), string).span(),
+ (0, 0))
+ # Test that 2**22 is accepted as a repetition number and look-behind
+ # width.
+ re.compile(p1 % (2**22, 1, 1))
+ re.compile(p1 % (1, 2**22, 1))
+ re.compile(p1 % (1, 1, 2**22))
+ re.compile(p2 % (2**22, 1, 1))
+ re.compile(p2 % (1, 2**22, 1))
+ re.compile(p2 % (1, 1, 2**22))
+ # But 2**66 is too large for look-behind width.
+ errmsg = "looks too much behind"
+ self.assertRaisesRegex(re.error, errmsg, re.compile, p1 % (2**22, 2**22, 2**22))
+ self.assertRaisesRegex(re.error, errmsg, re.compile, p2 % (2**22, 2**22, 2**22))
+
def test_backref_group_name_in_exception(self):
# Issue 17341: Poor error message when compiling invalid regex
self.checkPatternError('(?P=<foo>)',
--- /dev/null
+Improve errors for unsupported look-behind patterns. Now re.error is raised
+instead of OverflowError or RuntimeError for too large width of look-behind
+pattern.
GET_SKIP;
GET_ARG; /* 0 for lookahead, width for lookbehind */
code--; /* Back up over arg to simplify math below */
- if (arg & 0x80000000)
- FAIL; /* Width too large */
/* Stop 1 before the end; we check the SUCCESS below */
if (_validate_inner(code+1, code+skip-2, groups))
FAIL;
/* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
- TRACE(("reject (got %zd chars, need %zd)\n",
- end - ptr, (Py_ssize_t) pattern[3]));
+ TRACE(("reject (got %tu chars, need %zu)\n",
+ end - ptr, (size_t) pattern[3]));
RETURN_FAILURE;
}
pattern += pattern[1] + 1;
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", pattern,
ptr, pattern[1]));
- if (ptr - (SRE_CHAR *)state->beginning < (Py_ssize_t)pattern[1])
+ if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) < pattern[1])
RETURN_FAILURE;
state->ptr = ptr - pattern[1];
DO_JUMP0(JUMP_ASSERT, jump_assert, pattern+2);
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern,
ptr, pattern[1]));
- if (ptr - (SRE_CHAR *)state->beginning >= (Py_ssize_t)pattern[1]) {
+ if ((uintptr_t)(ptr - (SRE_CHAR *)state->beginning) >= pattern[1]) {
state->ptr = ptr - pattern[1];
LASTMARK_SAVE();
if (state->repeat)
flags = pattern[2];
- if (pattern[3] && end - ptr < (Py_ssize_t)pattern[3]) {
- TRACE(("reject (got %u chars, need %u)\n",
- (unsigned int)(end - ptr), pattern[3]));
+ if (pattern[3] && (uintptr_t)(end - ptr) < pattern[3]) {
+ TRACE(("reject (got %tu chars, need %zu)\n",
+ end - ptr, (size_t) pattern[3]));
return 0;
}
if (pattern[3] > 1) {