From: Jim Meyering Date: Sun, 12 Apr 2026 19:36:45 +0000 (-0700) Subject: regex: fix missed short match with backrefs X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;ds=sidebyside;p=thirdparty%2Fgnulib.git regex: fix missed short match with backrefs With a backref pattern like ^(.?)(.?).?\2\1 (no $), the engine could miss valid short matches. For example, "ab" should match via all-empty groups, yet regexec returned no-match because set_regs failed at the longest structural match (match_last=2) and never retried at a shorter match_last. * lib/regexec.c (re_search_internal): When set_regs fails for a backref pattern, retry prune_impossible_nodes and set_regs at progressively shorter match lengths. Save a copy of state_log before pruning so shorter retries can re-sift from the original states. * m4/regex.m4: Also reject system regex with this bug. * tests/test-regex.c (main): Add a test for this bug. Reported by Ed Morton in https://bugs.gnu.org/68725 Co-authored-by: Claude --- diff --git a/ChangeLog b/ChangeLog index c2446f5f16..b1bd9d66eb 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +2026-04-12 Jim Meyering + + regex: fix missed short match with backrefs + With a backref pattern like ^(.?)(.?).?\2\1 (no $), the engine + could miss valid short matches. For example, "ab" should match + via all-empty groups, yet regexec returned no-match because + set_regs failed at the longest structural match (match_last=2) + and never retried at a shorter match_last. + * lib/regexec.c (re_search_internal): When set_regs fails for a + backref pattern, retry prune_impossible_nodes and set_regs at + progressively shorter match lengths. Save a copy of state_log + before pruning so shorter retries can re-sift from the original + states. + * m4/regex.m4: Also reject system regex with this bug. + * tests/test-regex.c (main): Add a test for this bug. + Reported by Ed Morton in https://bugs.gnu.org/68725 + 2026-04-12 Collin Funk doc: update documentation about fopen with the 'e' mode character diff --git a/lib/regexec.c b/lib/regexec.c index e09fc7698e..76ef80ab07 100644 --- a/lib/regexec.c +++ b/lib/regexec.c @@ -678,6 +678,8 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, | (t != NULL ? 1 : 0)) : 8); + re_dfastate_t **save_state_log = NULL; + for (;; match_first += incr) { err = REG_NOMATCH; @@ -802,11 +804,32 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match) || dfa->nbackref) { + /* Save state_log before pruning, in case set_regs + later fails and we need to retry with a shorter + match. */ + re_free (save_state_log); + save_state_log = NULL; + if (!preg->no_sub && nmatch > 1 && dfa->nbackref) + { + save_state_log + = re_malloc (re_dfastate_t *, + mctx.match_last + 1); + if (__glibc_unlikely (save_state_log == NULL)) + { + err = REG_ESPACE; + goto free_return; + } + memcpy (save_state_log, mctx.state_log, + sizeof (re_dfastate_t *) + * (mctx.match_last + 1)); + } err = prune_impossible_nodes (&mctx); if (err == REG_NOERROR) break; if (__glibc_unlikely (err != REG_NOMATCH)) goto free_return; + re_free (save_state_log); + save_state_log = NULL; match_last = -1; } else @@ -825,24 +848,87 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, { Idx reg_idx; - /* Initialize registers. */ - for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) - pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; - - /* Set the points where matching start/end. */ - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = mctx.match_last; - /* FIXME: This function should fail if mctx.match_last exceeds - the maximum possible regoff_t value. We need a new error - code REG_OVERFLOW. */ - if (!preg->no_sub && nmatch > 1) { - err = set_regs (preg, &mctx, nmatch, pmatch, - dfa->has_plural_match && dfa->nbackref > 0); + /* When set_regs fails for a backref pattern, the structural + match at match_last has no valid register assignment. Try + shorter match lengths, since a valid shorter match may + exist (e.g., all groups matching empty). */ + for (;;) + { + /* Initialize registers. */ + for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) + pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; + pmatch[0].rm_so = 0; + pmatch[0].rm_eo = mctx.match_last; + + err = set_regs (preg, &mctx, nmatch, pmatch, + dfa->has_plural_match && dfa->nbackref > 0); + if (__glibc_likely (err == REG_NOERROR) + || save_state_log == NULL + || err != REG_NOMATCH) + break; + + /* set_regs failed; try a shorter match_last. */ + Idx ml = mctx.match_last; + re_free (mctx.state_log); + do + { + --ml; + if (ml < 0) + break; + } + while (save_state_log[ml] == NULL + || !save_state_log[ml]->halt + || !check_halt_state_context + (&mctx, save_state_log[ml], ml)); + if (ml < 0) + { + err = REG_NOMATCH; + mctx.state_log = save_state_log; + save_state_log = NULL; + break; + } + mctx.state_log + = re_malloc (re_dfastate_t *, ml + 1); + if (__glibc_unlikely (mctx.state_log == NULL)) + { + mctx.state_log = save_state_log; + save_state_log = NULL; + err = REG_ESPACE; + break; + } + memcpy (mctx.state_log, save_state_log, + sizeof (re_dfastate_t *) * (ml + 1)); + mctx.match_last = ml; + mctx.last_node + = check_halt_state_context + (&mctx, save_state_log[ml], ml); + err = prune_impossible_nodes (&mctx); + if (__glibc_unlikely (err != REG_NOERROR)) + { + if (err == REG_NOMATCH) + { + re_free (mctx.state_log); + mctx.state_log = save_state_log; + save_state_log = NULL; + } + break; + } + } + re_free (save_state_log); + save_state_log = NULL; if (__glibc_unlikely (err != REG_NOERROR)) goto free_return; } + else + { + /* Initialize registers. */ + for (reg_idx = 1; reg_idx < nmatch; ++reg_idx) + pmatch[reg_idx].rm_so = pmatch[reg_idx].rm_eo = -1; + pmatch[0].rm_so = 0; + pmatch[0].rm_eo = mctx.match_last; + } /* At last, add the offset to each register, since we slid the buffers so that we could assume that the matching starts @@ -882,6 +968,7 @@ re_search_internal (const regex_t *preg, const char *string, Idx length, } free_return: + re_free (save_state_log); re_free (mctx.state_log); if (dfa->nbackref) match_ctx_free (&mctx); diff --git a/m4/regex.m4 b/m4/regex.m4 index c36de81011..4a7257d892 100644 --- a/m4/regex.m4 +++ b/m4/regex.m4 @@ -318,9 +318,9 @@ AC_DEFUN([gl_REGEX], free (regs.end); } - /* This test is derived from bug#68725, reported by Ed Morton. - The regex uses backrefs to detect palindromes and "ab" - is not a palindrome, so this should not match. */ + /* These tests are derived from bug#68725, reported by + Ed Morton. The regex uses backrefs with optional groups + to detect palindromes. */ { regex_t re68725; i = regcomp (&re68725, @@ -330,8 +330,22 @@ AC_DEFUN([gl_REGEX], result |= 64; else { - regmatch_t pm; - if (regexec (&re68725, "ab", 1, &pm, 0) == 0) + regmatch_t pm[3]; + /* "ab" is not a palindrome, so must not match + with $. */ + if (regexec (&re68725, "ab", 1, pm, 0) == 0) + result |= 64; + /* Without $, a shorter match (e.g., empty or "a") + is valid at position 0. Ensure set_regs retries + with a shorter match_last when the longest + structural match fails content validation. */ + regfree (&re68725); + i = regcomp (&re68725, + "^(.?)(.?).?\\\\2\\\\1", + REG_EXTENDED); + if (i) + result |= 64; + else if (regexec (&re68725, "ab", 3, pm, 0) != 0) result |= 64; regfree (&re68725); } diff --git a/tests/test-regex.c b/tests/test-regex.c index 87c03834f5..d747eefdc7 100644 --- a/tests/test-regex.c +++ b/tests/test-regex.c @@ -473,8 +473,7 @@ main (void) report_error ("%s: %s", pat_badback, s); /* bug#68725, reported by Ed Morton. - The regex uses backrefs to detect palindromes and "ab" - is not a palindrome, so this should not match. */ + The regex uses backrefs with optional groups to detect palindromes. */ { regex_t re68725; int ret = regcomp (&re68725, "^(.?)(.?).?\\2\\1$", REG_EXTENDED); @@ -483,8 +482,25 @@ main (void) else { regmatch_t pm; + /* "ab" is not a palindrome, so must not match with $. */ if (regexec (&re68725, "ab", 1, &pm, 0) == 0) - report_error ("regexec bug#68725: \"ab\" matched, should not"); + report_error ("regexec bug#68725: \"ab\" matched with $," + " should not"); + regfree (&re68725); + } + + /* Without $, "ab" should match: the engine must retry with a + shorter match_last when set_regs fails at the longest + structural match. */ + ret = regcomp (&re68725, "^(.?)(.?).?\\2\\1", REG_EXTENDED); + if (ret) + report_error ("regcomp bug#68725 (no $) failed (%d)", ret); + else + { + regmatch_t pm[3]; + if (regexec (&re68725, "ab", 3, pm, 0) != 0) + report_error ("regexec bug#68725: \"ab\" should match" + " without $"); regfree (&re68725); } }