From: Guillaume Outters Date: Sat, 11 Apr 2026 06:01:57 +0000 (+0200) Subject: [Fix] regexp: do not discard all capture groups that follow an empty one (#5974) X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;ds=sidebyside;p=thirdparty%2Frspamd.git [Fix] regexp: do not discard all capture groups that follow an empty one (#5974) * [Test] Test empty capture groups in the middle of the match Tests #5973 * [Fix] Empty capture groups in the middle of a match don't discard all remaining groups (PCRE2) Do not stop on first empty capture group, rather lookup from the end for the last non-empty capture group, and truncate there. By the way the results array thus gets correctly dimensioned from the start, instead of being truncated afterwards. Fixes #5973 * [Fix] Empty capture groups in the middle of a match don't discard all remaining groups (PCRE1) Do not stop on first empty capture group, rather lookup from the end for the last non-empty capture group, and truncate there. By the way the results array thus gets correctly dimensioned from the start, instead of being truncated afterwards. Fixes #5973 --- diff --git a/src/libutil/regexp.c b/src/libutil/regexp.c index 86a6b1688c..800b284c42 100644 --- a/src/libutil/regexp.c +++ b/src/libutil/regexp.c @@ -748,18 +748,21 @@ rspamd_regexp_search(const rspamd_regexp_t *re, const char *text, gsize len, g_assert(g_array_get_element_size(captures) == sizeof(struct rspamd_re_capture)); + /* Remove empty captures at the end */ + while (--rc >= 0 && (ovec[rc * 2] == junk || ovec[rc * 2] < 0)) { + } + rc++; g_array_set_size(captures, rc); for (i = 0; i < rc; i++) { + elt = &g_array_index(captures, struct rspamd_re_capture, i); if (ovec[i * 2] != junk && ovec[i * 2] >= 0) { - elt = &g_array_index(captures, struct rspamd_re_capture, i); elt->p = mt + ovec[i * 2]; elt->len = (mt + ovec[i * 2 + 1]) - elt->p; } else { - /* Runtime match returned fewer captures than expected */ - g_array_set_size(captures, i); - break; + elt->p = ""; + elt->len = 0; } } } @@ -889,17 +892,21 @@ rspamd_regexp_search(const rspamd_regexp_t *re, const char *text, gsize len, g_assert(g_array_get_element_size(captures) == sizeof(struct rspamd_re_capture)); + /* Remove empty captures at the end */ + while (--novec >= 0 && (ovec[novec * 2] == junk || ovec[novec * 2] == PCRE2_UNSET)) { + } + novec++; g_array_set_size(captures, novec); for (i = 0; i < novec; i++) { + elt = &g_array_index(captures, struct rspamd_re_capture, i); if (ovec[i * 2] != junk && ovec[i * 2] != PCRE2_UNSET) { - elt = &g_array_index(captures, struct rspamd_re_capture, i); elt->p = mt + ovec[i * 2]; elt->len = (mt + ovec[i * 2 + 1]) - elt->p; } else { - g_array_set_size(captures, i); - break; + elt->p = ""; + elt->len = 0; } } } diff --git a/test/lua/unit/regxep.lua b/test/lua/unit/regxep.lua index 25bf09c42b..cfd50d44c8 100644 --- a/test/lua/unit/regxep.lua +++ b/test/lua/unit/regxep.lua @@ -42,6 +42,14 @@ context("Regexp unit tests", function() {'Body=(\\S+)(?: Fuz1=(\\S+))?(?: Fuz2=(\\S+))?', 'mc-filter4 1120; Body=1 Fuz1=2 mc-filter4 1120; Body=1 Fuz1=2 Fuz2=3', {'Body=1 Fuz1=2', '1', '2'}, {'Body=1 Fuz1=2 Fuz2=3', '1', '2', '3'}}, + -- discard empty captures at the end: we don't want an empty capture corresponding to (et)? + {'(\\S*atch)(et)?', 'Match the hatch', {'Match', 'Match'}, {'hatch', 'hatch'}}, + -- there once was a bug where an empty capture interrupted storing captures, + -- thus the second match was only {'capetians do halt captaining', 'capetians'} + {'(cap\\S*) do *(not)? halt (cap\\S*)', + 'empty captures do not halt capturing but dead capetians do halt captaining', + {'captures do not halt capturing', 'captures', 'not', 'capturing'}, + {'capetians do halt captaining', 'capetians', '', 'captaining'}}, } for _,c in ipairs(cases) do local r = re.create_cached(c[1])