]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] regexp: do not discard all capture groups that follow an empty one (#5974)
authorGuillaume Outters <guillaume-github@outters.eu>
Sat, 11 Apr 2026 06:01:57 +0000 (08:01 +0200)
committerGitHub <noreply@github.com>
Sat, 11 Apr 2026 06:01:57 +0000 (07:01 +0100)
* [Test] Test empty capture groups in the middle of the match

Tests #5973

* [Fix] Empty capture groups in the middle of a match don't discard all remaining groups (PCRE2)

Do not stop on first empty capture group, rather lookup from the end for the last non-empty capture group, and truncate there.
By the way the results array thus gets correctly dimensioned from the start, instead of being truncated afterwards.
Fixes #5973

* [Fix] Empty capture groups in the middle of a match don't discard all remaining groups (PCRE1)

Do not stop on first empty capture group, rather lookup from the end for the last non-empty capture group, and truncate there.
By the way the results array thus gets correctly dimensioned from the start, instead of being truncated afterwards.
Fixes #5973

src/libutil/regexp.c
test/lua/unit/regxep.lua

index 86a6b1688cefe22315faf6e9a1d997a82276498d..800b284c4232172dbf4512ed8aa8cae91090e703 100644 (file)
@@ -748,18 +748,21 @@ rspamd_regexp_search(const rspamd_regexp_t *re, const char *text, gsize len,
 
                        g_assert(g_array_get_element_size(captures) ==
                                         sizeof(struct rspamd_re_capture));
+                       /* Remove empty captures at the end */
+                       while (--rc >= 0 && (ovec[rc * 2] == junk || ovec[rc * 2] < 0)) {
+                       }
+                       rc++;
                        g_array_set_size(captures, rc);
 
                        for (i = 0; i < rc; i++) {
+                               elt = &g_array_index(captures, struct rspamd_re_capture, i);
                                if (ovec[i * 2] != junk && ovec[i * 2] >= 0) {
-                                       elt = &g_array_index(captures, struct rspamd_re_capture, i);
                                        elt->p = mt + ovec[i * 2];
                                        elt->len = (mt + ovec[i * 2 + 1]) - elt->p;
                                }
                                else {
-                                       /* Runtime match returned fewer captures than expected */
-                                       g_array_set_size(captures, i);
-                                       break;
+                                       elt->p = "";
+                                       elt->len = 0;
                                }
                        }
                }
@@ -889,17 +892,21 @@ rspamd_regexp_search(const rspamd_regexp_t *re, const char *text, gsize len,
 
                        g_assert(g_array_get_element_size(captures) ==
                                         sizeof(struct rspamd_re_capture));
+                       /* Remove empty captures at the end */
+                       while (--novec >= 0 && (ovec[novec * 2] == junk || ovec[novec * 2] == PCRE2_UNSET)) {
+                       }
+                       novec++;
                        g_array_set_size(captures, novec);
 
                        for (i = 0; i < novec; i++) {
+                               elt = &g_array_index(captures, struct rspamd_re_capture, i);
                                if (ovec[i * 2] != junk && ovec[i * 2] != PCRE2_UNSET) {
-                                       elt = &g_array_index(captures, struct rspamd_re_capture, i);
                                        elt->p = mt + ovec[i * 2];
                                        elt->len = (mt + ovec[i * 2 + 1]) - elt->p;
                                }
                                else {
-                                       g_array_set_size(captures, i);
-                                       break;
+                                       elt->p = "";
+                                       elt->len = 0;
                                }
                        }
                }
index 25bf09c42ba6158e314cb67c3d9a63490546a4f1..cfd50d44c826c2a12a1bdb47e58e11bba6c42fe7 100644 (file)
@@ -42,6 +42,14 @@ context("Regexp unit tests", function()
       {'Body=(\\S+)(?: Fuz1=(\\S+))?(?: Fuz2=(\\S+))?',
       'mc-filter4 1120; Body=1 Fuz1=2 mc-filter4 1120; Body=1 Fuz1=2 Fuz2=3',
       {'Body=1 Fuz1=2', '1', '2'}, {'Body=1 Fuz1=2 Fuz2=3', '1', '2', '3'}},
+      -- discard empty captures at the end: we don't want an empty capture corresponding to (et)?
+      {'(\\S*atch)(et)?', 'Match the hatch', {'Match', 'Match'}, {'hatch', 'hatch'}},
+      -- there once was a bug where an empty capture interrupted storing captures,
+      -- thus the second match was only {'capetians do halt captaining', 'capetians'}
+      {'(cap\\S*) do *(not)? halt (cap\\S*)',
+      'empty captures do not halt capturing but dead capetians do halt captaining',
+      {'captures do not halt capturing', 'captures', 'not', 'capturing'},
+      {'capetians do halt captaining', 'capetians', '', 'captaining'}},
     }
     for _,c in ipairs(cases) do
       local r = re.create_cached(c[1])