From: Vsevolod Stakhov Date: Thu, 18 Jun 2026 13:22:25 +0000 (+0100) Subject: [Feature] multipattern: explicit SOM flag and offset docs X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4a0deb476a79a1ae1973f75d45f2c2ef436275cc;p=thirdparty%2Frspamd.git [Feature] multipattern: explicit SOM flag and offset docs SOM (start-of-match) reporting already exists on master as the default (hyperscan compiles every pattern with HS_FLAG_SOM_LEFTMOST), but there was no explicit way to request it and the offset convention was undocumented. - Add RSPAMD_MULTIPATTERN_SOM (rspamd_trie.flags.som): an explicit opt-in for start offsets that also overrides no_start/single_match (forces SOM and drops the incompatible SINGLEMATCH). - Document the offset convention: pattern id is 1-based; match start and end are byte offsets, 0-based, start inclusive and end exclusive (one past the last matched byte), so end - start is the match length. - Fix the regex (flags.re) fallback used when hyperscan is unavailable: it discarded the real PCRE start and reported end - strlen(pattern), which is bogus for variable-length matches. It now reports the true start/end from rspamd_regexp_search. Add C++ (rspamd_cxx_unit_multipattern.hxx) and Lua (trie.lua) unit tests asserting (id, start, end) against hand-computed positions: multiple/overlapping occurrences, icase, literal vs regex, no-match, SOM-overrides-single_match and a large buffer. Existing rspamd_trie behaviour and its callers (url.c, lang_detection, lua plugins) are unchanged. --- diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index 1aea68476c..b301753926 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -420,6 +420,15 @@ void rspamd_multipattern_add_pattern_len(struct rspamd_multipattern *mp, if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) { fl &= ~HS_FLAG_SOM_LEFTMOST; } + if (adjusted_flags & RSPAMD_MULTIPATTERN_SOM) { + /* + * Explicit start-of-match request wins over the cost-saving + * opt-outs above. SINGLEMATCH is incompatible with SOM in + * hyperscan, so drop it here. + */ + fl |= HS_FLAG_SOM_LEFTMOST; + fl &= ~HS_FLAG_SINGLEMATCH; + } g_array_append_val(mp->hs_flags, fl); np = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen); @@ -933,6 +942,31 @@ rspamd_multipattern_acism_cb(int strnum, int textpos, void *context) return ret; } +/* + * Report a regex match using the real start/end offsets obtained from + * rspamd_regexp_search(). Unlike the literal ACISM path, a regex match length + * is not equal to the pattern string length, so the start MUST come from the + * regex engine rather than being derived as end - pattern_len (which would be + * bogus). Both offsets are byte offsets into cbd->in, 0-based, with match_pos + * exclusive (see rspamd_multipattern_cb_t). + */ +static int +rspamd_multipattern_regex_cb(struct rspamd_multipattern_cbdata *cbd, + unsigned int strnum, + int match_start, + int match_pos) +{ + int ret; + + ret = cbd->cb(cbd->mp, strnum, match_start, match_pos, + cbd->in, cbd->len, cbd->ud); + + cbd->nfound++; + cbd->ret = ret; + + return ret; +} + int rspamd_multipattern_lookup(struct rspamd_multipattern *mp, const char *in, gsize len, rspamd_multipattern_cb_t cb, gpointer ud, unsigned int *pnfound) @@ -1049,7 +1083,9 @@ int rspamd_multipattern_lookup(struct rspamd_multipattern *mp, if (start >= end) { break; } - if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) { + if (rspamd_multipattern_regex_cb(&cbd, i, + (int) (start - in), + (int) (end - in))) { goto hs_fallback_out; } } @@ -1084,7 +1120,9 @@ int rspamd_multipattern_lookup(struct rspamd_multipattern *mp, if (start >= end) { break; } - if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) { + if (rspamd_multipattern_regex_cb(&cbd, i, + (int) (start - in), + (int) (end - in))) { goto out; } } diff --git a/src/libutil/multipattern.h b/src/libutil/multipattern.h index 648d99e678..be7c020a5b 100644 --- a/src/libutil/multipattern.h +++ b/src/libutil/multipattern.h @@ -54,6 +54,15 @@ enum rspamd_multipattern_flags { RSPAMD_MULTIPATTERN_DOTALL = (1 << 5), RSPAMD_MULTIPATTERN_SINGLEMATCH = (1 << 6), RSPAMD_MULTIPATTERN_NO_START = (1 << 7), + /* + * Explicitly request start-of-match offsets for every occurrence. This is + * the default behaviour today (hyperscan is compiled with + * HS_FLAG_SOM_LEFTMOST and the ACISM/regex fallbacks derive the start), so + * the flag mostly documents intent and makes the request explicit and + * future-proof. When set it also wins over RSPAMD_MULTIPATTERN_NO_START and + * RSPAMD_MULTIPATTERN_SINGLEMATCH (which would otherwise drop SOM). + */ + RSPAMD_MULTIPATTERN_SOM = (1 << 8), }; /** @@ -81,10 +90,21 @@ struct rspamd_cryptobox_library_ctx; struct ev_loop; /** - * Called on pattern match + * Called on pattern match. + * + * Offset convention: both offsets are byte offsets into @text and are 0-based. + * @match_start is inclusive (the first matched byte) and @match_pos is + * exclusive (one past the last matched byte), so the matched span is + * text[match_start .. match_pos) and its length is match_pos - match_start. + * Note that numerically @match_pos equals the 1-based position of the last + * matched byte, which is what historical callers treated as "the end position". + * * @param mp multipattern structure - * @param strnum number of pattern matched - * @param textpos position in the text + * @param strnum number of pattern matched (0-based pattern id) + * @param match_start start byte offset of the match (0-based, inclusive). Only + * meaningful when start-of-match reporting is active (the default, or + * when RSPAMD_MULTIPATTERN_SOM is set); 0 otherwise. + * @param match_pos end byte offset of the match (0-based, exclusive) * @param text input text * @param len length of input text * @param context userdata diff --git a/src/lua/lua_trie.c b/src/lua/lua_trie.c index 210b089bb9..446ea49f81 100644 --- a/src/lua/lua_trie.c +++ b/src/lua/lua_trie.c @@ -264,11 +264,22 @@ lua_trie_search_str(lua_State *L, struct rspamd_multipattern *trie, /*** * @method trie:match(input, [cb][, report_start]) - * Search for patterns in `input` invoking `cb` optionally ignoring case + * Search for patterns in `input` invoking `cb` optionally ignoring case. + * + * Offset convention: the pattern index `idx` is 1-based (Lua style). Match + * offsets are byte offsets and are 0-based: when `report_start` is set the + * `start` is the inclusive offset of the first matched byte and the `end` is + * the exclusive offset one past the last matched byte (so `end - start` is the + * match length). When `report_start` is not set only the (exclusive) end + * offset is reported, matching the historical behaviour. Start offsets are + * available for every occurrence by default; pass `rspamd_trie.flags.som` at + * creation time to request them explicitly (and to keep them even when + * combined with `single_match`/`no_start`). + * * @param {table or string} input one or several (if `input` is an array) strings of input text - * @param {function} cb callback called on each pattern match in form `function (idx, pos)` where `idx` is a numeric index of pattern (starting from 1) and `pos` is a numeric offset where the pattern ends + * @param {function} cb callback called on each pattern match in form `function (idx, pos)` where `idx` is the 1-based pattern index and `pos` is the match end offset; when `report_start` is set `pos` is instead a table `{start, end}` * @param {boolean} report_start report both start and end offset when matching patterns - * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however). If `cb` is not defined then it returns a table of match positions indexed by pattern number + * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however). If `cb` is not defined then it returns a table indexed by pattern number, each entry being a list of every occurrence (either the end offset, or `{start, end}` when `report_start` is set) */ static int lua_trie_match(lua_State *L) @@ -484,6 +495,8 @@ lua_load_trie(lua_State *L) lua_setfield(L, -2, "single_match"); lua_pushinteger(L, RSPAMD_MULTIPATTERN_NO_START); lua_setfield(L, -2, "no_start"); + lua_pushinteger(L, RSPAMD_MULTIPATTERN_SOM); + lua_setfield(L, -2, "som"); lua_settable(L, -3); /* Main content */ diff --git a/test/lua/unit/trie.lua b/test/lua/unit/trie.lua index 9532eae529..6f8b13bc21 100644 --- a/test/lua/unit/trie.lua +++ b/test/lua/unit/trie.lua @@ -79,3 +79,84 @@ context("Trie search functions", function() end end) + +context("Trie start-of-match (SOM) offsets", function() + local t = require "rspamd_trie" + local bit = require "bit" + + -- Offsets are byte offsets: start is 0-based inclusive, end is 0-based + -- exclusive (one past the last matched byte), so end - start == match length. + -- This is exactly what the YARA-style helpers need for #s / @s[i] / "$s at X". + + -- {start, end, pattern_idx} ordering helper + local function cmp(a, b) + if a[3] ~= b[3] then return a[3] < b[3] end + if a[1] ~= b[1] then return a[1] < b[1] end + return a[2] < b[2] + end + + test("flags.som is exposed", function() + assert_not_nil(t.flags.som, "rspamd_trie.flags.som must exist") + end) + + test("literal patterns report every occurrence as {start, end}", function() + -- "abcab": 'ab' (id1) at [0,2) and [3,5); 'bc' (id2) at [1,3) + local trie = t.create({'ab', 'bc'}, t.flags.som) + local m = trie:match('abcab', true) + + -- Count semantics (#s): two occurrences of 'ab', one of 'bc' + assert_equal(2, #m[1]) + assert_equal(1, #m[2]) + + -- @s[i]: first 'ab' starts at offset 0, second at offset 3 + assert_equal(0, m[1][1][1]) + assert_equal(3, m[1][2][1]) + + local res = {} + for idx, hits in pairs(m) do + for _, se in ipairs(hits) do + table.insert(res, {se[1], se[2], idx}) + end + end + local expect = {{0, 2, 1}, {3, 5, 1}, {1, 3, 2}} + table.sort(res, cmp) + table.sort(expect, cmp) + assert_rspamd_table_eq({expect = expect, actual = res}) + end) + + test("callback form reports {start, end} when report_start is set", function() + local trie = t.create({'ab', 'bc'}, t.flags.som) + local res = {} + trie:match('abcab', function(idx, se) + table.insert(res, {se[1], se[2], idx}) + return 0 + end, true) + + local expect = {{0, 2, 1}, {3, 5, 1}, {1, 3, 2}} + table.sort(res, cmp) + table.sort(expect, cmp) + assert_rspamd_table_eq({expect = expect, actual = res}) + end) + + test("regex patterns report real start offsets", function() + -- fixed-length regex 'a.c' at [0,3) and [4,7) in "axc-ayc" + local trie = t.create({'a.c'}, bit.bor(t.flags.re, t.flags.som)) + local m = trie:match('axc-ayc', true) + + assert_equal(2, #m[1]) + local res = {} + for _, se in ipairs(m[1]) do + table.insert(res, {se[1], se[2], 1}) + end + local expect = {{0, 3, 1}, {4, 7, 1}} + table.sort(res, cmp) + table.sort(expect, cmp) + assert_rspamd_table_eq({expect = expect, actual = res}) + end) + + test("no match yields an empty result table", function() + local trie = t.create({'zzz'}, t.flags.som) + local m = trie:match('abcdef', true) + assert_equal(0, #m) + end) +end) diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx index af5a205374..6162d9cdfe 100644 --- a/test/rspamd_cxx_unit.cxx +++ b/test/rspamd_cxx_unit.cxx @@ -40,6 +40,7 @@ #include "rspamd_cxx_unit_settings_merge.hxx" #include "rspamd_cxx_unit_fpconv.hxx" #include "rspamd_cxx_unit_symcache_timeout.hxx" +#include "rspamd_cxx_unit_multipattern.hxx" static gboolean verbose = false; static const GOptionEntry entries[] = diff --git a/test/rspamd_cxx_unit_multipattern.hxx b/test/rspamd_cxx_unit_multipattern.hxx new file mode 100644 index 0000000000..a4ee72f7a7 --- /dev/null +++ b/test/rspamd_cxx_unit_multipattern.hxx @@ -0,0 +1,186 @@ +/* + * Copyright 2026 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_RSPAMD_CXX_UNIT_MULTIPATTERN_HXX +#define RSPAMD_RSPAMD_CXX_UNIT_MULTIPATTERN_HXX + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +extern "C" { +#include "libutil/multipattern.h" +} + +#include +#include +#include +#include + +/* + * Start-of-match (SOM) tests for the multipattern engine. These assert the + * (pattern_id, start, end) contract against hand-computed positions. Offsets + * are byte offsets: start is 0-based inclusive, end is 0-based exclusive (one + * past the last matched byte), so end - start is the match length. The + * assertions are backend-agnostic and must hold whether the lookup is served + * by hyperscan or the ACISM/regex fallback. + */ +TEST_SUITE("multipattern som") +{ + /* One reported occurrence: (pattern id, start offset, end offset) */ + using mp_match = std::tuple; + + static int mp_collect_cb(struct rspamd_multipattern * mp, unsigned int strnum, + int match_start, int match_pos, const char *text, + gsize len, void *context) + { + auto *acc = static_cast *>(context); + acc->emplace_back(strnum, match_start, match_pos); + + return 0; + } + + static std::vector mp_scan(const std::vector &pats, + int flags, const std::string &input) + { + struct rspamd_multipattern *mp = + rspamd_multipattern_create((enum rspamd_multipattern_flags) flags); + + for (const auto &p: pats) { + rspamd_multipattern_add_pattern_len(mp, p.data(), p.size(), flags); + } + + GError *err = nullptr; + bool ok = rspamd_multipattern_compile(mp, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err); + REQUIRE(ok); + + std::vector res; + unsigned int nfound = 0; + rspamd_multipattern_lookup(mp, input.data(), input.size(), + mp_collect_cb, &res, &nfound); + rspamd_multipattern_destroy(mp); + + CHECK(nfound == res.size()); + + /* Deterministic ordering: by end, then start, then id */ + std::sort(res.begin(), res.end(), + [](const mp_match &a, const mp_match &b) { + if (std::get<2>(a) != std::get<2>(b)) { + return std::get<2>(a) < std::get<2>(b); + } + if (std::get<1>(a) != std::get<1>(b)) { + return std::get<1>(a) < std::get<1>(b); + } + return std::get<0>(a) < std::get<0>(b); + }); + + return res; + } + + TEST_CASE("literal: multiple occurrences with ordered start offsets") + { + /* "ab" at byte offsets 0, 3, 6 in "abXabYab" */ + auto res = mp_scan({"ab"}, RSPAMD_MULTIPATTERN_SOM, "abXabYab"); + + REQUIRE(res.size() == 3); + CHECK(res[0] == mp_match{0, 0, 2}); + CHECK(res[1] == mp_match{0, 3, 5}); + CHECK(res[2] == mp_match{0, 6, 8}); + } + + TEST_CASE("literal: overlapping occurrences") + { + /* "aa" in "aaaa" matches ending at 2,3,4 with leftmost starts 0,1,2 */ + auto res = mp_scan({"aa"}, RSPAMD_MULTIPATTERN_SOM, "aaaa"); + + REQUIRE(res.size() == 3); + CHECK(res[0] == mp_match{0, 0, 2}); + CHECK(res[1] == mp_match{0, 1, 3}); + CHECK(res[2] == mp_match{0, 2, 4}); + } + + TEST_CASE("literal: case-insensitive start offsets") + { + /* "ABC" matched caselessly at offset 1 in "xABCy" */ + auto res = mp_scan({"abc"}, + RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_SOM, + "xABCy"); + + REQUIRE(res.size() == 1); + CHECK(res[0] == mp_match{0, 1, 4}); + } + + TEST_CASE("multiple distinct patterns keep their own ids") + { + /* id0="foo" at 0; id1="bar" at 4 in "foo bar" */ + auto res = mp_scan({"foo", "bar"}, RSPAMD_MULTIPATTERN_SOM, "foo bar"); + + REQUIRE(res.size() == 2); + CHECK(res[0] == mp_match{0, 0, 3}); + CHECK(res[1] == mp_match{1, 4, 7}); + } + + TEST_CASE("regex pattern reports real start offsets") + { + /* fixed-length regex "a.c" at offsets 0 and 4 in "axc-ayc" */ + auto res = mp_scan({"a.c"}, + RSPAMD_MULTIPATTERN_RE | RSPAMD_MULTIPATTERN_SOM, + "axc-ayc"); + + REQUIRE(res.size() == 2); + CHECK(res[0] == mp_match{0, 0, 3}); + CHECK(res[1] == mp_match{0, 4, 7}); + } + + TEST_CASE("no match yields no occurrences") + { + auto res = mp_scan({"zzz"}, RSPAMD_MULTIPATTERN_SOM, "abcdef"); + + CHECK(res.empty()); + } + + TEST_CASE("SOM overrides single_match") + { + /* + * single_match alone would collapse to one occurrence; the explicit + * SOM flag must drop single_match and report every occurrence with a + * start offset. + */ + auto res = mp_scan({"ab"}, + RSPAMD_MULTIPATTERN_SINGLEMATCH | RSPAMD_MULTIPATTERN_SOM, + "abab"); + + REQUIRE(res.size() == 2); + CHECK(res[0] == mp_match{0, 0, 2}); + CHECK(res[1] == mp_match{0, 2, 4}); + } + + TEST_CASE("large buffer: start offsets at known positions") + { + const std::string needle = "needle"; + std::string buf(100000, 'x'); + /* Plant the needle at two known offsets */ + buf.replace(1000, needle.size(), needle); + buf.replace(50000, needle.size(), needle); + + auto res = mp_scan({needle}, RSPAMD_MULTIPATTERN_SOM, buf); + + REQUIRE(res.size() == 2); + CHECK(res[0] == mp_match{0, 1000, 1000 + (int) needle.size()}); + CHECK(res[1] == mp_match{0, 50000, 50000 + (int) needle.size()}); + } +} + +#endif