if (adjusted_flags & RSPAMD_MULTIPATTERN_NO_START) {
fl &= ~HS_FLAG_SOM_LEFTMOST;
}
+ if (adjusted_flags & RSPAMD_MULTIPATTERN_SOM) {
+ /*
+ * Explicit start-of-match request wins over the cost-saving
+ * opt-outs above. SINGLEMATCH is incompatible with SOM in
+ * hyperscan, so drop it here.
+ */
+ fl |= HS_FLAG_SOM_LEFTMOST;
+ fl &= ~HS_FLAG_SINGLEMATCH;
+ }
g_array_append_val(mp->hs_flags, fl);
np = rspamd_multipattern_pattern_filter(pattern, patlen, flags, &dlen);
return ret;
}
+/*
+ * Report a regex match using the real start/end offsets obtained from
+ * rspamd_regexp_search(). Unlike the literal ACISM path, a regex match length
+ * is not equal to the pattern string length, so the start MUST come from the
+ * regex engine rather than being derived as end - pattern_len (which would be
+ * bogus). Both offsets are byte offsets into cbd->in, 0-based, with match_pos
+ * exclusive (see rspamd_multipattern_cb_t).
+ */
+static int
+rspamd_multipattern_regex_cb(struct rspamd_multipattern_cbdata *cbd,
+ unsigned int strnum,
+ int match_start,
+ int match_pos)
+{
+ int ret;
+
+ ret = cbd->cb(cbd->mp, strnum, match_start, match_pos,
+ cbd->in, cbd->len, cbd->ud);
+
+ cbd->nfound++;
+ cbd->ret = ret;
+
+ return ret;
+}
+
int rspamd_multipattern_lookup(struct rspamd_multipattern *mp,
const char *in, gsize len, rspamd_multipattern_cb_t cb,
gpointer ud, unsigned int *pnfound)
if (start >= end) {
break;
}
- if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) {
+ if (rspamd_multipattern_regex_cb(&cbd, i,
+ (int) (start - in),
+ (int) (end - in))) {
goto hs_fallback_out;
}
}
if (start >= end) {
break;
}
- if (rspamd_multipattern_acism_cb(i, end - in, &cbd)) {
+ if (rspamd_multipattern_regex_cb(&cbd, i,
+ (int) (start - in),
+ (int) (end - in))) {
goto out;
}
}
RSPAMD_MULTIPATTERN_DOTALL = (1 << 5),
RSPAMD_MULTIPATTERN_SINGLEMATCH = (1 << 6),
RSPAMD_MULTIPATTERN_NO_START = (1 << 7),
+ /*
+ * Explicitly request start-of-match offsets for every occurrence. This is
+ * the default behaviour today (hyperscan is compiled with
+ * HS_FLAG_SOM_LEFTMOST and the ACISM/regex fallbacks derive the start), so
+ * the flag mostly documents intent and makes the request explicit and
+ * future-proof. When set it also wins over RSPAMD_MULTIPATTERN_NO_START and
+ * RSPAMD_MULTIPATTERN_SINGLEMATCH (which would otherwise drop SOM).
+ */
+ RSPAMD_MULTIPATTERN_SOM = (1 << 8),
};
/**
struct ev_loop;
/**
- * Called on pattern match
+ * Called on pattern match.
+ *
+ * Offset convention: both offsets are byte offsets into @text and are 0-based.
+ * @match_start is inclusive (the first matched byte) and @match_pos is
+ * exclusive (one past the last matched byte), so the matched span is
+ * text[match_start .. match_pos) and its length is match_pos - match_start.
+ * Note that numerically @match_pos equals the 1-based position of the last
+ * matched byte, which is what historical callers treated as "the end position".
+ *
* @param mp multipattern structure
- * @param strnum number of pattern matched
- * @param textpos position in the text
+ * @param strnum number of pattern matched (0-based pattern id)
+ * @param match_start start byte offset of the match (0-based, inclusive). Only
+ * meaningful when start-of-match reporting is active (the default, or
+ * when RSPAMD_MULTIPATTERN_SOM is set); 0 otherwise.
+ * @param match_pos end byte offset of the match (0-based, exclusive)
* @param text input text
* @param len length of input text
* @param context userdata
/***
* @method trie:match(input, [cb][, report_start])
- * Search for patterns in `input` invoking `cb` optionally ignoring case
+ * Search for patterns in `input` invoking `cb` optionally ignoring case.
+ *
+ * Offset convention: the pattern index `idx` is 1-based (Lua style). Match
+ * offsets are byte offsets and are 0-based: when `report_start` is set the
+ * `start` is the inclusive offset of the first matched byte and the `end` is
+ * the exclusive offset one past the last matched byte (so `end - start` is the
+ * match length). When `report_start` is not set only the (exclusive) end
+ * offset is reported, matching the historical behaviour. Start offsets are
+ * available for every occurrence by default; pass `rspamd_trie.flags.som` at
+ * creation time to request them explicitly (and to keep them even when
+ * combined with `single_match`/`no_start`).
+ *
* @param {table or string} input one or several (if `input` is an array) strings of input text
- * @param {function} cb callback called on each pattern match in form `function (idx, pos)` where `idx` is a numeric index of pattern (starting from 1) and `pos` is a numeric offset where the pattern ends
+ * @param {function} cb callback called on each pattern match in form `function (idx, pos)` where `idx` is the 1-based pattern index and `pos` is the match end offset; when `report_start` is set `pos` is instead a table `{start, end}`
* @param {boolean} report_start report both start and end offset when matching patterns
- * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however). If `cb` is not defined then it returns a table of match positions indexed by pattern number
+ * @return {boolean} `true` if any pattern has been found (`cb` might be called multiple times however). If `cb` is not defined then it returns a table indexed by pattern number, each entry being a list of every occurrence (either the end offset, or `{start, end}` when `report_start` is set)
*/
static int
lua_trie_match(lua_State *L)
lua_setfield(L, -2, "single_match");
lua_pushinteger(L, RSPAMD_MULTIPATTERN_NO_START);
lua_setfield(L, -2, "no_start");
+ lua_pushinteger(L, RSPAMD_MULTIPATTERN_SOM);
+ lua_setfield(L, -2, "som");
lua_settable(L, -3);
/* Main content */
end
end)
+
+context("Trie start-of-match (SOM) offsets", function()
+ local t = require "rspamd_trie"
+ local bit = require "bit"
+
+ -- Offsets are byte offsets: start is 0-based inclusive, end is 0-based
+ -- exclusive (one past the last matched byte), so end - start == match length.
+ -- This is exactly what the YARA-style helpers need for #s / @s[i] / "$s at X".
+
+ -- {start, end, pattern_idx} ordering helper
+ local function cmp(a, b)
+ if a[3] ~= b[3] then return a[3] < b[3] end
+ if a[1] ~= b[1] then return a[1] < b[1] end
+ return a[2] < b[2]
+ end
+
+ test("flags.som is exposed", function()
+ assert_not_nil(t.flags.som, "rspamd_trie.flags.som must exist")
+ end)
+
+ test("literal patterns report every occurrence as {start, end}", function()
+ -- "abcab": 'ab' (id1) at [0,2) and [3,5); 'bc' (id2) at [1,3)
+ local trie = t.create({'ab', 'bc'}, t.flags.som)
+ local m = trie:match('abcab', true)
+
+ -- Count semantics (#s): two occurrences of 'ab', one of 'bc'
+ assert_equal(2, #m[1])
+ assert_equal(1, #m[2])
+
+ -- @s[i]: first 'ab' starts at offset 0, second at offset 3
+ assert_equal(0, m[1][1][1])
+ assert_equal(3, m[1][2][1])
+
+ local res = {}
+ for idx, hits in pairs(m) do
+ for _, se in ipairs(hits) do
+ table.insert(res, {se[1], se[2], idx})
+ end
+ end
+ local expect = {{0, 2, 1}, {3, 5, 1}, {1, 3, 2}}
+ table.sort(res, cmp)
+ table.sort(expect, cmp)
+ assert_rspamd_table_eq({expect = expect, actual = res})
+ end)
+
+ test("callback form reports {start, end} when report_start is set", function()
+ local trie = t.create({'ab', 'bc'}, t.flags.som)
+ local res = {}
+ trie:match('abcab', function(idx, se)
+ table.insert(res, {se[1], se[2], idx})
+ return 0
+ end, true)
+
+ local expect = {{0, 2, 1}, {3, 5, 1}, {1, 3, 2}}
+ table.sort(res, cmp)
+ table.sort(expect, cmp)
+ assert_rspamd_table_eq({expect = expect, actual = res})
+ end)
+
+ test("regex patterns report real start offsets", function()
+ -- fixed-length regex 'a.c' at [0,3) and [4,7) in "axc-ayc"
+ local trie = t.create({'a.c'}, bit.bor(t.flags.re, t.flags.som))
+ local m = trie:match('axc-ayc', true)
+
+ assert_equal(2, #m[1])
+ local res = {}
+ for _, se in ipairs(m[1]) do
+ table.insert(res, {se[1], se[2], 1})
+ end
+ local expect = {{0, 3, 1}, {4, 7, 1}}
+ table.sort(res, cmp)
+ table.sort(expect, cmp)
+ assert_rspamd_table_eq({expect = expect, actual = res})
+ end)
+
+ test("no match yields an empty result table", function()
+ local trie = t.create({'zzz'}, t.flags.som)
+ local m = trie:match('abcdef', true)
+ assert_equal(0, #m)
+ end)
+end)
#include "rspamd_cxx_unit_settings_merge.hxx"
#include "rspamd_cxx_unit_fpconv.hxx"
#include "rspamd_cxx_unit_symcache_timeout.hxx"
+#include "rspamd_cxx_unit_multipattern.hxx"
static gboolean verbose = false;
static const GOptionEntry entries[] =
--- /dev/null
+/*
+ * Copyright 2026 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_MULTIPATTERN_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_MULTIPATTERN_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+extern "C" {
+#include "libutil/multipattern.h"
+}
+
+#include <string>
+#include <vector>
+#include <tuple>
+#include <algorithm>
+
+/*
+ * Start-of-match (SOM) tests for the multipattern engine. These assert the
+ * (pattern_id, start, end) contract against hand-computed positions. Offsets
+ * are byte offsets: start is 0-based inclusive, end is 0-based exclusive (one
+ * past the last matched byte), so end - start is the match length. The
+ * assertions are backend-agnostic and must hold whether the lookup is served
+ * by hyperscan or the ACISM/regex fallback.
+ */
+TEST_SUITE("multipattern som")
+{
+ /* One reported occurrence: (pattern id, start offset, end offset) */
+ using mp_match = std::tuple<unsigned int, int, int>;
+
+ static int mp_collect_cb(struct rspamd_multipattern * mp, unsigned int strnum,
+ int match_start, int match_pos, const char *text,
+ gsize len, void *context)
+ {
+ auto *acc = static_cast<std::vector<mp_match> *>(context);
+ acc->emplace_back(strnum, match_start, match_pos);
+
+ return 0;
+ }
+
+ static std::vector<mp_match> mp_scan(const std::vector<std::string> &pats,
+ int flags, const std::string &input)
+ {
+ struct rspamd_multipattern *mp =
+ rspamd_multipattern_create((enum rspamd_multipattern_flags) flags);
+
+ for (const auto &p: pats) {
+ rspamd_multipattern_add_pattern_len(mp, p.data(), p.size(), flags);
+ }
+
+ GError *err = nullptr;
+ bool ok = rspamd_multipattern_compile(mp, RSPAMD_MULTIPATTERN_COMPILE_NO_FS, &err);
+ REQUIRE(ok);
+
+ std::vector<mp_match> res;
+ unsigned int nfound = 0;
+ rspamd_multipattern_lookup(mp, input.data(), input.size(),
+ mp_collect_cb, &res, &nfound);
+ rspamd_multipattern_destroy(mp);
+
+ CHECK(nfound == res.size());
+
+ /* Deterministic ordering: by end, then start, then id */
+ std::sort(res.begin(), res.end(),
+ [](const mp_match &a, const mp_match &b) {
+ if (std::get<2>(a) != std::get<2>(b)) {
+ return std::get<2>(a) < std::get<2>(b);
+ }
+ if (std::get<1>(a) != std::get<1>(b)) {
+ return std::get<1>(a) < std::get<1>(b);
+ }
+ return std::get<0>(a) < std::get<0>(b);
+ });
+
+ return res;
+ }
+
+ TEST_CASE("literal: multiple occurrences with ordered start offsets")
+ {
+ /* "ab" at byte offsets 0, 3, 6 in "abXabYab" */
+ auto res = mp_scan({"ab"}, RSPAMD_MULTIPATTERN_SOM, "abXabYab");
+
+ REQUIRE(res.size() == 3);
+ CHECK(res[0] == mp_match{0, 0, 2});
+ CHECK(res[1] == mp_match{0, 3, 5});
+ CHECK(res[2] == mp_match{0, 6, 8});
+ }
+
+ TEST_CASE("literal: overlapping occurrences")
+ {
+ /* "aa" in "aaaa" matches ending at 2,3,4 with leftmost starts 0,1,2 */
+ auto res = mp_scan({"aa"}, RSPAMD_MULTIPATTERN_SOM, "aaaa");
+
+ REQUIRE(res.size() == 3);
+ CHECK(res[0] == mp_match{0, 0, 2});
+ CHECK(res[1] == mp_match{0, 1, 3});
+ CHECK(res[2] == mp_match{0, 2, 4});
+ }
+
+ TEST_CASE("literal: case-insensitive start offsets")
+ {
+ /* "ABC" matched caselessly at offset 1 in "xABCy" */
+ auto res = mp_scan({"abc"},
+ RSPAMD_MULTIPATTERN_ICASE | RSPAMD_MULTIPATTERN_SOM,
+ "xABCy");
+
+ REQUIRE(res.size() == 1);
+ CHECK(res[0] == mp_match{0, 1, 4});
+ }
+
+ TEST_CASE("multiple distinct patterns keep their own ids")
+ {
+ /* id0="foo" at 0; id1="bar" at 4 in "foo bar" */
+ auto res = mp_scan({"foo", "bar"}, RSPAMD_MULTIPATTERN_SOM, "foo bar");
+
+ REQUIRE(res.size() == 2);
+ CHECK(res[0] == mp_match{0, 0, 3});
+ CHECK(res[1] == mp_match{1, 4, 7});
+ }
+
+ TEST_CASE("regex pattern reports real start offsets")
+ {
+ /* fixed-length regex "a.c" at offsets 0 and 4 in "axc-ayc" */
+ auto res = mp_scan({"a.c"},
+ RSPAMD_MULTIPATTERN_RE | RSPAMD_MULTIPATTERN_SOM,
+ "axc-ayc");
+
+ REQUIRE(res.size() == 2);
+ CHECK(res[0] == mp_match{0, 0, 3});
+ CHECK(res[1] == mp_match{0, 4, 7});
+ }
+
+ TEST_CASE("no match yields no occurrences")
+ {
+ auto res = mp_scan({"zzz"}, RSPAMD_MULTIPATTERN_SOM, "abcdef");
+
+ CHECK(res.empty());
+ }
+
+ TEST_CASE("SOM overrides single_match")
+ {
+ /*
+ * single_match alone would collapse to one occurrence; the explicit
+ * SOM flag must drop single_match and report every occurrence with a
+ * start offset.
+ */
+ auto res = mp_scan({"ab"},
+ RSPAMD_MULTIPATTERN_SINGLEMATCH | RSPAMD_MULTIPATTERN_SOM,
+ "abab");
+
+ REQUIRE(res.size() == 2);
+ CHECK(res[0] == mp_match{0, 0, 2});
+ CHECK(res[1] == mp_match{0, 2, 4});
+ }
+
+ TEST_CASE("large buffer: start offsets at known positions")
+ {
+ const std::string needle = "needle";
+ std::string buf(100000, 'x');
+ /* Plant the needle at two known offsets */
+ buf.replace(1000, needle.size(), needle);
+ buf.replace(50000, needle.size(), needle);
+
+ auto res = mp_scan({needle}, RSPAMD_MULTIPATTERN_SOM, buf);
+
+ REQUIRE(res.size() == 2);
+ CHECK(res[0] == mp_match{0, 1000, 1000 + (int) needle.size()});
+ CHECK(res[1] == mp_match{0, 50000, 50000 + (int) needle.size()});
+ }
+}
+
+#endif