From: Vsevolod Stakhov Date: Thu, 4 Jun 2026 08:55:22 +0000 (+0100) Subject: [Fix] multipattern: bound URL query scan reentrancy X-Git-Tag: 4.1.0~6 X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=f068a1156;p=thirdparty%2Frspamd.git [Fix] multipattern: bound URL query scan reentrancy A URL whose query embeds a percent-escaped URL is unwrapped recursively (PR #6066): rspamd_url_find_in_query re-enters rspamd_multipattern_lookup on the URL trie while the enclosing scan is still on the stack. Each scan borrows one of MAX_SCRATCH hyperscan scratch contexts; once the recursion nests deeper than the pool, the slot loop leaves scr == NULL and g_assert(scr != NULL) aborts the worker. A crafted message with a few levels of nested query URLs thus crashes a normal worker (DoS). The peak number of simultaneously-held scratch contexts on the deepest chain is RSPAMD_URL_QUERY_MAX_NESTING + 2: one for the enclosing text/subject scan and one for the per-URL TLD lookup that rspamd_url_parse runs on each freshly extracted leaf. The old pool of 4 with a nesting cap of 5 needed 7 -> assertion. - Introduce RSPAMD_MULTIPATTERN_MAX_REENTRANCY (10) and size the scratch stack from it; a scratch context is ~2.5-4 KiB, so the deeper stack costs only tens of KiB per multipattern. - Tie RSPAMD_URL_QUERY_MAX_NESTING to that budget (minus the two implicit levels) so normal nesting stays on the fast path. - Make scratch exhaustion non-fatal: allocate a one-off scratch for the scan instead of aborting the worker on attacker input. - Guard the unsigned-int scratch bitmask with a static assert. Add functional regression test 170_url_query_nesting. --- diff --git a/src/libserver/url.h b/src/libserver/url.h index 91b7a75b37..7a67547cb4 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -23,6 +23,7 @@ #include "khash.h" #include "fstring.h" #include "libutil/cxx/utf8_util.h" +#include "libutil/multipattern.h" /* for RSPAMD_MULTIPATTERN_MAX_REENTRANCY */ #ifdef __cplusplus extern "C" { @@ -277,8 +278,16 @@ void rspamd_url_find_single(rspamd_mempool_t *pool, /** * How deep to follow URLs nested inside the query of an already query-extracted * URL (a properly escaped wrapper carries one target per encoding layer). - */ -#define RSPAMD_URL_QUERY_MAX_NESTING 5 + * + * Each level re-enters the URL multipattern scan while the enclosing scan is + * still on the stack. The peak number of simultaneously-held scratch contexts + * on the deepest chain is therefore this depth plus two: one for the enclosing + * text/subject scan, and one for the per-URL TLD lookup that rspamd_url_parse + * runs on the freshly extracted leaf URL. Keep that within the multipattern + * scratch budget (RSPAMD_MULTIPATTERN_MAX_REENTRANCY) so normal nesting stays + * on the fast static-scratch path. + */ +#define RSPAMD_URL_QUERY_MAX_NESTING (RSPAMD_MULTIPATTERN_MAX_REENTRANCY - 2) /** * Find URLs embedded in the query parameters of `url`. Unlike diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index 3605f7c849..1aea68476c 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -30,7 +30,16 @@ #include "libutil/regexp.h" #include -#define MAX_SCRATCH 4 +/* + * Depth of the per-multipattern scratch stack. A hyperscan scratch context is + * ~2.5-4 KiB, so a stack of RSPAMD_MULTIPATTERN_MAX_REENTRANCY costs only a few + * tens of KiB per multipattern while letting lookups be safely re-entered from + * within a match callback (see RSPAMD_MULTIPATTERN_MAX_REENTRANCY). + */ +#define MAX_SCRATCH RSPAMD_MULTIPATTERN_MAX_REENTRANCY + +/* scratch_used is an unsigned int bitmask, so the stack cannot exceed its width */ +G_STATIC_ASSERT(MAX_SCRATCH <= sizeof(unsigned int) * 8); /* * Threshold for "small" multipatterns that are compiled in memory @@ -74,7 +83,7 @@ struct RSPAMD_ALIGNED(64) rspamd_multipattern { GArray *hs_pats; GArray *hs_ids; GArray *hs_flags; - unsigned int scratch_used; + unsigned int scratch_used; /* bitmask of busy scratch[] slots */ #endif ac_trie_t *t; GArray *pats; @@ -949,6 +958,7 @@ int rspamd_multipattern_lookup(struct rspamd_multipattern *mp, /* Use hyperscan if it's compiled and ready */ if (mp->state == RSPAMD_MP_STATE_COMPILED && mp->hs_db != NULL) { hs_scratch_t *scr = NULL; + gboolean scr_temporary = FALSE; unsigned int i; for (i = 0; i < MAX_SCRATCH; i++) { @@ -959,12 +969,39 @@ int rspamd_multipattern_lookup(struct rspamd_multipattern *mp, } } - g_assert(scr != NULL); + if (scr == NULL) { + /* + * The static scratch stack (MAX_SCRATCH deep) is exhausted by an + * unusually deep reentrant lookup - a lookup re-entered from within + * a match callback more than MAX_SCRATCH levels deep (e.g. a + * pathologically nested chain of query-embedded URLs). Callers are + * expected to bound their recursion below MAX_SCRATCH + * (see RSPAMD_MULTIPATTERN_MAX_REENTRANCY), so this is a cold safety + * net: allocate a one-off scratch for this scan rather than abort + * the whole worker on attacker-controlled input. + */ + int rc = hs_alloc_scratch(rspamd_hyperscan_get_database(mp->hs_db), + &scr); + + if (rc != HS_SUCCESS || scr == NULL) { + msg_err("cannot allocate temporary hyperscan scratch " + "(error code %d) at reentrancy depth > %d; skipping lookup", + rc, (int) MAX_SCRATCH); + return 0; + } + + scr_temporary = TRUE; + } ret = hs_scan(rspamd_hyperscan_get_database(mp->hs_db), in, len, 0, scr, rspamd_multipattern_hs_cb, &cbd); - mp->scratch_used &= ~(1 << i); + if (scr_temporary) { + hs_free_scratch(scr); + } + else { + mp->scratch_used &= ~(1 << i); + } if (ret == HS_SUCCESS) { ret = 0; diff --git a/src/libutil/multipattern.h b/src/libutil/multipattern.h index 7ca89161cf..648d99e678 100644 --- a/src/libutil/multipattern.h +++ b/src/libutil/multipattern.h @@ -32,6 +32,17 @@ extern "C" { struct ev_loop; +/** + * Maximum reentrancy depth of rspamd_multipattern_lookup() on a single + * multipattern. Each multipattern keeps a small stack of hyperscan scratch + * contexts of this depth, so a lookup may be re-entered (from within a match + * callback) up to this many levels deep before the scratch pool is exhausted. + * Callers that recurse into the same multipattern from their callback (e.g. URL + * query unwrapping, which re-scans an embedded URL while the enclosing scan is + * still on the stack) MUST keep their total nesting at or below this bound. + */ +#define RSPAMD_MULTIPATTERN_MAX_REENTRANCY 10 + enum rspamd_multipattern_flags { RSPAMD_MULTIPATTERN_DEFAULT = 0, RSPAMD_MULTIPATTERN_ICASE = (1 << 0), diff --git a/test/functional/cases/170_url_query_nesting.robot b/test/functional/cases/170_url_query_nesting.robot new file mode 100644 index 0000000000..bab2e599d0 --- /dev/null +++ b/test/functional/cases/170_url_query_nesting.robot @@ -0,0 +1,28 @@ +*** Settings *** +Suite Setup Rspamd Setup +Suite Teardown Rspamd Teardown +Library ${RSPAMD_TESTDIR}/lib/rspamd.py +Resource ${RSPAMD_TESTDIR}/lib/rspamd.robot +Variables ${RSPAMD_TESTDIR}/lib/vars.py + +*** Variables *** +${CONFIG} ${RSPAMD_TESTDIR}/configs/url_query_nesting.conf +${MESSAGE} ${RSPAMD_TESTDIR}/messages/url_query_nesting.eml +${RSPAMD_SCOPE} Suite +${RSPAMD_URL_TLD} ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat + +*** Test Cases *** +DEEPLY NESTED QUERY URLS DO NOT CRASH THE WORKER + [Documentation] A URL whose query embeds another URL, repeated many times, + ... makes the URL multipattern scan re-enter itself once per + ... nesting level. The per-multipattern hyperscan scratch stack + ... must absorb that reentrancy (bounded by + ... RSPAMD_URL_QUERY_MAX_NESTING) instead of aborting the worker + ... on a "scr != NULL" assertion. Regression for the scratch-pool + ... exhaustion crash in rspamd_multipattern_lookup. + Scan File ${MESSAGE} + # The scan completing at all proves the worker survived (a crash would make + # Scan File fail). The outermost URL is always extracted; deeper hops are + # followed up to the nesting cap. + Expect URL h0.example.org + Expect URL h1.example.org diff --git a/test/functional/configs/url_query_nesting.conf b/test/functional/configs/url_query_nesting.conf new file mode 100644 index 0000000000..eb6a307896 --- /dev/null +++ b/test/functional/configs/url_query_nesting.conf @@ -0,0 +1,45 @@ +options = { + filters = ["regexp"] + url_tld = "{= env.TESTDIR =}/../lua/unit/test_tld.dat" + pidfile = "{= env.TMPDIR =}/rspamd.pid"; + lua_path = "{= env.INSTALLROOT =}/share/rspamd/lib/?.lua"; + dns { + retransmits = 2; + } +} +logging = { + type = "file", + level = "debug" + filename = "{= env.TMPDIR =}/rspamd.log"; +} +metric = { + name = "default", + actions = { + reject = 100500, + } + unknown_weight = 1 +} + +worker { + type = normal + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_NORMAL =}" + count = 1 + keypair { + pubkey = "{= env.KEY_PUB1 =}"; + privkey = "{= env.KEY_PVT1 =}"; + } + task_timeout = 10s; +} + +worker { + type = controller + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_CONTROLLER =}" + count = 1 + secure_ip = ["127.0.0.1", "::1"]; + stats_path = "{= env.TMPDIR =}/stats.ucl" +} + +modules { + path = "{= env.TESTDIR =}/../../src/plugins/lua/" +} +lua = "{= env.INSTALLROOT =}/share/rspamd/rules/rspamd.lua" diff --git a/test/functional/messages/url_query_nesting.eml b/test/functional/messages/url_query_nesting.eml new file mode 100644 index 0000000000..307bc11083 --- /dev/null +++ b/test/functional/messages/url_query_nesting.eml @@ -0,0 +1,8 @@ +From: attacker@example.org +To: victim@example.com +Subject: nested query urls regression +Content-Type: text/plain +Date: Mon, 01 Jun 2026 00:00:00 +0000 +Message-ID: + +Please click http://h0.example.org/?u=http://h1.example.org/?u=http://h2.example.org/?u=http://h3.example.org/?u=http://h4.example.org/?u=http://h5.example.org/?u=http://h6.example.org/?u=http://h7.example.org/?u=http://h8.example.org/?u=http://h9.example.org/?u=http://h10.example.org/?u=http://h11.example.org/?u=http://h12.example.org/ now.