From 9ebad076064c4639294e0e912f11c7f19ddf8aa2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 14 Nov 2025 18:04:08 +0000 Subject: [PATCH] [Feature] Wire C->Lua URL filter consultation through parser - Add lua_State parameter to rspamd_url_parse() and rspamd_web_parse() - Pass lua_State through entire parsing chain - Call rspamd_url_lua_consult() at two critical points: * Oversized user field (>max_email_user) - line 1205 * Multiple @ signs detected - line 1227 - Lua filter can now REJECT (abort), SUSPICIOUS (mark obscured), or ACCEPT - Update all callers: pass task->cfg->lua_state when available, NULL otherwise - HTML parser calls: pass NULL (no task context) - URL extraction: pass NULL (callback data doesn't have task) - Query URL parsing: pass task->cfg->lua_state (has task context) - Completes two-level architecture: C consults Lua on ambiguous patterns --- src/libserver/html/html_url.cxx | 4 ++-- src/libserver/url.c | 42 ++++++++++++++++++++++----------- src/libserver/url.h | 4 +++- 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx index 35d6c3f78b..3ef51f2b6b 100644 --- a/src/libserver/html/html_url.cxx +++ b/src/libserver/html/html_url.cxx @@ -159,7 +159,7 @@ auto html_url_is_phished(rspamd_mempool_t *pool, text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool, - RSPAMD_URL_PARSE_TEXT); + RSPAMD_URL_PARSE_TEXT, NULL); if (rc == URI_ERRNO_OK) { text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED; @@ -455,7 +455,7 @@ auto html_process_url(rspamd_mempool_t *pool, std::string_view &input) url = rspamd_mempool_alloc0_type(pool, struct rspamd_url); rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags); - rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF); + rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, NULL); /* Filter some completely damaged urls */ if (rc == URI_ERRNO_OK && url->hostlen > 0 && diff --git a/src/libserver/url.c b/src/libserver/url.c index 5a6a9f5509..0c3ceb8fa4 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -1020,7 +1020,8 @@ static int rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len, char const **end, enum rspamd_url_parse_flags parse_flags, - unsigned int *flags) + unsigned int *flags, + lua_State *L) { const char *p = str, *c = str, *last = str + len, *slash = NULL, *password_start = NULL, *user_start = NULL; @@ -1200,13 +1201,18 @@ rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len, goto out; } else if (p - c > max_email_user) { - /* Allow oversized user fields but mark them - fixes #5731 */ - /* TODO: Call rspamd_url_lua_consult(pool, c, p-c, *flags, L) here - * to ask Lua if we should continue parsing this URL. - * Returns: 0=continue, 1=mark obscured, 2=abort (goto out) - * Challenge: need lua_State *L passed through call chain */ + /* Oversized user field - consult Lua filter (fixes #5731) */ + int lua_decision = rspamd_url_lua_consult(NULL, c, p - c, *flags, L); + if (lua_decision == 2) { + /* REJECT: Lua says this is garbage, abort parsing */ + goto out; + } + else if (lua_decision == 1) { + /* SUSPICIOUS: Mark as obscured for plugin analysis */ + *flags |= RSPAMD_URL_FLAG_OBSCURED; + } + /* ACCEPT or SUSPICIOUS: continue parsing */ *flags |= RSPAMD_URL_FLAG_HAS_USER; - /* Continue parsing - the Lua plugin will handle scoring */ } p++; @@ -1217,7 +1223,13 @@ rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len, goto out; } - /* For now, we ignore all that stuff as it is bogus */ + /* Multiple @ signs detected - consult Lua */ + int lua_decision = rspamd_url_lua_consult(NULL, c, p - c, *flags, L); + if (lua_decision == 2) { + /* REJECT: Too suspicious, abort */ + goto out; + } + /* ACCEPT or SUSPICIOUS: Continue but mark as obscured */ /* Off by one */ p--; SET_U(u, UF_USERINFO); @@ -2255,7 +2267,8 @@ enum uri_errno rspamd_url_parse(struct rspamd_url *uri, char *uristring, gsize len, rspamd_mempool_t *pool, - enum rspamd_url_parse_flags parse_flags) + enum rspamd_url_parse_flags parse_flags, + lua_State *L) { struct http_parser_url u; char *p; @@ -2296,11 +2309,11 @@ rspamd_url_parse(struct rspamd_url *uri, } else { ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, - &flags); + &flags, L); } } else { - ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags); + ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags, L); } if (ret != 0) { @@ -2972,7 +2985,7 @@ url_web_end(struct url_callback_data *cb, } if (rspamd_web_parse(NULL, pos, len, &last, - RSPAMD_URL_PARSE_CHECK, &flags) != 0) { + RSPAMD_URL_PARSE_CHECK, &flags, NULL) != 0) { return FALSE; } @@ -3455,7 +3468,7 @@ rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp, g_strstrip(cb->url_str); rc = rspamd_url_parse(url, cb->url_str, strlen(cb->url_str), pool, - RSPAMD_URL_PARSE_TEXT); + RSPAMD_URL_PARSE_TEXT, NULL); if (rc == URI_ERRNO_OK && url->hostlen > 0) { if (cb->prefix_added) { @@ -3797,7 +3810,8 @@ rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset, url_str, strlen(url_str), task->task_pool, - RSPAMD_URL_PARSE_TEXT); + RSPAMD_URL_PARSE_TEXT, + task->cfg ? task->cfg->lua_state : NULL); if (rc == URI_ERRNO_OK && url->hostlen > 0) { diff --git a/src/libserver/url.h b/src/libserver/url.h index 671d25b27b..a3a2ac941a 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -185,12 +185,14 @@ void rspamd_url_text_extract(rspamd_mempool_t *pool, * @param pool memory pool * @param uristring text form of url * @param uri url object, must be pre allocated + * @param L Lua state for consultation (may be NULL) */ enum uri_errno rspamd_url_parse(struct rspamd_url *uri, char *uristring, gsize len, rspamd_mempool_t *pool, - enum rspamd_url_parse_flags flags); + enum rspamd_url_parse_flags flags, + lua_State *L); /* * Try to extract url from a text -- 2.47.3