]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Wire C->Lua URL filter consultation through parser
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:04:08 +0000 (18:04 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:04:08 +0000 (18:04 +0000)
- Add lua_State parameter to rspamd_url_parse() and rspamd_web_parse()
- Pass lua_State through entire parsing chain
- Call rspamd_url_lua_consult() at two critical points:
  * Oversized user field (>max_email_user) - line 1205
  * Multiple @ signs detected - line 1227
- Lua filter can now REJECT (abort), SUSPICIOUS (mark obscured), or ACCEPT
- Update all callers: pass task->cfg->lua_state when available, NULL otherwise
- HTML parser calls: pass NULL (no task context)
- URL extraction: pass NULL (callback data doesn't have task)
- Query URL parsing: pass task->cfg->lua_state (has task context)
- Completes two-level architecture: C consults Lua on ambiguous patterns

src/libserver/html/html_url.cxx
src/libserver/url.c
src/libserver/url.h

index 35d6c3f78bf8b0d61b81edbd65bb9e5ebfe34493..3ef51f2b6bc484ecdd112a5c40fec746357594f9 100644 (file)
@@ -159,7 +159,7 @@ auto html_url_is_phished(rspamd_mempool_t *pool,
 
                text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
                auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
-                                                                  RSPAMD_URL_PARSE_TEXT);
+                                                                  RSPAMD_URL_PARSE_TEXT, NULL);
 
                if (rc == URI_ERRNO_OK) {
                        text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
@@ -455,7 +455,7 @@ auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
 
        url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
        rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
-       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF);
+       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, NULL);
 
        /* Filter some completely damaged urls */
        if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
index 5a6a9f55096ff6758d97da8dc7c5104c144c0560..0c3ceb8fa487b98b8ed503bf8be7522acec836b7 100644 (file)
@@ -1020,7 +1020,8 @@ static int
 rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len,
                                 char const **end,
                                 enum rspamd_url_parse_flags parse_flags,
-                                unsigned int *flags)
+                                unsigned int *flags,
+                                lua_State *L)
 {
        const char *p = str, *c = str, *last = str + len, *slash = NULL,
                           *password_start = NULL, *user_start = NULL;
@@ -1200,13 +1201,18 @@ rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len,
                                goto out;
                        }
                        else if (p - c > max_email_user) {
-                               /* Allow oversized user fields but mark them - fixes #5731 */
-                               /* TODO: Call rspamd_url_lua_consult(pool, c, p-c, *flags, L) here
-                                * to ask Lua if we should continue parsing this URL.
-                                * Returns: 0=continue, 1=mark obscured, 2=abort (goto out)
-                                * Challenge: need lua_State *L passed through call chain */
+                               /* Oversized user field - consult Lua filter (fixes #5731) */
+                               int lua_decision = rspamd_url_lua_consult(NULL, c, p - c, *flags, L);
+                               if (lua_decision == 2) {
+                                       /* REJECT: Lua says this is garbage, abort parsing */
+                                       goto out;
+                               }
+                               else if (lua_decision == 1) {
+                                       /* SUSPICIOUS: Mark as obscured for plugin analysis */
+                                       *flags |= RSPAMD_URL_FLAG_OBSCURED;
+                               }
+                               /* ACCEPT or SUSPICIOUS: continue parsing */
                                *flags |= RSPAMD_URL_FLAG_HAS_USER;
-                               /* Continue parsing - the Lua plugin will handle scoring */
                        }
 
                        p++;
@@ -1217,7 +1223,13 @@ rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len,
                                        goto out;
                                }
 
-                               /* For now, we ignore all that stuff as it is bogus */
+                               /* Multiple @ signs detected - consult Lua */
+                               int lua_decision = rspamd_url_lua_consult(NULL, c, p - c, *flags, L);
+                               if (lua_decision == 2) {
+                                       /* REJECT: Too suspicious, abort */
+                                       goto out;
+                               }
+                               /* ACCEPT or SUSPICIOUS: Continue but mark as obscured */
                                /* Off by one */
                                p--;
                                SET_U(u, UF_USERINFO);
@@ -2255,7 +2267,8 @@ enum uri_errno
 rspamd_url_parse(struct rspamd_url *uri,
                                 char *uristring, gsize len,
                                 rspamd_mempool_t *pool,
-                                enum rspamd_url_parse_flags parse_flags)
+                                enum rspamd_url_parse_flags parse_flags,
+                                lua_State *L)
 {
        struct http_parser_url u;
        char *p;
@@ -2296,11 +2309,11 @@ rspamd_url_parse(struct rspamd_url *uri,
                }
                else {
                        ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags,
-                                                                  &flags);
+                                                                  &flags, L);
                }
        }
        else {
-               ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags);
+               ret = rspamd_web_parse(&u, uristring, len, &end, parse_flags, &flags, L);
        }
 
        if (ret != 0) {
@@ -2972,7 +2985,7 @@ url_web_end(struct url_callback_data *cb,
        }
 
        if (rspamd_web_parse(NULL, pos, len, &last,
-                                                RSPAMD_URL_PARSE_CHECK, &flags) != 0) {
+                                                RSPAMD_URL_PARSE_CHECK, &flags, NULL) != 0) {
                return FALSE;
        }
 
@@ -3455,7 +3468,7 @@ rspamd_url_trie_generic_callback_common(struct rspamd_multipattern *mp,
                g_strstrip(cb->url_str);
                rc = rspamd_url_parse(url, cb->url_str,
                                                          strlen(cb->url_str), pool,
-                                                         RSPAMD_URL_PARSE_TEXT);
+                                                         RSPAMD_URL_PARSE_TEXT, NULL);
 
                if (rc == URI_ERRNO_OK && url->hostlen > 0) {
                        if (cb->prefix_added) {
@@ -3797,7 +3810,8 @@ rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
                                                                  url_str,
                                                                  strlen(url_str),
                                                                  task->task_pool,
-                                                                 RSPAMD_URL_PARSE_TEXT);
+                                                                 RSPAMD_URL_PARSE_TEXT,
+                                                                 task->cfg ? task->cfg->lua_state : NULL);
 
                        if (rc == URI_ERRNO_OK &&
                                url->hostlen > 0) {
index 671d25b27b0d4b1b3d35b2334fba62848494ad13..a3a2ac941a77b079dce0924a0e53560821f73f60 100644 (file)
@@ -185,12 +185,14 @@ void rspamd_url_text_extract(rspamd_mempool_t *pool,
  * @param pool memory pool
  * @param uristring text form of url
  * @param uri url object, must be pre allocated
+ * @param L Lua state for consultation (may be NULL)
  */
 enum uri_errno rspamd_url_parse(struct rspamd_url *uri,
                                                                char *uristring,
                                                                gsize len,
                                                                rspamd_mempool_t *pool,
-                                                               enum rspamd_url_parse_flags flags);
+                                                               enum rspamd_url_parse_flags flags,
+                                                               lua_State *L);
 
 /*
  * Try to extract url from a text