]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Pass lua_State through HTML URL processing
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
- Add lua_State parameter to html_process_url() and html_process_url_tag()
- Add lua_State parameter to html_check_displayed_url() and html_process_displayed_href_tag()
- Add lua_State parameter to html_process_query_url()
- Pass task->cfg->lua_state from html_process_input() to all URL processing functions
- All rspamd_url_parse() calls in HTML now have proper lua_State
- HTML URL processing now benefits from Lua filter consultation
- Completes lua_State plumbing - now universally available throughout URL processing chain

src/libserver/html/html.cxx
src/libserver/html/html_url.cxx
src/libserver/html/html_url.hxx

index 556eca66c7bf2d3c5ed5de62d1e99f662c3d8ed3..a55f95533453a4c49584c3e9f44136a199f758fb 100644 (file)
@@ -1313,7 +1313,8 @@ html_is_absolute_url(std::string_view st) -> bool
 static auto
 html_process_url_tag(rspamd_mempool_t *pool,
                                         struct html_tag *tag,
-                                        struct html_content *hc) -> std::optional<struct rspamd_url *>
+                                        struct html_content *hc,
+                                        lua_State *L) -> std::optional<struct rspamd_url *>
 {
        auto found_href_maybe = tag->find_href();
 
@@ -1370,7 +1371,7 @@ html_process_url_tag(rspamd_mempool_t *pool,
                        }
                }
 
-               auto url = html_process_url(pool, href_value).value_or(nullptr);
+               auto url = html_process_url(pool, href_value, L).value_or(nullptr);
 
                if (url) {
                        if (tag->id != Tag_A) {
@@ -1431,7 +1432,8 @@ html_url_query_callback(struct rspamd_url *url, gsize start_offset,
 static void
 html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                                           khash_t(rspamd_url_hash) * url_set,
-                                          GPtrArray *part_urls)
+                                          GPtrArray *part_urls,
+                                          lua_State *L)
 {
        if (url->querylen > 0) {
                struct rspamd_html_url_query_cbd qcbd;
@@ -1444,7 +1446,7 @@ html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                rspamd_url_find_multiple(pool,
                                                                 rspamd_url_query_unsafe(url), url->querylen,
                                                                 RSPAMD_URL_FIND_ALL, NULL,
-                                                                html_url_query_callback, &qcbd, NULL);
+                                                                html_url_query_callback, &qcbd, L);
        }
 
        if (part_urls) {
@@ -1549,7 +1551,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
                                if (img->src) {
 
                                        std::string_view cpy{*href_value};
-                                       auto maybe_url = html_process_url(pool, cpy);
+                                       auto maybe_url = html_process_url(pool, cpy, L);
 
                                        if (maybe_url) {
                                                img->url = maybe_url.value();
@@ -1852,7 +1854,8 @@ html_process_displayed_href_tag(rspamd_mempool_t *pool,
                                                                const struct html_tag *cur_tag,
                                                                GList **exceptions,
                                                                khash_t(rspamd_url_hash) * url_set,
-                                                               goffset dest_offset) -> void
+                                                               goffset dest_offset,
+                                                               lua_State *L) -> void
 {
 
        if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
@@ -1862,7 +1865,7 @@ html_process_displayed_href_tag(rspamd_mempool_t *pool,
                                                                 exceptions, url_set,
                                                                 data,
                                                                 dest_offset,
-                                                                url);
+                                                                url, L);
        }
 }
 
@@ -2040,7 +2043,8 @@ html_append_tag_content(rspamd_mempool_t *pool,
                        html_process_displayed_href_tag(pool, hc,
                                                                                        {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
                                                                                        tag, exceptions,
-                                                                                       url_set, initial_parsed_offset);
+                                                                                       url_set, initial_parsed_offset,
+                                                                                       task->cfg ? task->cfg->lua_state : NULL);
                        /* Count display URL mismatches when URL is present */
                        if (std::holds_alternative<rspamd_url *>(tag->extra)) {
                                auto *u = std::get<rspamd_url *>(tag->extra);
@@ -2218,7 +2222,8 @@ auto html_process_input(struct rspamd_task *task,
                        /* If action present and absolute, compare eTLD+1 with first-party */
                        if (auto href = cur_tag->find_href()) {
                                if (html_is_absolute_url(*href)) {
-                                       auto maybe_url = html_process_url(pool, *href);
+                                       auto maybe_url = html_process_url(pool, *href,
+                                                                                                         task->cfg ? task->cfg->lua_state : NULL);
                                        if (maybe_url) {
                                                struct rspamd_url *u = maybe_url.value();
                                                if (u->hostlen > 0) {
@@ -2268,7 +2273,8 @@ auto html_process_input(struct rspamd_task *task,
 
                                                                if (!urlv.empty()) {
                                                                        /* validate and count; do not add to urls set */
-                                                                       auto maybe_url = html_process_url(pool, urlv);
+                                                                       auto maybe_url = html_process_url(pool, urlv,
+                                                                                                                                         task->cfg ? task->cfg->lua_state : NULL);
                                                                        if (maybe_url) {
                                                                                hc->features.meta_refresh_urls++;
                                                                        }
@@ -2338,7 +2344,8 @@ auto html_process_input(struct rspamd_task *task,
                }
 
                if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
-                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc,
+                                                                                                 task->cfg ? task->cfg->lua_state : NULL);
 
                        if (maybe_url.has_value()) {
                                url = maybe_url.value();
@@ -2352,7 +2359,8 @@ auto html_process_input(struct rspamd_task *task,
                                                }
                                                url->part_order = cur_url_part_order++;
                                                html_process_query_url(pool, url, url_set,
-                                                                                          part_urls);
+                                                                                          part_urls,
+                                                                                          task->cfg ? task->cfg->lua_state : NULL);
                                        }
                                        else {
                                                url = maybe_existing;
@@ -2438,7 +2446,8 @@ auto html_process_input(struct rspamd_task *task,
                        /*
                         * Base is allowed only within head tag but HTML is retarded
                         */
-                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc,
+                                                                                                 task->cfg ? task->cfg->lua_state : NULL);
 
                        if (maybe_url) {
                                msg_debug_html("got valid base tag");
index 3ef51f2b6bc484ecdd112a5c40fec746357594f9..2e4cefdeb657ac4c2c2621bf3f98ac3a3cdf861b 100644 (file)
@@ -159,7 +159,7 @@ auto html_url_is_phished(rspamd_mempool_t *pool,
 
                text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
                auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
-                                                                  RSPAMD_URL_PARSE_TEXT, NULL);
+                                                                  RSPAMD_URL_PARSE_TEXT, L);
 
                if (rc == URI_ERRNO_OK) {
                        text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
@@ -229,7 +229,8 @@ void html_check_displayed_url(rspamd_mempool_t *pool,
                                                          void *url_set,
                                                          std::string_view visible_part,
                                                          goffset href_offset,
-                                                         struct rspamd_url *url)
+                                                         struct rspamd_url *url,
+                                                         lua_State *L)
 {
        struct rspamd_url *displayed_url = nullptr;
        struct rspamd_url *turl;
@@ -307,7 +308,7 @@ void html_check_displayed_url(rspamd_mempool_t *pool,
        rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
 }
 
-auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input, lua_State *L)
        -> std::optional<struct rspamd_url *>
 {
        struct rspamd_url *url;
@@ -455,7 +456,7 @@ auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
 
        url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
        rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
-       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, NULL);
+       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, L);
 
        /* Filter some completely damaged urls */
        if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
index 46dde6d382eaeb894ee4837c2213e1cf1908a637..20275b6b3e4012518457c72f96313dcaf3798f93 100644 (file)
@@ -53,15 +53,17 @@ auto html_check_displayed_url(rspamd_mempool_t *pool,
                                                          void *url_set,
                                                          std::string_view visible_part,
                                                          goffset href_offset,
-                                                         struct rspamd_url *url) -> void;
+                                                         struct rspamd_url *url,
+                                                         lua_State *L) -> void;
 
 /**
  * Process HTML url (e.g. for href component)
  * @param pool
  * @param input may be modified during the process
+ * @param L Lua state for consultation (may be NULL)
  * @return
  */
-auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input, lua_State *L)
        -> std::optional<struct rspamd_url *>;
 }// namespace rspamd::html