[Feature] Pass lua_State through HTML URL processing

author Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index 556eca66c7bf2d3c5ed5de62d1e99f662c3d8ed3..a55f95533453a4c49584c3e9f44136a199f758fb 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1313,7 +1313,8 @@ html_is_absolute_url(std::string_view st) -> bool
  static auto
  html_process_url_tag(rspamd_mempool_t *pool,
                                          struct html_tag *tag,
-                                        struct html_content *hc) -> std::optional<struct rspamd_url *>
+                                        struct html_content *hc,
+                                        lua_State *L) -> std::optional<struct rspamd_url *>
  {
         auto found_href_maybe = tag->find_href();
  
@@ -1370,7 +1371,7 @@ html_process_url_tag(rspamd_mempool_t *pool,
                         }
                 }
  
-               auto url = html_process_url(pool, href_value).value_or(nullptr);
+               auto url = html_process_url(pool, href_value, L).value_or(nullptr);
  
                 if (url) {
                         if (tag->id != Tag_A) {
@@ -1431,7 +1432,8 @@ html_url_query_callback(struct rspamd_url *url, gsize start_offset,
  static void
  html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                                            khash_t(rspamd_url_hash) * url_set,
-                                          GPtrArray *part_urls)
+                                          GPtrArray *part_urls,
+                                          lua_State *L)
  {
         if (url->querylen > 0) {
                 struct rspamd_html_url_query_cbd qcbd;
@@ -1444,7 +1446,7 @@ html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                 rspamd_url_find_multiple(pool,
                                                                  rspamd_url_query_unsafe(url), url->querylen,
                                                                  RSPAMD_URL_FIND_ALL, NULL,
-                                                                html_url_query_callback, &qcbd, NULL);
+                                                                html_url_query_callback, &qcbd, L);
         }
  
         if (part_urls) {
@@ -1549,7 +1551,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
                                 if (img->src) {
  
                                         std::string_view cpy{*href_value};
-                                       auto maybe_url = html_process_url(pool, cpy);
+                                       auto maybe_url = html_process_url(pool, cpy, L);
  
                                         if (maybe_url) {
                                                 img->url = maybe_url.value();
@@ -1852,7 +1854,8 @@ html_process_displayed_href_tag(rspamd_mempool_t *pool,
                                                                 const struct html_tag *cur_tag,
                                                                 GList **exceptions,
                                                                 khash_t(rspamd_url_hash) * url_set,
-                                                               goffset dest_offset) -> void
+                                                               goffset dest_offset,
+                                                               lua_State *L) -> void
  {
  
         if (std::holds_alternative<rspamd_url *>(cur_tag->extra)) {
@@ -1862,7 +1865,7 @@ html_process_displayed_href_tag(rspamd_mempool_t *pool,
                                                                  exceptions, url_set,
                                                                  data,
                                                                  dest_offset,
-                                                                url);
+                                                                url, L);
         }
  }
  
@@ -2040,7 +2043,8 @@ html_append_tag_content(rspamd_mempool_t *pool,
                         html_process_displayed_href_tag(pool, hc,
                                                                                         {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
                                                                                         tag, exceptions,
-                                                                                       url_set, initial_parsed_offset);
+                                                                                       url_set, initial_parsed_offset,
+                                                                                       task->cfg ? task->cfg->lua_state : NULL);
                         /* Count display URL mismatches when URL is present */
                         if (std::holds_alternative<rspamd_url *>(tag->extra)) {
                                 auto *u = std::get<rspamd_url *>(tag->extra);
@@ -2218,7 +2222,8 @@ auto html_process_input(struct rspamd_task *task,
                         /* If action present and absolute, compare eTLD+1 with first-party */
                         if (auto href = cur_tag->find_href()) {
                                 if (html_is_absolute_url(*href)) {
-                                       auto maybe_url = html_process_url(pool, *href);
+                                       auto maybe_url = html_process_url(pool, *href,
+                                                                                                         task->cfg ? task->cfg->lua_state : NULL);
                                         if (maybe_url) {
                                                 struct rspamd_url *u = maybe_url.value();
                                                 if (u->hostlen > 0) {
@@ -2268,7 +2273,8 @@ auto html_process_input(struct rspamd_task *task,
  
                                                                 if (!urlv.empty()) {
                                                                         /* validate and count; do not add to urls set */
-                                                                       auto maybe_url = html_process_url(pool, urlv);
+                                                                       auto maybe_url = html_process_url(pool, urlv,
+                                                                                                                                         task->cfg ? task->cfg->lua_state : NULL);
                                                                         if (maybe_url) {
                                                                                 hc->features.meta_refresh_urls++;
                                                                         }
@@ -2338,7 +2344,8 @@ auto html_process_input(struct rspamd_task *task,
                 }
  
                 if (cur_tag->flags & FL_HREF && html_document_state == html_document_state::body) {
-                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc,
+                                                                                                 task->cfg ? task->cfg->lua_state : NULL);
  
                         if (maybe_url.has_value()) {
                                 url = maybe_url.value();
@@ -2352,7 +2359,8 @@ auto html_process_input(struct rspamd_task *task,
                                                 }
                                                 url->part_order = cur_url_part_order++;
                                                 html_process_query_url(pool, url, url_set,
-                                                                                          part_urls);
+                                                                                          part_urls,
+                                                                                          task->cfg ? task->cfg->lua_state : NULL);
                                         }
                                         else {
                                                 url = maybe_existing;
@@ -2438,7 +2446,8 @@ auto html_process_input(struct rspamd_task *task,
                         /*
                          * Base is allowed only within head tag but HTML is retarded
                          */
-                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc,
+                                                                                                 task->cfg ? task->cfg->lua_state : NULL);
  
                         if (maybe_url) {
                                 msg_debug_html("got valid base tag");
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx

index 3ef51f2b6bc484ecdd112a5c40fec746357594f9..2e4cefdeb657ac4c2c2621bf3f98ac3a3cdf861b 100644 (file)
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -159,7 +159,7 @@ auto html_url_is_phished(rspamd_mempool_t *pool,
  
                 text_url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
                 auto rc = rspamd_url_parse(text_url, url_str, strlen(url_str), pool,
-                                                                  RSPAMD_URL_PARSE_TEXT, NULL);
+                                                                  RSPAMD_URL_PARSE_TEXT, L);
  
                 if (rc == URI_ERRNO_OK) {
                         text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
@@ -229,7 +229,8 @@ void html_check_displayed_url(rspamd_mempool_t *pool,
                                                           void *url_set,
                                                           std::string_view visible_part,
                                                           goffset href_offset,
-                                                         struct rspamd_url *url)
+                                                         struct rspamd_url *url,
+                                                         lua_State *L)
  {
         struct rspamd_url *displayed_url = nullptr;
         struct rspamd_url *turl;
@@ -307,7 +308,7 @@ void html_check_displayed_url(rspamd_mempool_t *pool,
         rspamd_normalise_unicode_inplace(url->ext->visible_part, &dlen);
  }
  
-auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input, lua_State *L)
         -> std::optional<struct rspamd_url *>
  {
         struct rspamd_url *url;
@@ -455,7 +456,7 @@ auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
  
         url = rspamd_mempool_alloc0_type(pool, struct rspamd_url);
         rspamd_url_normalise_propagate_flags(pool, decoded, &dlen, saved_flags);
-       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, NULL);
+       rc = rspamd_url_parse(url, decoded, dlen, pool, RSPAMD_URL_PARSE_HREF, L);
  
         /* Filter some completely damaged urls */
         if (rc == URI_ERRNO_OK && url->hostlen > 0 &&
diff --git a/src/libserver/html/html_url.hxx b/src/libserver/html/html_url.hxx

index 46dde6d382eaeb894ee4837c2213e1cf1908a637..20275b6b3e4012518457c72f96313dcaf3798f93 100644 (file)
--- a/src/libserver/html/html_url.hxx
+++ b/src/libserver/html/html_url.hxx
@@ -53,15 +53,17 @@ auto html_check_displayed_url(rspamd_mempool_t *pool,
                                                           void *url_set,
                                                           std::string_view visible_part,
                                                           goffset href_offset,
-                                                         struct rspamd_url *url) -> void;
+                                                         struct rspamd_url *url,
+                                                         lua_State *L) -> void;
  
  /**
   * Process HTML url (e.g. for href component)
   * @param pool
   * @param input may be modified during the process
+ * @param L Lua state for consultation (may be NULL)
   * @return
   */
-auto html_process_url(rspamd_mempool_t *pool, std::string_view &input)
+auto html_process_url(rspamd_mempool_t *pool, std::string_view &input, lua_State *L)
         -> std::optional<struct rspamd_url *>;
  }// namespace rspamd::html
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 14 Nov 2025 18:18:32 +0000 (18:18 +0000)
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html_url.cxx		patch \| blob \| blame \| history
src/libserver/html/html_url.hxx		patch \| blob \| blame \| history