]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Refactor] Direct C++ Lua binding for get_html_urls()
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 14 Oct 2025 08:02:46 +0000 (09:02 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 14 Oct 2025 08:46:34 +0000 (09:46 +0100)
Replace the C wrapper layer (rspamd_html_enumerate_urls) with a direct
C++ Lua binding to eliminate unnecessary data copying. Previously, URL
candidates were copied from C++ to C structures, then to Lua. Now they
are pushed directly from C++ to Lua using lua_pushlstring.

Changes:
- Add lua_html_url_rewrite.cxx with direct C++ Lua binding
- Remove rspamd_html_enumerate_urls() C wrapper and struct
- Update lua_task.c to use extern declaration for C++ function
- Add lua_html_url_rewrite.cxx to CMakeLists.txt
- Use lua_createtable() to preallocate tables with known sizes

This improves performance by avoiding intermediate allocations, string
copies, and table reallocations while maintaining the same Lua API.

src/libserver/html/html_url_rewrite_c.cxx
src/libserver/html/html_url_rewrite_c.h
src/lua/CMakeLists.txt
src/lua/lua_html_url_rewrite.cxx [new file with mode: 0644]
src/lua/lua_html_url_rewrite.h [new file with mode: 0644]
src/lua/lua_task.c

index 5f06a459f1fa557b67976cc8d86c1efa357555d9..f3e66672fe8ce4036b01b62cd469e5e5f5ede0a0 100644 (file)
 
 extern "C" {
 
-int rspamd_html_enumerate_urls(struct rspamd_task *task,
-                                                          void *html_content,
-                                                          int part_id,
-                                                          struct rspamd_html_url_candidate **candidates,
-                                                          gsize *n_candidates)
-{
-       if (!task || !html_content || !candidates || !n_candidates) {
-               return -1;
-       }
-
-       auto *hc = static_cast<const rspamd::html::html_content *>(html_content);
-
-       // Enumerate candidates using C++ function
-       auto cpp_candidates = rspamd::html::enumerate_rewrite_candidates(hc, task, part_id);
-
-       if (cpp_candidates.empty()) {
-               *candidates = nullptr;
-               *n_candidates = 0;
-               return 0;
-       }
-
-       // Allocate C-style array from task pool
-       *n_candidates = cpp_candidates.size();
-       *candidates = (struct rspamd_html_url_candidate *) rspamd_mempool_alloc(
-               task->task_pool,
-               sizeof(struct rspamd_html_url_candidate) * cpp_candidates.size());
-
-       // Convert C++ candidates to C candidates
-       for (size_t i = 0; i < cpp_candidates.size(); i++) {
-               const auto &cpp_cand = cpp_candidates[i];
-
-               // Allocate strings from task pool
-               char *url_str = (char *) rspamd_mempool_alloc(
-                       task->task_pool,
-                       cpp_cand.absolute_url.size() + 1);
-               memcpy(url_str, cpp_cand.absolute_url.data(), cpp_cand.absolute_url.size());
-               url_str[cpp_cand.absolute_url.size()] = '\0';
-
-               char *attr_str = (char *) rspamd_mempool_alloc(
-                       task->task_pool,
-                       cpp_cand.attr_name.size() + 1);
-               memcpy(attr_str, cpp_cand.attr_name.data(), cpp_cand.attr_name.size());
-               attr_str[cpp_cand.attr_name.size()] = '\0';
-
-               // Get tag name
-               const char *tag_name = "unknown";
-               gsize tag_len = 7;
-               if (cpp_cand.tag) {
-                       // Use rspamd_html_tag_by_id which returns const char*
-                       extern const char *rspamd_html_tag_by_id(int id);
-                       tag_name = rspamd_html_tag_by_id(cpp_cand.tag->id);
-                       if (tag_name) {
-                               tag_len = strlen(tag_name);
-                       }
-                       else {
-                               tag_name = "unknown";
-                               tag_len = 7;
-                       }
-               }
-
-               (*candidates)[i].url = url_str;
-               (*candidates)[i].url_len = cpp_cand.absolute_url.size();
-               (*candidates)[i].attr = attr_str;
-               (*candidates)[i].attr_len = cpp_cand.attr_name.size();
-               (*candidates)[i].tag = tag_name;
-               (*candidates)[i].tag_len = tag_len;
-       }
-
-       return 0;
-}
-
 int rspamd_html_url_rewrite(struct rspamd_task *task,
                                                        struct lua_State *L,
                                                        void *html_content,
index c0906e00a7d15f435d045dfc83d4fbc405c00116..798c8b39872f2f43517daa26e8c615a4051d49a1 100644 (file)
@@ -27,33 +27,6 @@ struct rspamd_task;
 
 struct lua_State;
 
-/**
- * URL candidate info for C interface
- */
-struct rspamd_html_url_candidate {
-       const char *url; // Absolute URL string (NUL-terminated)
-       const char *attr;// Attribute name: "href" or "src" (NUL-terminated)
-       const char *tag; // Tag name (NUL-terminated)
-       gsize url_len;   // Length of URL string
-       gsize attr_len;  // Length of attr string
-       gsize tag_len;   // Length of tag string
-};
-
-/**
- * C wrapper for enumerating HTML URL rewrite candidates
- * @param task Rspamd task
- * @param html_content HTML content pointer (void* cast of html_content*)
- * @param part_id MIME part ID
- * @param candidates Output array of candidates (allocated from task pool if successful)
- * @param n_candidates Output count of candidates
- * @return 0 on success, -1 on error
- */
-int rspamd_html_enumerate_urls(struct rspamd_task *task,
-                                                          void *html_content,
-                                                          int part_id,
-                                                          struct rspamd_html_url_candidate **candidates,
-                                                          gsize *n_candidates);
-
 /**
  * C wrapper for HTML URL rewriting
  * @param task Rspamd task
index 8fb976ebdf974215158dbc35b28af3c3935a672f..64b6ff918717e73c5b4835c7f28b8a5ea7af94ef 100644 (file)
@@ -22,6 +22,7 @@ SET(LUASRC                      ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_util.c
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_tcp.c
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_html.cxx
+                                         ${CMAKE_CURRENT_SOURCE_DIR}/lua_html_url_rewrite.cxx
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_sqlite3.c
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_cryptobox.c
                                          ${CMAKE_CURRENT_SOURCE_DIR}/lua_map.c
diff --git a/src/lua/lua_html_url_rewrite.cxx b/src/lua/lua_html_url_rewrite.cxx
new file mode 100644 (file)
index 0000000..15fa47c
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua_html_url_rewrite.h"
+#include "libserver/html/html_url_rewrite.hxx"
+#include "libserver/html/html.h"
+#include "libserver/html/html.hxx"
+#include "libserver/task.h"
+#include "message.h"
+#include "lua_common.h"
+
+extern "C" {
+
+int lua_task_get_html_urls(lua_State *L)
+{
+       // Get task from Lua stack
+       auto *task = lua_check_task(L, 1);
+
+       if (!task || !MESSAGE_FIELD_CHECK(task, text_parts)) {
+               lua_pushnil(L);
+               return 1;
+       }
+
+       // Create result table
+       lua_newtable(L);
+       int results = 0;
+
+       // Iterate through text parts
+       unsigned int i;
+       void *part;
+       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
+       {
+               auto *text_part = static_cast<rspamd_mime_text_part *>(part);
+
+               // Only process HTML parts
+               if (!IS_TEXT_PART_HTML(text_part) || !text_part->html) {
+                       continue;
+               }
+
+               // Skip if no UTF-8 content available
+               if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) {
+                       continue;
+               }
+
+               // Enumerate URLs directly using C++ function - no copying!
+               auto candidates = rspamd::html::enumerate_rewrite_candidates(
+                       static_cast<const rspamd::html::html_content *>(text_part->html),
+                       task,
+                       text_part->mime_part->part_number);
+
+               if (candidates.empty()) {
+                       continue;
+               }
+
+               // Create array for this part: table[part_number] = {url_info_1, url_info_2, ...}
+               lua_pushinteger(L, text_part->mime_part->part_number);
+               lua_createtable(L, candidates.size(), 0);// URLs array for this part
+
+               for (size_t j = 0; j < candidates.size(); j++) {
+                       const auto &cand = candidates[j];
+
+                       lua_pushinteger(L, j + 1);// 1-indexed array
+                       lua_createtable(L, 0, 3); // URL info table with 3 fields: url, attr, tag
+
+                       // url field - push string without copying
+                       lua_pushstring(L, "url");
+                       lua_pushlstring(L, cand.absolute_url.data(), cand.absolute_url.size());
+                       lua_settable(L, -3);
+
+                       // attr field - push string_view without copying
+                       lua_pushstring(L, "attr");
+                       lua_pushlstring(L, cand.attr_name.data(), cand.attr_name.size());
+                       lua_settable(L, -3);
+
+                       // tag field - get tag name
+                       lua_pushstring(L, "tag");
+                       if (cand.tag) {
+                               const char *tag_name = rspamd_html_tag_by_id(cand.tag->id);
+                               if (tag_name) {
+                                       lua_pushstring(L, tag_name);
+                               }
+                               else {
+                                       lua_pushstring(L, "unknown");
+                               }
+                       }
+                       else {
+                               lua_pushstring(L, "unknown");
+                       }
+                       lua_settable(L, -3);
+
+                       lua_settable(L, -3);// Add url info to URLs array
+               }
+
+               lua_settable(L, -3);// Add part to main table
+               results++;
+       }
+
+       if (results == 0) {
+               lua_pop(L, 1);
+               lua_pushnil(L);
+       }
+
+       return 1;
+}
+
+}// extern "C"
diff --git a/src/lua/lua_html_url_rewrite.h b/src/lua/lua_html_url_rewrite.h
new file mode 100644 (file)
index 0000000..44bf4ee
--- /dev/null
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LUA_HTML_URL_REWRITE_H
+#define RSPAMD_LUA_HTML_URL_REWRITE_H
+
+#include "config.h"
+
+struct rspamd_task;
+struct lua_State;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * C++ Lua binding for task:get_html_urls()
+ * Extracts URLs from HTML parts without intermediate C copying
+ * @param L Lua state
+ * @return number of return values on Lua stack
+ */
+int lua_task_get_html_urls(lua_State *L);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//RSPAMD_LUA_HTML_URL_REWRITE_H
index 5f0295268cbf74ddc9725b2661b7ce18a23afc8b..e10c7e089ba6c8a2c696f6d8707e8ff658da74e5 100644 (file)
@@ -33,6 +33,7 @@
 #include <math.h>
 #include "libmime/received.h"
 #include "libserver/html/html_url_rewrite_c.h"
+#include "lua_html_url_rewrite.h"
 
 /***
  * @module rspamd_task
@@ -1291,7 +1292,8 @@ LUA_FUNCTION_DEF(task, rewrite_html_urls);
  * Useful for async URL checking workflows where URLs need to be batched.
  * @return {table|nil} table indexed by part number, each containing an array of URL info tables with keys: url, attr, tag
  */
-LUA_FUNCTION_DEF(task, get_html_urls);
+/* Implemented in lua_html_url_rewrite.cxx as C++ binding */
+extern int lua_task_get_html_urls(lua_State *L);
 
 static const struct luaL_reg tasklib_f[] = {
        LUA_INTERFACE_DEF(task, create),
@@ -7884,87 +7886,7 @@ lua_task_rewrite_html_urls(lua_State *L)
        return 1;
 }
 
-static int
-lua_task_get_html_urls(lua_State *L)
-{
-       struct rspamd_task *task = lua_check_task(L, 1);
-
-       if (!task || !MESSAGE_FIELD_CHECK(task, text_parts)) {
-               lua_pushnil(L);
-               return 1;
-       }
-
-       /* Create result table */
-       lua_newtable(L);
-       int results = 0;
-       unsigned int i;
-       void *part;
-
-       /* Iterate through text parts */
-       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
-       {
-               struct rspamd_mime_text_part *text_part = (struct rspamd_mime_text_part *) part;
-
-               /* Only process HTML parts */
-               if (!IS_TEXT_PART_HTML(text_part) || !text_part->html) {
-                       continue;
-               }
-
-               /* Skip if no UTF-8 content available */
-               if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) {
-                       continue;
-               }
-
-               struct rspamd_html_url_candidate *candidates = NULL;
-               gsize n_candidates = 0;
-
-               /* Enumerate URLs using C wrapper */
-               int ret = rspamd_html_enumerate_urls(
-                       task,
-                       text_part->html,
-                       text_part->mime_part->part_number,
-                       &candidates,
-                       &n_candidates);
-
-               if (ret == 0 && candidates && n_candidates > 0) {
-                       /* Create array for this part: table[part_number] = {url_info_1, url_info_2, ...} */
-                       lua_pushinteger(L, text_part->mime_part->part_number);
-                       lua_newtable(L); /* URLs array for this part */
-
-                       for (gsize j = 0; j < n_candidates; j++) {
-                               lua_pushinteger(L, j + 1); /* 1-indexed array */
-                               lua_newtable(L);           /* URL info table */
-
-                               /* url field */
-                               lua_pushstring(L, "url");
-                               lua_pushstring(L, candidates[j].url);
-                               lua_settable(L, -3);
-
-                               /* attr field */
-                               lua_pushstring(L, "attr");
-                               lua_pushstring(L, candidates[j].attr);
-                               lua_settable(L, -3);
-
-                               /* tag field */
-                               lua_pushstring(L, "tag");
-                               lua_pushstring(L, candidates[j].tag);
-                               lua_settable(L, -3);
-
-                               lua_settable(L, -3); /* Add url info to URLs array */
-                       }
-
-                       lua_settable(L, -3); /* Add part to main table */
-                       results++;
-               }
-       }
-
-       if (results == 0) {
-               lua_pop(L, 1);
-               lua_pushnil(L);
-       }
-
-       return 1;
-}
+/* lua_task_get_html_urls is implemented in lua_html_url_rewrite.cxx */
 
 /* Init part */