From: Vsevolod Stakhov Date: Tue, 14 Oct 2025 08:02:46 +0000 (+0100) Subject: [Refactor] Direct C++ Lua binding for get_html_urls() X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=843aa6bf57155c6ce3cf965cb8c9fa545a985326;p=thirdparty%2Frspamd.git [Refactor] Direct C++ Lua binding for get_html_urls() Replace the C wrapper layer (rspamd_html_enumerate_urls) with a direct C++ Lua binding to eliminate unnecessary data copying. Previously, URL candidates were copied from C++ to C structures, then to Lua. Now they are pushed directly from C++ to Lua using lua_pushlstring. Changes: - Add lua_html_url_rewrite.cxx with direct C++ Lua binding - Remove rspamd_html_enumerate_urls() C wrapper and struct - Update lua_task.c to use extern declaration for C++ function - Add lua_html_url_rewrite.cxx to CMakeLists.txt - Use lua_createtable() to preallocate tables with known sizes This improves performance by avoiding intermediate allocations, string copies, and table reallocations while maintaining the same Lua API. --- diff --git a/src/libserver/html/html_url_rewrite_c.cxx b/src/libserver/html/html_url_rewrite_c.cxx index 5f06a459f1..f3e66672fe 100644 --- a/src/libserver/html/html_url_rewrite_c.cxx +++ b/src/libserver/html/html_url_rewrite_c.cxx @@ -21,77 +21,6 @@ extern "C" { -int rspamd_html_enumerate_urls(struct rspamd_task *task, - void *html_content, - int part_id, - struct rspamd_html_url_candidate **candidates, - gsize *n_candidates) -{ - if (!task || !html_content || !candidates || !n_candidates) { - return -1; - } - - auto *hc = static_cast(html_content); - - // Enumerate candidates using C++ function - auto cpp_candidates = rspamd::html::enumerate_rewrite_candidates(hc, task, part_id); - - if (cpp_candidates.empty()) { - *candidates = nullptr; - *n_candidates = 0; - return 0; - } - - // Allocate C-style array from task pool - *n_candidates = cpp_candidates.size(); - *candidates = (struct rspamd_html_url_candidate *) rspamd_mempool_alloc( - task->task_pool, - sizeof(struct rspamd_html_url_candidate) * cpp_candidates.size()); - - // Convert C++ candidates to C candidates - for (size_t i = 0; i < cpp_candidates.size(); i++) { - const auto &cpp_cand = cpp_candidates[i]; - - // Allocate strings from task pool - char *url_str = (char *) rspamd_mempool_alloc( - task->task_pool, - cpp_cand.absolute_url.size() + 1); - memcpy(url_str, cpp_cand.absolute_url.data(), cpp_cand.absolute_url.size()); - url_str[cpp_cand.absolute_url.size()] = '\0'; - - char *attr_str = (char *) rspamd_mempool_alloc( - task->task_pool, - cpp_cand.attr_name.size() + 1); - memcpy(attr_str, cpp_cand.attr_name.data(), cpp_cand.attr_name.size()); - attr_str[cpp_cand.attr_name.size()] = '\0'; - - // Get tag name - const char *tag_name = "unknown"; - gsize tag_len = 7; - if (cpp_cand.tag) { - // Use rspamd_html_tag_by_id which returns const char* - extern const char *rspamd_html_tag_by_id(int id); - tag_name = rspamd_html_tag_by_id(cpp_cand.tag->id); - if (tag_name) { - tag_len = strlen(tag_name); - } - else { - tag_name = "unknown"; - tag_len = 7; - } - } - - (*candidates)[i].url = url_str; - (*candidates)[i].url_len = cpp_cand.absolute_url.size(); - (*candidates)[i].attr = attr_str; - (*candidates)[i].attr_len = cpp_cand.attr_name.size(); - (*candidates)[i].tag = tag_name; - (*candidates)[i].tag_len = tag_len; - } - - return 0; -} - int rspamd_html_url_rewrite(struct rspamd_task *task, struct lua_State *L, void *html_content, diff --git a/src/libserver/html/html_url_rewrite_c.h b/src/libserver/html/html_url_rewrite_c.h index c0906e00a7..798c8b3987 100644 --- a/src/libserver/html/html_url_rewrite_c.h +++ b/src/libserver/html/html_url_rewrite_c.h @@ -27,33 +27,6 @@ struct rspamd_task; struct lua_State; -/** - * URL candidate info for C interface - */ -struct rspamd_html_url_candidate { - const char *url; // Absolute URL string (NUL-terminated) - const char *attr;// Attribute name: "href" or "src" (NUL-terminated) - const char *tag; // Tag name (NUL-terminated) - gsize url_len; // Length of URL string - gsize attr_len; // Length of attr string - gsize tag_len; // Length of tag string -}; - -/** - * C wrapper for enumerating HTML URL rewrite candidates - * @param task Rspamd task - * @param html_content HTML content pointer (void* cast of html_content*) - * @param part_id MIME part ID - * @param candidates Output array of candidates (allocated from task pool if successful) - * @param n_candidates Output count of candidates - * @return 0 on success, -1 on error - */ -int rspamd_html_enumerate_urls(struct rspamd_task *task, - void *html_content, - int part_id, - struct rspamd_html_url_candidate **candidates, - gsize *n_candidates); - /** * C wrapper for HTML URL rewriting * @param task Rspamd task diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt index 8fb976ebdf..64b6ff9187 100644 --- a/src/lua/CMakeLists.txt +++ b/src/lua/CMakeLists.txt @@ -22,6 +22,7 @@ SET(LUASRC ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_util.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_tcp.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_html.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/lua_html_url_rewrite.cxx ${CMAKE_CURRENT_SOURCE_DIR}/lua_sqlite3.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_cryptobox.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_map.c diff --git a/src/lua/lua_html_url_rewrite.cxx b/src/lua/lua_html_url_rewrite.cxx new file mode 100644 index 0000000000..15fa47c0a1 --- /dev/null +++ b/src/lua/lua_html_url_rewrite.cxx @@ -0,0 +1,119 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua_html_url_rewrite.h" +#include "libserver/html/html_url_rewrite.hxx" +#include "libserver/html/html.h" +#include "libserver/html/html.hxx" +#include "libserver/task.h" +#include "message.h" +#include "lua_common.h" + +extern "C" { + +int lua_task_get_html_urls(lua_State *L) +{ + // Get task from Lua stack + auto *task = lua_check_task(L, 1); + + if (!task || !MESSAGE_FIELD_CHECK(task, text_parts)) { + lua_pushnil(L); + return 1; + } + + // Create result table + lua_newtable(L); + int results = 0; + + // Iterate through text parts + unsigned int i; + void *part; + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) + { + auto *text_part = static_cast(part); + + // Only process HTML parts + if (!IS_TEXT_PART_HTML(text_part) || !text_part->html) { + continue; + } + + // Skip if no UTF-8 content available + if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) { + continue; + } + + // Enumerate URLs directly using C++ function - no copying! + auto candidates = rspamd::html::enumerate_rewrite_candidates( + static_cast(text_part->html), + task, + text_part->mime_part->part_number); + + if (candidates.empty()) { + continue; + } + + // Create array for this part: table[part_number] = {url_info_1, url_info_2, ...} + lua_pushinteger(L, text_part->mime_part->part_number); + lua_createtable(L, candidates.size(), 0);// URLs array for this part + + for (size_t j = 0; j < candidates.size(); j++) { + const auto &cand = candidates[j]; + + lua_pushinteger(L, j + 1);// 1-indexed array + lua_createtable(L, 0, 3); // URL info table with 3 fields: url, attr, tag + + // url field - push string without copying + lua_pushstring(L, "url"); + lua_pushlstring(L, cand.absolute_url.data(), cand.absolute_url.size()); + lua_settable(L, -3); + + // attr field - push string_view without copying + lua_pushstring(L, "attr"); + lua_pushlstring(L, cand.attr_name.data(), cand.attr_name.size()); + lua_settable(L, -3); + + // tag field - get tag name + lua_pushstring(L, "tag"); + if (cand.tag) { + const char *tag_name = rspamd_html_tag_by_id(cand.tag->id); + if (tag_name) { + lua_pushstring(L, tag_name); + } + else { + lua_pushstring(L, "unknown"); + } + } + else { + lua_pushstring(L, "unknown"); + } + lua_settable(L, -3); + + lua_settable(L, -3);// Add url info to URLs array + } + + lua_settable(L, -3);// Add part to main table + results++; + } + + if (results == 0) { + lua_pop(L, 1); + lua_pushnil(L); + } + + return 1; +} + +}// extern "C" diff --git a/src/lua/lua_html_url_rewrite.h b/src/lua/lua_html_url_rewrite.h new file mode 100644 index 0000000000..44bf4ee56d --- /dev/null +++ b/src/lua/lua_html_url_rewrite.h @@ -0,0 +1,41 @@ +/* + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_LUA_HTML_URL_REWRITE_H +#define RSPAMD_LUA_HTML_URL_REWRITE_H + +#include "config.h" + +struct rspamd_task; +struct lua_State; + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * C++ Lua binding for task:get_html_urls() + * Extracts URLs from HTML parts without intermediate C copying + * @param L Lua state + * @return number of return values on Lua stack + */ +int lua_task_get_html_urls(lua_State *L); + +#ifdef __cplusplus +} +#endif + +#endif//RSPAMD_LUA_HTML_URL_REWRITE_H diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 5f0295268c..e10c7e089b 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -33,6 +33,7 @@ #include #include "libmime/received.h" #include "libserver/html/html_url_rewrite_c.h" +#include "lua_html_url_rewrite.h" /*** * @module rspamd_task @@ -1291,7 +1292,8 @@ LUA_FUNCTION_DEF(task, rewrite_html_urls); * Useful for async URL checking workflows where URLs need to be batched. * @return {table|nil} table indexed by part number, each containing an array of URL info tables with keys: url, attr, tag */ -LUA_FUNCTION_DEF(task, get_html_urls); +/* Implemented in lua_html_url_rewrite.cxx as C++ binding */ +extern int lua_task_get_html_urls(lua_State *L); static const struct luaL_reg tasklib_f[] = { LUA_INTERFACE_DEF(task, create), @@ -7884,87 +7886,7 @@ lua_task_rewrite_html_urls(lua_State *L) return 1; } -static int -lua_task_get_html_urls(lua_State *L) -{ - struct rspamd_task *task = lua_check_task(L, 1); - - if (!task || !MESSAGE_FIELD_CHECK(task, text_parts)) { - lua_pushnil(L); - return 1; - } - - /* Create result table */ - lua_newtable(L); - int results = 0; - unsigned int i; - void *part; - - /* Iterate through text parts */ - PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) - { - struct rspamd_mime_text_part *text_part = (struct rspamd_mime_text_part *) part; - - /* Only process HTML parts */ - if (!IS_TEXT_PART_HTML(text_part) || !text_part->html) { - continue; - } - - /* Skip if no UTF-8 content available */ - if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) { - continue; - } - - struct rspamd_html_url_candidate *candidates = NULL; - gsize n_candidates = 0; - - /* Enumerate URLs using C wrapper */ - int ret = rspamd_html_enumerate_urls( - task, - text_part->html, - text_part->mime_part->part_number, - &candidates, - &n_candidates); - - if (ret == 0 && candidates && n_candidates > 0) { - /* Create array for this part: table[part_number] = {url_info_1, url_info_2, ...} */ - lua_pushinteger(L, text_part->mime_part->part_number); - lua_newtable(L); /* URLs array for this part */ - - for (gsize j = 0; j < n_candidates; j++) { - lua_pushinteger(L, j + 1); /* 1-indexed array */ - lua_newtable(L); /* URL info table */ - - /* url field */ - lua_pushstring(L, "url"); - lua_pushstring(L, candidates[j].url); - lua_settable(L, -3); - - /* attr field */ - lua_pushstring(L, "attr"); - lua_pushstring(L, candidates[j].attr); - lua_settable(L, -3); - - /* tag field */ - lua_pushstring(L, "tag"); - lua_pushstring(L, candidates[j].tag); - lua_settable(L, -3); - - lua_settable(L, -3); /* Add url info to URLs array */ - } - - lua_settable(L, -3); /* Add part to main table */ - results++; - } - } - - if (results == 0) { - lua_pop(L, 1); - lua_pushnil(L); - } - - return 1; -} +/* lua_task_get_html_urls is implemented in lua_html_url_rewrite.cxx */ /* Init part */