From: Vsevolod Stakhov Date: Mon, 13 Oct 2025 09:22:52 +0000 (+0100) Subject: [Fix] Use UTF-8 buffer for HTML URL rewriting X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=220cfd85b12e9d94d95ceeabfa9b22e8ec4ec1e1;p=thirdparty%2Frspamd.git [Fix] Use UTF-8 buffer for HTML URL rewriting The HTML parser calculates attribute value offsets from the UTF-8 buffer (utf_raw_content), but URL rewriting was incorrectly applying patches to the MIME-decoded buffer (parsed). When charset conversion occurs (e.g., from ISO-8859-1 to UTF-8), the same character can have different byte lengths, causing incorrect patch positions. This commit ensures all URL rewriting operations use the UTF-8 buffer consistently, preventing corruption with non-ASCII characters. --- diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 3daa89edcf..6df1955d4a 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -156,8 +156,8 @@ struct html_component_name : html_component_base { struct html_component_href : html_component_base { std::string_view value; - std::size_t offset = 0;// offset in decoded HTML buffer - std::size_t len = 0; // length of raw attribute value + std::size_t offset = 0;// offset in UTF-8 HTML buffer (utf_raw_content) + std::size_t len = 0; // length in UTF-8 HTML buffer (utf_raw_content) explicit constexpr html_component_href(std::string_view v, std::size_t off = 0, std::size_t l = 0) : value(v), offset(off), len(l) { @@ -992,8 +992,8 @@ struct html_component_title : html_component_base { struct html_component_src : html_component_base { std::string_view value; - std::size_t offset = 0;// offset in decoded HTML buffer - std::size_t len = 0; // length of raw attribute value + std::size_t offset = 0;// offset in UTF-8 HTML buffer (utf_raw_content) + std::size_t len = 0; // length in UTF-8 HTML buffer (utf_raw_content) explicit html_component_src(std::string_view v, std::size_t off = 0, std::size_t l = 0) : value(v), offset(off), len(l) { diff --git a/src/libserver/html/html_url_rewrite.hxx b/src/libserver/html/html_url_rewrite.hxx index 7de79acc26..53ea0738fb 100644 --- a/src/libserver/html/html_url_rewrite.hxx +++ b/src/libserver/html/html_url_rewrite.hxx @@ -40,19 +40,19 @@ struct rewrite_candidate { const html_tag *tag; // Tag containing the attribute std::string_view attr_name;// "href" or "src" std::string absolute_url; // Absolute/canonicalized URL for Lua policy - std::size_t offset; // Offset of attribute value in decoded HTML buffer - std::size_t len; // Length of attribute value in decoded HTML buffer + std::size_t offset; // Offset of attribute value in UTF-8 HTML buffer (utf_raw_content) + std::size_t len; // Length of attribute value in UTF-8 HTML buffer (utf_raw_content) int part_id; // MIME part ID (for multi-part messages) }; /** - * Patch to apply to the decoded HTML buffer + * Patch to apply to the UTF-8 HTML buffer * Represents a single replacement operation */ struct rewrite_patch { int part_id; // MIME part ID - std::size_t offset; // Offset in decoded buffer - std::size_t len; // Length to replace + std::size_t offset; // Offset in UTF-8 HTML buffer (utf_raw_content) + std::size_t len; // Length to replace in UTF-8 HTML buffer (utf_raw_content) std::string replacement;// Replacement string // For sorting patches by offset @@ -83,8 +83,8 @@ auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *ta auto validate_patches(std::vector &patches) -> bool; /** - * Apply patches to a decoded HTML buffer - * @param original original decoded buffer + * Apply patches to a UTF-8 HTML buffer + * @param original original UTF-8 HTML buffer (utf_raw_content) * @param patches sorted, non-overlapping patches * @return rewritten buffer */ @@ -99,7 +99,7 @@ auto apply_patches(std::string_view original, const std::vector & * @param hc HTML content * @param func_ref Lua function reference from luaL_ref * @param part_id MIME part ID - * @param original_html Original HTML content (decoded) + * @param original_html Original HTML content (UTF-8, from utf_raw_content) * @return Rewritten HTML or nullopt if no changes */ auto process_html_url_rewrite(struct rspamd_task *task, diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index ca72dc7f70..a111ef9729 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -7828,18 +7828,23 @@ lua_task_rewrite_html_urls(lua_State *L) continue; } + /* Skip if no UTF-8 content available */ + if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) { + continue; + } + char *output_html = NULL; gsize output_len = 0; - /* Process URL rewriting using C wrapper */ + /* Process URL rewriting using C wrapper on UTF-8 buffer */ int ret = rspamd_html_url_rewrite( task, L, text_part->html, func_ref, text_part->mime_part->part_number, - (const char *) text_part->parsed.begin, - text_part->parsed.len, + (const char *) text_part->utf_raw_content->data, + text_part->utf_raw_content->len, &output_html, &output_len);