]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Use UTF-8 buffer for HTML URL rewriting
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 13 Oct 2025 09:22:52 +0000 (10:22 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 13 Oct 2025 09:23:18 +0000 (10:23 +0100)
The HTML parser calculates attribute value offsets from the UTF-8
buffer (utf_raw_content), but URL rewriting was incorrectly applying
patches to the MIME-decoded buffer (parsed). When charset conversion
occurs (e.g., from ISO-8859-1 to UTF-8), the same character can have
different byte lengths, causing incorrect patch positions.

This commit ensures all URL rewriting operations use the UTF-8 buffer
consistently, preventing corruption with non-ASCII characters.

src/libserver/html/html_tag.hxx
src/libserver/html/html_url_rewrite.hxx
src/lua/lua_task.c

index 3daa89edcfa6415f3caee5962cb5f5b617d214d3..6df1955d4aae225726614c54d86117ca3856d0e2 100644 (file)
@@ -156,8 +156,8 @@ struct html_component_name : html_component_base {
 
 struct html_component_href : html_component_base {
        std::string_view value;
-       std::size_t offset = 0;// offset in decoded HTML buffer
-       std::size_t len = 0;   // length of raw attribute value
+       std::size_t offset = 0;// offset in UTF-8 HTML buffer (utf_raw_content)
+       std::size_t len = 0;   // length in UTF-8 HTML buffer (utf_raw_content)
        explicit constexpr html_component_href(std::string_view v, std::size_t off = 0, std::size_t l = 0)
                : value(v), offset(off), len(l)
        {
@@ -992,8 +992,8 @@ struct html_component_title : html_component_base {
 
 struct html_component_src : html_component_base {
        std::string_view value;
-       std::size_t offset = 0;// offset in decoded HTML buffer
-       std::size_t len = 0;   // length of raw attribute value
+       std::size_t offset = 0;// offset in UTF-8 HTML buffer (utf_raw_content)
+       std::size_t len = 0;   // length in UTF-8 HTML buffer (utf_raw_content)
        explicit html_component_src(std::string_view v, std::size_t off = 0, std::size_t l = 0)
                : value(v), offset(off), len(l)
        {
index 7de79acc26161753550887813268c7a4cc173fa6..53ea0738fbb31cd518eafbf88a2e94d5aff5a2c3 100644 (file)
@@ -40,19 +40,19 @@ struct rewrite_candidate {
        const html_tag *tag;       // Tag containing the attribute
        std::string_view attr_name;// "href" or "src"
        std::string absolute_url;  // Absolute/canonicalized URL for Lua policy
-       std::size_t offset;        // Offset of attribute value in decoded HTML buffer
-       std::size_t len;           // Length of attribute value in decoded HTML buffer
+       std::size_t offset;        // Offset of attribute value in UTF-8 HTML buffer (utf_raw_content)
+       std::size_t len;           // Length of attribute value in UTF-8 HTML buffer (utf_raw_content)
        int part_id;               // MIME part ID (for multi-part messages)
 };
 
 /**
- * Patch to apply to the decoded HTML buffer
+ * Patch to apply to the UTF-8 HTML buffer
  * Represents a single replacement operation
  */
 struct rewrite_patch {
        int part_id;            // MIME part ID
-       std::size_t offset;     // Offset in decoded buffer
-       std::size_t len;        // Length to replace
+       std::size_t offset;     // Offset in UTF-8 HTML buffer (utf_raw_content)
+       std::size_t len;        // Length to replace in UTF-8 HTML buffer (utf_raw_content)
        std::string replacement;// Replacement string
 
        // For sorting patches by offset
@@ -83,8 +83,8 @@ auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *ta
 auto validate_patches(std::vector<rewrite_patch> &patches) -> bool;
 
 /**
- * Apply patches to a decoded HTML buffer
- * @param original original decoded buffer
+ * Apply patches to a UTF-8 HTML buffer
+ * @param original original UTF-8 HTML buffer (utf_raw_content)
  * @param patches sorted, non-overlapping patches
  * @return rewritten buffer
  */
@@ -99,7 +99,7 @@ auto apply_patches(std::string_view original, const std::vector<rewrite_patch> &
  * @param hc HTML content
  * @param func_ref Lua function reference from luaL_ref
  * @param part_id MIME part ID
- * @param original_html Original HTML content (decoded)
+ * @param original_html Original HTML content (UTF-8, from utf_raw_content)
  * @return Rewritten HTML or nullopt if no changes
  */
 auto process_html_url_rewrite(struct rspamd_task *task,
index ca72dc7f702001dc1928ec90717078bbdf2ca956..a111ef97299273bb708f1b199853fb9f4bf21a94 100644 (file)
@@ -7828,18 +7828,23 @@ lua_task_rewrite_html_urls(lua_State *L)
                        continue;
                }
 
+               /* Skip if no UTF-8 content available */
+               if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) {
+                       continue;
+               }
+
                char *output_html = NULL;
                gsize output_len = 0;
 
-               /* Process URL rewriting using C wrapper */
+               /* Process URL rewriting using C wrapper on UTF-8 buffer */
                int ret = rspamd_html_url_rewrite(
                        task,
                        L,
                        text_part->html,
                        func_ref,
                        text_part->mime_part->part_number,
-                       (const char *) text_part->parsed.begin,
-                       text_part->parsed.len,
+                       (const char *) text_part->utf_raw_content->data,
+                       text_part->utf_raw_content->len,
                        &output_html,
                        &output_len);