]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] html: preserve verbatim href as url->raw (issue #5986)
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 17 Apr 2026 04:55:34 +0000 (05:55 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 17 Apr 2026 04:55:34 +0000 (05:55 +0100)
When the same URL appears in both the HTML and plain-text parts of a
multipart/alternative message, the HTML variant wins deduplication (HTML
parts are processed first).  html_process_url builds a partially
percent-decoded scratch buffer — decoding %40, %3A, %7C, %2F, %3F, %5C
to defeat URL obfuscation — and rspamd_url_parse sets url->raw to that
buffer.  Consumers of url:get_raw() then saw a partially decoded form
instead of the bytes that actually appeared in the message.

Re-point url->raw to a mempool-owned copy of the verbatim (trimmed)
href after a successful parse.  Length is clamped to G_MAXUINT16 / 2
with RSPAMD_URL_FLAG_TRUNCATED on overflow, matching rspamd_url_parse's
own truncation policy.

Adds test/lua/unit/url_raw_preserve.lua covering plain-text-only,
HTML-only, same-URL-in-both-parts (the issue reproduction), and a
GC-stability check for the mempool-backed raw buffer.

src/libserver/html/html_url.cxx
test/lua/unit/url_raw_preserve.lua [new file with mode: 0644]

index 2c800688019c26df5c4d86c04d0f0e7d5caa1498..22db7f66f69748d93de8a4e6b7b2945cc958f717 100644 (file)
@@ -20,6 +20,7 @@
 #include "libserver/logger.h"
 #include "rspamd.h"
 
+#include <algorithm>
 #include <unicode/idna.h>
 
 namespace rspamd::html {
@@ -477,6 +478,25 @@ auto html_process_url(rspamd_mempool_t *pool, std::string_view &input, lua_State
                        }
                }
 
+               /*
+                * Preserve the verbatim (trimmed) href as url->raw so that get_raw()
+                * returns the URL text as it appeared in the message rather than the
+                * partially percent-decoded scratch buffer used for parsing.  Without
+                * this, when the same URL appears in both the HTML and plain-text
+                * parts, the HTML variant wins deduplication (HTML parts are processed
+                * first) and get_raw() would return the decoded form (issue #5986).
+                * The copy lives in the task mempool, matching the URL's lifetime.
+                */
+               const auto raw_cap = std::min<gsize>(sz, G_MAXUINT16 / 2);
+               auto *original_raw = rspamd_mempool_alloc_buffer(pool, raw_cap + 1);
+               memcpy(original_raw, trimmed, raw_cap);
+               original_raw[raw_cap] = '\0';
+               url->raw = original_raw;
+               url->rawlen = (uint16_t) raw_cap;
+               if (raw_cap < sz) {
+                       url->flags |= RSPAMD_URL_FLAG_TRUNCATED;
+               }
+
                decoded = url->string;
 
                input = {decoded, url->urllen};
diff --git a/test/lua/unit/url_raw_preserve.lua b/test/lua/unit/url_raw_preserve.lua
new file mode 100644 (file)
index 0000000..c68a7db
--- /dev/null
@@ -0,0 +1,142 @@
+-- Regression test for issue #5986: url:get_raw() must return the URL text
+-- as it appeared in the original message (preserving percent-encoding),
+-- even when the same URL is present in both the HTML part (where href
+-- values are partially percent-decoded during parsing) and the plain-text
+-- part.  HTML parts are processed before plain-text parts, so before the
+-- fix the HTML-sourced URL (with a partially-decoded raw buffer) would
+-- win deduplication and url:get_raw() would return a mangled form.
+
+context("URL get_raw preservation across multipart/alternative", function()
+  local rspamd_task = require("rspamd_task")
+
+  -- A long percent-encoded URL that exercises the html_process_url
+  -- decoder (which decodes %40, %3A, %7C, %2F, %3F, %5C back to
+  -- their literal characters).  Using characters the decoder touches
+  -- so we can distinguish "raw as seen in message" from "decoded".
+  local encoded_url =
+    "https://safelinks.example.com/?url=https%3A%2F%2Flink.example.com%2Fpath" ..
+    "&data=05%7C02%7Cmail%40example.com%7C0%7C%7C%7C&reserved=0"
+  local decoded_url =
+    "https://safelinks.example.com/?url=https://link.example.com/path" ..
+    "&data=05|02|mail@example.com|0|||&reserved=0"
+
+  local function find_safelinks(urls)
+    for _, u in ipairs(urls) do
+      if u:get_host() == "safelinks.example.com" then
+        return u
+      end
+    end
+    return nil
+  end
+
+  test("plain-text-only URL: raw is verbatim", function()
+    local msg = table.concat({
+      "From: test@example.com",
+      "To: nobody@example.com",
+      "Subject: test",
+      "Content-Type: text/plain",
+      "",
+      "Visit " .. encoded_url,
+      "",
+    }, "\r\n")
+
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+    task:process_message()
+
+    local u = find_safelinks(task:get_urls({ "https", "http" }))
+    assert_not_nil(u, "safelinks URL should be extracted")
+    assert_equal(u:get_raw(), encoded_url,
+      "plain-text URL raw must equal the verbatim text")
+    task:destroy()
+  end)
+
+  test("HTML-only URL: raw is verbatim href", function()
+    local msg = table.concat({
+      "From: test@example.com",
+      "To: nobody@example.com",
+      "Subject: test",
+      "Content-Type: text/html",
+      "",
+      '<html><body><a href="' .. encoded_url .. '">Click</a></body></html>',
+      "",
+    }, "\r\n")
+
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+    task:process_message()
+
+    local u = find_safelinks(task:get_urls({ "https", "http" }))
+    assert_not_nil(u, "safelinks URL should be extracted")
+    assert_equal(u:get_raw(), encoded_url,
+      "HTML href raw must equal the verbatim href bytes, not the partially decoded form")
+    task:destroy()
+  end)
+
+  test("same URL in HTML and plain text: raw stays verbatim (issue #5986)", function()
+    local boundary = "boundary-string"
+    local msg = table.concat({
+      "From: test@example.com",
+      "To: nobody@example.com",
+      "Subject: test",
+      "MIME-Version: 1.0",
+      'Content-Type: multipart/alternative; boundary="' .. boundary .. '"',
+      "",
+      "--" .. boundary,
+      "Content-Type: text/plain; charset=utf-8",
+      "",
+      "Visit " .. encoded_url,
+      "",
+      "--" .. boundary,
+      "Content-Type: text/html; charset=utf-8",
+      "",
+      '<html><body><a href="' .. encoded_url .. '">Click</a></body></html>',
+      "",
+      "--" .. boundary .. "--",
+      "",
+    }, "\r\n")
+
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+    task:process_message()
+
+    local u = find_safelinks(task:get_urls({ "https", "http" }))
+    assert_not_nil(u, "safelinks URL should be extracted")
+    -- Before the fix this returned the decoded form.
+    assert_equal(u:get_raw(), encoded_url,
+      "dedup winner's raw must preserve the verbatim percent-encoded text")
+    assert_not_equal(u:get_raw(), decoded_url,
+      "raw must not be the partially-decoded form used internally for parsing")
+    task:destroy()
+  end)
+
+  test("raw buffer outlives parse scratch (memory ownership)", function()
+    -- Ensure that url:get_raw() continues to return stable bytes after
+    -- processing finishes and the scratch buffers used during parsing
+    -- go out of scope.  Both strings must survive a GC pass since the
+    -- raw pointer lives in the task mempool.
+    local msg = table.concat({
+      "From: test@example.com",
+      "To: nobody@example.com",
+      "Subject: test",
+      "Content-Type: text/html",
+      "",
+      '<html><body><a href="' .. encoded_url .. '">X</a></body></html>',
+      "",
+    }, "\r\n")
+
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+    task:process_message()
+
+    local u = find_safelinks(task:get_urls({ "https", "http" }))
+    assert_not_nil(u)
+    local first = u:get_raw()
+    collectgarbage("collect")
+    collectgarbage("collect")
+    local second = u:get_raw()
+    assert_equal(first, second, "raw must remain stable across GC (mempool-backed)")
+    assert_equal(first, encoded_url)
+    task:destroy()
+  end)
+end)