From b0a4a0039857890b98f6f06fb9860db182d6e0c3 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 13 Oct 2025 11:46:09 +0100 Subject: [PATCH] [Feature] Add task:get_html_urls() for async URL rewriting Introduce a two-phase API for HTML URL rewriting that separates URL extraction from the rewriting step. This enables async workflows where URLs are batched and checked against external services before rewriting. Changes: - Add rspamd_html_enumerate_urls() C wrapper to extract URL candidates - Add task:get_html_urls() Lua method returning URL info per HTML part - Include comprehensive unit tests covering edge cases - Provide async usage examples (HTTP, Redis, simple patterns) The new API complements the existing task:rewrite_html_urls() method, allowing users to extract URLs, perform async operations, then apply rewrites using a lookup table callback. --- src/libserver/html/html_url_rewrite.cxx | 6 +- src/libserver/html/html_url_rewrite_c.cxx | 71 ++++ src/libserver/html/html_url_rewrite_c.h | 27 ++ src/lua/lua_task.c | 91 +++++ test/lua/unit/get_html_urls.lua | 338 ++++++++++++++++++ test/lua/unit/get_html_urls_async_example.lua | 273 ++++++++++++++ test/lua/unit/url_rewrite.lua | 37 ++ 7 files changed, 839 insertions(+), 4 deletions(-) create mode 100644 test/lua/unit/get_html_urls.lua create mode 100644 test/lua/unit/get_html_urls_async_example.lua diff --git a/src/libserver/html/html_url_rewrite.cxx b/src/libserver/html/html_url_rewrite.cxx index 387490206e..5375f92964 100644 --- a/src/libserver/html/html_url_rewrite.cxx +++ b/src/libserver/html/html_url_rewrite.cxx @@ -123,10 +123,8 @@ auto enumerate_rewrite_candidates(const html_content *hc, struct rspamd_task *ta } // Skip data: and cid: schemes by default - if (url_value.size() >= 5) { - if (url_value.substr(0, 5) == "data:" || url_value.substr(0, 4) == "cid:") { - return true;// Continue to next - } + if (url_value.starts_with("data:") || url_value.starts_with("cid:")) { + return true;// Continue to next } // Build absolute URL (already done by parser, but we have it in url_value) diff --git a/src/libserver/html/html_url_rewrite_c.cxx b/src/libserver/html/html_url_rewrite_c.cxx index f3e66672fe..5f06a459f1 100644 --- a/src/libserver/html/html_url_rewrite_c.cxx +++ b/src/libserver/html/html_url_rewrite_c.cxx @@ -21,6 +21,77 @@ extern "C" { +int rspamd_html_enumerate_urls(struct rspamd_task *task, + void *html_content, + int part_id, + struct rspamd_html_url_candidate **candidates, + gsize *n_candidates) +{ + if (!task || !html_content || !candidates || !n_candidates) { + return -1; + } + + auto *hc = static_cast(html_content); + + // Enumerate candidates using C++ function + auto cpp_candidates = rspamd::html::enumerate_rewrite_candidates(hc, task, part_id); + + if (cpp_candidates.empty()) { + *candidates = nullptr; + *n_candidates = 0; + return 0; + } + + // Allocate C-style array from task pool + *n_candidates = cpp_candidates.size(); + *candidates = (struct rspamd_html_url_candidate *) rspamd_mempool_alloc( + task->task_pool, + sizeof(struct rspamd_html_url_candidate) * cpp_candidates.size()); + + // Convert C++ candidates to C candidates + for (size_t i = 0; i < cpp_candidates.size(); i++) { + const auto &cpp_cand = cpp_candidates[i]; + + // Allocate strings from task pool + char *url_str = (char *) rspamd_mempool_alloc( + task->task_pool, + cpp_cand.absolute_url.size() + 1); + memcpy(url_str, cpp_cand.absolute_url.data(), cpp_cand.absolute_url.size()); + url_str[cpp_cand.absolute_url.size()] = '\0'; + + char *attr_str = (char *) rspamd_mempool_alloc( + task->task_pool, + cpp_cand.attr_name.size() + 1); + memcpy(attr_str, cpp_cand.attr_name.data(), cpp_cand.attr_name.size()); + attr_str[cpp_cand.attr_name.size()] = '\0'; + + // Get tag name + const char *tag_name = "unknown"; + gsize tag_len = 7; + if (cpp_cand.tag) { + // Use rspamd_html_tag_by_id which returns const char* + extern const char *rspamd_html_tag_by_id(int id); + tag_name = rspamd_html_tag_by_id(cpp_cand.tag->id); + if (tag_name) { + tag_len = strlen(tag_name); + } + else { + tag_name = "unknown"; + tag_len = 7; + } + } + + (*candidates)[i].url = url_str; + (*candidates)[i].url_len = cpp_cand.absolute_url.size(); + (*candidates)[i].attr = attr_str; + (*candidates)[i].attr_len = cpp_cand.attr_name.size(); + (*candidates)[i].tag = tag_name; + (*candidates)[i].tag_len = tag_len; + } + + return 0; +} + int rspamd_html_url_rewrite(struct rspamd_task *task, struct lua_State *L, void *html_content, diff --git a/src/libserver/html/html_url_rewrite_c.h b/src/libserver/html/html_url_rewrite_c.h index 798c8b3987..c0906e00a7 100644 --- a/src/libserver/html/html_url_rewrite_c.h +++ b/src/libserver/html/html_url_rewrite_c.h @@ -27,6 +27,33 @@ struct rspamd_task; struct lua_State; +/** + * URL candidate info for C interface + */ +struct rspamd_html_url_candidate { + const char *url; // Absolute URL string (NUL-terminated) + const char *attr;// Attribute name: "href" or "src" (NUL-terminated) + const char *tag; // Tag name (NUL-terminated) + gsize url_len; // Length of URL string + gsize attr_len; // Length of attr string + gsize tag_len; // Length of tag string +}; + +/** + * C wrapper for enumerating HTML URL rewrite candidates + * @param task Rspamd task + * @param html_content HTML content pointer (void* cast of html_content*) + * @param part_id MIME part ID + * @param candidates Output array of candidates (allocated from task pool if successful) + * @param n_candidates Output count of candidates + * @return 0 on success, -1 on error + */ +int rspamd_html_enumerate_urls(struct rspamd_task *task, + void *html_content, + int part_id, + struct rspamd_html_url_candidate **candidates, + gsize *n_candidates); + /** * C wrapper for HTML URL rewriting * @param task Rspamd task diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index a111ef9729..5f0295268c 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -1285,6 +1285,14 @@ LUA_FUNCTION_DEF(task, add_timer); */ LUA_FUNCTION_DEF(task, rewrite_html_urls); +/*** + * @method task:get_html_urls() + * Extracts all URLs from HTML parts without rewriting. + * Useful for async URL checking workflows where URLs need to be batched. + * @return {table|nil} table indexed by part number, each containing an array of URL info tables with keys: url, attr, tag + */ +LUA_FUNCTION_DEF(task, get_html_urls); + static const struct luaL_reg tasklib_f[] = { LUA_INTERFACE_DEF(task, create), LUA_INTERFACE_DEF(task, load_from_file), @@ -1416,6 +1424,7 @@ static const struct luaL_reg tasklib_m[] = { LUA_INTERFACE_DEF(task, topointer), LUA_INTERFACE_DEF(task, add_timer), LUA_INTERFACE_DEF(task, rewrite_html_urls), + LUA_INTERFACE_DEF(task, get_html_urls), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL}}; @@ -7875,6 +7884,88 @@ lua_task_rewrite_html_urls(lua_State *L) return 1; } +static int +lua_task_get_html_urls(lua_State *L) +{ + struct rspamd_task *task = lua_check_task(L, 1); + + if (!task || !MESSAGE_FIELD_CHECK(task, text_parts)) { + lua_pushnil(L); + return 1; + } + + /* Create result table */ + lua_newtable(L); + int results = 0; + unsigned int i; + void *part; + + /* Iterate through text parts */ + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) + { + struct rspamd_mime_text_part *text_part = (struct rspamd_mime_text_part *) part; + + /* Only process HTML parts */ + if (!IS_TEXT_PART_HTML(text_part) || !text_part->html) { + continue; + } + + /* Skip if no UTF-8 content available */ + if (!text_part->utf_raw_content || text_part->utf_raw_content->len == 0) { + continue; + } + + struct rspamd_html_url_candidate *candidates = NULL; + gsize n_candidates = 0; + + /* Enumerate URLs using C wrapper */ + int ret = rspamd_html_enumerate_urls( + task, + text_part->html, + text_part->mime_part->part_number, + &candidates, + &n_candidates); + + if (ret == 0 && candidates && n_candidates > 0) { + /* Create array for this part: table[part_number] = {url_info_1, url_info_2, ...} */ + lua_pushinteger(L, text_part->mime_part->part_number); + lua_newtable(L); /* URLs array for this part */ + + for (gsize j = 0; j < n_candidates; j++) { + lua_pushinteger(L, j + 1); /* 1-indexed array */ + lua_newtable(L); /* URL info table */ + + /* url field */ + lua_pushstring(L, "url"); + lua_pushstring(L, candidates[j].url); + lua_settable(L, -3); + + /* attr field */ + lua_pushstring(L, "attr"); + lua_pushstring(L, candidates[j].attr); + lua_settable(L, -3); + + /* tag field */ + lua_pushstring(L, "tag"); + lua_pushstring(L, candidates[j].tag); + lua_settable(L, -3); + + lua_settable(L, -3); /* Add url info to URLs array */ + } + + lua_settable(L, -3); /* Add part to main table */ + results++; + } + } + + if (results == 0) { + lua_pop(L, 1); + lua_pushnil(L); + } + + return 1; +} + /* Init part */ static int diff --git a/test/lua/unit/get_html_urls.lua b/test/lua/unit/get_html_urls.lua new file mode 100644 index 0000000000..862fea4fcb --- /dev/null +++ b/test/lua/unit/get_html_urls.lua @@ -0,0 +1,338 @@ +context("HTML URL extraction", function() + local rspamd_task = require("rspamd_task") + local logger = require("rspamd_logger") + + test("Basic URL extraction from simple HTML", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + +Click here + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should extract URLs") + + -- Check structure + local found_url = false + for part_id, url_list in pairs(urls) do + assert_true(type(url_list) == "table", "URL list should be a table") + for i, url_info in ipairs(url_list) do + assert_not_nil(url_info.url, "should have url field") + assert_not_nil(url_info.attr, "should have attr field") + assert_not_nil(url_info.tag, "should have tag field") + + if url_info.url == "http://example.com/test" then + assert_equal(url_info.attr, "href", "should be href attribute") + assert_equal(url_info.tag, "a", "should be tag") + found_url = true + end + end + end + + assert_true(found_url, "should find the expected URL") + + task:destroy() + end) + + test("Multiple URLs in same HTML part", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + +Link 1 +Link 2 + + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should extract URLs") + + -- Count URLs + local url_count = 0 + local found_urls = {} + for part_id, url_list in pairs(urls) do + for i, url_info in ipairs(url_list) do + url_count = url_count + 1 + found_urls[url_info.url] = url_info + end + end + + assert_equal(url_count, 3, "should have found 3 URLs") + assert_not_nil(found_urls["http://example.com/link1"], "should find link1") + assert_not_nil(found_urls["http://example.com/link2"], "should find link2") + assert_not_nil(found_urls["http://example.com/image.jpg"], "should find image") + + -- Check attributes + assert_equal(found_urls["http://example.com/link1"].attr, "href") + assert_equal(found_urls["http://example.com/link1"].tag, "a") + assert_equal(found_urls["http://example.com/image.jpg"].attr, "src") + assert_equal(found_urls["http://example.com/image.jpg"].tag, "img") + + task:destroy() + end) + + test("Non-HTML parts return nil", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/plain + +This is plain text with http://example.com/test +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + -- Should return nil for plain text + assert_nil(urls, "should return nil for non-HTML parts") + + task:destroy() + end) + + test("Empty HTML returns nil", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + -- Should return nil for empty HTML + assert_nil(urls, "should return nil for empty HTML") + + task:destroy() + end) + + test("HTML without URLs returns nil", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + +

Just some text without any links

+ + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + -- Should return nil when no URLs found + assert_nil(urls, "should return nil when no URLs found") + + task:destroy() + end) + + test("Data URI scheme is skipped", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + + +Real link + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should extract non-data URLs") + + -- Check that data: URIs are skipped + local found_data_uri = false + local found_http_url = false + for part_id, url_list in pairs(urls) do + for i, url_info in ipairs(url_list) do + if url_info.url:find("^data:", 1, false) then + found_data_uri = true + end + if url_info.url == "http://example.com/test" then + found_http_url = true + end + end + end + + assert_false(found_data_uri, "data: URIs should be skipped") + assert_true(found_http_url, "should have found the http URL") + + task:destroy() + end) + + test("CID scheme is skipped", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + + +Real link + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should extract non-cid URLs") + + -- Check that cid: URIs are skipped + local found_cid_uri = false + local found_http_url = false + for part_id, url_list in pairs(urls) do + for i, url_info in ipairs(url_list) do + if url_info.url:find("^cid:", 1, false) then + found_cid_uri = true + end + if url_info.url == "http://example.com/test" then + found_http_url = true + end + end + end + + assert_false(found_cid_uri, "cid: URIs should be skipped") + assert_true(found_http_url, "should have found the http URL") + + task:destroy() + end) + + test("Multipart message with multiple HTML parts", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: multipart/alternative; boundary="boundary123" + +--boundary123 +Content-Type: text/plain + +Plain text part + +--boundary123 +Content-Type: text/html + +Part 1 + +--boundary123 +Content-Type: text/html + +Part 2 + +--boundary123-- +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should extract URLs from multipart HTML") + + -- Should have processed at least one HTML part + local part_count = 0 + local total_urls = 0 + for part_id, url_list in pairs(urls) do + part_count = part_count + 1 + total_urls = total_urls + #url_list + end + + assert_true(part_count >= 1, "should have URLs from at least one HTML part") + assert_true(total_urls >= 1, "should have found at least one URL") + + task:destroy() + end) + + test("URL with special characters", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + +Link + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls = task:get_html_urls() + + assert_not_nil(urls, "should handle URLs with special chars") + + local found_url = false + for part_id, url_list in pairs(urls) do + for i, url_info in ipairs(url_list) do + if url_info.url:find("example.com/path", 1, true) then + found_url = true + -- URL should contain the query parameters + assert_true(url_info.url:find("param=value", 1, true) ~= nil, + "should preserve query parameters") + end + end + end + + assert_true(found_url, "should have found the URL with special chars") + + task:destroy() + end) + +end) diff --git a/test/lua/unit/get_html_urls_async_example.lua b/test/lua/unit/get_html_urls_async_example.lua new file mode 100644 index 0000000000..90d44b6191 --- /dev/null +++ b/test/lua/unit/get_html_urls_async_example.lua @@ -0,0 +1,273 @@ +--[[ + Async HTML URL Rewriting Example + + This is an example demonstrating how to use task:get_html_urls() with + async operations to batch-check URLs against an external service before + rewriting them. + + Usage pattern: + 1. Extract all URLs from HTML parts using task:get_html_urls() + 2. Send all URLs to external service via async HTTP/Redis/etc + 3. Receive URL replacements from service + 4. Apply rewrites using task:rewrite_html_urls() with lookup table +]] + +-- Example rule implementation +local function register_async_url_rewriter(rspamd_config) + rspamd_config:register_symbol({ + name = 'ASYNC_URL_REWRITER', + type = 'postfilter', + callback = function(task) + -- Step 1: Extract all URLs from HTML parts + local urls_by_part = task:get_html_urls() + + if not urls_by_part then + return -- No HTML URLs to process + end + + -- Flatten URLs for batched API request + local all_urls = {} + local url_to_info = {} + + for part_id, url_list in pairs(urls_by_part) do + for _, url_info in ipairs(url_list) do + table.insert(all_urls, url_info.url) + url_to_info[url_info.url] = url_info + end + end + + if #all_urls == 0 then + return + end + + rspamd_logger.infox(task, "Found %s HTML URLs to check", #all_urls) + + -- Step 2: Make async request to URL checking service + local http = require "rspamd_http" + local ucl = require "ucl" + + http.request({ + task = task, + url = 'http://url-checker.example.com/api/check-batch', + callback = function(err, code, body) + if err then + rspamd_logger.errx(task, 'URL check failed: %s', err) + return + end + + if code ~= 200 then + rspamd_logger.errx(task, 'URL check service returned HTTP %s', code) + return + end + + -- Step 3: Parse response containing URL replacements + local parser = ucl.parser() + local ok, parse_err = parser:parse_string(body) + + if not ok then + rspamd_logger.errx(task, 'Failed to parse response: %s', parse_err) + return + end + + local response = parser:get_object() + + -- Build replacement map: original_url -> new_url + local replacements = {} + + for original_url, result in pairs(response.urls or {}) do + if result.action == 'rewrite' and result.new_url then + replacements[original_url] = result.new_url + rspamd_logger.infox(task, "Will rewrite %s -> %s", + original_url, result.new_url) + elseif result.action == 'block' then + -- Redirect blocked URLs to warning page + replacements[original_url] = 'https://warning.example.com/blocked' + rspamd_logger.infox(task, "Blocking URL %s", original_url) + + -- Optionally set a symbol + task:insert_result('BLOCKED_URL', 1.0, original_url) + end + end + + -- Step 4: Apply rewrites using lookup table callback + if next(replacements) then + local rewritten = task:rewrite_html_urls(function(task, url) + -- Simple lookup - returns nil if URL shouldn't be rewritten + return replacements[url] + end) + + if rewritten then + rspamd_logger.infox(task, 'Rewritten URLs in parts: %s', + table.concat(table_keys(rewritten), ', ')) + + -- Optionally set a symbol to track rewrites + task:insert_result('URL_REWRITTEN', 1.0, + string.format('%d URLs', count_rewrites(replacements))) + end + end + end, + + -- Request configuration + headers = { + ['Content-Type'] = 'application/json', + ['Authorization'] = 'Bearer YOUR_API_TOKEN' + }, + body = ucl.to_format({ + urls = all_urls, + -- Include additional context if needed + message_id = task:get_message_id(), + from = (task:get_from('smtp') or {})[1] + }, 'json'), + timeout = 5.0 + }) + end, + priority = 10 -- Postfilter priority + }) +end + +-- Helper functions +local function table_keys(t) + local keys = {} + for k, _ in pairs(t) do + table.insert(keys, tostring(k)) + end + return keys +end + +local function count_rewrites(replacements) + local count = 0 + for _, _ in pairs(replacements) do + count = count + 1 + end + return count +end + +--[[ + Alternative: Using Redis for caching URL check results +]] + +local function register_redis_cached_url_rewriter(rspamd_config) + rspamd_config:register_symbol({ + name = 'REDIS_CACHED_URL_REWRITER', + type = 'postfilter', + callback = function(task) + local redis = require "rspamd_redis" + local urls_by_part = task:get_html_urls() + + if not urls_by_part then + return + end + + -- Collect all URLs + local all_urls = {} + for part_id, url_list in pairs(urls_by_part) do + for _, url_info in ipairs(url_list) do + table.insert(all_urls, url_info.url) + end + end + + if #all_urls == 0 then + return + end + + -- Build Redis MGET command to check all URLs at once + local redis_keys = {} + for _, url in ipairs(all_urls) do + table.insert(redis_keys, 'url:rewrite:' .. url) + end + + redis.make_request({ + task = task, + cmd = 'MGET', + args = redis_keys, + callback = function(err, data) + if err then + rspamd_logger.errx(task, 'Redis error: %s', err) + return + end + + -- Build replacement map from Redis results + local replacements = {} + for i, url in ipairs(all_urls) do + if data[i] and data[i] ~= '' then + replacements[url] = data[i] + end + end + + -- Apply rewrites + if next(replacements) then + local rewritten = task:rewrite_html_urls(function(task, url) + return replacements[url] + end) + + if rewritten then + rspamd_logger.infox(task, 'Applied %d URL rewrites from Redis', + count_rewrites(replacements)) + end + end + end + }) + end + }) +end + +--[[ + Simpler example: Rewrite specific domains without external service +]] + +local function register_simple_domain_rewriter(rspamd_config) + -- Mapping of domains to redirect targets + local domain_redirects = { + ['evil.com'] = 'https://warning.example.com/blocked?domain=evil.com', + ['phishing.net'] = 'https://warning.example.com/blocked?domain=phishing.net', + } + + rspamd_config:register_symbol({ + name = 'SIMPLE_DOMAIN_REWRITER', + type = 'postfilter', + callback = function(task) + local urls_by_part = task:get_html_urls() + + if not urls_by_part then + return + end + + -- Check if any URLs match blocked domains + local needs_rewrite = false + for part_id, url_list in pairs(urls_by_part) do + for _, url_info in ipairs(url_list) do + for blocked_domain, _ in pairs(domain_redirects) do + if url_info.url:find(blocked_domain, 1, true) then + needs_rewrite = true + break + end + end + end + end + + if not needs_rewrite then + return + end + + -- Apply rewrites + local rewritten = task:rewrite_html_urls(function(task, url) + for blocked_domain, redirect_url in pairs(domain_redirects) do + if url:find(blocked_domain, 1, true) then + return redirect_url + end + end + return nil -- Don't rewrite + end) + + if rewritten then + task:insert_result('DOMAIN_REWRITTEN', 1.0) + end + end + }) +end + +return { + register_async_url_rewriter = register_async_url_rewriter, + register_redis_cached_url_rewriter = register_redis_cached_url_rewriter, + register_simple_domain_rewriter = register_simple_domain_rewriter, +} diff --git a/test/lua/unit/url_rewrite.lua b/test/lua/unit/url_rewrite.lua index bada63a92d..099f9d41ea 100644 --- a/test/lua/unit/url_rewrite.lua +++ b/test/lua/unit/url_rewrite.lua @@ -460,4 +460,41 @@ Content-Type: text/html task:destroy() end) + test("Edge case: bare cid: and data: schemes", function() + local msg = [[ +From: test@example.com +To: nobody@example.com +Subject: test +Content-Type: text/html + + + + + +Real link + + +]] + local res, task = rspamd_task.load_from_string(msg, rspamd_config) + assert_true(res, "failed to load message") + + task:process_message() + + local urls_seen = {} + local function rewrite_callback(task, url) + table.insert(urls_seen, url) + return "http://safe.com/redirect" + end + + local result = task:rewrite_html_urls(rewrite_callback) + + assert_not_nil(result, "should rewrite non-special scheme URLs") + + -- Should only see the http URL, not bare cid: or data: + assert_equal(#urls_seen, 1, "should see exactly 1 URL (the http one)") + assert_equal(urls_seen[1], "http://example.com/test", "should see the http URL") + + task:destroy() + end) + end) -- 2.47.3