From: Vsevolod Stakhov Date: Thu, 6 Nov 2025 18:22:08 +0000 (+0000) Subject: [Rework] Prioritize CTA URLs in redirector and Lua helpers X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d7d214dc2ad92af850a41f832b330e176e4ada7f;p=thirdparty%2Frspamd.git [Rework] Prioritize CTA URLs in redirector and Lua helpers --- diff --git a/.overcommit.yml b/.overcommit.yml index 9212c33b32..0c62a7d277 100644 --- a/.overcommit.yml +++ b/.overcommit.yml @@ -15,6 +15,8 @@ # # Uncomment the following lines to make the configuration take effect. +concurrency: 1 + PreCommit: TrailingWhitespace: enabled: true diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua index 88127f7e01..4851e566fb 100644 --- a/lualib/lua_util.lua +++ b/lualib/lua_util.lua @@ -883,6 +883,44 @@ exports.filter_specific_urls = function(urls, params) local res = {} local nres = 0 + local cta_priority_map + + if params.task and params.task.get_text_parts then + local text_parts = params.task:get_text_parts() + if text_parts then + cta_priority_map = {} + for _, part in ipairs(text_parts) do + if part.is_html and part:is_html() and part.get_cta_urls then + local entries = part:get_cta_urls({original = true, with_weights = true}) + if type(entries) == 'table' then + for _, entry in ipairs(entries) do + if entry and entry.url then + local url = entry.url + local str = tostring(url) + local weight = entry.weight or 0 + local score = 6 + math.floor(weight * 10 + 0.5) + if not cta_priority_map[str] or score > cta_priority_map[str] then + cta_priority_map[str] = score + end + local redir = url:get_redirected() + if redir then + local rstr = tostring(redir) + if not cta_priority_map[rstr] or score > cta_priority_map[rstr] then + cta_priority_map[rstr] = score + end + end + end + end + end + end + end + + if next(cta_priority_map) == nil then + cta_priority_map = nil + end + end + end + local function insert_url(str, u) if not res[str] then res[str] = u @@ -927,6 +965,20 @@ exports.filter_specific_urls = function(urls, params) local esld = u:get_tld() local str_hash = tostring(u) + if cta_priority_map then + local cta_pr = cta_priority_map[str_hash] + if not cta_pr and flags.redirected then + local redir_url = u:get_redirected() + if redir_url then + cta_pr = cta_priority_map[tostring(redir_url)] + end + end + + if cta_pr then + priority = math.max(priority, cta_pr) + end + end + if esld then -- Special cases if (u:get_protocol() ~= 'mailto') and (not flags.html_displayed) then diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 21b3f6bbe7..97b1349d0c 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -1441,14 +1441,51 @@ lua_textpart_get_cta_urls(lua_State *L) struct rspamd_mime_text_part *part = lua_check_textpart(L); unsigned int max_urls = 0; unsigned int nret = 0; + gboolean return_original = FALSE; + gboolean include_weights = FALSE; if (part == NULL) { return luaL_error(L, "invalid arguments"); } - /* Get optional max_urls parameter */ - if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) { - max_urls = lua_tointeger(L, 2); + int top = lua_gettop(L); + + if (top >= 2) { + if (lua_istable(L, 2)) { + lua_getfield(L, 2, "max"); + if (lua_isnumber(L, -1)) { + max_urls = lua_tointeger(L, -1); + } + lua_pop(L, 1); + lua_getfield(L, 2, "original"); + if (lua_isboolean(L, -1)) { + return_original = lua_toboolean(L, -1); + } + lua_pop(L, 1); + lua_getfield(L, 2, "with_weights"); + if (lua_isboolean(L, -1)) { + include_weights = lua_toboolean(L, -1); + } + lua_pop(L, 1); + } + else if (lua_isnumber(L, 2)) { + max_urls = lua_tointeger(L, 2); + if (top >= 3 && lua_isboolean(L, 3)) { + return_original = lua_toboolean(L, 3); + } + if (top >= 4 && lua_isboolean(L, 4)) { + include_weights = lua_toboolean(L, 4); + } + } + else if (lua_isboolean(L, 2)) { + return_original = lua_toboolean(L, 2); + if (top >= 3 && lua_isnumber(L, 3)) { + max_urls = lua_tointeger(L, 3); + } + if (top >= 4 && lua_isboolean(L, 4)) { + include_weights = lua_toboolean(L, 4); + } + } } /* Check if this HTML part has CTA URLs */ @@ -1462,21 +1499,48 @@ lua_textpart_get_cta_urls(lua_State *L) /* Heap is already top-K, but in min-heap order - need to reverse for descending */ unsigned int result_size = max_urls > 0 ? MIN(max_urls, heap->n) : heap->n; - lua_createtable(L, result_size, 0); + lua_createtable(L, result_size, include_weights ? 0 : 0); + + GHashTable *seen = g_hash_table_new(g_direct_hash, g_direct_equal); /* Iterate heap from end to start for descending order */ for (int i = (int) heap->n - 1; i >= 0 && nret < result_size; i--) { struct rspamd_html_cta_entry *entry = &heap->a[i]; if (entry && entry->url) { - struct rspamd_lua_url *lua_url; + struct rspamd_url *chosen = entry->url; + + if (!return_original && chosen->ext && chosen->ext->linked_url && + chosen->ext->linked_url != chosen) { + chosen = chosen->ext->linked_url; + } + + if (g_hash_table_lookup(seen, chosen)) { + continue; + } - lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url)); - rspamd_lua_setclass(L, rspamd_url_classname, -1); - lua_url->url = entry->url; - lua_rawseti(L, -2, ++nret); + g_hash_table_insert(seen, chosen, chosen); + + if (include_weights) { + lua_createtable(L, 0, 2); + struct rspamd_lua_url *lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url)); + rspamd_lua_setclass(L, rspamd_url_classname, -1); + lua_url->url = chosen; + lua_setfield(L, -2, "url"); + lua_pushnumber(L, entry->weight); + lua_setfield(L, -2, "weight"); + lua_rawseti(L, -2, ++nret); + } + else { + struct rspamd_lua_url *lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url)); + rspamd_lua_setclass(L, rspamd_url_classname, -1); + lua_url->url = chosen; + lua_rawseti(L, -2, ++nret); + } } } + g_hash_table_unref(seen); + return 1; } diff --git a/src/plugins/lua/url_redirector.lua b/src/plugins/lua/url_redirector.lua index c1fa85cae7..1c61c1de41 100644 --- a/src/plugins/lua/url_redirector.lua +++ b/src/plugins/lua/url_redirector.lua @@ -344,25 +344,76 @@ local function url_redirector_process_url(task, url) end local function url_redirector_handler(task) - local sp_urls = lua_util.extract_specific_urls({ - task = task, - limit = settings.max_urls, - filter = function(url) - local host = url:get_host() - if settings.redirector_hosts_map:get_key(host) then - lua_util.debugm(N, task, 'check url %s', tostring(url)) - return true + local selected = {} + local seen = {} + + local text_parts = task:get_text_parts() + if text_parts then + for _, part in ipairs(text_parts) do + if part:is_html() and part.get_cta_urls then + local cta_urls = part:get_cta_urls(settings.max_urls, true) + if cta_urls then + for _, url in ipairs(cta_urls) do + local host = url:get_host() + if host and settings.redirector_hosts_map:get_key(host) then + local key = tostring(url) + if not seen[key] then + lua_util.debugm(N, task, 'prefer CTA url %s for redirector', key) + table.insert(selected, url) + seen[key] = true + if #selected >= settings.max_urls then + break + end + end + end + end + end + end + + if #selected >= settings.max_urls then + break + end + end + end + + local remaining = settings.max_urls - #selected + + if remaining > 0 then + local sp_urls = lua_util.extract_specific_urls({ + task = task, + limit = remaining, + filter = function(url) + local host = url:get_host() + if host and settings.redirector_hosts_map:get_key(host) then + local key = tostring(url) + if not seen[key] then + lua_util.debugm(N, task, 'consider redirector url %s', key) + return true + end + end + return false + end, + no_cache = true, + need_content = true, + }) + + if sp_urls then + for _, u in ipairs(sp_urls) do + local key = tostring(u) + if not seen[key] then + table.insert(selected, u) + seen[key] = true + if #selected >= settings.max_urls then + break + end + end end - end, - no_cache = true, - need_content = true, - }) - - if sp_urls then - for _, u in ipairs(sp_urls) do - url_redirector_process_url(task, u) end end + + for _, u in ipairs(selected) do + url_redirector_process_url(task, u) + end end local opts = rspamd_config:get_all_opt('url_redirector')