]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Prioritize CTA URLs in redirector and Lua helpers
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 6 Nov 2025 18:22:08 +0000 (18:22 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 6 Nov 2025 18:22:08 +0000 (18:22 +0000)
.overcommit.yml
lualib/lua_util.lua
src/lua/lua_mimepart.c
src/plugins/lua/url_redirector.lua

index 9212c33b32944e44fa636af802bb8fe0414767aa..0c62a7d27798c2e693fe87ee237e719e1b6f149f 100644 (file)
@@ -15,6 +15,8 @@
 #
 # Uncomment the following lines to make the configuration take effect.
 
+concurrency: 1
+
 PreCommit:
   TrailingWhitespace:
     enabled: true
index 88127f7e016d80330560e630da4fe10a1ce62f1e..4851e566fb8cd3b2342c5da0a151b4ccef5d6423 100644 (file)
@@ -883,6 +883,44 @@ exports.filter_specific_urls = function(urls, params)
   local res = {}
   local nres = 0
 
+  local cta_priority_map
+
+  if params.task and params.task.get_text_parts then
+    local text_parts = params.task:get_text_parts()
+    if text_parts then
+      cta_priority_map = {}
+      for _, part in ipairs(text_parts) do
+        if part.is_html and part:is_html() and part.get_cta_urls then
+          local entries = part:get_cta_urls({original = true, with_weights = true})
+          if type(entries) == 'table' then
+            for _, entry in ipairs(entries) do
+              if entry and entry.url then
+                local url = entry.url
+                local str = tostring(url)
+                local weight = entry.weight or 0
+                local score = 6 + math.floor(weight * 10 + 0.5)
+                if not cta_priority_map[str] or score > cta_priority_map[str] then
+                  cta_priority_map[str] = score
+                end
+                local redir = url:get_redirected()
+                if redir then
+                  local rstr = tostring(redir)
+                  if not cta_priority_map[rstr] or score > cta_priority_map[rstr] then
+                    cta_priority_map[rstr] = score
+                  end
+                end
+              end
+            end
+          end
+        end
+      end
+
+      if next(cta_priority_map) == nil then
+        cta_priority_map = nil
+      end
+    end
+  end
+
   local function insert_url(str, u)
     if not res[str] then
       res[str] = u
@@ -927,6 +965,20 @@ exports.filter_specific_urls = function(urls, params)
     local esld = u:get_tld()
     local str_hash = tostring(u)
 
+    if cta_priority_map then
+      local cta_pr = cta_priority_map[str_hash]
+      if not cta_pr and flags.redirected then
+        local redir_url = u:get_redirected()
+        if redir_url then
+          cta_pr = cta_priority_map[tostring(redir_url)]
+        end
+      end
+
+      if cta_pr then
+        priority = math.max(priority, cta_pr)
+      end
+    end
+
     if esld then
       -- Special cases
       if (u:get_protocol() ~= 'mailto') and (not flags.html_displayed) then
index 21b3f6bbe7585dceb92fc6ecb6d3824d411dea46..97b1349d0cae3df35260fca8f7ce79407ac73114 100644 (file)
@@ -1441,14 +1441,51 @@ lua_textpart_get_cta_urls(lua_State *L)
        struct rspamd_mime_text_part *part = lua_check_textpart(L);
        unsigned int max_urls = 0;
        unsigned int nret = 0;
+       gboolean return_original = FALSE;
+       gboolean include_weights = FALSE;
 
        if (part == NULL) {
                return luaL_error(L, "invalid arguments");
        }
 
-       /* Get optional max_urls parameter */
-       if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
-               max_urls = lua_tointeger(L, 2);
+       int top = lua_gettop(L);
+
+       if (top >= 2) {
+               if (lua_istable(L, 2)) {
+                       lua_getfield(L, 2, "max");
+                       if (lua_isnumber(L, -1)) {
+                               max_urls = lua_tointeger(L, -1);
+                       }
+                       lua_pop(L, 1);
+                       lua_getfield(L, 2, "original");
+                       if (lua_isboolean(L, -1)) {
+                               return_original = lua_toboolean(L, -1);
+                       }
+                       lua_pop(L, 1);
+                       lua_getfield(L, 2, "with_weights");
+                       if (lua_isboolean(L, -1)) {
+                               include_weights = lua_toboolean(L, -1);
+                       }
+                       lua_pop(L, 1);
+               }
+               else if (lua_isnumber(L, 2)) {
+                       max_urls = lua_tointeger(L, 2);
+                       if (top >= 3 && lua_isboolean(L, 3)) {
+                               return_original = lua_toboolean(L, 3);
+                       }
+                       if (top >= 4 && lua_isboolean(L, 4)) {
+                               include_weights = lua_toboolean(L, 4);
+                       }
+               }
+               else if (lua_isboolean(L, 2)) {
+                       return_original = lua_toboolean(L, 2);
+                       if (top >= 3 && lua_isnumber(L, 3)) {
+                               max_urls = lua_tointeger(L, 3);
+                       }
+                       if (top >= 4 && lua_isboolean(L, 4)) {
+                               include_weights = lua_toboolean(L, 4);
+                       }
+               }
        }
 
        /* Check if this HTML part has CTA URLs */
@@ -1462,21 +1499,48 @@ lua_textpart_get_cta_urls(lua_State *L)
 
        /* Heap is already top-K, but in min-heap order - need to reverse for descending */
        unsigned int result_size = max_urls > 0 ? MIN(max_urls, heap->n) : heap->n;
-       lua_createtable(L, result_size, 0);
+       lua_createtable(L, result_size, include_weights ? 0 : 0);
+
+       GHashTable *seen = g_hash_table_new(g_direct_hash, g_direct_equal);
 
        /* Iterate heap from end to start for descending order */
        for (int i = (int) heap->n - 1; i >= 0 && nret < result_size; i--) {
                struct rspamd_html_cta_entry *entry = &heap->a[i];
                if (entry && entry->url) {
-                       struct rspamd_lua_url *lua_url;
+                       struct rspamd_url *chosen = entry->url;
+
+                       if (!return_original && chosen->ext && chosen->ext->linked_url &&
+                               chosen->ext->linked_url != chosen) {
+                               chosen = chosen->ext->linked_url;
+                       }
+
+                       if (g_hash_table_lookup(seen, chosen)) {
+                               continue;
+                       }
 
-                       lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
-                       rspamd_lua_setclass(L, rspamd_url_classname, -1);
-                       lua_url->url = entry->url;
-                       lua_rawseti(L, -2, ++nret);
+                       g_hash_table_insert(seen, chosen, chosen);
+
+                       if (include_weights) {
+                               lua_createtable(L, 0, 2);
+                               struct rspamd_lua_url *lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+                               rspamd_lua_setclass(L, rspamd_url_classname, -1);
+                               lua_url->url = chosen;
+                               lua_setfield(L, -2, "url");
+                               lua_pushnumber(L, entry->weight);
+                               lua_setfield(L, -2, "weight");
+                               lua_rawseti(L, -2, ++nret);
+                       }
+                       else {
+                               struct rspamd_lua_url *lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+                               rspamd_lua_setclass(L, rspamd_url_classname, -1);
+                               lua_url->url = chosen;
+                               lua_rawseti(L, -2, ++nret);
+                       }
                }
        }
 
+       g_hash_table_unref(seen);
+
        return 1;
 }
 
index c1fa85cae7d048d9530f394896462d10cc191beb..1c61c1de4143d47dc68c105c231d3f59367cda52 100644 (file)
@@ -344,25 +344,76 @@ local function url_redirector_process_url(task, url)
 end
 
 local function url_redirector_handler(task)
-  local sp_urls = lua_util.extract_specific_urls({
-    task = task,
-    limit = settings.max_urls,
-    filter = function(url)
-      local host = url:get_host()
-      if settings.redirector_hosts_map:get_key(host) then
-        lua_util.debugm(N, task, 'check url %s', tostring(url))
-        return true
+  local selected = {}
+  local seen = {}
+
+  local text_parts = task:get_text_parts()
+  if text_parts then
+    for _, part in ipairs(text_parts) do
+      if part:is_html() and part.get_cta_urls then
+        local cta_urls = part:get_cta_urls(settings.max_urls, true)
+        if cta_urls then
+          for _, url in ipairs(cta_urls) do
+            local host = url:get_host()
+            if host and settings.redirector_hosts_map:get_key(host) then
+              local key = tostring(url)
+              if not seen[key] then
+                lua_util.debugm(N, task, 'prefer CTA url %s for redirector', key)
+                table.insert(selected, url)
+                seen[key] = true
+                if #selected >= settings.max_urls then
+                  break
+                end
+              end
+            end
+          end
+        end
+      end
+
+      if #selected >= settings.max_urls then
+        break
+      end
+    end
+  end
+
+  local remaining = settings.max_urls - #selected
+
+  if remaining > 0 then
+    local sp_urls = lua_util.extract_specific_urls({
+      task = task,
+      limit = remaining,
+      filter = function(url)
+        local host = url:get_host()
+        if host and settings.redirector_hosts_map:get_key(host) then
+          local key = tostring(url)
+          if not seen[key] then
+            lua_util.debugm(N, task, 'consider redirector url %s', key)
+            return true
+          end
+        end
+        return false
+      end,
+      no_cache = true,
+      need_content = true,
+    })
+
+    if sp_urls then
+      for _, u in ipairs(sp_urls) do
+        local key = tostring(u)
+        if not seen[key] then
+          table.insert(selected, u)
+          seen[key] = true
+          if #selected >= settings.max_urls then
+            break
+          end
+        end
       end
-    end,
-    no_cache = true,
-    need_content = true,
-  })
-
-  if sp_urls then
-    for _, u in ipairs(sp_urls) do
-      url_redirector_process_url(task, u)
     end
   end
+
+  for _, u in ipairs(selected) do
+    url_redirector_process_url(task, u)
+  end
 end
 
 local opts = rspamd_config:get_all_opt('url_redirector')