]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Take button weight into consideration
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 8 Sep 2025 17:05:12 +0000 (18:05 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 8 Sep 2025 17:05:12 +0000 (18:05 +0100)
lualib/lua_cta.lua
src/libmime/message.c
src/libserver/html/html.cxx
src/libserver/html/html.h
src/libserver/html/html.hxx

index 7bc5dcd486be53df747941129252a501d05b7f65..fb96d54227b467f77531d3cd5d62c1d2d99dd953 100644 (file)
@@ -143,9 +143,11 @@ M.process_html_links = function(task, part, ctx)
     trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
   }
 
-  -- Simple CTA guess: first candidate with display_mismatch or earliest order
+  -- Simple CTA guess: prefer higher C-side weight, then display_mismatch, then earliest order
   if #cands > 0 then
     table.sort(cands, function(a, b)
+      local aw, bw = tonumber(a.weight) or 0, tonumber(b.weight) or 0
+      if aw ~= bw then return aw > bw end
       if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
       if a.order ~= b.order then return a.order < b.order end
       return a.part_order < b.part_order
index b59215270444c321a296bf20a48bc764821b6d93..4721a65c59a3cbf210328a57fe97004793580730 100644 (file)
@@ -898,6 +898,10 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
                                        lua_pushstring(L, "part_order");
                                        lua_pushinteger(L, (lua_Integer) u->part_order);
                                        lua_settable(L, -3);
+                                       /* cta weight from C heuristics */
+                                       lua_pushstring(L, "weight");
+                                       lua_pushnumber(L, (lua_Number) rspamd_html_url_button_weight(text_part->html, u));
+                                       lua_settable(L, -3);
                                        /* etld1 computed in Lua if needed */
                                        lua_rawseti(L, -2, ++nadded);
                                }
@@ -925,6 +929,23 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
                                        rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL);
                                }
                                lua_pop(L, 1);
+                               /* If no weight set by Lua, derive from C heuristic */
+                               if (!rspamd_mempool_get_variable(task->task_pool, "html_cta_weight") &&
+                                       text_part->html && text_part->mime_part && text_part->mime_part->urls) {
+                                       double best_w = 0.0;
+                                       unsigned int ui;
+                                       for (ui = 0; ui < text_part->mime_part->urls->len; ui++) {
+                                               struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, ui);
+                                               if (!u) continue;
+                                               if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+                                               if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+                                               float cw = rspamd_html_url_button_weight(text_part->html, u);
+                                               if (cw > best_w) best_w = cw;
+                                       }
+                                       if (best_w > 0.0) {
+                                               rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &best_w, NULL);
+                                       }
+                               }
                                lua_pushstring(L, "affiliated_ratio");
                                lua_gettable(L, -2);
                                if (lua_isnumber(L, -1)) {
index d231260860b072a2988ff35483f026cfa6238c76..8ef1520db293eb1cd4db8be80f926979e7523b13 100644 (file)
@@ -2273,6 +2273,26 @@ auto html_process_input(struct rspamd_task *task,
                                        if (cnt > hc->features.links.max_links_single_domain) {
                                                hc->features.links.max_links_single_domain = cnt;
                                        }
+                                       /* Heuristic button weight */
+                                       float w = 0.0f;
+                                       if (url->ext && url->ext->linked_url && url->ext->linked_url != url) {
+                                               w += 0.5f; /* display mismatch bonus */
+                                       }
+                                       w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order);
+                                       if (cur_tag->block && cur_tag->block->is_visible()) {
+                                               if (cur_tag->block->has_display()) {
+                                                       w += 0.1f;
+                                               }
+                                               if (cur_tag->block->width > 0 && cur_tag->block->height > 0) {
+                                                       w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f);
+                                               }
+                                               if (cur_tag->block->font_size >= 14) {
+                                                       w += 0.1f;
+                                               }
+                                       }
+                                       if (w > 0) {
+                                               hc->url_button_weights[url] += w;
+                                       }
                                        /* same eTLD+1 as first-party? */
                                        if (!hc->first_party_etld1.empty()) {
                                                rspamd_ftok_t tld2;
@@ -3165,6 +3185,18 @@ rspamd_html_tag_by_id(int id)
        return nullptr;
 }
 
+float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u)
+{
+       if (html_content == NULL || u == NULL) return 0.0f;
+       auto *hc = rspamd::html::html_content::from_ptr(html_content);
+       auto it = hc->url_button_weights.find(u);
+       if (it != hc->url_button_weights.end()) {
+               return it->second;
+       }
+
+       return 0.0f;
+}
+
 const struct rspamd_html_features *
 rspamd_html_get_features(void *html_content)
 {
index c0fa2b9c341b263955f759149c015de6b12cd25d..368a22b08c97c32aa2080677a39998ae7791cbff 100644 (file)
@@ -130,6 +130,11 @@ bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
  */
 gsize rspamd_html_get_tags_count(void *html_content);
 
+/**
+ * Returns heuristic button weight for a given URL within this HTML content
+ */
+float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u);
+
 /**
  * Returns an immutable pointer to aggregated html features
  */
index 509697264f5a026974ecb9f32c9f84e8a9741c29..3e295ea60d7013ab417e958ab80accb9d0aba102 100644 (file)
@@ -56,6 +56,8 @@ struct html_content {
        struct rspamd_html_features features;
        /* Helper: per-domain link counts */
        ankerl::unordered_dense::map<std::string, unsigned int> link_domain_counts;
+       /* Heuristic weights for button-like links */
+       ankerl::unordered_dense::map<struct rspamd_url *, float> url_button_weights;
        /* First-party eTLD+1 derived from message (e.g. From:) */
        std::string first_party_etld1;