From 848ea6502bab0e4c3685720e0fe3ad5e60a5d894 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 8 Sep 2025 18:05:12 +0100 Subject: [PATCH] [Project] Take button weight into consideration --- lualib/lua_cta.lua | 4 +++- src/libmime/message.c | 21 +++++++++++++++++++++ src/libserver/html/html.cxx | 32 ++++++++++++++++++++++++++++++++ src/libserver/html/html.h | 5 +++++ src/libserver/html/html.hxx | 2 ++ 5 files changed, 63 insertions(+), 1 deletion(-) diff --git a/lualib/lua_cta.lua b/lualib/lua_cta.lua index 7bc5dcd486..fb96d54227 100644 --- a/lualib/lua_cta.lua +++ b/lualib/lua_cta.lua @@ -143,9 +143,11 @@ M.process_html_links = function(task, part, ctx) trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0, } - -- Simple CTA guess: first candidate with display_mismatch or earliest order + -- Simple CTA guess: prefer higher C-side weight, then display_mismatch, then earliest order if #cands > 0 then table.sort(cands, function(a, b) + local aw, bw = tonumber(a.weight) or 0, tonumber(b.weight) or 0 + if aw ~= bw then return aw > bw end if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end if a.order ~= b.order then return a.order < b.order end return a.part_order < b.part_order diff --git a/src/libmime/message.c b/src/libmime/message.c index b592152704..4721a65c59 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -898,6 +898,10 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, lua_pushstring(L, "part_order"); lua_pushinteger(L, (lua_Integer) u->part_order); lua_settable(L, -3); + /* cta weight from C heuristics */ + lua_pushstring(L, "weight"); + lua_pushnumber(L, (lua_Number) rspamd_html_url_button_weight(text_part->html, u)); + lua_settable(L, -3); /* etld1 computed in Lua if needed */ lua_rawseti(L, -2, ++nadded); } @@ -925,6 +929,23 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL); } lua_pop(L, 1); + /* If no weight set by Lua, derive from C heuristic */ + if (!rspamd_mempool_get_variable(task->task_pool, "html_cta_weight") && + text_part->html && text_part->mime_part && text_part->mime_part->urls) { + double best_w = 0.0; + unsigned int ui; + for (ui = 0; ui < text_part->mime_part->urls->len; ui++) { + struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, ui); + if (!u) continue; + if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue; + if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue; + float cw = rspamd_html_url_button_weight(text_part->html, u); + if (cw > best_w) best_w = cw; + } + if (best_w > 0.0) { + rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &best_w, NULL); + } + } lua_pushstring(L, "affiliated_ratio"); lua_gettable(L, -2); if (lua_isnumber(L, -1)) { diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index d231260860..8ef1520db2 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -2273,6 +2273,26 @@ auto html_process_input(struct rspamd_task *task, if (cnt > hc->features.links.max_links_single_domain) { hc->features.links.max_links_single_domain = cnt; } + /* Heuristic button weight */ + float w = 0.0f; + if (url->ext && url->ext->linked_url && url->ext->linked_url != url) { + w += 0.5f; /* display mismatch bonus */ + } + w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order); + if (cur_tag->block && cur_tag->block->is_visible()) { + if (cur_tag->block->has_display()) { + w += 0.1f; + } + if (cur_tag->block->width > 0 && cur_tag->block->height > 0) { + w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f); + } + if (cur_tag->block->font_size >= 14) { + w += 0.1f; + } + } + if (w > 0) { + hc->url_button_weights[url] += w; + } /* same eTLD+1 as first-party? */ if (!hc->first_party_etld1.empty()) { rspamd_ftok_t tld2; @@ -3165,6 +3185,18 @@ rspamd_html_tag_by_id(int id) return nullptr; } +float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u) +{ + if (html_content == NULL || u == NULL) return 0.0f; + auto *hc = rspamd::html::html_content::from_ptr(html_content); + auto it = hc->url_button_weights.find(u); + if (it != hc->url_button_weights.end()) { + return it->second; + } + + return 0.0f; +} + const struct rspamd_html_features * rspamd_html_get_features(void *html_content) { diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index c0fa2b9c34..368a22b08c 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -130,6 +130,11 @@ bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); */ gsize rspamd_html_get_tags_count(void *html_content); +/** + * Returns heuristic button weight for a given URL within this HTML content + */ +float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u); + /** * Returns an immutable pointer to aggregated html features */ diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 509697264f..3e295ea60d 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -56,6 +56,8 @@ struct html_content { struct rspamd_html_features features; /* Helper: per-domain link counts */ ankerl::unordered_dense::map link_domain_counts; + /* Heuristic weights for button-like links */ + ankerl::unordered_dense::map url_button_weights; /* First-party eTLD+1 derived from message (e.g. From:) */ std::string first_party_etld1; -- 2.47.3