trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
}
- -- Simple CTA guess: first candidate with display_mismatch or earliest order
+ -- Simple CTA guess: prefer higher C-side weight, then display_mismatch, then earliest order
if #cands > 0 then
table.sort(cands, function(a, b)
+ local aw, bw = tonumber(a.weight) or 0, tonumber(b.weight) or 0
+ if aw ~= bw then return aw > bw end
if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
if a.order ~= b.order then return a.order < b.order end
return a.part_order < b.part_order
lua_pushstring(L, "part_order");
lua_pushinteger(L, (lua_Integer) u->part_order);
lua_settable(L, -3);
+ /* cta weight from C heuristics */
+ lua_pushstring(L, "weight");
+ lua_pushnumber(L, (lua_Number) rspamd_html_url_button_weight(text_part->html, u));
+ lua_settable(L, -3);
/* etld1 computed in Lua if needed */
lua_rawseti(L, -2, ++nadded);
}
rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL);
}
lua_pop(L, 1);
+ /* If no weight set by Lua, derive from C heuristic */
+ if (!rspamd_mempool_get_variable(task->task_pool, "html_cta_weight") &&
+ text_part->html && text_part->mime_part && text_part->mime_part->urls) {
+ double best_w = 0.0;
+ unsigned int ui;
+ for (ui = 0; ui < text_part->mime_part->urls->len; ui++) {
+ struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, ui);
+ if (!u) continue;
+ if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+ if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+ float cw = rspamd_html_url_button_weight(text_part->html, u);
+ if (cw > best_w) best_w = cw;
+ }
+ if (best_w > 0.0) {
+ rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &best_w, NULL);
+ }
+ }
lua_pushstring(L, "affiliated_ratio");
lua_gettable(L, -2);
if (lua_isnumber(L, -1)) {
if (cnt > hc->features.links.max_links_single_domain) {
hc->features.links.max_links_single_domain = cnt;
}
+ /* Heuristic button weight */
+ float w = 0.0f;
+ if (url->ext && url->ext->linked_url && url->ext->linked_url != url) {
+ w += 0.5f; /* display mismatch bonus */
+ }
+ w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order);
+ if (cur_tag->block && cur_tag->block->is_visible()) {
+ if (cur_tag->block->has_display()) {
+ w += 0.1f;
+ }
+ if (cur_tag->block->width > 0 && cur_tag->block->height > 0) {
+ w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f);
+ }
+ if (cur_tag->block->font_size >= 14) {
+ w += 0.1f;
+ }
+ }
+ if (w > 0) {
+ hc->url_button_weights[url] += w;
+ }
/* same eTLD+1 as first-party? */
if (!hc->first_party_etld1.empty()) {
rspamd_ftok_t tld2;
return nullptr;
}
+float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u)
+{
+ if (html_content == NULL || u == NULL) return 0.0f;
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+ auto it = hc->url_button_weights.find(u);
+ if (it != hc->url_button_weights.end()) {
+ return it->second;
+ }
+
+ return 0.0f;
+}
+
const struct rspamd_html_features *
rspamd_html_get_features(void *html_content)
{
*/
gsize rspamd_html_get_tags_count(void *html_content);
+/**
+ * Returns heuristic button weight for a given URL within this HTML content
+ */
+float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u);
+
/**
* Returns an immutable pointer to aggregated html features
*/
struct rspamd_html_features features;
/* Helper: per-domain link counts */
ankerl::unordered_dense::map<std::string, unsigned int> link_domain_counts;
+ /* Heuristic weights for button-like links */
+ ankerl::unordered_dense::map<struct rspamd_url *, float> url_button_weights;
/* First-party eTLD+1 derived from message (e.g. From:) */
std::string first_party_etld1;