From: Vsevolod Stakhov Date: Mon, 8 Sep 2025 16:51:21 +0000 (+0100) Subject: [Project] Add CTA analytics engine X-Git-Tag: 3.13.0~6^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4273c0c77aa27689936d9016b498b09771cff5e5;p=thirdparty%2Frspamd.git [Project] Add CTA analytics engine --- diff --git a/lualib/lua_cta.lua b/lualib/lua_cta.lua new file mode 100644 index 0000000000..7bc5dcd486 --- /dev/null +++ b/lualib/lua_cta.lua @@ -0,0 +1,169 @@ +--[[ +CTA and link affiliation analysis + +Purpose: +- Given a capped list of candidate links extracted in C during HTML parsing, + compute simple affiliation scores between those links and the sender’s + first-party domain, and pick a likely CTA (call-to-action) link. + +How it is called: +- C code (message processing after HTML parsing) loads this function via + `rspamd_lua_require_function(L, "lua_cta", "process_html_links")` and calls + `process_html_links(task, part, ctx)` once per HTML text part. + +Inputs (ctx table): +- links_total: total number of links in the part (summary; may be omitted) +- domains_total: number of distinct link domains (summary) +- max_links_single_domain: maximum links seen for a single domain (summary) +- candidates: array (capped in C, default 24) of small objects with fields: + - host: link host (string) + - idn, numeric, has_port, has_query, display_mismatch: booleans + - order, part_order: integers (ordering hints) + - etld1: optional eTLD+1 (if not set, this module approximates from host) + +Outputs (returned table): +- cta_affiliated: boolean – whether the selected CTA appears affiliated +- cta_weight: number – simple weight hint (e.g. 1.0 if display mismatch) +- affiliated_ratio: number – fraction of candidates considered affiliated +- trackerish_ratio: number – fraction of candidates that look trackerish + +Configuration (rspamd.conf): +- Use the `link_affiliation { ... }` section. +- Options: + - stopwords: map (set/regexp/glob) used to strip common tracking tokens from + domains when computing token overlap + - whitelist / blacklist: optional maps (set) to tweak affiliation + - min_similarity: number (default 0.5) – Jaccard threshold for affiliation + - max_candidates: number (default 24) – extra Lua-side cap (C caps as well) + +This module keeps all heavy config logic in Lua using lua_maps and only relies +on C to provide a bounded set of safe, pre-filtered candidates. +]] +local M = {} + +local lua_util = require "lua_util" +local lua_maps = require "lua_maps" + +local settings = { + min_similarity = 0.5, + max_candidates = 24, + stopwords = nil, + whitelist = nil, + blacklist = nil, +} + +local function load_settings() + local cfg = rawget(_G, 'rspamd_config') + local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {} + settings = lua_util.override_defaults(settings, opts) + -- Convert map definitions to maps if needed + if settings.stopwords and (type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key) then + settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords') + end + if settings.whitelist and (type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key) then + settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist') + end + if settings.blacklist and (type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key) then + settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist') + end +end + +load_settings() + +local function etld1_tokens(dom) + local t = {} + for token in string.gmatch(string.lower(dom or ''), "[a-z0-9]+") do + if not (settings.stopwords and settings.stopwords:get_key(token)) then + t[token] = true + end + end + return t +end + +local function jaccard(a, b) + local inter, uni = 0, 0 + for k in pairs(a) do + if b[k] then inter = inter + 1 end + uni = uni + 1 + end + for k in pairs(b) do + if not a[k] then uni = uni + 1 end + end + if uni == 0 then return 0 end + return inter / uni +end + +M.process_html_links = function(task, part, ctx) + local first_party = nil + -- Derive first-party from From: if not provided + do + local from = task:get_from('mime') or {} + if from[1] and from[1].domain then + first_party = from[1].domain + end + end + + local cands = ctx.candidates or {} + if #cands > settings.max_candidates then + local tmp = {} + for i = 1, settings.max_candidates do tmp[i] = cands[i] end + cands = tmp + end + local affiliated = 0 + local trackerish = 0 + + local fp_tokens = etld1_tokens(first_party) + + for _, c in ipairs(cands) do + local etld1 = c.etld1 or c.host or '' + -- approximate etld1 from host when not provided (split last two labels) + do + local h = tostring(etld1) + local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$") + if p1 and p2 then etld1 = p1 .. "." .. p2 end + end + + local toks = etld1_tokens(etld1) + local sim = jaccard(fp_tokens, toks) + + if sim >= settings.min_similarity then + affiliated = affiliated + 1 + end + + -- very naive trackerish: all tokens are stopwords or too few tokens + local n_tokens, n_nonstop = 0, 0 + for _ in pairs(toks) do + n_tokens = n_tokens + 1; n_nonstop = n_nonstop + 1 + end + if n_nonstop == 0 then trackerish = trackerish + 1 end + end + + local res = { + affiliated_ratio = (#cands > 0) and (affiliated / #cands) or 0, + trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0, + } + + -- Simple CTA guess: first candidate with display_mismatch or earliest order + if #cands > 0 then + table.sort(cands, function(a, b) + if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end + if a.order ~= b.order then return a.order < b.order end + return a.part_order < b.part_order + end) + local cta = cands[1] + local etld1 = cta.etld1 or cta.host or '' + do + local h = tostring(etld1) + local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$") + if p1 and p2 then etld1 = p1 .. "." .. p2 end + end + local toks = etld1_tokens(etld1) + local sim = jaccard(fp_tokens, toks) + res.cta_affiliated = (sim >= settings.min_similarity) + res.cta_weight = (cta.display_mismatch and 1.0 or 0.5) + end + + return res +end + +return M diff --git a/lualib/lua_meta.lua b/lualib/lua_meta.lua index 446c2b4830..ecfabca002 100644 --- a/lualib/lua_meta.lua +++ b/lualib/lua_meta.lua @@ -330,6 +330,21 @@ local function meta_html_features_function(task) } end +local function meta_cta_function(task) + local mp = task:get_mempool() + local cta_aff = mp:get_variable("html_cta_affiliated", "double") or 0 + local cta_w = mp:get_variable("html_cta_weight", "double") or 0 + local aff_ratio = mp:get_variable("html_affiliated_links_ratio", "double") or 0 + local tr_ratio = mp:get_variable("html_trackerish_ratio", "double") or 0 + + return { + cta_aff, + cta_w, + aff_ratio, + tr_ratio, + } +end + local metafunctions = { { cb = meta_size_function, @@ -480,6 +495,22 @@ local metafunctions = { - reciprocal of total forms - ratio of forms posting to unaffiliated domains - ratio of forms posting to affiliated domains +]] + }, + { + cb = meta_cta_function, + ninputs = 4, + names = { + 'html_cta_affiliated', + 'html_cta_weight', + 'html_affiliated_links_ratio', + 'html_trackerish_ratio', + }, + description = [[CTA and affiliation metrics from lua_cta: + - CTA affiliated flag + - CTA weight heuristic + - affiliated links ratio among candidates + - trackerish domains ratio among candidates ]] }, } @@ -605,7 +636,7 @@ end exports.rspamd_count_metatokens = rspamd_count_metatokens exports.count_metatokens = rspamd_count_metatokens -exports.version = 2 -- MUST be increased on each change of metatokens +exports.version = 3 -- MUST be increased on each change of metatokens exports.add_metafunction = function(tbl) local ret, err = meta_schema(tbl) diff --git a/src/libmime/message.c b/src/libmime/message.c index c5bb003970..b592152704 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -818,6 +818,134 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated", (void *) &hf->forms_post_affiliated, NULL); } + + /* Optionally call CTA/affiliation Lua hook with capped candidates */ + if (task->cfg && task->cfg->lua_state) { + lua_State *L = task->cfg->lua_state; + int old_top = lua_gettop(L); + if (rspamd_lua_require_function(L, "lua_cta", "process_html_links")) { + /* Build ctx table with summary and limited candidates */ + lua_pushcfunction(L, &rspamd_lua_traceback); + int err_idx = lua_gettop(L); + lua_pushvalue(L, -2); /* function */ + + /* Arg1: task */ + struct rspamd_task **ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, rspamd_task_classname, -1); + *ptask = task; + /* Arg2: text part */ + struct rspamd_mime_text_part **ptxt = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *)); + rspamd_lua_setclass(L, rspamd_textpart_classname, -1); + *ptxt = text_part; + /* Arg3: ctx table */ + lua_createtable(L, 0, 4); + /* first_party_etld1 if any */ + if (text_part->html && text_part->html_features) { + /* Expose as string if derived */ + /* Not directly accessible; skip for now, Lua can derive from From: */ + } + /* Summary counters */ + lua_pushstring(L, "links_total"); + lua_pushinteger(L, (lua_Integer) text_part->html_features->links.total_links); + lua_settable(L, -3); + lua_pushstring(L, "domains_total"); + lua_pushinteger(L, (lua_Integer) text_part->html_features->links.domains_total); + lua_settable(L, -3); + lua_pushstring(L, "max_links_single_domain"); + lua_pushinteger(L, (lua_Integer) text_part->html_features->links.max_links_single_domain); + lua_settable(L, -3); + /* candidates array */ + lua_pushstring(L, "candidates"); + int max_candidates = 24; /* TODO: make configurable */ + lua_createtable(L, max_candidates, 0); + int nadded = 0; + if (text_part->mime_part && text_part->mime_part->urls && text_part->mime_part->urls->len > 0) { + unsigned int i; + for (i = 0; i < text_part->mime_part->urls->len && nadded < (unsigned) max_candidates; i++) { + struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i); + if (!u) continue; + /* filter: only http/https, visible */ + if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue; + if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue; + /* Build small table */ + lua_createtable(L, 0, 8); + /* host */ + lua_pushstring(L, "host"); + if (u->hostlen > 0) lua_pushlstring(L, rspamd_url_host_unsafe(u), u->hostlen); + else + lua_pushnil(L); + lua_settable(L, -3); + /* flags */ + lua_pushstring(L, "idn"); + lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_IDN)); + lua_settable(L, -3); + lua_pushstring(L, "numeric"); + lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_NUMERIC)); + lua_settable(L, -3); + lua_pushstring(L, "has_port"); + lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_HAS_PORT)); + lua_settable(L, -3); + lua_pushstring(L, "has_query"); + lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_QUERY)); + lua_settable(L, -3); + lua_pushstring(L, "display_mismatch"); + lua_pushboolean(L, (u->ext && u->ext->linked_url && u->ext->linked_url != u)); + lua_settable(L, -3); + /* order */ + lua_pushstring(L, "order"); + lua_pushinteger(L, (lua_Integer) u->order); + lua_settable(L, -3); + lua_pushstring(L, "part_order"); + lua_pushinteger(L, (lua_Integer) u->part_order); + lua_settable(L, -3); + /* etld1 computed in Lua if needed */ + lua_rawseti(L, -2, ++nadded); + } + } + lua_settable(L, -3); /* ctx.candidates = [...] */ + + if (lua_pcall(L, 3, 1, err_idx) != 0) { + msg_debug_task("lua_cta.process_html_links error: %s", lua_tostring(L, -1)); + } + else if (lua_istable(L, -1)) { + /* read result and expose mempool variables */ + lua_pushstring(L, "cta_affiliated"); + lua_gettable(L, -2); + if (lua_isboolean(L, -1)) { + static int val; + val = !!lua_toboolean(L, -1); + rspamd_mempool_set_variable(task->task_pool, "html_cta_affiliated", &val, NULL); + } + lua_pop(L, 1); + lua_pushstring(L, "cta_weight"); + lua_gettable(L, -2); + if (lua_isnumber(L, -1)) { + static double w; + w = lua_tonumber(L, -1); + rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL); + } + lua_pop(L, 1); + lua_pushstring(L, "affiliated_ratio"); + lua_gettable(L, -2); + if (lua_isnumber(L, -1)) { + static double r; + r = lua_tonumber(L, -1); + rspamd_mempool_set_variable(task->task_pool, "html_affiliated_links_ratio", &r, NULL); + } + lua_pop(L, 1); + lua_pushstring(L, "trackerish_ratio"); + lua_gettable(L, -2); + if (lua_isnumber(L, -1)) { + static double tr; + tr = lua_tonumber(L, -1); + rspamd_mempool_set_variable(task->task_pool, "html_trackerish_ratio", &tr, NULL); + } + lua_pop(L, 1); + } + + lua_settop(L, old_top); + } + } rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); if (text_part->utf_content.len == 0) {