--- /dev/null
+--[[
+CTA and link affiliation analysis
+
+Purpose:
+- Given a capped list of candidate links extracted in C during HTML parsing,
+ compute simple affiliation scores between those links and the sender’s
+ first-party domain, and pick a likely CTA (call-to-action) link.
+
+How it is called:
+- C code (message processing after HTML parsing) loads this function via
+ `rspamd_lua_require_function(L, "lua_cta", "process_html_links")` and calls
+ `process_html_links(task, part, ctx)` once per HTML text part.
+
+Inputs (ctx table):
+- links_total: total number of links in the part (summary; may be omitted)
+- domains_total: number of distinct link domains (summary)
+- max_links_single_domain: maximum links seen for a single domain (summary)
+- candidates: array (capped in C, default 24) of small objects with fields:
+ - host: link host (string)
+ - idn, numeric, has_port, has_query, display_mismatch: booleans
+ - order, part_order: integers (ordering hints)
+ - etld1: optional eTLD+1 (if not set, this module approximates from host)
+
+Outputs (returned table):
+- cta_affiliated: boolean – whether the selected CTA appears affiliated
+- cta_weight: number – simple weight hint (e.g. 1.0 if display mismatch)
+- affiliated_ratio: number – fraction of candidates considered affiliated
+- trackerish_ratio: number – fraction of candidates that look trackerish
+
+Configuration (rspamd.conf):
+- Use the `link_affiliation { ... }` section.
+- Options:
+ - stopwords: map (set/regexp/glob) used to strip common tracking tokens from
+ domains when computing token overlap
+ - whitelist / blacklist: optional maps (set) to tweak affiliation
+ - min_similarity: number (default 0.5) – Jaccard threshold for affiliation
+ - max_candidates: number (default 24) – extra Lua-side cap (C caps as well)
+
+This module keeps all heavy config logic in Lua using lua_maps and only relies
+on C to provide a bounded set of safe, pre-filtered candidates.
+]]
+local M = {}
+
+local lua_util = require "lua_util"
+local lua_maps = require "lua_maps"
+
+local settings = {
+ min_similarity = 0.5,
+ max_candidates = 24,
+ stopwords = nil,
+ whitelist = nil,
+ blacklist = nil,
+}
+
+local function load_settings()
+ local cfg = rawget(_G, 'rspamd_config')
+ local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {}
+ settings = lua_util.override_defaults(settings, opts)
+ -- Convert map definitions to maps if needed
+ if settings.stopwords and (type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key) then
+ settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
+ end
+ if settings.whitelist and (type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key) then
+ settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
+ end
+ if settings.blacklist and (type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key) then
+ settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
+ end
+end
+
+load_settings()
+
+local function etld1_tokens(dom)
+ local t = {}
+ for token in string.gmatch(string.lower(dom or ''), "[a-z0-9]+") do
+ if not (settings.stopwords and settings.stopwords:get_key(token)) then
+ t[token] = true
+ end
+ end
+ return t
+end
+
+local function jaccard(a, b)
+ local inter, uni = 0, 0
+ for k in pairs(a) do
+ if b[k] then inter = inter + 1 end
+ uni = uni + 1
+ end
+ for k in pairs(b) do
+ if not a[k] then uni = uni + 1 end
+ end
+ if uni == 0 then return 0 end
+ return inter / uni
+end
+
+M.process_html_links = function(task, part, ctx)
+ local first_party = nil
+ -- Derive first-party from From: if not provided
+ do
+ local from = task:get_from('mime') or {}
+ if from[1] and from[1].domain then
+ first_party = from[1].domain
+ end
+ end
+
+ local cands = ctx.candidates or {}
+ if #cands > settings.max_candidates then
+ local tmp = {}
+ for i = 1, settings.max_candidates do tmp[i] = cands[i] end
+ cands = tmp
+ end
+ local affiliated = 0
+ local trackerish = 0
+
+ local fp_tokens = etld1_tokens(first_party)
+
+ for _, c in ipairs(cands) do
+ local etld1 = c.etld1 or c.host or ''
+ -- approximate etld1 from host when not provided (split last two labels)
+ do
+ local h = tostring(etld1)
+ local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
+ if p1 and p2 then etld1 = p1 .. "." .. p2 end
+ end
+
+ local toks = etld1_tokens(etld1)
+ local sim = jaccard(fp_tokens, toks)
+
+ if sim >= settings.min_similarity then
+ affiliated = affiliated + 1
+ end
+
+ -- very naive trackerish: all tokens are stopwords or too few tokens
+ local n_tokens, n_nonstop = 0, 0
+ for _ in pairs(toks) do
+ n_tokens = n_tokens + 1; n_nonstop = n_nonstop + 1
+ end
+ if n_nonstop == 0 then trackerish = trackerish + 1 end
+ end
+
+ local res = {
+ affiliated_ratio = (#cands > 0) and (affiliated / #cands) or 0,
+ trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
+ }
+
+ -- Simple CTA guess: first candidate with display_mismatch or earliest order
+ if #cands > 0 then
+ table.sort(cands, function(a, b)
+ if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
+ if a.order ~= b.order then return a.order < b.order end
+ return a.part_order < b.part_order
+ end)
+ local cta = cands[1]
+ local etld1 = cta.etld1 or cta.host or ''
+ do
+ local h = tostring(etld1)
+ local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
+ if p1 and p2 then etld1 = p1 .. "." .. p2 end
+ end
+ local toks = etld1_tokens(etld1)
+ local sim = jaccard(fp_tokens, toks)
+ res.cta_affiliated = (sim >= settings.min_similarity)
+ res.cta_weight = (cta.display_mismatch and 1.0 or 0.5)
+ end
+
+ return res
+end
+
+return M
}
end
+local function meta_cta_function(task)
+ local mp = task:get_mempool()
+ local cta_aff = mp:get_variable("html_cta_affiliated", "double") or 0
+ local cta_w = mp:get_variable("html_cta_weight", "double") or 0
+ local aff_ratio = mp:get_variable("html_affiliated_links_ratio", "double") or 0
+ local tr_ratio = mp:get_variable("html_trackerish_ratio", "double") or 0
+
+ return {
+ cta_aff,
+ cta_w,
+ aff_ratio,
+ tr_ratio,
+ }
+end
+
local metafunctions = {
{
cb = meta_size_function,
- reciprocal of total forms
- ratio of forms posting to unaffiliated domains
- ratio of forms posting to affiliated domains
+]]
+ },
+ {
+ cb = meta_cta_function,
+ ninputs = 4,
+ names = {
+ 'html_cta_affiliated',
+ 'html_cta_weight',
+ 'html_affiliated_links_ratio',
+ 'html_trackerish_ratio',
+ },
+ description = [[CTA and affiliation metrics from lua_cta:
+ - CTA affiliated flag
+ - CTA weight heuristic
+ - affiliated links ratio among candidates
+ - trackerish domains ratio among candidates
]]
},
}
exports.rspamd_count_metatokens = rspamd_count_metatokens
exports.count_metatokens = rspamd_count_metatokens
-exports.version = 2 -- MUST be increased on each change of metatokens
+exports.version = 3 -- MUST be increased on each change of metatokens
exports.add_metafunction = function(tbl)
local ret, err = meta_schema(tbl)
rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
(void *) &hf->forms_post_affiliated, NULL);
}
+
+ /* Optionally call CTA/affiliation Lua hook with capped candidates */
+ if (task->cfg && task->cfg->lua_state) {
+ lua_State *L = task->cfg->lua_state;
+ int old_top = lua_gettop(L);
+ if (rspamd_lua_require_function(L, "lua_cta", "process_html_links")) {
+ /* Build ctx table with summary and limited candidates */
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ int err_idx = lua_gettop(L);
+ lua_pushvalue(L, -2); /* function */
+
+ /* Arg1: task */
+ struct rspamd_task **ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, rspamd_task_classname, -1);
+ *ptask = task;
+ /* Arg2: text part */
+ struct rspamd_mime_text_part **ptxt = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+ rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+ *ptxt = text_part;
+ /* Arg3: ctx table */
+ lua_createtable(L, 0, 4);
+ /* first_party_etld1 if any */
+ if (text_part->html && text_part->html_features) {
+ /* Expose as string if derived */
+ /* Not directly accessible; skip for now, Lua can derive from From: */
+ }
+ /* Summary counters */
+ lua_pushstring(L, "links_total");
+ lua_pushinteger(L, (lua_Integer) text_part->html_features->links.total_links);
+ lua_settable(L, -3);
+ lua_pushstring(L, "domains_total");
+ lua_pushinteger(L, (lua_Integer) text_part->html_features->links.domains_total);
+ lua_settable(L, -3);
+ lua_pushstring(L, "max_links_single_domain");
+ lua_pushinteger(L, (lua_Integer) text_part->html_features->links.max_links_single_domain);
+ lua_settable(L, -3);
+ /* candidates array */
+ lua_pushstring(L, "candidates");
+ int max_candidates = 24; /* TODO: make configurable */
+ lua_createtable(L, max_candidates, 0);
+ int nadded = 0;
+ if (text_part->mime_part && text_part->mime_part->urls && text_part->mime_part->urls->len > 0) {
+ unsigned int i;
+ for (i = 0; i < text_part->mime_part->urls->len && nadded < (unsigned) max_candidates; i++) {
+ struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i);
+ if (!u) continue;
+ /* filter: only http/https, visible */
+ if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+ if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+ /* Build small table */
+ lua_createtable(L, 0, 8);
+ /* host */
+ lua_pushstring(L, "host");
+ if (u->hostlen > 0) lua_pushlstring(L, rspamd_url_host_unsafe(u), u->hostlen);
+ else
+ lua_pushnil(L);
+ lua_settable(L, -3);
+ /* flags */
+ lua_pushstring(L, "idn");
+ lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_IDN));
+ lua_settable(L, -3);
+ lua_pushstring(L, "numeric");
+ lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_NUMERIC));
+ lua_settable(L, -3);
+ lua_pushstring(L, "has_port");
+ lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_HAS_PORT));
+ lua_settable(L, -3);
+ lua_pushstring(L, "has_query");
+ lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_QUERY));
+ lua_settable(L, -3);
+ lua_pushstring(L, "display_mismatch");
+ lua_pushboolean(L, (u->ext && u->ext->linked_url && u->ext->linked_url != u));
+ lua_settable(L, -3);
+ /* order */
+ lua_pushstring(L, "order");
+ lua_pushinteger(L, (lua_Integer) u->order);
+ lua_settable(L, -3);
+ lua_pushstring(L, "part_order");
+ lua_pushinteger(L, (lua_Integer) u->part_order);
+ lua_settable(L, -3);
+ /* etld1 computed in Lua if needed */
+ lua_rawseti(L, -2, ++nadded);
+ }
+ }
+ lua_settable(L, -3); /* ctx.candidates = [...] */
+
+ if (lua_pcall(L, 3, 1, err_idx) != 0) {
+ msg_debug_task("lua_cta.process_html_links error: %s", lua_tostring(L, -1));
+ }
+ else if (lua_istable(L, -1)) {
+ /* read result and expose mempool variables */
+ lua_pushstring(L, "cta_affiliated");
+ lua_gettable(L, -2);
+ if (lua_isboolean(L, -1)) {
+ static int val;
+ val = !!lua_toboolean(L, -1);
+ rspamd_mempool_set_variable(task->task_pool, "html_cta_affiliated", &val, NULL);
+ }
+ lua_pop(L, 1);
+ lua_pushstring(L, "cta_weight");
+ lua_gettable(L, -2);
+ if (lua_isnumber(L, -1)) {
+ static double w;
+ w = lua_tonumber(L, -1);
+ rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL);
+ }
+ lua_pop(L, 1);
+ lua_pushstring(L, "affiliated_ratio");
+ lua_gettable(L, -2);
+ if (lua_isnumber(L, -1)) {
+ static double r;
+ r = lua_tonumber(L, -1);
+ rspamd_mempool_set_variable(task->task_pool, "html_affiliated_links_ratio", &r, NULL);
+ }
+ lua_pop(L, 1);
+ lua_pushstring(L, "trackerish_ratio");
+ lua_gettable(L, -2);
+ if (lua_isnumber(L, -1)) {
+ static double tr;
+ tr = lua_tonumber(L, -1);
+ rspamd_mempool_set_variable(task->task_pool, "html_trackerish_ratio", &tr, NULL);
+ }
+ lua_pop(L, 1);
+ }
+
+ lua_settop(L, old_top);
+ }
+ }
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
if (text_part->utf_content.len == 0) {