]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Add CTA analytics engine
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 8 Sep 2025 16:51:21 +0000 (17:51 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 8 Sep 2025 16:51:21 +0000 (17:51 +0100)
lualib/lua_cta.lua [new file with mode: 0644]
lualib/lua_meta.lua
src/libmime/message.c

diff --git a/lualib/lua_cta.lua b/lualib/lua_cta.lua
new file mode 100644 (file)
index 0000000..7bc5dcd
--- /dev/null
@@ -0,0 +1,169 @@
+--[[
+CTA and link affiliation analysis
+
+Purpose:
+- Given a capped list of candidate links extracted in C during HTML parsing,
+  compute simple affiliation scores between those links and the sender’s
+  first-party domain, and pick a likely CTA (call-to-action) link.
+
+How it is called:
+- C code (message processing after HTML parsing) loads this function via
+  `rspamd_lua_require_function(L, "lua_cta", "process_html_links")` and calls
+  `process_html_links(task, part, ctx)` once per HTML text part.
+
+Inputs (ctx table):
+- links_total: total number of links in the part (summary; may be omitted)
+- domains_total: number of distinct link domains (summary)
+- max_links_single_domain: maximum links seen for a single domain (summary)
+- candidates: array (capped in C, default 24) of small objects with fields:
+  - host: link host (string)
+  - idn, numeric, has_port, has_query, display_mismatch: booleans
+  - order, part_order: integers (ordering hints)
+  - etld1: optional eTLD+1 (if not set, this module approximates from host)
+
+Outputs (returned table):
+- cta_affiliated: boolean – whether the selected CTA appears affiliated
+- cta_weight: number – simple weight hint (e.g. 1.0 if display mismatch)
+- affiliated_ratio: number – fraction of candidates considered affiliated
+- trackerish_ratio: number – fraction of candidates that look trackerish
+
+Configuration (rspamd.conf):
+- Use the `link_affiliation { ... }` section.
+- Options:
+  - stopwords: map (set/regexp/glob) used to strip common tracking tokens from
+               domains when computing token overlap
+  - whitelist / blacklist: optional maps (set) to tweak affiliation
+  - min_similarity: number (default 0.5) – Jaccard threshold for affiliation
+  - max_candidates: number (default 24) – extra Lua-side cap (C caps as well)
+
+This module keeps all heavy config logic in Lua using lua_maps and only relies
+on C to provide a bounded set of safe, pre-filtered candidates.
+]]
+local M = {}
+
+local lua_util = require "lua_util"
+local lua_maps = require "lua_maps"
+
+local settings = {
+  min_similarity = 0.5,
+  max_candidates = 24,
+  stopwords = nil,
+  whitelist = nil,
+  blacklist = nil,
+}
+
+local function load_settings()
+  local cfg = rawget(_G, 'rspamd_config')
+  local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {}
+  settings = lua_util.override_defaults(settings, opts)
+  -- Convert map definitions to maps if needed
+  if settings.stopwords and (type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key) then
+    settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
+  end
+  if settings.whitelist and (type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key) then
+    settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
+  end
+  if settings.blacklist and (type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key) then
+    settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
+  end
+end
+
+load_settings()
+
+local function etld1_tokens(dom)
+  local t = {}
+  for token in string.gmatch(string.lower(dom or ''), "[a-z0-9]+") do
+    if not (settings.stopwords and settings.stopwords:get_key(token)) then
+      t[token] = true
+    end
+  end
+  return t
+end
+
+local function jaccard(a, b)
+  local inter, uni = 0, 0
+  for k in pairs(a) do
+    if b[k] then inter = inter + 1 end
+    uni = uni + 1
+  end
+  for k in pairs(b) do
+    if not a[k] then uni = uni + 1 end
+  end
+  if uni == 0 then return 0 end
+  return inter / uni
+end
+
+M.process_html_links = function(task, part, ctx)
+  local first_party = nil
+  -- Derive first-party from From: if not provided
+  do
+    local from = task:get_from('mime') or {}
+    if from[1] and from[1].domain then
+      first_party = from[1].domain
+    end
+  end
+
+  local cands = ctx.candidates or {}
+  if #cands > settings.max_candidates then
+    local tmp = {}
+    for i = 1, settings.max_candidates do tmp[i] = cands[i] end
+    cands = tmp
+  end
+  local affiliated = 0
+  local trackerish = 0
+
+  local fp_tokens = etld1_tokens(first_party)
+
+  for _, c in ipairs(cands) do
+    local etld1 = c.etld1 or c.host or ''
+    -- approximate etld1 from host when not provided (split last two labels)
+    do
+      local h = tostring(etld1)
+      local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
+      if p1 and p2 then etld1 = p1 .. "." .. p2 end
+    end
+
+    local toks = etld1_tokens(etld1)
+    local sim = jaccard(fp_tokens, toks)
+
+    if sim >= settings.min_similarity then
+      affiliated = affiliated + 1
+    end
+
+    -- very naive trackerish: all tokens are stopwords or too few tokens
+    local n_tokens, n_nonstop = 0, 0
+    for _ in pairs(toks) do
+      n_tokens = n_tokens + 1; n_nonstop = n_nonstop + 1
+    end
+    if n_nonstop == 0 then trackerish = trackerish + 1 end
+  end
+
+  local res = {
+    affiliated_ratio = (#cands > 0) and (affiliated / #cands) or 0,
+    trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
+  }
+
+  -- Simple CTA guess: first candidate with display_mismatch or earliest order
+  if #cands > 0 then
+    table.sort(cands, function(a, b)
+      if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
+      if a.order ~= b.order then return a.order < b.order end
+      return a.part_order < b.part_order
+    end)
+    local cta = cands[1]
+    local etld1 = cta.etld1 or cta.host or ''
+    do
+      local h = tostring(etld1)
+      local p1, p2 = string.match(h, "([^.]+)%.([^.]+)$")
+      if p1 and p2 then etld1 = p1 .. "." .. p2 end
+    end
+    local toks = etld1_tokens(etld1)
+    local sim = jaccard(fp_tokens, toks)
+    res.cta_affiliated = (sim >= settings.min_similarity)
+    res.cta_weight = (cta.display_mismatch and 1.0 or 0.5)
+  end
+
+  return res
+end
+
+return M
index 446c2b4830aac834d536b90e1714232e65636a1a..ecfabca002efc25faba283cb3b3beb3ac3414bf5 100644 (file)
@@ -330,6 +330,21 @@ local function meta_html_features_function(task)
   }
 end
 
+local function meta_cta_function(task)
+  local mp = task:get_mempool()
+  local cta_aff = mp:get_variable("html_cta_affiliated", "double") or 0
+  local cta_w = mp:get_variable("html_cta_weight", "double") or 0
+  local aff_ratio = mp:get_variable("html_affiliated_links_ratio", "double") or 0
+  local tr_ratio = mp:get_variable("html_trackerish_ratio", "double") or 0
+
+  return {
+    cta_aff,
+    cta_w,
+    aff_ratio,
+    tr_ratio,
+  }
+end
+
 local metafunctions = {
   {
     cb = meta_size_function,
@@ -480,6 +495,22 @@ local metafunctions = {
     - reciprocal of total forms
     - ratio of forms posting to unaffiliated domains
     - ratio of forms posting to affiliated domains
+]]
+  },
+  {
+    cb = meta_cta_function,
+    ninputs = 4,
+    names = {
+      'html_cta_affiliated',
+      'html_cta_weight',
+      'html_affiliated_links_ratio',
+      'html_trackerish_ratio',
+    },
+    description = [[CTA and affiliation metrics from lua_cta:
+    - CTA affiliated flag
+    - CTA weight heuristic
+    - affiliated links ratio among candidates
+    - trackerish domains ratio among candidates
 ]]
   },
 }
@@ -605,7 +636,7 @@ end
 
 exports.rspamd_count_metatokens = rspamd_count_metatokens
 exports.count_metatokens = rspamd_count_metatokens
-exports.version = 2 -- MUST be increased on each change of metatokens
+exports.version = 3 -- MUST be increased on each change of metatokens
 
 exports.add_metafunction = function(tbl)
   local ret, err = meta_schema(tbl)
index c5bb0039707bcd9429a267df8ac71917c613d747..b59215270444c321a296bf20a48bc764821b6d93 100644 (file)
@@ -818,6 +818,134 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
                rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
                                                                        (void *) &hf->forms_post_affiliated, NULL);
        }
+
+       /* Optionally call CTA/affiliation Lua hook with capped candidates */
+       if (task->cfg && task->cfg->lua_state) {
+               lua_State *L = task->cfg->lua_state;
+               int old_top = lua_gettop(L);
+               if (rspamd_lua_require_function(L, "lua_cta", "process_html_links")) {
+                       /* Build ctx table with summary and limited candidates */
+                       lua_pushcfunction(L, &rspamd_lua_traceback);
+                       int err_idx = lua_gettop(L);
+                       lua_pushvalue(L, -2); /* function */
+
+                       /* Arg1: task */
+                       struct rspamd_task **ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+                       rspamd_lua_setclass(L, rspamd_task_classname, -1);
+                       *ptask = task;
+                       /* Arg2: text part */
+                       struct rspamd_mime_text_part **ptxt = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+                       rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+                       *ptxt = text_part;
+                       /* Arg3: ctx table */
+                       lua_createtable(L, 0, 4);
+                       /* first_party_etld1 if any */
+                       if (text_part->html && text_part->html_features) {
+                               /* Expose as string if derived */
+                               /* Not directly accessible; skip for now, Lua can derive from From: */
+                       }
+                       /* Summary counters */
+                       lua_pushstring(L, "links_total");
+                       lua_pushinteger(L, (lua_Integer) text_part->html_features->links.total_links);
+                       lua_settable(L, -3);
+                       lua_pushstring(L, "domains_total");
+                       lua_pushinteger(L, (lua_Integer) text_part->html_features->links.domains_total);
+                       lua_settable(L, -3);
+                       lua_pushstring(L, "max_links_single_domain");
+                       lua_pushinteger(L, (lua_Integer) text_part->html_features->links.max_links_single_domain);
+                       lua_settable(L, -3);
+                       /* candidates array */
+                       lua_pushstring(L, "candidates");
+                       int max_candidates = 24; /* TODO: make configurable */
+                       lua_createtable(L, max_candidates, 0);
+                       int nadded = 0;
+                       if (text_part->mime_part && text_part->mime_part->urls && text_part->mime_part->urls->len > 0) {
+                               unsigned int i;
+                               for (i = 0; i < text_part->mime_part->urls->len && nadded < (unsigned) max_candidates; i++) {
+                                       struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i);
+                                       if (!u) continue;
+                                       /* filter: only http/https, visible */
+                                       if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+                                       if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+                                       /* Build small table */
+                                       lua_createtable(L, 0, 8);
+                                       /* host */
+                                       lua_pushstring(L, "host");
+                                       if (u->hostlen > 0) lua_pushlstring(L, rspamd_url_host_unsafe(u), u->hostlen);
+                                       else
+                                               lua_pushnil(L);
+                                       lua_settable(L, -3);
+                                       /* flags */
+                                       lua_pushstring(L, "idn");
+                                       lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_IDN));
+                                       lua_settable(L, -3);
+                                       lua_pushstring(L, "numeric");
+                                       lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_NUMERIC));
+                                       lua_settable(L, -3);
+                                       lua_pushstring(L, "has_port");
+                                       lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_HAS_PORT));
+                                       lua_settable(L, -3);
+                                       lua_pushstring(L, "has_query");
+                                       lua_pushboolean(L, !!(u->flags & RSPAMD_URL_FLAG_QUERY));
+                                       lua_settable(L, -3);
+                                       lua_pushstring(L, "display_mismatch");
+                                       lua_pushboolean(L, (u->ext && u->ext->linked_url && u->ext->linked_url != u));
+                                       lua_settable(L, -3);
+                                       /* order */
+                                       lua_pushstring(L, "order");
+                                       lua_pushinteger(L, (lua_Integer) u->order);
+                                       lua_settable(L, -3);
+                                       lua_pushstring(L, "part_order");
+                                       lua_pushinteger(L, (lua_Integer) u->part_order);
+                                       lua_settable(L, -3);
+                                       /* etld1 computed in Lua if needed */
+                                       lua_rawseti(L, -2, ++nadded);
+                               }
+                       }
+                       lua_settable(L, -3); /* ctx.candidates = [...] */
+
+                       if (lua_pcall(L, 3, 1, err_idx) != 0) {
+                               msg_debug_task("lua_cta.process_html_links error: %s", lua_tostring(L, -1));
+                       }
+                       else if (lua_istable(L, -1)) {
+                               /* read result and expose mempool variables */
+                               lua_pushstring(L, "cta_affiliated");
+                               lua_gettable(L, -2);
+                               if (lua_isboolean(L, -1)) {
+                                       static int val;
+                                       val = !!lua_toboolean(L, -1);
+                                       rspamd_mempool_set_variable(task->task_pool, "html_cta_affiliated", &val, NULL);
+                               }
+                               lua_pop(L, 1);
+                               lua_pushstring(L, "cta_weight");
+                               lua_gettable(L, -2);
+                               if (lua_isnumber(L, -1)) {
+                                       static double w;
+                                       w = lua_tonumber(L, -1);
+                                       rspamd_mempool_set_variable(task->task_pool, "html_cta_weight", &w, NULL);
+                               }
+                               lua_pop(L, 1);
+                               lua_pushstring(L, "affiliated_ratio");
+                               lua_gettable(L, -2);
+                               if (lua_isnumber(L, -1)) {
+                                       static double r;
+                                       r = lua_tonumber(L, -1);
+                                       rspamd_mempool_set_variable(task->task_pool, "html_affiliated_links_ratio", &r, NULL);
+                               }
+                               lua_pop(L, 1);
+                               lua_pushstring(L, "trackerish_ratio");
+                               lua_gettable(L, -2);
+                               if (lua_isnumber(L, -1)) {
+                                       static double tr;
+                                       tr = lua_tonumber(L, -1);
+                                       rspamd_mempool_set_variable(task->task_pool, "html_trackerish_ratio", &tr, NULL);
+                               }
+                               lua_pop(L, 1);
+                       }
+
+                       lua_settop(L, old_top);
+               }
+       }
        rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
        if (text_part->utf_content.len == 0) {