]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Simplify features transition 5608/head
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 11 Sep 2025 11:49:13 +0000 (12:49 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 11 Sep 2025 11:49:13 +0000 (12:49 +0100)
lualib/lua_meta.lua
src/libmime/message.c
src/libserver/html/html.cxx
src/libserver/html/html_features.h
src/lua/lua_html.cxx

index ecfabca002efc25faba283cb3b3beb3ac3414bf5..d969fd156df1494c28c19e455b4305c3931f8493 100644 (file)
@@ -19,6 +19,7 @@ local exports = {}
 local N = "metatokens"
 local ts = require("tableshape").types
 local logger = require "rspamd_logger"
+local lua_mime = require "lua_mime"
 
 -- Metafunctions
 local function meta_size_function(task)
@@ -279,17 +280,28 @@ local function meta_words_function(task)
 end
 
 local function meta_html_features_function(task)
-  local mp = task:get_mempool()
-  local lt = mp:get_variable("html_links_total", "int") or 0
-  local http = mp:get_variable("html_links_http", "int") or 0
-  local ql = mp:get_variable("html_links_query", "int") or 0
-  local same = mp:get_variable("html_links_same_etld1", "int") or 0
-  local dom_total = mp:get_variable("html_links_domains_total", "int") or 0
-  local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0
+  local lt, http, ql, same, dom_total, max_per_dom = 0, 0, 0, 0, 0, 0
+  local ft, fua, fa = 0, 0, 0
+
+  local sel_part = lua_mime.get_displayed_text_part(task)
+  if sel_part and sel_part:is_html() then
+    local html = sel_part:get_html()
+    if html and html.get_features then
+      local f = html:get_features()
+      if f and f.links then
+        lt = f.links.total_links or 0
+        http = f.links.http_links or 0
+        ql = f.links.query_links or 0
+        same = f.links.same_etld1_links or 0
+        dom_total = f.links.domains_total or 0
+        max_per_dom = f.links.max_links_single_domain or 0
+      end
+      ft = f.forms_count or 0
+      fua = f.forms_post_unaffiliated or 0
+      fa = f.forms_post_affiliated or 0
+    end
+  end
 
-  local ft = mp:get_variable("html_forms_total", "int") or 0
-  local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0
-  local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0
 
   local nhtml_links = 0
   local http_ratio = 0
@@ -345,6 +357,44 @@ local function meta_cta_function(task)
   }
 end
 
+local function meta_html_visibility_function(task)
+  local sel_part = lua_mime.get_displayed_text_part(task)
+  local hidden_ratio, transparent_ratio = 0, 0
+  local blkh, blkt, off, mref, mrefu = 0, 0, 0, 0, 0
+
+  if sel_part and sel_part:is_html() then
+    local html = sel_part:get_html()
+    if html and html.get_features then
+      local f = html:get_features() or {}
+      local vis = f.text_visible or 0
+      local hid = f.text_hidden or 0
+      local transp = f.text_transparent or 0
+      local total = vis + hid
+      if total > 0 then
+        hidden_ratio = hid / total
+        transparent_ratio = transp / total
+      end
+      blkh = f.blocks_hidden or 0
+      blkt = f.blocks_transparent or 0
+      off = f.offscreen_blocks or 0
+      mref = f.meta_refresh or 0
+      mrefu = f.meta_refresh_urls or 0
+    end
+  end
+
+  -- no mempool fallback; individual mempool exports were removed
+
+  return {
+    hidden_ratio,
+    transparent_ratio,
+    blkh,
+    blkt,
+    off,
+    mref,
+    mrefu,
+  }
+end
+
 local metafunctions = {
   {
     cb = meta_size_function,
@@ -511,6 +561,28 @@ local metafunctions = {
     - CTA weight heuristic
     - affiliated links ratio among candidates
     - trackerish domains ratio among candidates
+]]
+  },
+  {
+    cb = meta_html_visibility_function,
+    ninputs = 7,
+    names = {
+      'html_hidden_text_ratio',
+      'html_transparent_text_ratio',
+      'html_hidden_blocks',
+      'html_transparent_blocks',
+      'html_offscreen_blocks',
+      'html_meta_refresh',
+      'html_meta_refresh_urls',
+    },
+    description = [[HTML hidden/offscreen/obfuscation features and meta refresh counters:
+    - ratio of hidden text to total text
+    - ratio of transparent text to total text
+    - number of hidden text blocks appended
+    - number of transparent blocks
+    - number of offscreen-styled blocks
+    - number of meta refresh tags
+    - number of meta refresh URLs extracted
 ]]
   },
 }
index 4721a65c59a3cbf210328a57fe97004793580730..bff822a40fef728bfb3ad575b7faa598137c8ab3 100644 (file)
@@ -794,30 +794,6 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
 
        /* Wire aggregated HTML features */
        text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
-       /* Expose a few mempool variables for Lua meta to start experimenting */
-       if (text_part->html_features) {
-               const struct rspamd_html_features *hf = text_part->html_features;
-               rspamd_mempool_set_variable(task->task_pool, "html_links_total",
-                                                                       (void *) &hf->links.total_links, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_links_http",
-                                                                       (void *) &hf->links.http_links, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_links_query",
-                                                                       (void *) &hf->links.query_links, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1",
-                                                                       (void *) &hf->links.same_etld1_links, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total",
-                                                                       (void *) &hf->links.domains_total, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain",
-                                                                       (void *) &hf->links.max_links_single_domain, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_images_total",
-                                                                       (void *) &hf->images_total, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_forms_total",
-                                                                       (void *) &hf->forms_count, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated",
-                                                                       (void *) &hf->forms_post_unaffiliated, NULL);
-               rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
-                                                                       (void *) &hf->forms_post_affiliated, NULL);
-       }
 
        /* Optionally call CTA/affiliation Lua hook with capped candidates */
        if (task->cfg && task->cfg->lua_state) {
index 8ef1520db293eb1cd4db8be80f926979e7523b13..c0a2b90e99cf2fa7f631cb9c199c30f4813b7be2 100644 (file)
@@ -1677,6 +1677,48 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
        if (maybe_bgcolor) {
                tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
        }
+
+       /* Offscreen heuristic: negative text-indent or large negative left/top */
+       if (auto style = tag->find_style()) {
+               auto sv = *style;
+               /* text-indent */
+               auto p_ti = rspamd_substring_search_caseless(sv.data(), sv.size(), "text-indent", sizeof("text-indent") - 1);
+               if (p_ti != -1) {
+                       /* look ahead for '-' before a digit */
+                       for (std::size_t i = p_ti; i < sv.size(); i++) {
+                               char c = sv[i];
+                               if (c == '-') {
+                                       /* consider offscreen */
+                                       hc->features.offscreen_blocks++;
+                                       break;
+                               }
+                               if (g_ascii_isdigit(c)) break;
+                       }
+               }
+               /* left/top negative absolute */
+               auto p_left = rspamd_substring_search_caseless(sv.data(), sv.size(), "left", sizeof("left") - 1);
+               if (p_left != -1) {
+                       for (std::size_t i = p_left; i < sv.size(); i++) {
+                               char c = sv[i];
+                               if (c == '-') {
+                                       hc->features.offscreen_blocks++;
+                                       break;
+                               }
+                               if (g_ascii_isdigit(c)) break;
+                       }
+               }
+               auto p_top = rspamd_substring_search_caseless(sv.data(), sv.size(), "top", sizeof("top") - 1);
+               if (p_top != -1) {
+                       for (std::size_t i = p_top; i < sv.size(); i++) {
+                               char c = sv[i];
+                               if (c == '-') {
+                                       hc->features.offscreen_blocks++;
+                                       break;
+                               }
+                               if (g_ascii_isdigit(c)) break;
+                       }
+               }
+       }
 }
 
 static inline auto
@@ -1740,6 +1782,21 @@ html_append_parsed(struct html_content *hc,
                                return !g_ascii_isspace(c);
                        },
                        ' ');
+               /* Accumulate transparent text bytes */
+               hc->features.text_transparent += (unsigned int) nlen;
+               hc->features.blocks_transparent++;
+       }
+       else {
+               /* Visible or hidden text accounted outside; keep helper here for visible path */
+               if (&dest == &hc->parsed) {
+                       /* Visible text */
+                       hc->features.text_visible += (unsigned int) nlen;
+               }
+               else if (&dest == &hc->invisible) {
+                       /* Hidden text */
+                       hc->features.text_hidden += (unsigned int) nlen;
+                       hc->features.blocks_hidden++;
+               }
        }
 
        return nlen;
@@ -2143,6 +2200,37 @@ auto html_process_input(struct rspamd_task *task,
                                }
                        }
                        break;
+               case Tag_META: {
+                       /* Detect meta refresh */
+                       auto http_equiv = cur_tag->find_component<html_component_http_equiv>();
+                       if (http_equiv) {
+                               auto hv = http_equiv.value()->value;
+                               if (hv.size() >= sizeof("refresh") - 1 &&
+                                       g_ascii_strncasecmp(hv.data(), "refresh", hv.size()) == 0) {
+                                       hc->features.meta_refresh++;
+                                       /* Try to extract URL from content */
+                                       if (auto content = cur_tag->find_component<html_component_content>()) {
+                                               auto cv = content.value()->value;
+                                               /* naive parse: look for 'url=' and capture token */
+                                               auto p = rspamd_substring_search_caseless(cv.data(), cv.size(), "url=", sizeof("url=") - 1);
+                                               if (p != -1) {
+                                                       std::string_view urlv{cv.data() + p + (sizeof("url=") - 1), cv.size() - (p + (sizeof("url=") - 1))};
+                                                       /* Trim quotes/spaces and trailing separators */
+                                                       while (!urlv.empty() && (g_ascii_isspace(urlv.front()) || urlv.front() == '\'' || urlv.front() == '"')) urlv.remove_prefix(1);
+                                                       while (!urlv.empty() && (urlv.back() == ';' || urlv.back() == '\'' || urlv.back() == '"' || g_ascii_isspace(urlv.back()))) urlv.remove_suffix(1);
+                                                       if (!urlv.empty()) {
+                                                               /* validate and count; do not add to urls set */
+                                                               auto maybe_url = html_process_url(pool, urlv);
+                                                               if (maybe_url) {
+                                                                       hc->features.meta_refresh_urls++;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+               }
                case Tag_INPUT: {
                        if (auto type_comp = cur_tag->find_component<html_component_type>()) {
                                auto tv = type_comp.value()->get_string_value();
@@ -3030,6 +3118,22 @@ auto html_process_input(struct rspamd_task *task,
        /* Mirror parser flags into features */
        hc->features.flags = (unsigned int) hc->flags;
 
+       /* Clamp visibility counters to input length */
+       {
+               unsigned int total_decoded = hc->features.text_visible + hc->features.text_hidden;
+               if (total_decoded > (unsigned int) (end - start)) {
+                       /* Best-effort clamp */
+                       unsigned int excess = total_decoded - (unsigned int) (end - start);
+                       if (hc->features.text_hidden >= excess) {
+                               hc->features.text_hidden -= excess;
+                       }
+                       else {
+                               hc->features.text_visible -= (excess - hc->features.text_hidden);
+                               hc->features.text_hidden = 0;
+                       }
+               }
+       }
+
        return hc;
 }
 
index 5b02c36a1937575cb4e3bbb7628acd37ffd07fad..9caf4b34ac6e867965bfc9f55410a8ad1ea3540f 100644 (file)
@@ -78,6 +78,18 @@ struct rspamd_html_features {
        unsigned int tags_count;
        unsigned int max_dom_depth;
 
+       /* Visibility/text stats */
+       unsigned int text_visible;       /* bytes of visible decoded text */
+       unsigned int text_hidden;        /* bytes of hidden/offscreen decoded text */
+       unsigned int text_transparent;   /* bytes of text rendered transparent */
+       unsigned int blocks_hidden;      /* segments appended to hidden text */
+       unsigned int blocks_transparent; /* segments that were masked as transparent */
+       unsigned int offscreen_blocks;   /* blocks with offscreen style tricks */
+
+       /* Meta/obfuscation */
+       unsigned int meta_refresh;      /* count of <meta http-equiv=refresh> */
+       unsigned int meta_refresh_urls; /* count of URLs extracted from meta refresh */
+
        /* Parser/quality flags mirror (bitset, reserved) */
        unsigned int flags;
 };
index 9b0deed45c1b85aba115da4fd0e5ebdfe79122c8..f9a2ac5a4978363dcfeb792f3c7efbf37a1d329e 100644 (file)
@@ -110,6 +110,12 @@ LUA_FUNCTION_DEF(html, foreach_tag);
  * @return
  */
 LUA_FUNCTION_DEF(html, get_invisible);
+/***
+ * @method html:get_features()
+ * Returns aggregated HTML features as a Lua table
+ * @return {table} features table
+ */
+LUA_FUNCTION_DEF(html, get_features);
 
 static const struct luaL_reg htmllib_m[] = {
        LUA_INTERFACE_DEF(html, has_tag),
@@ -117,6 +123,7 @@ static const struct luaL_reg htmllib_m[] = {
        LUA_INTERFACE_DEF(html, get_images),
        LUA_INTERFACE_DEF(html, foreach_tag),
        LUA_INTERFACE_DEF(html, get_invisible),
+       LUA_INTERFACE_DEF(html, get_features),
        {"__tostring", rspamd_lua_class_tostring},
        {NULL, NULL}};
 
@@ -550,6 +557,132 @@ lua_html_get_invisible(lua_State *L)
        return 1;
 }
 
+static int
+lua_html_get_features(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       auto *hc = lua_check_html(L, 1);
+
+       if (hc == NULL) {
+               lua_pushnil(L);
+               return 1;
+       }
+
+       const struct rspamd_html_features *hf = &hc->features;
+
+       /* Top-level table */
+       lua_createtable(L, 0, 16);
+
+       /* version */
+       lua_pushstring(L, "version");
+       lua_pushinteger(L, hf->version);
+       lua_settable(L, -3);
+
+       /* links subtable */
+       lua_pushstring(L, "links");
+       lua_createtable(L, 0, 20);
+
+#define PUSH_FIELD(name)                \
+       lua_pushstring(L, #name);           \
+       lua_pushinteger(L, hf->links.name); \
+       lua_settable(L, -3)
+
+       PUSH_FIELD(total_links);
+       PUSH_FIELD(affiliated_links);
+       PUSH_FIELD(unaffiliated_links);
+       PUSH_FIELD(confusable_like_from_links);
+       PUSH_FIELD(punycode_links);
+       PUSH_FIELD(ip_links);
+       PUSH_FIELD(port_links);
+       PUSH_FIELD(long_query_links);
+       PUSH_FIELD(trackerish_links);
+       PUSH_FIELD(display_mismatch_links);
+       PUSH_FIELD(js_scheme_links);
+       PUSH_FIELD(data_scheme_links);
+       PUSH_FIELD(mailto_links);
+       PUSH_FIELD(http_links);
+       PUSH_FIELD(query_links);
+       PUSH_FIELD(same_etld1_links);
+       PUSH_FIELD(domains_total);
+       PUSH_FIELD(max_links_single_domain);
+
+#undef PUSH_FIELD
+
+       /* set links */
+       lua_settable(L, -3);
+
+       /* forms */
+       lua_pushstring(L, "forms_count");
+       lua_pushinteger(L, hf->forms_count);
+       lua_settable(L, -3);
+       lua_pushstring(L, "forms_post_unaffiliated");
+       lua_pushinteger(L, hf->forms_post_unaffiliated);
+       lua_settable(L, -3);
+       lua_pushstring(L, "forms_post_affiliated");
+       lua_pushinteger(L, hf->forms_post_affiliated);
+       lua_settable(L, -3);
+       lua_pushstring(L, "has_password_input");
+       lua_pushinteger(L, hf->has_password_input);
+       lua_settable(L, -3);
+
+       /* images */
+       lua_pushstring(L, "images_total");
+       lua_pushinteger(L, hf->images_total);
+       lua_settable(L, -3);
+       lua_pushstring(L, "images_external");
+       lua_pushinteger(L, hf->images_external);
+       lua_settable(L, -3);
+       lua_pushstring(L, "images_data");
+       lua_pushinteger(L, hf->images_data);
+       lua_settable(L, -3);
+       lua_pushstring(L, "images_tiny_external");
+       lua_pushinteger(L, hf->images_tiny_external);
+       lua_settable(L, -3);
+
+       /* dom */
+       lua_pushstring(L, "tags_count");
+       lua_pushinteger(L, hf->tags_count);
+       lua_settable(L, -3);
+       lua_pushstring(L, "max_dom_depth");
+       lua_pushinteger(L, hf->max_dom_depth);
+       lua_settable(L, -3);
+
+       /* visibility/text */
+       lua_pushstring(L, "text_visible");
+       lua_pushinteger(L, hf->text_visible);
+       lua_settable(L, -3);
+       lua_pushstring(L, "text_hidden");
+       lua_pushinteger(L, hf->text_hidden);
+       lua_settable(L, -3);
+       lua_pushstring(L, "text_transparent");
+       lua_pushinteger(L, hf->text_transparent);
+       lua_settable(L, -3);
+       lua_pushstring(L, "blocks_hidden");
+       lua_pushinteger(L, hf->blocks_hidden);
+       lua_settable(L, -3);
+       lua_pushstring(L, "blocks_transparent");
+       lua_pushinteger(L, hf->blocks_transparent);
+       lua_settable(L, -3);
+       lua_pushstring(L, "offscreen_blocks");
+       lua_pushinteger(L, hf->offscreen_blocks);
+       lua_settable(L, -3);
+
+       /* meta/obfuscation */
+       lua_pushstring(L, "meta_refresh");
+       lua_pushinteger(L, hf->meta_refresh);
+       lua_settable(L, -3);
+       lua_pushstring(L, "meta_refresh_urls");
+       lua_pushinteger(L, hf->meta_refresh_urls);
+       lua_settable(L, -3);
+
+       /* flags */
+       lua_pushstring(L, "flags");
+       lua_pushinteger(L, hf->flags);
+       lua_settable(L, -3);
+
+       return 1;
+}
+
 static int
 lua_html_tag_get_type(lua_State *L)
 {