From: Vsevolod Stakhov Date: Thu, 11 Sep 2025 11:49:13 +0000 (+0100) Subject: [Minor] Simplify features transition X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F5608%2Fhead;p=thirdparty%2Frspamd.git [Minor] Simplify features transition --- diff --git a/lualib/lua_meta.lua b/lualib/lua_meta.lua index ecfabca002..d969fd156d 100644 --- a/lualib/lua_meta.lua +++ b/lualib/lua_meta.lua @@ -19,6 +19,7 @@ local exports = {} local N = "metatokens" local ts = require("tableshape").types local logger = require "rspamd_logger" +local lua_mime = require "lua_mime" -- Metafunctions local function meta_size_function(task) @@ -279,17 +280,28 @@ local function meta_words_function(task) end local function meta_html_features_function(task) - local mp = task:get_mempool() - local lt = mp:get_variable("html_links_total", "int") or 0 - local http = mp:get_variable("html_links_http", "int") or 0 - local ql = mp:get_variable("html_links_query", "int") or 0 - local same = mp:get_variable("html_links_same_etld1", "int") or 0 - local dom_total = mp:get_variable("html_links_domains_total", "int") or 0 - local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0 + local lt, http, ql, same, dom_total, max_per_dom = 0, 0, 0, 0, 0, 0 + local ft, fua, fa = 0, 0, 0 + + local sel_part = lua_mime.get_displayed_text_part(task) + if sel_part and sel_part:is_html() then + local html = sel_part:get_html() + if html and html.get_features then + local f = html:get_features() + if f and f.links then + lt = f.links.total_links or 0 + http = f.links.http_links or 0 + ql = f.links.query_links or 0 + same = f.links.same_etld1_links or 0 + dom_total = f.links.domains_total or 0 + max_per_dom = f.links.max_links_single_domain or 0 + end + ft = f.forms_count or 0 + fua = f.forms_post_unaffiliated or 0 + fa = f.forms_post_affiliated or 0 + end + end - local ft = mp:get_variable("html_forms_total", "int") or 0 - local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0 - local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0 local nhtml_links = 0 local http_ratio = 0 @@ -345,6 +357,44 @@ local function meta_cta_function(task) } end +local function meta_html_visibility_function(task) + local sel_part = lua_mime.get_displayed_text_part(task) + local hidden_ratio, transparent_ratio = 0, 0 + local blkh, blkt, off, mref, mrefu = 0, 0, 0, 0, 0 + + if sel_part and sel_part:is_html() then + local html = sel_part:get_html() + if html and html.get_features then + local f = html:get_features() or {} + local vis = f.text_visible or 0 + local hid = f.text_hidden or 0 + local transp = f.text_transparent or 0 + local total = vis + hid + if total > 0 then + hidden_ratio = hid / total + transparent_ratio = transp / total + end + blkh = f.blocks_hidden or 0 + blkt = f.blocks_transparent or 0 + off = f.offscreen_blocks or 0 + mref = f.meta_refresh or 0 + mrefu = f.meta_refresh_urls or 0 + end + end + + -- no mempool fallback; individual mempool exports were removed + + return { + hidden_ratio, + transparent_ratio, + blkh, + blkt, + off, + mref, + mrefu, + } +end + local metafunctions = { { cb = meta_size_function, @@ -511,6 +561,28 @@ local metafunctions = { - CTA weight heuristic - affiliated links ratio among candidates - trackerish domains ratio among candidates +]] + }, + { + cb = meta_html_visibility_function, + ninputs = 7, + names = { + 'html_hidden_text_ratio', + 'html_transparent_text_ratio', + 'html_hidden_blocks', + 'html_transparent_blocks', + 'html_offscreen_blocks', + 'html_meta_refresh', + 'html_meta_refresh_urls', + }, + description = [[HTML hidden/offscreen/obfuscation features and meta refresh counters: + - ratio of hidden text to total text + - ratio of transparent text to total text + - number of hidden text blocks appended + - number of transparent blocks + - number of offscreen-styled blocks + - number of meta refresh tags + - number of meta refresh URLs extracted ]] }, } diff --git a/src/libmime/message.c b/src/libmime/message.c index 4721a65c59..bff822a40f 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -794,30 +794,6 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, /* Wire aggregated HTML features */ text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html); - /* Expose a few mempool variables for Lua meta to start experimenting */ - if (text_part->html_features) { - const struct rspamd_html_features *hf = text_part->html_features; - rspamd_mempool_set_variable(task->task_pool, "html_links_total", - (void *) &hf->links.total_links, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_links_http", - (void *) &hf->links.http_links, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_links_query", - (void *) &hf->links.query_links, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1", - (void *) &hf->links.same_etld1_links, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total", - (void *) &hf->links.domains_total, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain", - (void *) &hf->links.max_links_single_domain, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_images_total", - (void *) &hf->images_total, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_forms_total", - (void *) &hf->forms_count, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated", - (void *) &hf->forms_post_unaffiliated, NULL); - rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated", - (void *) &hf->forms_post_affiliated, NULL); - } /* Optionally call CTA/affiliation Lua hook with capped candidates */ if (task->cfg && task->cfg->lua_state) { diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 8ef1520db2..c0a2b90e99 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1677,6 +1677,48 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, if (maybe_bgcolor) { tag->block->set_bgcolor(maybe_bgcolor->to_color().value()); } + + /* Offscreen heuristic: negative text-indent or large negative left/top */ + if (auto style = tag->find_style()) { + auto sv = *style; + /* text-indent */ + auto p_ti = rspamd_substring_search_caseless(sv.data(), sv.size(), "text-indent", sizeof("text-indent") - 1); + if (p_ti != -1) { + /* look ahead for '-' before a digit */ + for (std::size_t i = p_ti; i < sv.size(); i++) { + char c = sv[i]; + if (c == '-') { + /* consider offscreen */ + hc->features.offscreen_blocks++; + break; + } + if (g_ascii_isdigit(c)) break; + } + } + /* left/top negative absolute */ + auto p_left = rspamd_substring_search_caseless(sv.data(), sv.size(), "left", sizeof("left") - 1); + if (p_left != -1) { + for (std::size_t i = p_left; i < sv.size(); i++) { + char c = sv[i]; + if (c == '-') { + hc->features.offscreen_blocks++; + break; + } + if (g_ascii_isdigit(c)) break; + } + } + auto p_top = rspamd_substring_search_caseless(sv.data(), sv.size(), "top", sizeof("top") - 1); + if (p_top != -1) { + for (std::size_t i = p_top; i < sv.size(); i++) { + char c = sv[i]; + if (c == '-') { + hc->features.offscreen_blocks++; + break; + } + if (g_ascii_isdigit(c)) break; + } + } + } } static inline auto @@ -1740,6 +1782,21 @@ html_append_parsed(struct html_content *hc, return !g_ascii_isspace(c); }, ' '); + /* Accumulate transparent text bytes */ + hc->features.text_transparent += (unsigned int) nlen; + hc->features.blocks_transparent++; + } + else { + /* Visible or hidden text accounted outside; keep helper here for visible path */ + if (&dest == &hc->parsed) { + /* Visible text */ + hc->features.text_visible += (unsigned int) nlen; + } + else if (&dest == &hc->invisible) { + /* Hidden text */ + hc->features.text_hidden += (unsigned int) nlen; + hc->features.blocks_hidden++; + } } return nlen; @@ -2143,6 +2200,37 @@ auto html_process_input(struct rspamd_task *task, } } break; + case Tag_META: { + /* Detect meta refresh */ + auto http_equiv = cur_tag->find_component(); + if (http_equiv) { + auto hv = http_equiv.value()->value; + if (hv.size() >= sizeof("refresh") - 1 && + g_ascii_strncasecmp(hv.data(), "refresh", hv.size()) == 0) { + hc->features.meta_refresh++; + /* Try to extract URL from content */ + if (auto content = cur_tag->find_component()) { + auto cv = content.value()->value; + /* naive parse: look for 'url=' and capture token */ + auto p = rspamd_substring_search_caseless(cv.data(), cv.size(), "url=", sizeof("url=") - 1); + if (p != -1) { + std::string_view urlv{cv.data() + p + (sizeof("url=") - 1), cv.size() - (p + (sizeof("url=") - 1))}; + /* Trim quotes/spaces and trailing separators */ + while (!urlv.empty() && (g_ascii_isspace(urlv.front()) || urlv.front() == '\'' || urlv.front() == '"')) urlv.remove_prefix(1); + while (!urlv.empty() && (urlv.back() == ';' || urlv.back() == '\'' || urlv.back() == '"' || g_ascii_isspace(urlv.back()))) urlv.remove_suffix(1); + if (!urlv.empty()) { + /* validate and count; do not add to urls set */ + auto maybe_url = html_process_url(pool, urlv); + if (maybe_url) { + hc->features.meta_refresh_urls++; + } + } + } + } + } + } + break; + } case Tag_INPUT: { if (auto type_comp = cur_tag->find_component()) { auto tv = type_comp.value()->get_string_value(); @@ -3030,6 +3118,22 @@ auto html_process_input(struct rspamd_task *task, /* Mirror parser flags into features */ hc->features.flags = (unsigned int) hc->flags; + /* Clamp visibility counters to input length */ + { + unsigned int total_decoded = hc->features.text_visible + hc->features.text_hidden; + if (total_decoded > (unsigned int) (end - start)) { + /* Best-effort clamp */ + unsigned int excess = total_decoded - (unsigned int) (end - start); + if (hc->features.text_hidden >= excess) { + hc->features.text_hidden -= excess; + } + else { + hc->features.text_visible -= (excess - hc->features.text_hidden); + hc->features.text_hidden = 0; + } + } + } + return hc; } diff --git a/src/libserver/html/html_features.h b/src/libserver/html/html_features.h index 5b02c36a19..9caf4b34ac 100644 --- a/src/libserver/html/html_features.h +++ b/src/libserver/html/html_features.h @@ -78,6 +78,18 @@ struct rspamd_html_features { unsigned int tags_count; unsigned int max_dom_depth; + /* Visibility/text stats */ + unsigned int text_visible; /* bytes of visible decoded text */ + unsigned int text_hidden; /* bytes of hidden/offscreen decoded text */ + unsigned int text_transparent; /* bytes of text rendered transparent */ + unsigned int blocks_hidden; /* segments appended to hidden text */ + unsigned int blocks_transparent; /* segments that were masked as transparent */ + unsigned int offscreen_blocks; /* blocks with offscreen style tricks */ + + /* Meta/obfuscation */ + unsigned int meta_refresh; /* count of */ + unsigned int meta_refresh_urls; /* count of URLs extracted from meta refresh */ + /* Parser/quality flags mirror (bitset, reserved) */ unsigned int flags; }; diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx index 9b0deed45c..f9a2ac5a49 100644 --- a/src/lua/lua_html.cxx +++ b/src/lua/lua_html.cxx @@ -110,6 +110,12 @@ LUA_FUNCTION_DEF(html, foreach_tag); * @return */ LUA_FUNCTION_DEF(html, get_invisible); +/*** + * @method html:get_features() + * Returns aggregated HTML features as a Lua table + * @return {table} features table + */ +LUA_FUNCTION_DEF(html, get_features); static const struct luaL_reg htmllib_m[] = { LUA_INTERFACE_DEF(html, has_tag), @@ -117,6 +123,7 @@ static const struct luaL_reg htmllib_m[] = { LUA_INTERFACE_DEF(html, get_images), LUA_INTERFACE_DEF(html, foreach_tag), LUA_INTERFACE_DEF(html, get_invisible), + LUA_INTERFACE_DEF(html, get_features), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL}}; @@ -550,6 +557,132 @@ lua_html_get_invisible(lua_State *L) return 1; } +static int +lua_html_get_features(lua_State *L) +{ + LUA_TRACE_POINT; + auto *hc = lua_check_html(L, 1); + + if (hc == NULL) { + lua_pushnil(L); + return 1; + } + + const struct rspamd_html_features *hf = &hc->features; + + /* Top-level table */ + lua_createtable(L, 0, 16); + + /* version */ + lua_pushstring(L, "version"); + lua_pushinteger(L, hf->version); + lua_settable(L, -3); + + /* links subtable */ + lua_pushstring(L, "links"); + lua_createtable(L, 0, 20); + +#define PUSH_FIELD(name) \ + lua_pushstring(L, #name); \ + lua_pushinteger(L, hf->links.name); \ + lua_settable(L, -3) + + PUSH_FIELD(total_links); + PUSH_FIELD(affiliated_links); + PUSH_FIELD(unaffiliated_links); + PUSH_FIELD(confusable_like_from_links); + PUSH_FIELD(punycode_links); + PUSH_FIELD(ip_links); + PUSH_FIELD(port_links); + PUSH_FIELD(long_query_links); + PUSH_FIELD(trackerish_links); + PUSH_FIELD(display_mismatch_links); + PUSH_FIELD(js_scheme_links); + PUSH_FIELD(data_scheme_links); + PUSH_FIELD(mailto_links); + PUSH_FIELD(http_links); + PUSH_FIELD(query_links); + PUSH_FIELD(same_etld1_links); + PUSH_FIELD(domains_total); + PUSH_FIELD(max_links_single_domain); + +#undef PUSH_FIELD + + /* set links */ + lua_settable(L, -3); + + /* forms */ + lua_pushstring(L, "forms_count"); + lua_pushinteger(L, hf->forms_count); + lua_settable(L, -3); + lua_pushstring(L, "forms_post_unaffiliated"); + lua_pushinteger(L, hf->forms_post_unaffiliated); + lua_settable(L, -3); + lua_pushstring(L, "forms_post_affiliated"); + lua_pushinteger(L, hf->forms_post_affiliated); + lua_settable(L, -3); + lua_pushstring(L, "has_password_input"); + lua_pushinteger(L, hf->has_password_input); + lua_settable(L, -3); + + /* images */ + lua_pushstring(L, "images_total"); + lua_pushinteger(L, hf->images_total); + lua_settable(L, -3); + lua_pushstring(L, "images_external"); + lua_pushinteger(L, hf->images_external); + lua_settable(L, -3); + lua_pushstring(L, "images_data"); + lua_pushinteger(L, hf->images_data); + lua_settable(L, -3); + lua_pushstring(L, "images_tiny_external"); + lua_pushinteger(L, hf->images_tiny_external); + lua_settable(L, -3); + + /* dom */ + lua_pushstring(L, "tags_count"); + lua_pushinteger(L, hf->tags_count); + lua_settable(L, -3); + lua_pushstring(L, "max_dom_depth"); + lua_pushinteger(L, hf->max_dom_depth); + lua_settable(L, -3); + + /* visibility/text */ + lua_pushstring(L, "text_visible"); + lua_pushinteger(L, hf->text_visible); + lua_settable(L, -3); + lua_pushstring(L, "text_hidden"); + lua_pushinteger(L, hf->text_hidden); + lua_settable(L, -3); + lua_pushstring(L, "text_transparent"); + lua_pushinteger(L, hf->text_transparent); + lua_settable(L, -3); + lua_pushstring(L, "blocks_hidden"); + lua_pushinteger(L, hf->blocks_hidden); + lua_settable(L, -3); + lua_pushstring(L, "blocks_transparent"); + lua_pushinteger(L, hf->blocks_transparent); + lua_settable(L, -3); + lua_pushstring(L, "offscreen_blocks"); + lua_pushinteger(L, hf->offscreen_blocks); + lua_settable(L, -3); + + /* meta/obfuscation */ + lua_pushstring(L, "meta_refresh"); + lua_pushinteger(L, hf->meta_refresh); + lua_settable(L, -3); + lua_pushstring(L, "meta_refresh_urls"); + lua_pushinteger(L, hf->meta_refresh_urls); + lua_settable(L, -3); + + /* flags */ + lua_pushstring(L, "flags"); + lua_pushinteger(L, hf->flags); + lua_settable(L, -3); + + return 1; +} + static int lua_html_tag_get_type(lua_State *L) {