local N = "metatokens"
local ts = require("tableshape").types
local logger = require "rspamd_logger"
+local lua_mime = require "lua_mime"
-- Metafunctions
local function meta_size_function(task)
end
local function meta_html_features_function(task)
- local mp = task:get_mempool()
- local lt = mp:get_variable("html_links_total", "int") or 0
- local http = mp:get_variable("html_links_http", "int") or 0
- local ql = mp:get_variable("html_links_query", "int") or 0
- local same = mp:get_variable("html_links_same_etld1", "int") or 0
- local dom_total = mp:get_variable("html_links_domains_total", "int") or 0
- local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0
+ local lt, http, ql, same, dom_total, max_per_dom = 0, 0, 0, 0, 0, 0
+ local ft, fua, fa = 0, 0, 0
+
+ local sel_part = lua_mime.get_displayed_text_part(task)
+ if sel_part and sel_part:is_html() then
+ local html = sel_part:get_html()
+ if html and html.get_features then
+ local f = html:get_features()
+ if f and f.links then
+ lt = f.links.total_links or 0
+ http = f.links.http_links or 0
+ ql = f.links.query_links or 0
+ same = f.links.same_etld1_links or 0
+ dom_total = f.links.domains_total or 0
+ max_per_dom = f.links.max_links_single_domain or 0
+ end
+ ft = f.forms_count or 0
+ fua = f.forms_post_unaffiliated or 0
+ fa = f.forms_post_affiliated or 0
+ end
+ end
- local ft = mp:get_variable("html_forms_total", "int") or 0
- local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0
- local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0
local nhtml_links = 0
local http_ratio = 0
}
end
+local function meta_html_visibility_function(task)
+ local sel_part = lua_mime.get_displayed_text_part(task)
+ local hidden_ratio, transparent_ratio = 0, 0
+ local blkh, blkt, off, mref, mrefu = 0, 0, 0, 0, 0
+
+ if sel_part and sel_part:is_html() then
+ local html = sel_part:get_html()
+ if html and html.get_features then
+ local f = html:get_features() or {}
+ local vis = f.text_visible or 0
+ local hid = f.text_hidden or 0
+ local transp = f.text_transparent or 0
+ local total = vis + hid
+ if total > 0 then
+ hidden_ratio = hid / total
+ transparent_ratio = transp / total
+ end
+ blkh = f.blocks_hidden or 0
+ blkt = f.blocks_transparent or 0
+ off = f.offscreen_blocks or 0
+ mref = f.meta_refresh or 0
+ mrefu = f.meta_refresh_urls or 0
+ end
+ end
+
+ -- no mempool fallback; individual mempool exports were removed
+
+ return {
+ hidden_ratio,
+ transparent_ratio,
+ blkh,
+ blkt,
+ off,
+ mref,
+ mrefu,
+ }
+end
+
local metafunctions = {
{
cb = meta_size_function,
- CTA weight heuristic
- affiliated links ratio among candidates
- trackerish domains ratio among candidates
+]]
+ },
+ {
+ cb = meta_html_visibility_function,
+ ninputs = 7,
+ names = {
+ 'html_hidden_text_ratio',
+ 'html_transparent_text_ratio',
+ 'html_hidden_blocks',
+ 'html_transparent_blocks',
+ 'html_offscreen_blocks',
+ 'html_meta_refresh',
+ 'html_meta_refresh_urls',
+ },
+ description = [[HTML hidden/offscreen/obfuscation features and meta refresh counters:
+ - ratio of hidden text to total text
+ - ratio of transparent text to total text
+ - number of hidden text blocks appended
+ - number of transparent blocks
+ - number of offscreen-styled blocks
+ - number of meta refresh tags
+ - number of meta refresh URLs extracted
]]
},
}
/* Wire aggregated HTML features */
text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
- /* Expose a few mempool variables for Lua meta to start experimenting */
- if (text_part->html_features) {
- const struct rspamd_html_features *hf = text_part->html_features;
- rspamd_mempool_set_variable(task->task_pool, "html_links_total",
- (void *) &hf->links.total_links, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_links_http",
- (void *) &hf->links.http_links, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_links_query",
- (void *) &hf->links.query_links, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1",
- (void *) &hf->links.same_etld1_links, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total",
- (void *) &hf->links.domains_total, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain",
- (void *) &hf->links.max_links_single_domain, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_images_total",
- (void *) &hf->images_total, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_forms_total",
- (void *) &hf->forms_count, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated",
- (void *) &hf->forms_post_unaffiliated, NULL);
- rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
- (void *) &hf->forms_post_affiliated, NULL);
- }
/* Optionally call CTA/affiliation Lua hook with capped candidates */
if (task->cfg && task->cfg->lua_state) {
if (maybe_bgcolor) {
tag->block->set_bgcolor(maybe_bgcolor->to_color().value());
}
+
+ /* Offscreen heuristic: negative text-indent or large negative left/top */
+ if (auto style = tag->find_style()) {
+ auto sv = *style;
+ /* text-indent */
+ auto p_ti = rspamd_substring_search_caseless(sv.data(), sv.size(), "text-indent", sizeof("text-indent") - 1);
+ if (p_ti != -1) {
+ /* look ahead for '-' before a digit */
+ for (std::size_t i = p_ti; i < sv.size(); i++) {
+ char c = sv[i];
+ if (c == '-') {
+ /* consider offscreen */
+ hc->features.offscreen_blocks++;
+ break;
+ }
+ if (g_ascii_isdigit(c)) break;
+ }
+ }
+ /* left/top negative absolute */
+ auto p_left = rspamd_substring_search_caseless(sv.data(), sv.size(), "left", sizeof("left") - 1);
+ if (p_left != -1) {
+ for (std::size_t i = p_left; i < sv.size(); i++) {
+ char c = sv[i];
+ if (c == '-') {
+ hc->features.offscreen_blocks++;
+ break;
+ }
+ if (g_ascii_isdigit(c)) break;
+ }
+ }
+ auto p_top = rspamd_substring_search_caseless(sv.data(), sv.size(), "top", sizeof("top") - 1);
+ if (p_top != -1) {
+ for (std::size_t i = p_top; i < sv.size(); i++) {
+ char c = sv[i];
+ if (c == '-') {
+ hc->features.offscreen_blocks++;
+ break;
+ }
+ if (g_ascii_isdigit(c)) break;
+ }
+ }
+ }
}
static inline auto
return !g_ascii_isspace(c);
},
' ');
+ /* Accumulate transparent text bytes */
+ hc->features.text_transparent += (unsigned int) nlen;
+ hc->features.blocks_transparent++;
+ }
+ else {
+ /* Visible or hidden text accounted outside; keep helper here for visible path */
+ if (&dest == &hc->parsed) {
+ /* Visible text */
+ hc->features.text_visible += (unsigned int) nlen;
+ }
+ else if (&dest == &hc->invisible) {
+ /* Hidden text */
+ hc->features.text_hidden += (unsigned int) nlen;
+ hc->features.blocks_hidden++;
+ }
}
return nlen;
}
}
break;
+ case Tag_META: {
+ /* Detect meta refresh */
+ auto http_equiv = cur_tag->find_component<html_component_http_equiv>();
+ if (http_equiv) {
+ auto hv = http_equiv.value()->value;
+ if (hv.size() >= sizeof("refresh") - 1 &&
+ g_ascii_strncasecmp(hv.data(), "refresh", hv.size()) == 0) {
+ hc->features.meta_refresh++;
+ /* Try to extract URL from content */
+ if (auto content = cur_tag->find_component<html_component_content>()) {
+ auto cv = content.value()->value;
+ /* naive parse: look for 'url=' and capture token */
+ auto p = rspamd_substring_search_caseless(cv.data(), cv.size(), "url=", sizeof("url=") - 1);
+ if (p != -1) {
+ std::string_view urlv{cv.data() + p + (sizeof("url=") - 1), cv.size() - (p + (sizeof("url=") - 1))};
+ /* Trim quotes/spaces and trailing separators */
+ while (!urlv.empty() && (g_ascii_isspace(urlv.front()) || urlv.front() == '\'' || urlv.front() == '"')) urlv.remove_prefix(1);
+ while (!urlv.empty() && (urlv.back() == ';' || urlv.back() == '\'' || urlv.back() == '"' || g_ascii_isspace(urlv.back()))) urlv.remove_suffix(1);
+ if (!urlv.empty()) {
+ /* validate and count; do not add to urls set */
+ auto maybe_url = html_process_url(pool, urlv);
+ if (maybe_url) {
+ hc->features.meta_refresh_urls++;
+ }
+ }
+ }
+ }
+ }
+ }
+ break;
+ }
case Tag_INPUT: {
if (auto type_comp = cur_tag->find_component<html_component_type>()) {
auto tv = type_comp.value()->get_string_value();
/* Mirror parser flags into features */
hc->features.flags = (unsigned int) hc->flags;
+ /* Clamp visibility counters to input length */
+ {
+ unsigned int total_decoded = hc->features.text_visible + hc->features.text_hidden;
+ if (total_decoded > (unsigned int) (end - start)) {
+ /* Best-effort clamp */
+ unsigned int excess = total_decoded - (unsigned int) (end - start);
+ if (hc->features.text_hidden >= excess) {
+ hc->features.text_hidden -= excess;
+ }
+ else {
+ hc->features.text_visible -= (excess - hc->features.text_hidden);
+ hc->features.text_hidden = 0;
+ }
+ }
+ }
+
return hc;
}
unsigned int tags_count;
unsigned int max_dom_depth;
+ /* Visibility/text stats */
+ unsigned int text_visible; /* bytes of visible decoded text */
+ unsigned int text_hidden; /* bytes of hidden/offscreen decoded text */
+ unsigned int text_transparent; /* bytes of text rendered transparent */
+ unsigned int blocks_hidden; /* segments appended to hidden text */
+ unsigned int blocks_transparent; /* segments that were masked as transparent */
+ unsigned int offscreen_blocks; /* blocks with offscreen style tricks */
+
+ /* Meta/obfuscation */
+ unsigned int meta_refresh; /* count of <meta http-equiv=refresh> */
+ unsigned int meta_refresh_urls; /* count of URLs extracted from meta refresh */
+
/* Parser/quality flags mirror (bitset, reserved) */
unsigned int flags;
};
* @return
*/
LUA_FUNCTION_DEF(html, get_invisible);
+/***
+ * @method html:get_features()
+ * Returns aggregated HTML features as a Lua table
+ * @return {table} features table
+ */
+LUA_FUNCTION_DEF(html, get_features);
static const struct luaL_reg htmllib_m[] = {
LUA_INTERFACE_DEF(html, has_tag),
LUA_INTERFACE_DEF(html, get_images),
LUA_INTERFACE_DEF(html, foreach_tag),
LUA_INTERFACE_DEF(html, get_invisible),
+ LUA_INTERFACE_DEF(html, get_features),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}};
return 1;
}
+static int
+lua_html_get_features(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *hc = lua_check_html(L, 1);
+
+ if (hc == NULL) {
+ lua_pushnil(L);
+ return 1;
+ }
+
+ const struct rspamd_html_features *hf = &hc->features;
+
+ /* Top-level table */
+ lua_createtable(L, 0, 16);
+
+ /* version */
+ lua_pushstring(L, "version");
+ lua_pushinteger(L, hf->version);
+ lua_settable(L, -3);
+
+ /* links subtable */
+ lua_pushstring(L, "links");
+ lua_createtable(L, 0, 20);
+
+#define PUSH_FIELD(name) \
+ lua_pushstring(L, #name); \
+ lua_pushinteger(L, hf->links.name); \
+ lua_settable(L, -3)
+
+ PUSH_FIELD(total_links);
+ PUSH_FIELD(affiliated_links);
+ PUSH_FIELD(unaffiliated_links);
+ PUSH_FIELD(confusable_like_from_links);
+ PUSH_FIELD(punycode_links);
+ PUSH_FIELD(ip_links);
+ PUSH_FIELD(port_links);
+ PUSH_FIELD(long_query_links);
+ PUSH_FIELD(trackerish_links);
+ PUSH_FIELD(display_mismatch_links);
+ PUSH_FIELD(js_scheme_links);
+ PUSH_FIELD(data_scheme_links);
+ PUSH_FIELD(mailto_links);
+ PUSH_FIELD(http_links);
+ PUSH_FIELD(query_links);
+ PUSH_FIELD(same_etld1_links);
+ PUSH_FIELD(domains_total);
+ PUSH_FIELD(max_links_single_domain);
+
+#undef PUSH_FIELD
+
+ /* set links */
+ lua_settable(L, -3);
+
+ /* forms */
+ lua_pushstring(L, "forms_count");
+ lua_pushinteger(L, hf->forms_count);
+ lua_settable(L, -3);
+ lua_pushstring(L, "forms_post_unaffiliated");
+ lua_pushinteger(L, hf->forms_post_unaffiliated);
+ lua_settable(L, -3);
+ lua_pushstring(L, "forms_post_affiliated");
+ lua_pushinteger(L, hf->forms_post_affiliated);
+ lua_settable(L, -3);
+ lua_pushstring(L, "has_password_input");
+ lua_pushinteger(L, hf->has_password_input);
+ lua_settable(L, -3);
+
+ /* images */
+ lua_pushstring(L, "images_total");
+ lua_pushinteger(L, hf->images_total);
+ lua_settable(L, -3);
+ lua_pushstring(L, "images_external");
+ lua_pushinteger(L, hf->images_external);
+ lua_settable(L, -3);
+ lua_pushstring(L, "images_data");
+ lua_pushinteger(L, hf->images_data);
+ lua_settable(L, -3);
+ lua_pushstring(L, "images_tiny_external");
+ lua_pushinteger(L, hf->images_tiny_external);
+ lua_settable(L, -3);
+
+ /* dom */
+ lua_pushstring(L, "tags_count");
+ lua_pushinteger(L, hf->tags_count);
+ lua_settable(L, -3);
+ lua_pushstring(L, "max_dom_depth");
+ lua_pushinteger(L, hf->max_dom_depth);
+ lua_settable(L, -3);
+
+ /* visibility/text */
+ lua_pushstring(L, "text_visible");
+ lua_pushinteger(L, hf->text_visible);
+ lua_settable(L, -3);
+ lua_pushstring(L, "text_hidden");
+ lua_pushinteger(L, hf->text_hidden);
+ lua_settable(L, -3);
+ lua_pushstring(L, "text_transparent");
+ lua_pushinteger(L, hf->text_transparent);
+ lua_settable(L, -3);
+ lua_pushstring(L, "blocks_hidden");
+ lua_pushinteger(L, hf->blocks_hidden);
+ lua_settable(L, -3);
+ lua_pushstring(L, "blocks_transparent");
+ lua_pushinteger(L, hf->blocks_transparent);
+ lua_settable(L, -3);
+ lua_pushstring(L, "offscreen_blocks");
+ lua_pushinteger(L, hf->offscreen_blocks);
+ lua_settable(L, -3);
+
+ /* meta/obfuscation */
+ lua_pushstring(L, "meta_refresh");
+ lua_pushinteger(L, hf->meta_refresh);
+ lua_settable(L, -3);
+ lua_pushstring(L, "meta_refresh_urls");
+ lua_pushinteger(L, hf->meta_refresh_urls);
+ lua_settable(L, -3);
+
+ /* flags */
+ lua_pushstring(L, "flags");
+ lua_pushinteger(L, hf->flags);
+ lua_settable(L, -3);
+
+ return 1;
+}
+
static int
lua_html_tag_get_type(lua_State *L)
{