From: Vsevolod Stakhov Date: Sat, 6 Sep 2025 12:45:02 +0000 (+0100) Subject: [Project] Extract more features from HTML messages X-Git-Tag: 3.13.0~6^2~7 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8487ea2055b12405a2c936bed54da03ed6087cca;p=thirdparty%2Frspamd.git [Project] Extract more features from HTML messages --- diff --git a/lualib/lua_meta.lua b/lualib/lua_meta.lua index de006df8e7..446c2b4830 100644 --- a/lualib/lua_meta.lua +++ b/lualib/lua_meta.lua @@ -278,6 +278,58 @@ local function meta_words_function(task) return ret end +local function meta_html_features_function(task) + local mp = task:get_mempool() + local lt = mp:get_variable("html_links_total", "int") or 0 + local http = mp:get_variable("html_links_http", "int") or 0 + local ql = mp:get_variable("html_links_query", "int") or 0 + local same = mp:get_variable("html_links_same_etld1", "int") or 0 + local dom_total = mp:get_variable("html_links_domains_total", "int") or 0 + local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0 + + local ft = mp:get_variable("html_forms_total", "int") or 0 + local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0 + local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0 + + local nhtml_links = 0 + local http_ratio = 0 + local query_ratio = 0 + local same_etld1_ratio = 0 + local domains_per_link_ratio = 0 + local max_links_per_domain_ratio = 0 + + if lt > 0 then + nhtml_links = 1.0 / lt + http_ratio = http / lt + query_ratio = ql / lt + same_etld1_ratio = same / lt + domains_per_link_ratio = dom_total / lt + max_links_per_domain_ratio = max_per_dom / lt + end + + local nhtml_forms = 0 + local forms_unaff_ratio = 0 + local forms_aff_ratio = 0 + + if ft > 0 then + nhtml_forms = 1.0 / ft + forms_unaff_ratio = fua / ft + forms_aff_ratio = fa / ft + end + + return { + nhtml_links, + http_ratio, + query_ratio, + same_etld1_ratio, + domains_per_link_ratio, + max_links_per_domain_ratio, + nhtml_forms, + forms_unaff_ratio, + forms_aff_ratio, + } +end + local metafunctions = { { cb = meta_size_function, @@ -402,6 +454,32 @@ local metafunctions = { - rate of non-ascii characters - rate of capital letters - rate of numbers +]] + }, + { + cb = meta_html_features_function, + ninputs = 9, + names = { + 'nhtml_links', + 'nhtml_http_links_ratio', + 'nhtml_query_links_ratio', + 'nhtml_same_etld1_links_ratio', + 'nhtml_domains_per_link_ratio', + 'nhtml_max_links_per_domain_ratio', + 'nhtml_forms', + 'nhtml_forms_unaffiliated_ratio', + 'nhtml_forms_affiliated_ratio', + }, + description = [[HTML link/form aggregated features: + - reciprocal of total links + - ratio of http(s) links + - ratio of links with query + - ratio of links with same eTLD+1 as first-party + - domains per link ratio + - max links per single domain ratio + - reciprocal of total forms + - ratio of forms posting to unaffiliated domains + - ratio of forms posting to affiliated domains ]] }, } @@ -527,7 +605,7 @@ end exports.rspamd_count_metatokens = rspamd_count_metatokens exports.count_metatokens = rspamd_count_metatokens -exports.version = 1 -- MUST be increased on each change of metatokens +exports.version = 2 -- MUST be increased on each change of metatokens exports.add_metafunction = function(tbl) local ret, err = meta_schema(tbl) diff --git a/src/libmime/message.c b/src/libmime/message.c index 8442c80ac8..c5bb003970 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -791,6 +791,33 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, text_part->mime_part->urls, task->cfg ? task->cfg->enable_css_parser : true, cur_url_order); + + /* Wire aggregated HTML features */ + text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html); + /* Expose a few mempool variables for Lua meta to start experimenting */ + if (text_part->html_features) { + const struct rspamd_html_features *hf = text_part->html_features; + rspamd_mempool_set_variable(task->task_pool, "html_links_total", + (void *) &hf->links.total_links, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_links_http", + (void *) &hf->links.http_links, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_links_query", + (void *) &hf->links.query_links, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1", + (void *) &hf->links.same_etld1_links, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total", + (void *) &hf->links.domains_total, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain", + (void *) &hf->links.max_links_single_domain, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_images_total", + (void *) &hf->images_total, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_forms_total", + (void *) &hf->forms_count, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated", + (void *) &hf->forms_post_unaffiliated, NULL); + rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated", + (void *) &hf->forms_post_affiliated, NULL); + } rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); if (text_part->utf_content.len == 0) { diff --git a/src/libmime/message.h b/src/libmime/message.h index e6b4543625..83f36ff192 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -29,6 +29,7 @@ struct rspamd_task; struct controller_session; struct rspamd_image; struct rspamd_archive; +struct rspamd_html_features; enum rspamd_mime_part_flags { RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u), @@ -145,6 +146,8 @@ struct rspamd_mime_text_part { GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ void *html; + /* Optional HTML features collected during parsing */ + struct rspamd_html_features *html_features; GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 78a6a975c9..26a105f355 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1605,6 +1605,19 @@ html_process_img_tag(rspamd_mempool_t *pool, hc->images.push_back(img); + /* Update image-related features */ + hc->features.images_total++; + if (img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA) { + hc->features.images_data++; + } + if (img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL) { + hc->features.images_external++; + /* tiny external pixel tracking */ + if (img->width > 0 && img->height > 0 && (img->width * img->height) <= 4u) { + hc->features.images_tiny_external++; + } + } + if (std::holds_alternative(tag->extra)) { tag->extra = img; } @@ -1928,6 +1941,16 @@ html_append_tag_content(rspamd_mempool_t *pool, {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)}, tag, exceptions, url_set, initial_parsed_offset); + /* Count display URL mismatches when URL is present */ + if (std::holds_alternative(tag->extra)) { + auto *u = std::get(tag->extra); + if (u && (u->flags & RSPAMD_URL_FLAG_DISPLAY_URL) && (u->flags & RSPAMD_URL_FLAG_HTML_DISPLAYED)) { + /* html_process_displayed_href_tag sets linked_url when display URL differs */ + if (u->ext && u->ext->linked_url && u->ext->linked_url != u) { + hc->features.links.display_mismatch_links++; + } + } + } } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ @@ -2023,6 +2046,26 @@ auto html_process_input(struct rspamd_task *task, auto *hc = new html_content; rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc); + /* Derive first-party eTLD+1 from From: if present */ + if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) { + struct rspamd_email_address *addr = (struct rspamd_email_address *) g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0); + if (addr && addr->domain && addr->domain_len > 0) { + rspamd_ftok_t tld; + if (rspamd_url_find_tld(addr->domain, addr->domain_len, &tld)) { + /* eTLD+1: take the last label before tld and the tld */ + const char *dom = addr->domain; + const char *dom_end = addr->domain + addr->domain_len; + const char *tld_begin = tld.begin; + /* Find start of the registrable part */ + const char *p = tld_begin; + while (p > dom && *(p - 1) != '.') { + p--; + } + hc->first_party_etld1.assign(p, dom_end - p); + } + } + } + if (task->cfg && in->len > task->cfg->max_html_len) { msg_notice_task("html input is too big: %z, limit is %z", in->len, @@ -2065,6 +2108,52 @@ auto html_process_input(struct rspamd_task *task, hc->tags_seen[cur_tag->id] = true; } + /* Simple feature collection on opening */ + switch (cur_tag->id) { + case Tag_FORM: + hc->features.forms_count++; + /* If action present and absolute, compare eTLD+1 with first-party */ + if (auto href = cur_tag->find_href()) { + if (html_is_absolute_url(*href)) { + auto maybe_url = html_process_url(pool, *href); + if (maybe_url) { + struct rspamd_url *u = maybe_url.value(); + if (u->hostlen > 0) { + /* Find eTLD+1 of action host */ + rspamd_ftok_t tld2; + if (rspamd_url_find_tld(rspamd_url_host_unsafe(u), u->hostlen, &tld2)) { + const char *host = rspamd_url_host_unsafe(u); + const char *p2 = tld2.begin; + while (p2 > host && *(p2 - 1) != '.') { + p2--; + } + std::string etld1_action{p2, host + u->hostlen - p2}; + if (!hc->first_party_etld1.empty() && !g_ascii_strcasecmp(etld1_action.c_str(), hc->first_party_etld1.c_str())) { + hc->features.forms_post_affiliated++; + } + else { + hc->features.forms_post_unaffiliated++; + } + } + } + } + } + } + break; + case Tag_INPUT: { + if (auto type_comp = cur_tag->find_component()) { + auto tv = type_comp.value()->get_string_value(); + if (tv.size() == sizeof("password") - 1 && + g_ascii_strncasecmp(tv.data(), "password", tv.size()) == 0) { + hc->features.has_password_input = 1u; + } + } + break; + } + default: + break; + } + /* Shift to the first unclosed tag */ auto *pt = parent_tag; while (pt && (pt->flags & FL_CLOSED)) { @@ -2137,6 +2226,71 @@ auto html_process_input(struct rspamd_task *task, g_ptr_array_add(part_urls, url); } + /* Minimal link features collection */ + hc->features.links.total_links++; + if (url->flags & RSPAMD_URL_FLAG_IDN) { + hc->features.links.punycode_links++; + } + if (url->flags & RSPAMD_URL_FLAG_NUMERIC) { + hc->features.links.ip_links++; + } + if (url->flags & RSPAMD_URL_FLAG_HAS_PORT) { + hc->features.links.port_links++; + } + if (url->flags & RSPAMD_URL_FLAG_QUERY) { + /* Heuristic: long query length */ + if (url->querylen > 64) { + hc->features.links.long_query_links++; + } + } + /* Scheme type */ + if (url->protocol == PROTOCOL_MAILTO) { + hc->features.links.mailto_links++; + } + else if (url->protocol == PROTOCOL_HTTP || url->protocol == PROTOCOL_HTTPS) { + hc->features.links.http_links++; + } + /* data/javascript schemes can be detected by flags set during parsing */ + if (url->protocol == PROTOCOL_UNKNOWN) { + /* We don't have explicit scheme enum for data/js; check raw prefix quickly */ + if (url->raw && url->rawlen >= 5) { + if (g_ascii_strncasecmp(url->raw, "data:", 5) == 0) { + hc->features.links.data_scheme_links++; + } + else if (url->rawlen >= 11 && g_ascii_strncasecmp(url->raw, "javascript:", 11) == 0) { + hc->features.links.js_scheme_links++; + } + } + } + /* Domain counting + affiliation */ + if (url->hostlen > 0) { + std::string host{rspamd_url_host_unsafe(url), url->hostlen}; + auto &cnt = hc->link_domain_counts[host]; + cnt++; + if (cnt > hc->features.links.max_links_single_domain) { + hc->features.links.max_links_single_domain = cnt; + } + /* same eTLD+1 as first-party? */ + if (!hc->first_party_etld1.empty()) { + rspamd_ftok_t tld2; + if (rspamd_url_find_tld(host.c_str(), host.size(), &tld2)) { + const char *h = host.c_str(); + const char *p2 = tld2.begin; + while (p2 > h && *(p2 - 1) != '.') { + p2--; + } + std::string etld1_link{p2, h + host.size() - p2}; + if (!g_ascii_strcasecmp(etld1_link.c_str(), hc->first_party_etld1.c_str())) { + hc->features.links.same_etld1_links++; + } + } + } + } + /* Query presence */ + if (url->querylen > 0) { + hc->features.links.query_links++; + } + href_offset = hc->parsed.size(); } } @@ -2172,6 +2326,18 @@ auto html_process_input(struct rspamd_task *task, part_urls); } + /* Track DOM tag count and max depth */ + hc->features.tags_count++; + { + unsigned int depth = 0; + for (auto *pdepth = cur_tag->parent; pdepth != nullptr; pdepth = pdepth->parent) { + depth++; + } + if (depth > hc->features.max_dom_depth) { + hc->features.max_dom_depth = depth; + } + } + if (!(cur_tag->flags & CM_EMPTY)) { html_process_block_tag(pool, cur_tag, hc); } @@ -2833,6 +2999,14 @@ auto html_process_input(struct rspamd_task *task, } } + /* Finalize derived link domain counters */ + if (!hc->link_domain_counts.empty()) { + hc->features.links.domains_total = (unsigned int) hc->link_domain_counts.size(); + } + + /* Mirror parser flags into features */ + hc->features.flags = (unsigned int) hc->flags; + return hc; } @@ -2988,6 +3162,17 @@ rspamd_html_tag_by_id(int id) return nullptr; } +const struct rspamd_html_features * +rspamd_html_get_features(void *html_content) +{ + if (html_content == NULL) { + return NULL; + } + + auto *hc = rspamd::html::html_content::from_ptr(html_content); + return &hc->features; +} + const char * rspamd_html_tag_name(void *p, gsize *len) { diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index 1bab2d10ed..c0fa2b9c34 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -20,6 +20,7 @@ #include "config.h" #include "libutil/mem_pool.h" #include "libserver/url.h" +#include "libserver/html/html_features.h" #ifdef __cplusplus extern "C" { @@ -129,6 +130,11 @@ bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest); */ gsize rspamd_html_get_tags_count(void *html_content); +/** + * Returns an immutable pointer to aggregated html features + */ +const struct rspamd_html_features *rspamd_html_get_features(void *html_content); + #ifdef __cplusplus } diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 4d69559949..509697264f 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -22,10 +22,12 @@ #include "libserver/url.h" #include "libserver/html/html_tag.hxx" #include "libserver/html/html.h" +#include "libserver/html/html_features.h" #include "libserver/html/html_tags.h" #include +#include "contrib/ankerl/unordered_dense.h" #include #include #include "function2/function2.hpp" @@ -50,12 +52,21 @@ struct html_content { std::string invisible; std::shared_ptr css_style; + /* Aggregated HTML features */ + struct rspamd_html_features features; + /* Helper: per-domain link counts */ + ankerl::unordered_dense::map link_domain_counts; + /* First-party eTLD+1 derived from message (e.g. From:) */ + std::string first_party_etld1; + /* Preallocate and reserve all internal structures */ html_content() { tags_seen.resize(Tag_MAX, false); all_tags.reserve(128); parsed.reserve(256); + memset(&features, 0, sizeof(features)); + features.version = 1u; } static void html_content_dtor(void *ptr) diff --git a/src/libserver/html/html_features.h b/src/libserver/html/html_features.h new file mode 100644 index 0000000000..5b02c36a19 --- /dev/null +++ b/src/libserver/html/html_features.h @@ -0,0 +1,89 @@ +/*- + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_HTML_FEATURES_H +#define RSPAMD_HTML_FEATURES_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Per-message HTML link features collected during HTML parsing. + */ +struct rspamd_html_link_features { + /* Total number of / link-like elements with a parsed URL */ + unsigned int total_links; + + /* Links considered affiliated with first-party (From/DKIM/etc.) */ + unsigned int affiliated_links; + unsigned int unaffiliated_links; + + /* Phishing-oriented link properties */ + unsigned int confusable_like_from_links; /* Unicode confusable with first-party */ + unsigned int punycode_links; /* Host contains xn-- */ + unsigned int ip_links; /* Host is an IP */ + unsigned int port_links; /* Has explicit non-default port */ + unsigned int long_query_links; /* Heuristically long query string */ + unsigned int trackerish_links; /* Domain tokens dominated by tracker words */ + unsigned int display_mismatch_links; /* Visible URL text domain != href domain */ + unsigned int js_scheme_links; /* javascript: scheme */ + unsigned int data_scheme_links; /* data: scheme */ + unsigned int mailto_links; /* mailto: links */ + unsigned int http_links; /* http/https links */ + unsigned int query_links; /* links with any query */ + unsigned int same_etld1_links; /* href eTLD+1 equals first-party */ + + /* Domain distribution */ + unsigned int domains_total; /* Distinct domains among links */ + unsigned int max_links_single_domain; /* Max links observed for one domain */ +}; + +/* + * Aggregate HTML features for a text part; extendable in future. + */ +struct rspamd_html_features { + /* Version of the structure for serialization/caching if needed */ + unsigned int version; + + /* Link-related features */ + struct rspamd_html_link_features links; + + /* Forms */ + unsigned int forms_count; + unsigned int forms_post_unaffiliated; + unsigned int forms_post_affiliated; + unsigned int has_password_input; /* 0/1 */ + + /* Images */ + unsigned int images_total; + unsigned int images_external; + unsigned int images_data; + unsigned int images_tiny_external; + + /* DOM / layout */ + unsigned int tags_count; + unsigned int max_dom_depth; + + /* Parser/quality flags mirror (bitset, reserved) */ + unsigned int flags; +}; + +#ifdef __cplusplus +} +#endif + +#endif