return ret
end
+local function meta_html_features_function(task)
+ local mp = task:get_mempool()
+ local lt = mp:get_variable("html_links_total", "int") or 0
+ local http = mp:get_variable("html_links_http", "int") or 0
+ local ql = mp:get_variable("html_links_query", "int") or 0
+ local same = mp:get_variable("html_links_same_etld1", "int") or 0
+ local dom_total = mp:get_variable("html_links_domains_total", "int") or 0
+ local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0
+
+ local ft = mp:get_variable("html_forms_total", "int") or 0
+ local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0
+ local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0
+
+ local nhtml_links = 0
+ local http_ratio = 0
+ local query_ratio = 0
+ local same_etld1_ratio = 0
+ local domains_per_link_ratio = 0
+ local max_links_per_domain_ratio = 0
+
+ if lt > 0 then
+ nhtml_links = 1.0 / lt
+ http_ratio = http / lt
+ query_ratio = ql / lt
+ same_etld1_ratio = same / lt
+ domains_per_link_ratio = dom_total / lt
+ max_links_per_domain_ratio = max_per_dom / lt
+ end
+
+ local nhtml_forms = 0
+ local forms_unaff_ratio = 0
+ local forms_aff_ratio = 0
+
+ if ft > 0 then
+ nhtml_forms = 1.0 / ft
+ forms_unaff_ratio = fua / ft
+ forms_aff_ratio = fa / ft
+ end
+
+ return {
+ nhtml_links,
+ http_ratio,
+ query_ratio,
+ same_etld1_ratio,
+ domains_per_link_ratio,
+ max_links_per_domain_ratio,
+ nhtml_forms,
+ forms_unaff_ratio,
+ forms_aff_ratio,
+ }
+end
+
local metafunctions = {
{
cb = meta_size_function,
- rate of non-ascii characters
- rate of capital letters
- rate of numbers
+]]
+ },
+ {
+ cb = meta_html_features_function,
+ ninputs = 9,
+ names = {
+ 'nhtml_links',
+ 'nhtml_http_links_ratio',
+ 'nhtml_query_links_ratio',
+ 'nhtml_same_etld1_links_ratio',
+ 'nhtml_domains_per_link_ratio',
+ 'nhtml_max_links_per_domain_ratio',
+ 'nhtml_forms',
+ 'nhtml_forms_unaffiliated_ratio',
+ 'nhtml_forms_affiliated_ratio',
+ },
+ description = [[HTML link/form aggregated features:
+ - reciprocal of total links
+ - ratio of http(s) links
+ - ratio of links with query
+ - ratio of links with same eTLD+1 as first-party
+ - domains per link ratio
+ - max links per single domain ratio
+ - reciprocal of total forms
+ - ratio of forms posting to unaffiliated domains
+ - ratio of forms posting to affiliated domains
]]
},
}
exports.rspamd_count_metatokens = rspamd_count_metatokens
exports.count_metatokens = rspamd_count_metatokens
-exports.version = 1 -- MUST be increased on each change of metatokens
+exports.version = 2 -- MUST be increased on each change of metatokens
exports.add_metafunction = function(tbl)
local ret, err = meta_schema(tbl)
text_part->mime_part->urls,
task->cfg ? task->cfg->enable_css_parser : true,
cur_url_order);
+
+ /* Wire aggregated HTML features */
+ text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
+ /* Expose a few mempool variables for Lua meta to start experimenting */
+ if (text_part->html_features) {
+ const struct rspamd_html_features *hf = text_part->html_features;
+ rspamd_mempool_set_variable(task->task_pool, "html_links_total",
+ (void *) &hf->links.total_links, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_links_http",
+ (void *) &hf->links.http_links, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_links_query",
+ (void *) &hf->links.query_links, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1",
+ (void *) &hf->links.same_etld1_links, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total",
+ (void *) &hf->links.domains_total, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain",
+ (void *) &hf->links.max_links_single_domain, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_images_total",
+ (void *) &hf->images_total, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_forms_total",
+ (void *) &hf->forms_count, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated",
+ (void *) &hf->forms_post_unaffiliated, NULL);
+ rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
+ (void *) &hf->forms_post_affiliated, NULL);
+ }
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
if (text_part->utf_content.len == 0) {
struct controller_session;
struct rspamd_image;
struct rspamd_archive;
+struct rspamd_html_features;
enum rspamd_mime_part_flags {
RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u),
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
void *html;
+ /* Optional HTML features collected during parsing */
+ struct rspamd_html_features *html_features;
GList *exceptions; /**< list of offsets of urls */
struct rspamd_mime_part *mime_part;
hc->images.push_back(img);
+ /* Update image-related features */
+ hc->features.images_total++;
+ if (img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA) {
+ hc->features.images_data++;
+ }
+ if (img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL) {
+ hc->features.images_external++;
+ /* tiny external pixel tracking */
+ if (img->width > 0 && img->height > 0 && (img->width * img->height) <= 4u) {
+ hc->features.images_tiny_external++;
+ }
+ }
+
if (std::holds_alternative<std::monostate>(tag->extra)) {
tag->extra = img;
}
{hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
tag, exceptions,
url_set, initial_parsed_offset);
+ /* Count display URL mismatches when URL is present */
+ if (std::holds_alternative<rspamd_url *>(tag->extra)) {
+ auto *u = std::get<rspamd_url *>(tag->extra);
+ if (u && (u->flags & RSPAMD_URL_FLAG_DISPLAY_URL) && (u->flags & RSPAMD_URL_FLAG_HTML_DISPLAYED)) {
+ /* html_process_displayed_href_tag sets linked_url when display URL differs */
+ if (u->ext && u->ext->linked_url && u->ext->linked_url != u) {
+ hc->features.links.display_mismatch_links++;
+ }
+ }
+ }
}
else if (tag->id == Tag_IMG) {
/* Process ALT if presented */
auto *hc = new html_content;
rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
+ /* Derive first-party eTLD+1 from From: if present */
+ if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) {
+ struct rspamd_email_address *addr = (struct rspamd_email_address *) g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0);
+ if (addr && addr->domain && addr->domain_len > 0) {
+ rspamd_ftok_t tld;
+ if (rspamd_url_find_tld(addr->domain, addr->domain_len, &tld)) {
+ /* eTLD+1: take the last label before tld and the tld */
+ const char *dom = addr->domain;
+ const char *dom_end = addr->domain + addr->domain_len;
+ const char *tld_begin = tld.begin;
+ /* Find start of the registrable part */
+ const char *p = tld_begin;
+ while (p > dom && *(p - 1) != '.') {
+ p--;
+ }
+ hc->first_party_etld1.assign(p, dom_end - p);
+ }
+ }
+ }
+
if (task->cfg && in->len > task->cfg->max_html_len) {
msg_notice_task("html input is too big: %z, limit is %z",
in->len,
hc->tags_seen[cur_tag->id] = true;
}
+ /* Simple feature collection on opening */
+ switch (cur_tag->id) {
+ case Tag_FORM:
+ hc->features.forms_count++;
+ /* If action present and absolute, compare eTLD+1 with first-party */
+ if (auto href = cur_tag->find_href()) {
+ if (html_is_absolute_url(*href)) {
+ auto maybe_url = html_process_url(pool, *href);
+ if (maybe_url) {
+ struct rspamd_url *u = maybe_url.value();
+ if (u->hostlen > 0) {
+ /* Find eTLD+1 of action host */
+ rspamd_ftok_t tld2;
+ if (rspamd_url_find_tld(rspamd_url_host_unsafe(u), u->hostlen, &tld2)) {
+ const char *host = rspamd_url_host_unsafe(u);
+ const char *p2 = tld2.begin;
+ while (p2 > host && *(p2 - 1) != '.') {
+ p2--;
+ }
+ std::string etld1_action{p2, host + u->hostlen - p2};
+ if (!hc->first_party_etld1.empty() && !g_ascii_strcasecmp(etld1_action.c_str(), hc->first_party_etld1.c_str())) {
+ hc->features.forms_post_affiliated++;
+ }
+ else {
+ hc->features.forms_post_unaffiliated++;
+ }
+ }
+ }
+ }
+ }
+ }
+ break;
+ case Tag_INPUT: {
+ if (auto type_comp = cur_tag->find_component<html_component_type>()) {
+ auto tv = type_comp.value()->get_string_value();
+ if (tv.size() == sizeof("password") - 1 &&
+ g_ascii_strncasecmp(tv.data(), "password", tv.size()) == 0) {
+ hc->features.has_password_input = 1u;
+ }
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
/* Shift to the first unclosed tag */
auto *pt = parent_tag;
while (pt && (pt->flags & FL_CLOSED)) {
g_ptr_array_add(part_urls, url);
}
+ /* Minimal link features collection */
+ hc->features.links.total_links++;
+ if (url->flags & RSPAMD_URL_FLAG_IDN) {
+ hc->features.links.punycode_links++;
+ }
+ if (url->flags & RSPAMD_URL_FLAG_NUMERIC) {
+ hc->features.links.ip_links++;
+ }
+ if (url->flags & RSPAMD_URL_FLAG_HAS_PORT) {
+ hc->features.links.port_links++;
+ }
+ if (url->flags & RSPAMD_URL_FLAG_QUERY) {
+ /* Heuristic: long query length */
+ if (url->querylen > 64) {
+ hc->features.links.long_query_links++;
+ }
+ }
+ /* Scheme type */
+ if (url->protocol == PROTOCOL_MAILTO) {
+ hc->features.links.mailto_links++;
+ }
+ else if (url->protocol == PROTOCOL_HTTP || url->protocol == PROTOCOL_HTTPS) {
+ hc->features.links.http_links++;
+ }
+ /* data/javascript schemes can be detected by flags set during parsing */
+ if (url->protocol == PROTOCOL_UNKNOWN) {
+ /* We don't have explicit scheme enum for data/js; check raw prefix quickly */
+ if (url->raw && url->rawlen >= 5) {
+ if (g_ascii_strncasecmp(url->raw, "data:", 5) == 0) {
+ hc->features.links.data_scheme_links++;
+ }
+ else if (url->rawlen >= 11 && g_ascii_strncasecmp(url->raw, "javascript:", 11) == 0) {
+ hc->features.links.js_scheme_links++;
+ }
+ }
+ }
+ /* Domain counting + affiliation */
+ if (url->hostlen > 0) {
+ std::string host{rspamd_url_host_unsafe(url), url->hostlen};
+ auto &cnt = hc->link_domain_counts[host];
+ cnt++;
+ if (cnt > hc->features.links.max_links_single_domain) {
+ hc->features.links.max_links_single_domain = cnt;
+ }
+ /* same eTLD+1 as first-party? */
+ if (!hc->first_party_etld1.empty()) {
+ rspamd_ftok_t tld2;
+ if (rspamd_url_find_tld(host.c_str(), host.size(), &tld2)) {
+ const char *h = host.c_str();
+ const char *p2 = tld2.begin;
+ while (p2 > h && *(p2 - 1) != '.') {
+ p2--;
+ }
+ std::string etld1_link{p2, h + host.size() - p2};
+ if (!g_ascii_strcasecmp(etld1_link.c_str(), hc->first_party_etld1.c_str())) {
+ hc->features.links.same_etld1_links++;
+ }
+ }
+ }
+ }
+ /* Query presence */
+ if (url->querylen > 0) {
+ hc->features.links.query_links++;
+ }
+
href_offset = hc->parsed.size();
}
}
part_urls);
}
+ /* Track DOM tag count and max depth */
+ hc->features.tags_count++;
+ {
+ unsigned int depth = 0;
+ for (auto *pdepth = cur_tag->parent; pdepth != nullptr; pdepth = pdepth->parent) {
+ depth++;
+ }
+ if (depth > hc->features.max_dom_depth) {
+ hc->features.max_dom_depth = depth;
+ }
+ }
+
if (!(cur_tag->flags & CM_EMPTY)) {
html_process_block_tag(pool, cur_tag, hc);
}
}
}
+ /* Finalize derived link domain counters */
+ if (!hc->link_domain_counts.empty()) {
+ hc->features.links.domains_total = (unsigned int) hc->link_domain_counts.size();
+ }
+
+ /* Mirror parser flags into features */
+ hc->features.flags = (unsigned int) hc->flags;
+
return hc;
}
return nullptr;
}
+const struct rspamd_html_features *
+rspamd_html_get_features(void *html_content)
+{
+ if (html_content == NULL) {
+ return NULL;
+ }
+
+ auto *hc = rspamd::html::html_content::from_ptr(html_content);
+ return &hc->features;
+}
+
const char *
rspamd_html_tag_name(void *p, gsize *len)
{
#include "config.h"
#include "libutil/mem_pool.h"
#include "libserver/url.h"
+#include "libserver/html/html_features.h"
#ifdef __cplusplus
extern "C" {
*/
gsize rspamd_html_get_tags_count(void *html_content);
+/**
+ * Returns an immutable pointer to aggregated html features
+ */
+const struct rspamd_html_features *rspamd_html_get_features(void *html_content);
+
#ifdef __cplusplus
}
#include "libserver/url.h"
#include "libserver/html/html_tag.hxx"
#include "libserver/html/html.h"
+#include "libserver/html/html_features.h"
#include "libserver/html/html_tags.h"
#include <vector>
+#include "contrib/ankerl/unordered_dense.h"
#include <memory>
#include <string>
#include "function2/function2.hpp"
std::string invisible;
std::shared_ptr<css::css_style_sheet> css_style;
+ /* Aggregated HTML features */
+ struct rspamd_html_features features;
+ /* Helper: per-domain link counts */
+ ankerl::unordered_dense::map<std::string, unsigned int> link_domain_counts;
+ /* First-party eTLD+1 derived from message (e.g. From:) */
+ std::string first_party_etld1;
+
/* Preallocate and reserve all internal structures */
html_content()
{
tags_seen.resize(Tag_MAX, false);
all_tags.reserve(128);
parsed.reserve(256);
+ memset(&features, 0, sizeof(features));
+ features.version = 1u;
}
static void html_content_dtor(void *ptr)
--- /dev/null
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_FEATURES_H
+#define RSPAMD_HTML_FEATURES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Per-message HTML link features collected during HTML parsing.
+ */
+struct rspamd_html_link_features {
+ /* Total number of <a> / link-like elements with a parsed URL */
+ unsigned int total_links;
+
+ /* Links considered affiliated with first-party (From/DKIM/etc.) */
+ unsigned int affiliated_links;
+ unsigned int unaffiliated_links;
+
+ /* Phishing-oriented link properties */
+ unsigned int confusable_like_from_links; /* Unicode confusable with first-party */
+ unsigned int punycode_links; /* Host contains xn-- */
+ unsigned int ip_links; /* Host is an IP */
+ unsigned int port_links; /* Has explicit non-default port */
+ unsigned int long_query_links; /* Heuristically long query string */
+ unsigned int trackerish_links; /* Domain tokens dominated by tracker words */
+ unsigned int display_mismatch_links; /* Visible URL text domain != href domain */
+ unsigned int js_scheme_links; /* javascript: scheme */
+ unsigned int data_scheme_links; /* data: scheme */
+ unsigned int mailto_links; /* mailto: links */
+ unsigned int http_links; /* http/https links */
+ unsigned int query_links; /* links with any query */
+ unsigned int same_etld1_links; /* href eTLD+1 equals first-party */
+
+ /* Domain distribution */
+ unsigned int domains_total; /* Distinct domains among links */
+ unsigned int max_links_single_domain; /* Max links observed for one domain */
+};
+
+/*
+ * Aggregate HTML features for a text part; extendable in future.
+ */
+struct rspamd_html_features {
+ /* Version of the structure for serialization/caching if needed */
+ unsigned int version;
+
+ /* Link-related features */
+ struct rspamd_html_link_features links;
+
+ /* Forms */
+ unsigned int forms_count;
+ unsigned int forms_post_unaffiliated;
+ unsigned int forms_post_affiliated;
+ unsigned int has_password_input; /* 0/1 */
+
+ /* Images */
+ unsigned int images_total;
+ unsigned int images_external;
+ unsigned int images_data;
+ unsigned int images_tiny_external;
+
+ /* DOM / layout */
+ unsigned int tags_count;
+ unsigned int max_dom_depth;
+
+ /* Parser/quality flags mirror (bitset, reserved) */
+ unsigned int flags;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif