]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Extract more features from HTML messages
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 6 Sep 2025 12:45:02 +0000 (13:45 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 6 Sep 2025 12:45:02 +0000 (13:45 +0100)
lualib/lua_meta.lua
src/libmime/message.c
src/libmime/message.h
src/libserver/html/html.cxx
src/libserver/html/html.h
src/libserver/html/html.hxx
src/libserver/html/html_features.h [new file with mode: 0644]

index de006df8e774c9bee069771091d23c57fd914784..446c2b4830aac834d536b90e1714232e65636a1a 100644 (file)
@@ -278,6 +278,58 @@ local function meta_words_function(task)
   return ret
 end
 
+local function meta_html_features_function(task)
+  local mp = task:get_mempool()
+  local lt = mp:get_variable("html_links_total", "int") or 0
+  local http = mp:get_variable("html_links_http", "int") or 0
+  local ql = mp:get_variable("html_links_query", "int") or 0
+  local same = mp:get_variable("html_links_same_etld1", "int") or 0
+  local dom_total = mp:get_variable("html_links_domains_total", "int") or 0
+  local max_per_dom = mp:get_variable("html_links_max_per_domain", "int") or 0
+
+  local ft = mp:get_variable("html_forms_total", "int") or 0
+  local fua = mp:get_variable("html_forms_post_unaffiliated", "int") or 0
+  local fa = mp:get_variable("html_forms_post_affiliated", "int") or 0
+
+  local nhtml_links = 0
+  local http_ratio = 0
+  local query_ratio = 0
+  local same_etld1_ratio = 0
+  local domains_per_link_ratio = 0
+  local max_links_per_domain_ratio = 0
+
+  if lt > 0 then
+    nhtml_links = 1.0 / lt
+    http_ratio = http / lt
+    query_ratio = ql / lt
+    same_etld1_ratio = same / lt
+    domains_per_link_ratio = dom_total / lt
+    max_links_per_domain_ratio = max_per_dom / lt
+  end
+
+  local nhtml_forms = 0
+  local forms_unaff_ratio = 0
+  local forms_aff_ratio = 0
+
+  if ft > 0 then
+    nhtml_forms = 1.0 / ft
+    forms_unaff_ratio = fua / ft
+    forms_aff_ratio = fa / ft
+  end
+
+  return {
+    nhtml_links,
+    http_ratio,
+    query_ratio,
+    same_etld1_ratio,
+    domains_per_link_ratio,
+    max_links_per_domain_ratio,
+    nhtml_forms,
+    forms_unaff_ratio,
+    forms_aff_ratio,
+  }
+end
+
 local metafunctions = {
   {
     cb = meta_size_function,
@@ -402,6 +454,32 @@ local metafunctions = {
     - rate of non-ascii characters
     - rate of capital letters
     - rate of numbers
+]]
+  },
+  {
+    cb = meta_html_features_function,
+    ninputs = 9,
+    names = {
+      'nhtml_links',
+      'nhtml_http_links_ratio',
+      'nhtml_query_links_ratio',
+      'nhtml_same_etld1_links_ratio',
+      'nhtml_domains_per_link_ratio',
+      'nhtml_max_links_per_domain_ratio',
+      'nhtml_forms',
+      'nhtml_forms_unaffiliated_ratio',
+      'nhtml_forms_affiliated_ratio',
+    },
+    description = [[HTML link/form aggregated features:
+    - reciprocal of total links
+    - ratio of http(s) links
+    - ratio of links with query
+    - ratio of links with same eTLD+1 as first-party
+    - domains per link ratio
+    - max links per single domain ratio
+    - reciprocal of total forms
+    - ratio of forms posting to unaffiliated domains
+    - ratio of forms posting to affiliated domains
 ]]
   },
 }
@@ -527,7 +605,7 @@ end
 
 exports.rspamd_count_metatokens = rspamd_count_metatokens
 exports.count_metatokens = rspamd_count_metatokens
-exports.version = 1 -- MUST be increased on each change of metatokens
+exports.version = 2 -- MUST be increased on each change of metatokens
 
 exports.add_metafunction = function(tbl)
   local ret, err = meta_schema(tbl)
index 8442c80ac82a5597034fb4bbad3f57ac982f4f8a..c5bb0039707bcd9429a267df8ac71917c613d747 100644 (file)
@@ -791,6 +791,33 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
                text_part->mime_part->urls,
                task->cfg ? task->cfg->enable_css_parser : true,
                cur_url_order);
+
+       /* Wire aggregated HTML features */
+       text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
+       /* Expose a few mempool variables for Lua meta to start experimenting */
+       if (text_part->html_features) {
+               const struct rspamd_html_features *hf = text_part->html_features;
+               rspamd_mempool_set_variable(task->task_pool, "html_links_total",
+                                                                       (void *) &hf->links.total_links, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_links_http",
+                                                                       (void *) &hf->links.http_links, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_links_query",
+                                                                       (void *) &hf->links.query_links, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_links_same_etld1",
+                                                                       (void *) &hf->links.same_etld1_links, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_links_domains_total",
+                                                                       (void *) &hf->links.domains_total, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_links_max_per_domain",
+                                                                       (void *) &hf->links.max_links_single_domain, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_images_total",
+                                                                       (void *) &hf->images_total, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_forms_total",
+                                                                       (void *) &hf->forms_count, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_forms_post_unaffiliated",
+                                                                       (void *) &hf->forms_post_unaffiliated, NULL);
+               rspamd_mempool_set_variable(task->task_pool, "html_forms_post_affiliated",
+                                                                       (void *) &hf->forms_post_affiliated, NULL);
+       }
        rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
        if (text_part->utf_content.len == 0) {
index e6b4543625178b29a53bb691f4629bb84845ffb3..83f36ff19203c485c8f4e3a23a211a751cae9b6f 100644 (file)
@@ -29,6 +29,7 @@ struct rspamd_task;
 struct controller_session;
 struct rspamd_image;
 struct rspamd_archive;
+struct rspamd_html_features;
 
 enum rspamd_mime_part_flags {
        RSPAMD_MIME_PART_ATTACHEMENT = (1u << 1u),
@@ -145,6 +146,8 @@ struct rspamd_mime_text_part {
 
        GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
        void *html;
+       /* Optional HTML features collected during parsing */
+       struct rspamd_html_features *html_features;
        GList *exceptions; /**< list of offsets of urls                                         */
        struct rspamd_mime_part *mime_part;
 
index 78a6a975c9486741e626d020c79c03ee542e18b6..26a105f355d9d62aa8f71bd0ef67eb9e36c4253d 100644 (file)
@@ -1605,6 +1605,19 @@ html_process_img_tag(rspamd_mempool_t *pool,
 
        hc->images.push_back(img);
 
+       /* Update image-related features */
+       hc->features.images_total++;
+       if (img->flags & RSPAMD_HTML_FLAG_IMAGE_DATA) {
+               hc->features.images_data++;
+       }
+       if (img->flags & RSPAMD_HTML_FLAG_IMAGE_EXTERNAL) {
+               hc->features.images_external++;
+               /* tiny external pixel tracking */
+               if (img->width > 0 && img->height > 0 && (img->width * img->height) <= 4u) {
+                       hc->features.images_tiny_external++;
+               }
+       }
+
        if (std::holds_alternative<std::monostate>(tag->extra)) {
                tag->extra = img;
        }
@@ -1928,6 +1941,16 @@ html_append_tag_content(rspamd_mempool_t *pool,
                                                                                        {hc->parsed.data() + initial_parsed_offset, std::size_t(written_len)},
                                                                                        tag, exceptions,
                                                                                        url_set, initial_parsed_offset);
+                       /* Count display URL mismatches when URL is present */
+                       if (std::holds_alternative<rspamd_url *>(tag->extra)) {
+                               auto *u = std::get<rspamd_url *>(tag->extra);
+                               if (u && (u->flags & RSPAMD_URL_FLAG_DISPLAY_URL) && (u->flags & RSPAMD_URL_FLAG_HTML_DISPLAYED)) {
+                                       /* html_process_displayed_href_tag sets linked_url when display URL differs */
+                                       if (u->ext && u->ext->linked_url && u->ext->linked_url != u) {
+                                               hc->features.links.display_mismatch_links++;
+                                       }
+                               }
+                       }
                }
                else if (tag->id == Tag_IMG) {
                        /* Process ALT if presented */
@@ -2023,6 +2046,26 @@ auto html_process_input(struct rspamd_task *task,
        auto *hc = new html_content;
        rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
 
+       /* Derive first-party eTLD+1 from From: if present */
+       if (MESSAGE_FIELD(task, from_mime) && MESSAGE_FIELD(task, from_mime)->len > 0) {
+               struct rspamd_email_address *addr = (struct rspamd_email_address *) g_ptr_array_index(MESSAGE_FIELD(task, from_mime), 0);
+               if (addr && addr->domain && addr->domain_len > 0) {
+                       rspamd_ftok_t tld;
+                       if (rspamd_url_find_tld(addr->domain, addr->domain_len, &tld)) {
+                               /* eTLD+1: take the last label before tld and the tld */
+                               const char *dom = addr->domain;
+                               const char *dom_end = addr->domain + addr->domain_len;
+                               const char *tld_begin = tld.begin;
+                               /* Find start of the registrable part */
+                               const char *p = tld_begin;
+                               while (p > dom && *(p - 1) != '.') {
+                                       p--;
+                               }
+                               hc->first_party_etld1.assign(p, dom_end - p);
+                       }
+               }
+       }
+
        if (task->cfg && in->len > task->cfg->max_html_len) {
                msg_notice_task("html input is too big: %z, limit is %z",
                                                in->len,
@@ -2065,6 +2108,52 @@ auto html_process_input(struct rspamd_task *task,
                        hc->tags_seen[cur_tag->id] = true;
                }
 
+               /* Simple feature collection on opening */
+               switch (cur_tag->id) {
+               case Tag_FORM:
+                       hc->features.forms_count++;
+                       /* If action present and absolute, compare eTLD+1 with first-party */
+                       if (auto href = cur_tag->find_href()) {
+                               if (html_is_absolute_url(*href)) {
+                                       auto maybe_url = html_process_url(pool, *href);
+                                       if (maybe_url) {
+                                               struct rspamd_url *u = maybe_url.value();
+                                               if (u->hostlen > 0) {
+                                                       /* Find eTLD+1 of action host */
+                                                       rspamd_ftok_t tld2;
+                                                       if (rspamd_url_find_tld(rspamd_url_host_unsafe(u), u->hostlen, &tld2)) {
+                                                               const char *host = rspamd_url_host_unsafe(u);
+                                                               const char *p2 = tld2.begin;
+                                                               while (p2 > host && *(p2 - 1) != '.') {
+                                                                       p2--;
+                                                               }
+                                                               std::string etld1_action{p2, host + u->hostlen - p2};
+                                                               if (!hc->first_party_etld1.empty() && !g_ascii_strcasecmp(etld1_action.c_str(), hc->first_party_etld1.c_str())) {
+                                                                       hc->features.forms_post_affiliated++;
+                                                               }
+                                                               else {
+                                                                       hc->features.forms_post_unaffiliated++;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+                       break;
+               case Tag_INPUT: {
+                       if (auto type_comp = cur_tag->find_component<html_component_type>()) {
+                               auto tv = type_comp.value()->get_string_value();
+                               if (tv.size() == sizeof("password") - 1 &&
+                                       g_ascii_strncasecmp(tv.data(), "password", tv.size()) == 0) {
+                                       hc->features.has_password_input = 1u;
+                               }
+                       }
+                       break;
+               }
+               default:
+                       break;
+               }
+
                /* Shift to the first unclosed tag */
                auto *pt = parent_tag;
                while (pt && (pt->flags & FL_CLOSED)) {
@@ -2137,6 +2226,71 @@ auto html_process_input(struct rspamd_task *task,
                                        g_ptr_array_add(part_urls, url);
                                }
 
+                               /* Minimal link features collection */
+                               hc->features.links.total_links++;
+                               if (url->flags & RSPAMD_URL_FLAG_IDN) {
+                                       hc->features.links.punycode_links++;
+                               }
+                               if (url->flags & RSPAMD_URL_FLAG_NUMERIC) {
+                                       hc->features.links.ip_links++;
+                               }
+                               if (url->flags & RSPAMD_URL_FLAG_HAS_PORT) {
+                                       hc->features.links.port_links++;
+                               }
+                               if (url->flags & RSPAMD_URL_FLAG_QUERY) {
+                                       /* Heuristic: long query length */
+                                       if (url->querylen > 64) {
+                                               hc->features.links.long_query_links++;
+                                       }
+                               }
+                               /* Scheme type */
+                               if (url->protocol == PROTOCOL_MAILTO) {
+                                       hc->features.links.mailto_links++;
+                               }
+                               else if (url->protocol == PROTOCOL_HTTP || url->protocol == PROTOCOL_HTTPS) {
+                                       hc->features.links.http_links++;
+                               }
+                               /* data/javascript schemes can be detected by flags set during parsing */
+                               if (url->protocol == PROTOCOL_UNKNOWN) {
+                                       /* We don't have explicit scheme enum for data/js; check raw prefix quickly */
+                                       if (url->raw && url->rawlen >= 5) {
+                                               if (g_ascii_strncasecmp(url->raw, "data:", 5) == 0) {
+                                                       hc->features.links.data_scheme_links++;
+                                               }
+                                               else if (url->rawlen >= 11 && g_ascii_strncasecmp(url->raw, "javascript:", 11) == 0) {
+                                                       hc->features.links.js_scheme_links++;
+                                               }
+                                       }
+                               }
+                               /* Domain counting + affiliation */
+                               if (url->hostlen > 0) {
+                                       std::string host{rspamd_url_host_unsafe(url), url->hostlen};
+                                       auto &cnt = hc->link_domain_counts[host];
+                                       cnt++;
+                                       if (cnt > hc->features.links.max_links_single_domain) {
+                                               hc->features.links.max_links_single_domain = cnt;
+                                       }
+                                       /* same eTLD+1 as first-party? */
+                                       if (!hc->first_party_etld1.empty()) {
+                                               rspamd_ftok_t tld2;
+                                               if (rspamd_url_find_tld(host.c_str(), host.size(), &tld2)) {
+                                                       const char *h = host.c_str();
+                                                       const char *p2 = tld2.begin;
+                                                       while (p2 > h && *(p2 - 1) != '.') {
+                                                               p2--;
+                                                       }
+                                                       std::string etld1_link{p2, h + host.size() - p2};
+                                                       if (!g_ascii_strcasecmp(etld1_link.c_str(), hc->first_party_etld1.c_str())) {
+                                                               hc->features.links.same_etld1_links++;
+                                                       }
+                                               }
+                                       }
+                               }
+                               /* Query presence */
+                               if (url->querylen > 0) {
+                                       hc->features.links.query_links++;
+                               }
+
                                href_offset = hc->parsed.size();
                        }
                }
@@ -2172,6 +2326,18 @@ auto html_process_input(struct rspamd_task *task,
                                                                  part_urls);
                }
 
+               /* Track DOM tag count and max depth */
+               hc->features.tags_count++;
+               {
+                       unsigned int depth = 0;
+                       for (auto *pdepth = cur_tag->parent; pdepth != nullptr; pdepth = pdepth->parent) {
+                               depth++;
+                       }
+                       if (depth > hc->features.max_dom_depth) {
+                               hc->features.max_dom_depth = depth;
+                       }
+               }
+
                if (!(cur_tag->flags & CM_EMPTY)) {
                        html_process_block_tag(pool, cur_tag, hc);
                }
@@ -2833,6 +2999,14 @@ auto html_process_input(struct rspamd_task *task,
                }
        }
 
+       /* Finalize derived link domain counters */
+       if (!hc->link_domain_counts.empty()) {
+               hc->features.links.domains_total = (unsigned int) hc->link_domain_counts.size();
+       }
+
+       /* Mirror parser flags into features */
+       hc->features.flags = (unsigned int) hc->flags;
+
        return hc;
 }
 
@@ -2988,6 +3162,17 @@ rspamd_html_tag_by_id(int id)
        return nullptr;
 }
 
+const struct rspamd_html_features *
+rspamd_html_get_features(void *html_content)
+{
+       if (html_content == NULL) {
+               return NULL;
+       }
+
+       auto *hc = rspamd::html::html_content::from_ptr(html_content);
+       return &hc->features;
+}
+
 const char *
 rspamd_html_tag_name(void *p, gsize *len)
 {
index 1bab2d10ed1047121de3c3c614b166537b510a32..c0fa2b9c341b263955f759149c015de6b12cd25d 100644 (file)
@@ -20,6 +20,7 @@
 #include "config.h"
 #include "libutil/mem_pool.h"
 #include "libserver/url.h"
+#include "libserver/html/html_features.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -129,6 +130,11 @@ bool rspamd_html_get_parsed_content(void *html_content, rspamd_ftok_t *dest);
  */
 gsize rspamd_html_get_tags_count(void *html_content);
 
+/**
+ * Returns an immutable pointer to aggregated html features
+ */
+const struct rspamd_html_features *rspamd_html_get_features(void *html_content);
+
 
 #ifdef __cplusplus
 }
index 4d6955994906fd5e0a282047425e149772c5a867..509697264f5a026974ecb9f32c9f84e8a9741c29 100644 (file)
 #include "libserver/url.h"
 #include "libserver/html/html_tag.hxx"
 #include "libserver/html/html.h"
+#include "libserver/html/html_features.h"
 #include "libserver/html/html_tags.h"
 
 
 #include <vector>
+#include "contrib/ankerl/unordered_dense.h"
 #include <memory>
 #include <string>
 #include "function2/function2.hpp"
@@ -50,12 +52,21 @@ struct html_content {
        std::string invisible;
        std::shared_ptr<css::css_style_sheet> css_style;
 
+       /* Aggregated HTML features */
+       struct rspamd_html_features features;
+       /* Helper: per-domain link counts */
+       ankerl::unordered_dense::map<std::string, unsigned int> link_domain_counts;
+       /* First-party eTLD+1 derived from message (e.g. From:) */
+       std::string first_party_etld1;
+
        /* Preallocate and reserve all internal structures */
        html_content()
        {
                tags_seen.resize(Tag_MAX, false);
                all_tags.reserve(128);
                parsed.reserve(256);
+               memset(&features, 0, sizeof(features));
+               features.version = 1u;
        }
 
        static void html_content_dtor(void *ptr)
diff --git a/src/libserver/html/html_features.h b/src/libserver/html/html_features.h
new file mode 100644 (file)
index 0000000..5b02c36
--- /dev/null
@@ -0,0 +1,89 @@
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_FEATURES_H
+#define RSPAMD_HTML_FEATURES_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Per-message HTML link features collected during HTML parsing.
+ */
+struct rspamd_html_link_features {
+       /* Total number of <a> / link-like elements with a parsed URL */
+       unsigned int total_links;
+
+       /* Links considered affiliated with first-party (From/DKIM/etc.) */
+       unsigned int affiliated_links;
+       unsigned int unaffiliated_links;
+
+       /* Phishing-oriented link properties */
+       unsigned int confusable_like_from_links; /* Unicode confusable with first-party */
+       unsigned int punycode_links;             /* Host contains xn-- */
+       unsigned int ip_links;                   /* Host is an IP */
+       unsigned int port_links;                 /* Has explicit non-default port */
+       unsigned int long_query_links;           /* Heuristically long query string */
+       unsigned int trackerish_links;           /* Domain tokens dominated by tracker words */
+       unsigned int display_mismatch_links;     /* Visible URL text domain != href domain */
+       unsigned int js_scheme_links;            /* javascript: scheme */
+       unsigned int data_scheme_links;          /* data: scheme */
+       unsigned int mailto_links;               /* mailto: links */
+       unsigned int http_links;                 /* http/https links */
+       unsigned int query_links;                /* links with any query */
+       unsigned int same_etld1_links;           /* href eTLD+1 equals first-party */
+
+       /* Domain distribution */
+       unsigned int domains_total;           /* Distinct domains among links */
+       unsigned int max_links_single_domain; /* Max links observed for one domain */
+};
+
+/*
+ * Aggregate HTML features for a text part; extendable in future.
+ */
+struct rspamd_html_features {
+       /* Version of the structure for serialization/caching if needed */
+       unsigned int version;
+
+       /* Link-related features */
+       struct rspamd_html_link_features links;
+
+       /* Forms */
+       unsigned int forms_count;
+       unsigned int forms_post_unaffiliated;
+       unsigned int forms_post_affiliated;
+       unsigned int has_password_input; /* 0/1 */
+
+       /* Images */
+       unsigned int images_total;
+       unsigned int images_external;
+       unsigned int images_data;
+       unsigned int images_tiny_external;
+
+       /* DOM / layout */
+       unsigned int tags_count;
+       unsigned int max_dom_depth;
+
+       /* Parser/quality flags mirror (bitset, reserved) */
+       unsigned int flags;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif