]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add ignore_link_domains option for HTML fuzzy rules
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 07:23:20 +0000 (07:23 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 07:23:20 +0000 (07:23 +0000)
Allow HTML fuzzy shingles to match purely on structure (tag skeleton +
classes) by stripping link domains from structural tokens. Domain hashes
are still computed separately for the metadata fields. A separate cache
key suffix (_nd) prevents cross-contamination with normal hashes.

src/libutil/shingles.h
src/libutil/shingles_html.cxx
src/lua/lua_mimepart.c
src/plugins/fuzzy_check.c

index 29e097c3185e423eca80b76981f635731713aa5e..c710f33fc07a1e59ac65ba316a47f2d666cc4de4 100644 (file)
@@ -117,7 +117,8 @@ struct rspamd_html_shingle *rspamd_shingles_from_html(void *html_content,
                                                                                                          rspamd_mempool_t *pool,
                                                                                                          rspamd_shingles_filter filter,
                                                                                                          gpointer filterd,
-                                                                                                         enum rspamd_shingle_alg alg);
+                                                                                                         enum rspamd_shingle_alg alg,
+                                                                                                         gboolean ignore_link_domains);
 
 /**
  * Compares two shingles and return result as a floating point value - 1.0
index 7dc7ec70e4df00baf062e58e782b0c335321315b..be729ff177d5922a5095aa42f9d4896158494743 100644 (file)
@@ -129,7 +129,8 @@ html_extract_structural_tokens(html_content *hc,
                                                           rspamd_mempool_t *pool,
                                                           std::vector<std::string> &tokens,
                                                           std::vector<std::string_view> &cta_domains,
-                                                          std::vector<std::string_view> &all_domains)
+                                                          std::vector<std::string_view> &all_domains,
+                                                          bool ignore_link_domains)
 {
        tokens.reserve(hc->all_tags.size());
        cta_domains.reserve(16);
@@ -175,10 +176,12 @@ html_extract_structural_tokens(html_content *hc,
                                auto etld1 = extract_etld1_from_url(url);
 
                                if (!etld1.empty()) {
-                                       token += '@';
-                                       token += etld1;
+                                       if (!ignore_link_domains) {
+                                               token += '@';
+                                               token += etld1;
+                                       }
 
-                                       /* Add to all_domains */
+                                       /* Domain collection for separate hashes still happens unconditionally */
                                        all_domains.push_back(etld1);
 
                                        /* Check if this is a CTA link using button weights */
@@ -426,7 +429,8 @@ rspamd_shingles_from_html(void *html_content,
                                                  rspamd_mempool_t *pool,
                                                  rspamd_shingles_filter filter,
                                                  gpointer filterd,
-                                                 enum rspamd_shingle_alg alg)
+                                                 enum rspamd_shingle_alg alg,
+                                                 gboolean ignore_link_domains)
 {
        if (!html_content) {
                return nullptr;
@@ -446,7 +450,7 @@ rspamd_shingles_from_html(void *html_content,
        /* 1. Extract structural tokens and domain lists using modern C++ */
        std::vector<std::string> tokens;
        std::vector<std::string_view> cta_domains, all_domains;
-       html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains);
+       html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains, ignore_link_domains);
 
        if (tokens.empty()) {
                /* Empty HTML structure after filtering */
index db60b24e2cb930f6ad92cc53559cd3b42c747cd5..0dec35f891acd555c61b7258f86b0ec90d19b18d 100644 (file)
@@ -1430,7 +1430,7 @@ lua_textpart_get_html_fuzzy_hashes(lua_State *L)
        /* Generate HTML shingles */
        html_sgl = rspamd_shingles_from_html(part->html, key, pool,
                                                                                 rspamd_shingles_default_filter,
-                                                                                NULL, RSPAMD_SHINGLES_MUMHASH);
+                                                                                NULL, RSPAMD_SHINGLES_MUMHASH, FALSE);
 
        if (html_sgl == NULL) {
                lua_pushnil(L);
index 7f0ba2367ca26ca8045cddacb966c998b8feb21d..b87a44681d9472067316a7080b3fb994c0a44737 100644 (file)
@@ -118,6 +118,7 @@ struct fuzzy_rule {
        gboolean html_shingles;     /* Enable HTML fuzzy hashing */
        gboolean text_hashes;       /* Enable/disable generation of text hashes */
        unsigned int min_html_tags; /* Minimum tags for HTML hash */
+       gboolean html_ignore_domains; /* Ignore link domains in HTML structural tokens */
        int learn_condition_cb;
        uint32_t retransmits;
        struct rspamd_hash_map_helper *skip_map;
@@ -489,6 +490,10 @@ fuzzy_rule_apply_checks(struct fuzzy_rule *rule,
                        else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) {
                                rule->html_weight = ucl_obj_todouble(opt);
                        }
+
+                       if ((opt = ucl_object_lookup(cur, "ignore_link_domains")) != NULL) {
+                               rule->html_ignore_domains = ucl_obj_toboolean(opt);
+                       }
                }
                else {
                        /* Other checks are processed by lua_fuzzy; keep legacy behaviour */
@@ -533,6 +538,7 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool)
        rule->html_shingles = FALSE;
        rule->text_hashes = TRUE;
        rule->min_html_tags = 10;
+       rule->html_ignore_domains = FALSE;
 
        return rule;
 }
@@ -2543,6 +2549,15 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
                                                           0,
                                                           NULL,
                                                           0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                                          "fuzzy_check.rule",
+                                                          "Ignore link domains in HTML structural tokens (for template-only matching)",
+                                                          "html_ignore_domains",
+                                                          UCL_BOOLEAN,
+                                                          NULL,
+                                                          0,
+                                                          "false",
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
                                                           "fuzzy_check.rule",
                                                           "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
@@ -3757,8 +3772,9 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
        struct rspamd_cached_shingles **html_cached_ptr;
 
        memcpy(&key_part, rule->shingles_key->str, sizeof(key_part));
-       rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html",
-                                       rule->algorithm_str, key_part);
+       rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html%s",
+                                       rule->algorithm_str, key_part,
+                                       rule->html_ignore_domains ? "_nd" : "");
 
        html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable(
                task->task_pool, html_cache_key);
@@ -3801,7 +3817,7 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
                html_sh = rspamd_shingles_from_html(part->html,
                                                                                        (const unsigned char *) rule->shingles_key->str, task->task_pool,
                                                                                        rspamd_shingles_default_filter, NULL,
-                                                                                       rule->alg);
+                                                                                       rule->alg, rule->html_ignore_domains);
 
                if (html_sh != NULL) {
                        /* Use structure shingles for fuzzy matching */