From: Vsevolod Stakhov Date: Fri, 13 Feb 2026 07:23:20 +0000 (+0000) Subject: [Feature] Add ignore_link_domains option for HTML fuzzy rules X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=03be93671d00b3b0a523af899867d79e7fa66b61;p=thirdparty%2Frspamd.git [Feature] Add ignore_link_domains option for HTML fuzzy rules Allow HTML fuzzy shingles to match purely on structure (tag skeleton + classes) by stripping link domains from structural tokens. Domain hashes are still computed separately for the metadata fields. A separate cache key suffix (_nd) prevents cross-contamination with normal hashes. --- diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index 29e097c318..c710f33fc0 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -117,7 +117,8 @@ struct rspamd_html_shingle *rspamd_shingles_from_html(void *html_content, rspamd_mempool_t *pool, rspamd_shingles_filter filter, gpointer filterd, - enum rspamd_shingle_alg alg); + enum rspamd_shingle_alg alg, + gboolean ignore_link_domains); /** * Compares two shingles and return result as a floating point value - 1.0 diff --git a/src/libutil/shingles_html.cxx b/src/libutil/shingles_html.cxx index 7dc7ec70e4..be729ff177 100644 --- a/src/libutil/shingles_html.cxx +++ b/src/libutil/shingles_html.cxx @@ -129,7 +129,8 @@ html_extract_structural_tokens(html_content *hc, rspamd_mempool_t *pool, std::vector &tokens, std::vector &cta_domains, - std::vector &all_domains) + std::vector &all_domains, + bool ignore_link_domains) { tokens.reserve(hc->all_tags.size()); cta_domains.reserve(16); @@ -175,10 +176,12 @@ html_extract_structural_tokens(html_content *hc, auto etld1 = extract_etld1_from_url(url); if (!etld1.empty()) { - token += '@'; - token += etld1; + if (!ignore_link_domains) { + token += '@'; + token += etld1; + } - /* Add to all_domains */ + /* Domain collection for separate hashes still happens unconditionally */ all_domains.push_back(etld1); /* Check if this is a CTA link using button weights */ @@ -426,7 +429,8 @@ rspamd_shingles_from_html(void *html_content, rspamd_mempool_t *pool, rspamd_shingles_filter filter, gpointer filterd, - enum rspamd_shingle_alg alg) + enum rspamd_shingle_alg alg, + gboolean ignore_link_domains) { if (!html_content) { return nullptr; @@ -446,7 +450,7 @@ rspamd_shingles_from_html(void *html_content, /* 1. Extract structural tokens and domain lists using modern C++ */ std::vector tokens; std::vector cta_domains, all_domains; - html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains); + html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains, ignore_link_domains); if (tokens.empty()) { /* Empty HTML structure after filtering */ diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index db60b24e2c..0dec35f891 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -1430,7 +1430,7 @@ lua_textpart_get_html_fuzzy_hashes(lua_State *L) /* Generate HTML shingles */ html_sgl = rspamd_shingles_from_html(part->html, key, pool, rspamd_shingles_default_filter, - NULL, RSPAMD_SHINGLES_MUMHASH); + NULL, RSPAMD_SHINGLES_MUMHASH, FALSE); if (html_sgl == NULL) { lua_pushnil(L); diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 7f0ba2367c..b87a44681d 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -118,6 +118,7 @@ struct fuzzy_rule { gboolean html_shingles; /* Enable HTML fuzzy hashing */ gboolean text_hashes; /* Enable/disable generation of text hashes */ unsigned int min_html_tags; /* Minimum tags for HTML hash */ + gboolean html_ignore_domains; /* Ignore link domains in HTML structural tokens */ int learn_condition_cb; uint32_t retransmits; struct rspamd_hash_map_helper *skip_map; @@ -489,6 +490,10 @@ fuzzy_rule_apply_checks(struct fuzzy_rule *rule, else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) { rule->html_weight = ucl_obj_todouble(opt); } + + if ((opt = ucl_object_lookup(cur, "ignore_link_domains")) != NULL) { + rule->html_ignore_domains = ucl_obj_toboolean(opt); + } } else { /* Other checks are processed by lua_fuzzy; keep legacy behaviour */ @@ -533,6 +538,7 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool) rule->html_shingles = FALSE; rule->text_hashes = TRUE; rule->min_html_tags = 10; + rule->html_ignore_domains = FALSE; return rule; } @@ -2543,6 +2549,15 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx) 0, NULL, 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Ignore link domains in HTML structural tokens (for template-only matching)", + "html_ignore_domains", + UCL_BOOLEAN, + NULL, + 0, + "false", + 0); rspamd_rcl_add_doc_by_path(cfg, "fuzzy_check.rule", "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })", @@ -3757,8 +3772,9 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task, struct rspamd_cached_shingles **html_cached_ptr; memcpy(&key_part, rule->shingles_key->str, sizeof(key_part)); - rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html", - rule->algorithm_str, key_part); + rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html%s", + rule->algorithm_str, key_part, + rule->html_ignore_domains ? "_nd" : ""); html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable( task->task_pool, html_cache_key); @@ -3801,7 +3817,7 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task, html_sh = rspamd_shingles_from_html(part->html, (const unsigned char *) rule->shingles_key->str, task->task_pool, rspamd_shingles_default_filter, NULL, - rule->alg); + rule->alg, rule->html_ignore_domains); if (html_sh != NULL) { /* Use structure shingles for fuzzy matching */