From: Vsevolod Stakhov Date: Sat, 4 Oct 2025 21:22:52 +0000 (+0100) Subject: [Fix] Fix CSS class normalization in HTML fuzzy tokens X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=8f41c948a4805f900746985db80d076504cac766;p=thirdparty%2Frspamd.git [Fix] Fix CSS class normalization in HTML fuzzy tokens Multiple CSS classes (space-separated) were concatenated incorrectly, causing token instability. Now take only first class for consistency. Example: - Before: class="button primary" → token "a.buttonprimary" - After: class="button primary" → token "a.button" This ensures HTML structure tokens are stable across variations. --- diff --git a/src/libutil/shingles_html.cxx b/src/libutil/shingles_html.cxx index 4b8fbf63d0..8792f3ee84 100644 --- a/src/libutil/shingles_html.cxx +++ b/src/libutil/shingles_html.cxx @@ -89,21 +89,30 @@ normalize_class(const char *cls, gsize len, rspamd_mempool_t *pool) return nullptr; } + /* For multiple classes (space-separated), take only first class */ + gsize first_class_len = len; + for (gsize i = 0; i < len; i++) { + if (g_ascii_isspace(cls[i])) { + first_class_len = i; + break; + } + } + /* Skip if mostly digits */ unsigned int digit_count = 0; - for (gsize i = 0; i < len; i++) { + for (gsize i = 0; i < first_class_len; i++) { if (g_ascii_isdigit(cls[i])) { digit_count++; } } - if (digit_count > len / 2) { + if (digit_count > first_class_len / 2) { return nullptr; } - auto *result = static_cast(rspamd_mempool_alloc(pool, len + 1)); + auto *result = static_cast(rspamd_mempool_alloc(pool, first_class_len + 1)); gsize out_len = 0; - for (gsize i = 0; i < len && out_len < 32; i++) { + for (gsize i = 0; i < first_class_len && out_len < 32; i++) { char c = cls[i]; if (g_ascii_isalnum(c) || c == '-' || c == '_') { result[out_len++] = g_ascii_tolower(c); @@ -399,7 +408,12 @@ rspamd_shingles_from_html(void *html_content, auto *hc_ptr = html_content::from_ptr(html_content); - if (!hc_ptr || hc_ptr->all_tags.empty()) { + if (!hc_ptr) { + return nullptr; + } + + if (hc_ptr->all_tags.empty()) { + /* Empty HTML - no tags */ return nullptr; } @@ -409,7 +423,7 @@ rspamd_shingles_from_html(void *html_content, html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains); if (tokens.empty()) { - /* Empty HTML structure */ + /* Empty HTML structure after filtering */ return nullptr; } diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 8d87ff1e35..3876666dcc 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -2116,17 +2116,13 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task, unsigned int additional_length; unsigned char *additional_data; - msg_debug_fuzzy_check("fuzzy_cmd_from_html_part called for rule %s", rule->name); - /* Check if HTML shingles are enabled for this rule */ if (!rule->html_shingles) { - msg_debug_fuzzy_check("HTML shingles disabled for rule %s", rule->name); return NULL; } /* Check if this is an HTML part */ if (!IS_TEXT_PART_HTML(part) || part->html == NULL) { - msg_debug_fuzzy_check("Part is not HTML or html is NULL"); return NULL; } @@ -2137,13 +2133,24 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task, return NULL; } - msg_debug_fuzzy_check("Proceeding to generate HTML fuzzy hash, tags_count=%d", - part->html_features ? part->html_features->tags_count : 0); + /* + * HTML fuzzy uses separate cache key to avoid conflicts with text fuzzy. + * Text parts can have both text hash (short text, no shingles) and HTML hash. + */ + char html_cache_key[64]; + int key_part; + struct rspamd_cached_shingles **html_cached_ptr; - cached = fuzzy_cmd_get_cached(rule, task, mp); + memcpy(&key_part, rule->shingles_key->str, sizeof(key_part)); + rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html", + rule->algorithm_str, key_part); - if (cached) { - /* Copy from cache */ + html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable( + task->task_pool, html_cache_key); + + if (html_cached_ptr && html_cached_ptr[mp->part_number]) { + cached = html_cached_ptr[mp->part_number]; + /* Copy from HTML-specific cache */ additional_length = cached->additional_length; additional_data = cached->additional_data; @@ -2176,11 +2183,8 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task, sizeof(*encshcmd) + additional_length); shcmd = &encshcmd->cmd; - msg_debug_fuzzy_check("generating HTML shingles for part with %d tags", - part->html_features ? part->html_features->tags_count : 0); - html_sh = rspamd_shingles_from_html(part->html, - rule->shingles_key->str, task->task_pool, + (const unsigned char *) rule->shingles_key->str, task->task_pool, rspamd_shingles_default_filter, NULL, rule->alg); @@ -3519,22 +3523,13 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) { struct fuzzy_cmd_io *html_io; - msg_debug_fuzzy_check("Attempting HTML fuzzy hash for rule %s", rule->name); html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value, part, mime_part); if (html_io) { /* Add HTML hash as separate command */ - msg_debug_fuzzy_check("HTML fuzzy hash generated and added to commands"); g_ptr_array_add(res, html_io); } - else { - msg_debug_fuzzy_check("HTML fuzzy hash generation returned NULL"); - } - } - else { - msg_debug_fuzzy_check("HTML fuzzy skipped: html_shingles=%d, NOHTML flag=%d", - rule->html_shingles, !!(flags & FUZZY_CHECK_FLAG_NOHTML)); } } else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE && diff --git a/test/functional/cases/120_fuzzy/html-fuzzy.robot b/test/functional/cases/120_fuzzy/html-fuzzy.robot index b27e9bd8c8..9efb36e8d1 100644 --- a/test/functional/cases/120_fuzzy/html-fuzzy.robot +++ b/test/functional/cases/120_fuzzy/html-fuzzy.robot @@ -6,6 +6,7 @@ Resource lib.robot *** Variables *** ${HTML_TEMPLATE_1} ${RSPAMD_TESTDIR}/messages/html_template_1.eml ${HTML_TEMPLATE_1_VAR} ${RSPAMD_TESTDIR}/messages/html_template_1_variation.eml +${HTML_TEMPLATE_1_FUZZY} ${RSPAMD_TESTDIR}/messages/html_template_1_fuzzy.eml ${HTML_PHISHING} ${RSPAMD_TESTDIR}/messages/html_phishing.eml *** Keywords *** @@ -32,13 +33,23 @@ HTML Fuzzy Check Test Scan File ${HTML_TEMPLATE_1} Expect Symbol ${FLAG1_SYMBOL} -HTML Fuzzy Variation Test - [Documentation] Check variation of same template (different text, same HTML structure) +HTML Fuzzy Exact Match Variation Test + [Documentation] Check exact match with different text but identical HTML structure IF ${RSPAMD_FUZZY_HTML_ADD} == 0 Fail "HTML Fuzzy Add was not run" END Scan File ${HTML_TEMPLATE_1_VAR} - # Should match via HTML shingles despite different text + # Should match exactly - same HTML structure, only text differs + Expect Symbol ${FLAG1_SYMBOL} + +HTML Fuzzy Similarity Test + [Documentation] Check fuzzy (similarity) match with slightly different HTML structure + IF ${RSPAMD_FUZZY_HTML_ADD} == 0 + Fail "HTML Fuzzy Add was not run" + END + Scan File ${HTML_TEMPLATE_1_FUZZY} + # Should match via shingles - similar but not identical HTML structure + # (added spacer div, extra paragraph, second article) Expect Symbol ${FLAG1_SYMBOL} HTML Fuzzy Phishing Test @@ -69,8 +80,11 @@ HTML Fuzzy Add HTML Fuzzy Exact Match HTML Fuzzy Check Test -HTML Fuzzy Template Variation - HTML Fuzzy Variation Test +HTML Fuzzy Exact Match With Text Variation + HTML Fuzzy Exact Match Variation Test + +HTML Fuzzy Similarity Match + HTML Fuzzy Similarity Test HTML Fuzzy Phishing Detection HTML Fuzzy Phishing Test