]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Fix CSS class normalization in HTML fuzzy tokens
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 21:22:52 +0000 (22:22 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 21:22:52 +0000 (22:22 +0100)
Multiple CSS classes (space-separated) were concatenated incorrectly,
causing token instability. Now take only first class for consistency.

Example:
- Before: class="button primary" → token "a.buttonprimary"
- After:  class="button primary" → token "a.button"

This ensures HTML structure tokens are stable across variations.

src/libutil/shingles_html.cxx
src/plugins/fuzzy_check.c
test/functional/cases/120_fuzzy/html-fuzzy.robot

index 4b8fbf63d0df34ca5e6f10b73218c265a7311e96..8792f3ee8464fdc5cb506c55d38d69c3d4cf8a6c 100644 (file)
@@ -89,21 +89,30 @@ normalize_class(const char *cls, gsize len, rspamd_mempool_t *pool)
                return nullptr;
        }
 
+       /* For multiple classes (space-separated), take only first class */
+       gsize first_class_len = len;
+       for (gsize i = 0; i < len; i++) {
+               if (g_ascii_isspace(cls[i])) {
+                       first_class_len = i;
+                       break;
+               }
+       }
+
        /* Skip if mostly digits */
        unsigned int digit_count = 0;
-       for (gsize i = 0; i < len; i++) {
+       for (gsize i = 0; i < first_class_len; i++) {
                if (g_ascii_isdigit(cls[i])) {
                        digit_count++;
                }
        }
-       if (digit_count > len / 2) {
+       if (digit_count > first_class_len / 2) {
                return nullptr;
        }
 
-       auto *result = static_cast<char *>(rspamd_mempool_alloc(pool, len + 1));
+       auto *result = static_cast<char *>(rspamd_mempool_alloc(pool, first_class_len + 1));
        gsize out_len = 0;
 
-       for (gsize i = 0; i < len && out_len < 32; i++) {
+       for (gsize i = 0; i < first_class_len && out_len < 32; i++) {
                char c = cls[i];
                if (g_ascii_isalnum(c) || c == '-' || c == '_') {
                        result[out_len++] = g_ascii_tolower(c);
@@ -399,7 +408,12 @@ rspamd_shingles_from_html(void *html_content,
 
        auto *hc_ptr = html_content::from_ptr(html_content);
 
-       if (!hc_ptr || hc_ptr->all_tags.empty()) {
+       if (!hc_ptr) {
+               return nullptr;
+       }
+
+       if (hc_ptr->all_tags.empty()) {
+               /* Empty HTML - no tags */
                return nullptr;
        }
 
@@ -409,7 +423,7 @@ rspamd_shingles_from_html(void *html_content,
        html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains);
 
        if (tokens.empty()) {
-               /* Empty HTML structure */
+               /* Empty HTML structure after filtering */
                return nullptr;
        }
 
index 8d87ff1e35552061f5fee9b07aada8240179fb3b..3876666dcc31b4fdb073ad32b99b2b62b021e9f0 100644 (file)
@@ -2116,17 +2116,13 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
        unsigned int additional_length;
        unsigned char *additional_data;
 
-       msg_debug_fuzzy_check("fuzzy_cmd_from_html_part called for rule %s", rule->name);
-
        /* Check if HTML shingles are enabled for this rule */
        if (!rule->html_shingles) {
-               msg_debug_fuzzy_check("HTML shingles disabled for rule %s", rule->name);
                return NULL;
        }
 
        /* Check if this is an HTML part */
        if (!IS_TEXT_PART_HTML(part) || part->html == NULL) {
-               msg_debug_fuzzy_check("Part is not HTML or html is NULL");
                return NULL;
        }
 
@@ -2137,13 +2133,24 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
                return NULL;
        }
 
-       msg_debug_fuzzy_check("Proceeding to generate HTML fuzzy hash, tags_count=%d",
-                                                 part->html_features ? part->html_features->tags_count : 0);
+       /*
+        * HTML fuzzy uses separate cache key to avoid conflicts with text fuzzy.
+        * Text parts can have both text hash (short text, no shingles) and HTML hash.
+        */
+       char html_cache_key[64];
+       int key_part;
+       struct rspamd_cached_shingles **html_cached_ptr;
 
-       cached = fuzzy_cmd_get_cached(rule, task, mp);
+       memcpy(&key_part, rule->shingles_key->str, sizeof(key_part));
+       rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html",
+                                       rule->algorithm_str, key_part);
 
-       if (cached) {
-               /* Copy from cache */
+       html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable(
+               task->task_pool, html_cache_key);
+
+       if (html_cached_ptr && html_cached_ptr[mp->part_number]) {
+               cached = html_cached_ptr[mp->part_number];
+               /* Copy from HTML-specific cache */
                additional_length = cached->additional_length;
                additional_data = cached->additional_data;
 
@@ -2176,11 +2183,8 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
                                                                                 sizeof(*encshcmd) + additional_length);
                shcmd = &encshcmd->cmd;
 
-               msg_debug_fuzzy_check("generating HTML shingles for part with %d tags",
-                                                         part->html_features ? part->html_features->tags_count : 0);
-
                html_sh = rspamd_shingles_from_html(part->html,
-                                                                                       rule->shingles_key->str, task->task_pool,
+                                                                                       (const unsigned char *) rule->shingles_key->str, task->task_pool,
                                                                                        rspamd_shingles_default_filter, NULL,
                                                                                        rule->alg);
 
@@ -3519,22 +3523,13 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
                                                struct fuzzy_cmd_io *html_io;
 
-                                               msg_debug_fuzzy_check("Attempting HTML fuzzy hash for rule %s", rule->name);
                                                html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
                                                                                                                   part, mime_part);
 
                                                if (html_io) {
                                                        /* Add HTML hash as separate command */
-                                                       msg_debug_fuzzy_check("HTML fuzzy hash generated and added to commands");
                                                        g_ptr_array_add(res, html_io);
                                                }
-                                               else {
-                                                       msg_debug_fuzzy_check("HTML fuzzy hash generation returned NULL");
-                                               }
-                                       }
-                                       else {
-                                               msg_debug_fuzzy_check("HTML fuzzy skipped: html_shingles=%d, NOHTML flag=%d",
-                                                                                         rule->html_shingles, !!(flags & FUZZY_CHECK_FLAG_NOHTML));
                                        }
                                }
                                else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
index b27e9bd8c8e735a7e9a5eb573777bf277961e08b..9efb36e8d1eac4ef26c40fb50698e24867845fb2 100644 (file)
@@ -6,6 +6,7 @@ Resource        lib.robot
 *** Variables ***
 ${HTML_TEMPLATE_1}         ${RSPAMD_TESTDIR}/messages/html_template_1.eml
 ${HTML_TEMPLATE_1_VAR}     ${RSPAMD_TESTDIR}/messages/html_template_1_variation.eml
+${HTML_TEMPLATE_1_FUZZY}   ${RSPAMD_TESTDIR}/messages/html_template_1_fuzzy.eml
 ${HTML_PHISHING}           ${RSPAMD_TESTDIR}/messages/html_phishing.eml
 
 *** Keywords ***
@@ -32,13 +33,23 @@ HTML Fuzzy Check Test
   Scan File  ${HTML_TEMPLATE_1}
   Expect Symbol  ${FLAG1_SYMBOL}
 
-HTML Fuzzy Variation Test
-  [Documentation]  Check variation of same template (different text, same HTML structure)
+HTML Fuzzy Exact Match Variation Test
+  [Documentation]  Check exact match with different text but identical HTML structure
   IF  ${RSPAMD_FUZZY_HTML_ADD} == 0
     Fail  "HTML Fuzzy Add was not run"
   END
   Scan File  ${HTML_TEMPLATE_1_VAR}
-  # Should match via HTML shingles despite different text
+  # Should match exactly - same HTML structure, only text differs
+  Expect Symbol  ${FLAG1_SYMBOL}
+
+HTML Fuzzy Similarity Test
+  [Documentation]  Check fuzzy (similarity) match with slightly different HTML structure
+  IF  ${RSPAMD_FUZZY_HTML_ADD} == 0
+    Fail  "HTML Fuzzy Add was not run"
+  END
+  Scan File  ${HTML_TEMPLATE_1_FUZZY}
+  # Should match via shingles - similar but not identical HTML structure
+  # (added spacer div, extra paragraph, second article)
   Expect Symbol  ${FLAG1_SYMBOL}
 
 HTML Fuzzy Phishing Test
@@ -69,8 +80,11 @@ HTML Fuzzy Add
 HTML Fuzzy Exact Match
   HTML Fuzzy Check Test
 
-HTML Fuzzy Template Variation
-  HTML Fuzzy Variation Test
+HTML Fuzzy Exact Match With Text Variation
+  HTML Fuzzy Exact Match Variation Test
+
+HTML Fuzzy Similarity Match
+  HTML Fuzzy Similarity Test
 
 HTML Fuzzy Phishing Detection
   HTML Fuzzy Phishing Test