]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Add debug logging to HTML fuzzy hash generation
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 20:30:53 +0000 (21:30 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 20:30:53 +0000 (21:30 +0100)
Add detailed debug messages to track HTML fuzzy hash generation flow:
- Log when fuzzy_cmd_from_html_part is called
- Log HTML shingles enabled/disabled status
- Log HTML part detection
- Log tag count checks
- Log successful/failed hash generation

This helps diagnose issues with HTML fuzzy matching in tests.

src/plugins/fuzzy_check.c
test/functional/cases/120_fuzzy/html-fuzzy.robot

index 0afdfba67aa6f18304923cc259687cae3c2f0842..8d87ff1e35552061f5fee9b07aada8240179fb3b 100644 (file)
@@ -2116,13 +2116,17 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
        unsigned int additional_length;
        unsigned char *additional_data;
 
+       msg_debug_fuzzy_check("fuzzy_cmd_from_html_part called for rule %s", rule->name);
+
        /* Check if HTML shingles are enabled for this rule */
        if (!rule->html_shingles) {
+               msg_debug_fuzzy_check("HTML shingles disabled for rule %s", rule->name);
                return NULL;
        }
 
        /* Check if this is an HTML part */
        if (!IS_TEXT_PART_HTML(part) || part->html == NULL) {
+               msg_debug_fuzzy_check("Part is not HTML or html is NULL");
                return NULL;
        }
 
@@ -2133,6 +2137,9 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
                return NULL;
        }
 
+       msg_debug_fuzzy_check("Proceeding to generate HTML fuzzy hash, tags_count=%d",
+                                                 part->html_features ? part->html_features->tags_count : 0);
+
        cached = fuzzy_cmd_get_cached(rule, task, mp);
 
        if (cached) {
@@ -3512,13 +3519,22 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
                                                struct fuzzy_cmd_io *html_io;
 
+                                               msg_debug_fuzzy_check("Attempting HTML fuzzy hash for rule %s", rule->name);
                                                html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
                                                                                                                   part, mime_part);
 
                                                if (html_io) {
                                                        /* Add HTML hash as separate command */
+                                                       msg_debug_fuzzy_check("HTML fuzzy hash generated and added to commands");
                                                        g_ptr_array_add(res, html_io);
                                                }
+                                               else {
+                                                       msg_debug_fuzzy_check("HTML fuzzy hash generation returned NULL");
+                                               }
+                                       }
+                                       else {
+                                               msg_debug_fuzzy_check("HTML fuzzy skipped: html_shingles=%d, NOHTML flag=%d",
+                                                                                         rule->html_shingles, !!(flags & FUZZY_CHECK_FLAG_NOHTML));
                                        }
                                }
                                else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
index ee63c445d3e68a7151cf87f34f82417b3d553b4a..26be91392a4c0befb36ced0efc8e290338763fac 100644 (file)
@@ -46,12 +46,10 @@ HTML Fuzzy Phishing Test
   IF  ${RSPAMD_FUZZY_HTML_ADD} == 0
     Fail  "HTML Fuzzy Add was not run"
   END
-  Scan File  ${HTML_PHISHING}
   # Structure similar but CTA domains different
   # Might match with lower score or not match depending on CTA weight
-  # For now just verify no crash
-  ${result} =  Scan Message With Rspamc  ${HTML_PHISHING}
-  Should Be Equal As Numbers  ${result.returncode}  0
+  # Just verify scanning works without crash
+  Scan File  ${HTML_PHISHING}
 
 HTML Fuzzy Delete Test
   [Documentation]  Delete HTML fuzzy hash