]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Add safety checks for short HTML to prevent false positives
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 5 Oct 2025 07:32:54 +0000 (08:32 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 5 Oct 2025 07:32:54 +0000 (08:32 +0100)
Require minimum complexity for HTML fuzzy matching:
- At least 2 links (single-link emails too generic)
- At least DOM depth 3 (flat structures too common)

This prevents false positives on trivial HTML like:
  <html><body><p>text <a href="...">link</a></p></body></html>

Such simple structures are not unique enough for reliable fuzzy matching.

src/plugins/fuzzy_check.c

index 3876666dcc31b4fdb073ad32b99b2b62b021e9f0..9568cf8410dc87e30933c88ffb632d971bfd0799 100644 (file)
@@ -2133,6 +2133,24 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
                return NULL;
        }
 
+       /*
+        * Additional safety checks for short HTML to prevent false positives:
+        * - Require at least 2 links (single-link emails too generic)
+        * - Require at least some DOM depth (flat structure too common)
+        */
+       if (part->html_features) {
+               if (part->html_features->links.total_links < 2) {
+                       msg_debug_fuzzy_check("HTML part has only %d links, too few for reliable matching",
+                                                                 part->html_features->links.total_links);
+                       return NULL;
+               }
+               if (part->html_features->max_dom_depth < 3) {
+                       msg_debug_fuzzy_check("HTML part has depth %d, too shallow for reliable matching",
+                                                                 part->html_features->max_dom_depth);
+                       return NULL;
+               }
+       }
+
        /*
         * HTML fuzzy uses separate cache key to avoid conflicts with text fuzzy.
         * Text parts can have both text hash (short text, no shingles) and HTML hash.