]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Allow HTML-only fuzzy rules
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 30 Oct 2025 18:43:01 +0000 (18:43 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 30 Oct 2025 18:43:01 +0000 (18:43 +0000)
- add per-rule text_hashes toggle so HTML shingles can stand alone

- adjust lua/C logic and move HTML example into main fuzzy config

conf/modules.d/fuzzy_check.conf
conf/modules.d/fuzzy_check_html.conf [deleted file]
lualib/lua_fuzzy.lua
src/plugins/fuzzy_check.c

index 73e280f7958471e65d236f232cdd2154cce5902a..e3908ade2021640c8cd1d84b1b7b013f2551adee 100644 (file)
@@ -42,6 +42,24 @@ fuzzy_check {
       }
     }
   }
+  # Example HTML-only fuzzy rule. Uncomment and adjust the settings below if you
+  # want to maintain HTML structure hashes on a dedicated fuzzy storage.
+  # rule "html_structure_example" {
+  #   servers = "html-fuzzy.example.com:11335";
+  #   text_hashes = false;          # disable text hashes for this rule
+  #   skip_images = true;           # optional: do not hash images
+  #   html_shingles = true;         # enable HTML structure hashing
+  #   min_html_tags = 20;           # require substantial HTML before hashing
+  #   html_weight = 1.0;            # adjust weight of HTML matches if needed
+  #   symbol = "FUZZY_HTML_STRUCTURE";
+  #   max_score = 25.0;
+  #   fuzzy_map = {
+  #     FUZZY_HTML_SPAM {
+  #       flag = 200;
+  #       max_score = 25.0;
+  #     }
+  #   }
+  # }
   # Include dynamic conf for the rule
   .include(try=true,priority=5) "${DBDIR}/dynamic/fuzzy_check.conf"
   .include(try=true,priority=1,duplicate=merge) "$LOCAL_CONFDIR/local.d/fuzzy_check.conf"
diff --git a/conf/modules.d/fuzzy_check_html.conf b/conf/modules.d/fuzzy_check_html.conf
deleted file mode 100644 (file)
index ed4631e..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-# HTML Fuzzy Hashing Configuration Example
-#
-# This configuration demonstrates how to use HTML fuzzy hashing for:
-# 1. Detecting spam campaigns with similar HTML structure
-# 2. Phishing detection (similar structure, different CTA domains)
-# 3. Brand protection (legitimate templates vs. fake emails)
-
-fuzzy_check {
-  # Example rule for HTML structure matching
-  rule "HTML_FUZZY" {
-    # Standard fuzzy storage configuration
-    servers = "localhost:11335";
-
-    # Encryption (optional, recommended for production)
-    # encryption_key = "your_base32_encoded_public_key";
-    # fuzzy_key = "your_hashing_key";
-    # fuzzy_shingles_key = "your_shingles_key";
-
-    # Algorithm for shingles (mumhash recommended for HTML)
-    algorithm = "mumhash";
-
-    # Enable HTML fuzzy hashing
-    html_shingles = true;
-
-    # Minimum number of HTML tags to generate hash
-    # (prevents hashing of trivial HTML snippets)
-    min_html_tags = 15;
-
-    # Weight multiplier for HTML fuzzy matches
-    # Can be < 1.0 to reduce impact, or > 1.0 to increase
-    html_weight = 1.0;
-
-    # Regular fuzzy check settings
-    symbol = "FUZZY_HTML";
-    max_score = 20.0;
-
-    # Fuzzy flag mappings
-    fuzzy_map = {
-      # Whitelist: known legitimate HTML structures
-      "FUZZY_HTML_WHITELIST" {
-        flag = 1;
-        max_score = 20.0;
-      }
-      # Blacklist: known spam/phishing HTML structures
-      "FUZZY_HTML_BLACKLIST" {
-        flag = 2;
-        max_score = 20.0;
-      }
-    }
-
-    # Optional: skip specific hashes
-    # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map";
-  }
-
-  # Example: Combined text + HTML rule
-  rule "COMBINED_FUZZY" {
-    servers = "localhost:11335";
-    algorithm = "mumhash";
-
-    # Enable both text and HTML fuzzy hashing
-    html_shingles = true;
-    min_html_tags = 10;
-
-    # This rule will generate:
-    # - Text fuzzy hashes (from content)
-    # - HTML fuzzy hashes (from structure)
-    # Both sent to same storage with same flag
-
-    symbol = "FUZZY_COMBINED";
-    max_score = 30.0;
-
-    fuzzy_map = {
-      "FUZZY_COMBINED_WHITE" {
-        flag = 10;
-        max_score = 30.0;
-      }
-      "FUZZY_COMBINED_SPAM" {
-        flag = 11;
-        max_score = 30.0;
-      }
-    }
-  }
-
-  # Example: Phishing detection rule (higher weight for HTML)
-  rule "PHISHING_DETECTION" {
-    servers = "localhost:11335";
-    algorithm = "mumhash";
-
-    html_shingles = true;
-    min_html_tags = 20;
-
-    # Higher weight for HTML matches = prioritize structure over content
-    html_weight = 1.5;
-
-    symbol = "FUZZY_PHISHING";
-    max_score = 25.0;
-
-    fuzzy_map = {
-      # Known phishing HTML templates
-      "FUZZY_PHISHING_HTML" {
-        flag = 20;
-        max_score = 25.0;
-      }
-      # Known legitimate brands (for comparison)
-      "FUZZY_LEGIT_BRANDS" {
-        flag = 21;
-        max_score = -25.0;  # Negative score = whitelist
-      }
-    }
-  }
-}
-
-# Additional configuration for phishing detection rules
-# See rules/fuzzy_html_phishing.lua for Lua-based detection logic
index 986d1a045bc4d38d2454d21c1b80e27de4a94a03..23757ebb7c3a489ffdc332741aa0e516a8d2e7af 100644 (file)
@@ -42,6 +42,7 @@ local policies = {
     scan_archives = true,
     short_text_direct_hash = true,
     text_shingles = true,
+  text_hashes = true,
     skip_images = false,
   }
 }
@@ -58,6 +59,7 @@ local schema_fields = {
   scan_archives = ts.boolean,
   short_text_direct_hash = ts.boolean,
   text_shingles = ts.boolean,
+  text_hashes = ts.boolean,
   skip_images = ts.boolean,
 }
 local policy_schema = ts.shape(schema_fields)
@@ -176,6 +178,12 @@ local function check_text_part(task, part, rule, text)
 
   local id = part:get_id()
   lua_util.debugm(N, task, 'check text part %s', id)
+
+  if rule.text_hashes == false then
+    lua_util.debugm(N, task, 'text hashes disabled, relying on HTML for part %s', id)
+    return rule.html_shingles == true, false
+  end
+
   local wcnt = text:get_words_count()
 
   if rule.text_shingles then
index d07565d753f6103a2e0cdccca17a7d9dbd8e71ca..8554f76bbe8669e35424d80985473b82b8503b51 100644 (file)
@@ -110,6 +110,7 @@ struct fuzzy_rule {
        gboolean no_share;
        gboolean no_subject;
        gboolean html_shingles;     /* Enable HTML fuzzy hashing */
+       gboolean text_hashes;        /* Enable/disable generation of text hashes */
        unsigned int min_html_tags; /* Minimum tags for HTML hash */
        int learn_condition_cb;
        uint32_t retransmits;
@@ -440,6 +441,7 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool)
        rule->weight_threshold = NAN;
        rule->html_weight = 1.0;
        rule->html_shingles = FALSE;
+       rule->text_hashes = TRUE;
        rule->min_html_tags = 10;
 
        return rule;
@@ -2032,6 +2034,10 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj,
                rule->weight_threshold = ucl_object_todouble(value);
        }
 
+       if ((value = ucl_object_lookup(obj, "text_hashes")) != NULL) {
+               rule->text_hashes = ucl_obj_toboolean(value);
+       }
+
        if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) {
                rule->html_shingles = ucl_object_toboolean(value);
        }
@@ -2397,6 +2403,42 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
                                                           0,
                                                           "true",
                                                           0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                  "fuzzy_check.rule",
+                                  "Enable hashing of text content (set to false to disable text hashes)",
+                                  "text_hashes",
+                                  UCL_BOOLEAN,
+                                  NULL,
+                                  0,
+                                  "true",
+                                  0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                  "fuzzy_check.rule",
+                                  "Enable HTML structure hashing for this rule",
+                                  "html_shingles",
+                                  UCL_BOOLEAN,
+                                  NULL,
+                                  0,
+                                  "false",
+                                  0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                  "fuzzy_check.rule",
+                                  "Minimum number of HTML tags required to generate HTML hashes",
+                                  "min_html_tags",
+                                  UCL_INT,
+                                  NULL,
+                                  0,
+                                  NULL,
+                                  0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                  "fuzzy_check.rule",
+                                  "Multiplier applied to HTML fuzzy matches",
+                                  "html_weight",
+                                  UCL_FLOAT,
+                                  NULL,
+                                  0,
+                                  NULL,
+                                  0);
        rspamd_rcl_add_doc_by_path(cfg,
                                                           "fuzzy_check.rule",
                                                           "Override module default min bytes for this rule",
@@ -5086,7 +5128,7 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                        g_ptr_array_add(res, io);
                }
 
-               goto end;
+               return res;
        }
        else if (c == FUZZY_PING) {
                res = g_ptr_array_sized_new(1);
@@ -5096,11 +5138,11 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                        g_ptr_array_add(res, io);
                }
 
-               goto end;
+               return res;
        }
 
        if (task->message == NULL) {
-               goto end;
+               return res;
        }
 
        res = g_ptr_array_sized_new(MESSAGE_FIELD(task, parts)->len + 1);
@@ -5118,21 +5160,25 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                if (mime_part->part_type == RSPAMD_MIME_PART_TEXT &&
                                        !(flags & FUZZY_CHECK_FLAG_NOTEXT)) {
                                        part = mime_part->specific.txt;
+                                       gboolean allow_html = rule->html_shingles &&
+                                               !(flags & FUZZY_CHECK_FLAG_NOHTML) &&
+                                               (check_part || !rule->text_hashes);
+
+                                       if (check_part && rule->text_hashes) {
+                                               io = fuzzy_cmd_from_text_part(task, rule,
+                                                                                       c,
+                                                                                       flag,
+                                                                                       value,
+                                                                                       !fuzzy_check,
+                                                                                       part,
+                                                                                       mime_part);
+                                       }
 
-                                       io = fuzzy_cmd_from_text_part(task, rule,
-                                                                                                 c,
-                                                                                                 flag,
-                                                                                                 value,
-                                                                                                 !fuzzy_check,
-                                                                                                 part,
-                                                                                                 mime_part);
-
-                                       /* Try HTML fuzzy hash if enabled and text hash generation succeeded/failed */
-                                       if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
+                                       if (allow_html && part != NULL) {
                                                struct fuzzy_cmd_io *html_io;
 
                                                html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
-                                                                                                                  part, mime_part);
+                                                                                       part, mime_part);
 
                                                if (html_io) {
                                                        /* Add HTML hash as separate command */
@@ -5140,17 +5186,17 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                                }
                                        }
                                }
-                               else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
-                                                !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
+                               else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
+                                        !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
                                        image = mime_part->specific.img;
 
                                        io = fuzzy_cmd_from_data_part(rule, c, flag, value,
-                                                                                                 task,
-                                                                                                 image->parent->digest,
-                                                                                                 mime_part);
+                                                                                         task,
+                                                                                         image->parent->digest,
+                                                                                         mime_part);
                                        io->flags |= FUZZY_CMD_FLAG_IMAGE;
                                }
-                               else if (mime_part->part_type == RSPAMD_MIME_PART_CUSTOM_LUA) {
+                               else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_CUSTOM_LUA) {
                                        const struct rspamd_lua_specific_part *lua_spec;
 
                                        lua_spec = &mime_part->specific.lua_specific;
@@ -5189,10 +5235,10 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
 
                                                                if (hlen == rspamd_cryptobox_HASHBYTES) {
                                                                        io = fuzzy_cmd_from_data_part(rule, c,
-                                                                                                                                 flag, value,
-                                                                                                                                 task,
-                                                                                                                                 (unsigned char *) h,
-                                                                                                                                 mime_part);
+                                                                                             flag, value,
+                                                                                             task,
+                                                                                             (unsigned char *) h,
+                                                                                             mime_part);
 
                                                                        if (io) {
                                                                                io->flags |= FUZZY_CMD_FLAG_CONTENT;
@@ -5208,16 +5254,16 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                                 * Add part itself as well
                                                 */
                                                io = fuzzy_cmd_from_data_part(rule, c,
-                                                                                                         flag, value,
-                                                                                                         task,
-                                                                                                         mime_part->digest,
-                                                                                                         mime_part);
+                                                                                         flag, value,
+                                                                                         task,
+                                                                                         mime_part->digest,
+                                                                                         mime_part);
                                        }
                                }
-                               else {
+                               else if (check_part) {
                                        io = fuzzy_cmd_from_data_part(rule, c, flag, value,
-                                                                                                 task,
-                                                                                                 mime_part->digest, mime_part);
+                                                                                         task,
+                                                                                         mime_part->digest, mime_part);
                                }
 
                                if (io) {
@@ -5226,7 +5272,7 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        PTR_ARRAY_FOREACH(res, j, cur)
                                        {
                                                if (memcmp(cur->cmd.digest, io->cmd.digest,
-                                                                  sizeof(io->cmd.digest)) == 0) {
+                                                          sizeof(io->cmd.digest)) == 0) {
                                                        skip_existing = TRUE;
                                                        break;
                                                }
@@ -5240,7 +5286,6 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                }
        }
 
-end:
        if (res && res->len == 0) {
                g_ptr_array_free(res, TRUE);