From: Vsevolod Stakhov Date: Thu, 30 Oct 2025 18:43:01 +0000 (+0000) Subject: [Feature] Allow HTML-only fuzzy rules X-Git-Tag: 3.14.0~26^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=055715b16836150b4b57459ff32d0512dcf02b56;p=thirdparty%2Frspamd.git [Feature] Allow HTML-only fuzzy rules - add per-rule text_hashes toggle so HTML shingles can stand alone - adjust lua/C logic and move HTML example into main fuzzy config --- diff --git a/conf/modules.d/fuzzy_check.conf b/conf/modules.d/fuzzy_check.conf index 73e280f795..e3908ade20 100644 --- a/conf/modules.d/fuzzy_check.conf +++ b/conf/modules.d/fuzzy_check.conf @@ -42,6 +42,24 @@ fuzzy_check { } } } + # Example HTML-only fuzzy rule. Uncomment and adjust the settings below if you + # want to maintain HTML structure hashes on a dedicated fuzzy storage. + # rule "html_structure_example" { + # servers = "html-fuzzy.example.com:11335"; + # text_hashes = false; # disable text hashes for this rule + # skip_images = true; # optional: do not hash images + # html_shingles = true; # enable HTML structure hashing + # min_html_tags = 20; # require substantial HTML before hashing + # html_weight = 1.0; # adjust weight of HTML matches if needed + # symbol = "FUZZY_HTML_STRUCTURE"; + # max_score = 25.0; + # fuzzy_map = { + # FUZZY_HTML_SPAM { + # flag = 200; + # max_score = 25.0; + # } + # } + # } # Include dynamic conf for the rule .include(try=true,priority=5) "${DBDIR}/dynamic/fuzzy_check.conf" .include(try=true,priority=1,duplicate=merge) "$LOCAL_CONFDIR/local.d/fuzzy_check.conf" diff --git a/conf/modules.d/fuzzy_check_html.conf b/conf/modules.d/fuzzy_check_html.conf deleted file mode 100644 index ed4631e9ad..0000000000 --- a/conf/modules.d/fuzzy_check_html.conf +++ /dev/null @@ -1,114 +0,0 @@ -# HTML Fuzzy Hashing Configuration Example -# -# This configuration demonstrates how to use HTML fuzzy hashing for: -# 1. Detecting spam campaigns with similar HTML structure -# 2. Phishing detection (similar structure, different CTA domains) -# 3. Brand protection (legitimate templates vs. fake emails) - -fuzzy_check { - # Example rule for HTML structure matching - rule "HTML_FUZZY" { - # Standard fuzzy storage configuration - servers = "localhost:11335"; - - # Encryption (optional, recommended for production) - # encryption_key = "your_base32_encoded_public_key"; - # fuzzy_key = "your_hashing_key"; - # fuzzy_shingles_key = "your_shingles_key"; - - # Algorithm for shingles (mumhash recommended for HTML) - algorithm = "mumhash"; - - # Enable HTML fuzzy hashing - html_shingles = true; - - # Minimum number of HTML tags to generate hash - # (prevents hashing of trivial HTML snippets) - min_html_tags = 15; - - # Weight multiplier for HTML fuzzy matches - # Can be < 1.0 to reduce impact, or > 1.0 to increase - html_weight = 1.0; - - # Regular fuzzy check settings - symbol = "FUZZY_HTML"; - max_score = 20.0; - - # Fuzzy flag mappings - fuzzy_map = { - # Whitelist: known legitimate HTML structures - "FUZZY_HTML_WHITELIST" { - flag = 1; - max_score = 20.0; - } - # Blacklist: known spam/phishing HTML structures - "FUZZY_HTML_BLACKLIST" { - flag = 2; - max_score = 20.0; - } - } - - # Optional: skip specific hashes - # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map"; - } - - # Example: Combined text + HTML rule - rule "COMBINED_FUZZY" { - servers = "localhost:11335"; - algorithm = "mumhash"; - - # Enable both text and HTML fuzzy hashing - html_shingles = true; - min_html_tags = 10; - - # This rule will generate: - # - Text fuzzy hashes (from content) - # - HTML fuzzy hashes (from structure) - # Both sent to same storage with same flag - - symbol = "FUZZY_COMBINED"; - max_score = 30.0; - - fuzzy_map = { - "FUZZY_COMBINED_WHITE" { - flag = 10; - max_score = 30.0; - } - "FUZZY_COMBINED_SPAM" { - flag = 11; - max_score = 30.0; - } - } - } - - # Example: Phishing detection rule (higher weight for HTML) - rule "PHISHING_DETECTION" { - servers = "localhost:11335"; - algorithm = "mumhash"; - - html_shingles = true; - min_html_tags = 20; - - # Higher weight for HTML matches = prioritize structure over content - html_weight = 1.5; - - symbol = "FUZZY_PHISHING"; - max_score = 25.0; - - fuzzy_map = { - # Known phishing HTML templates - "FUZZY_PHISHING_HTML" { - flag = 20; - max_score = 25.0; - } - # Known legitimate brands (for comparison) - "FUZZY_LEGIT_BRANDS" { - flag = 21; - max_score = -25.0; # Negative score = whitelist - } - } - } -} - -# Additional configuration for phishing detection rules -# See rules/fuzzy_html_phishing.lua for Lua-based detection logic diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua index 986d1a045b..23757ebb7c 100644 --- a/lualib/lua_fuzzy.lua +++ b/lualib/lua_fuzzy.lua @@ -42,6 +42,7 @@ local policies = { scan_archives = true, short_text_direct_hash = true, text_shingles = true, + text_hashes = true, skip_images = false, } } @@ -58,6 +59,7 @@ local schema_fields = { scan_archives = ts.boolean, short_text_direct_hash = ts.boolean, text_shingles = ts.boolean, + text_hashes = ts.boolean, skip_images = ts.boolean, } local policy_schema = ts.shape(schema_fields) @@ -176,6 +178,12 @@ local function check_text_part(task, part, rule, text) local id = part:get_id() lua_util.debugm(N, task, 'check text part %s', id) + + if rule.text_hashes == false then + lua_util.debugm(N, task, 'text hashes disabled, relying on HTML for part %s', id) + return rule.html_shingles == true, false + end + local wcnt = text:get_words_count() if rule.text_shingles then diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index d07565d753..8554f76bbe 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -110,6 +110,7 @@ struct fuzzy_rule { gboolean no_share; gboolean no_subject; gboolean html_shingles; /* Enable HTML fuzzy hashing */ + gboolean text_hashes; /* Enable/disable generation of text hashes */ unsigned int min_html_tags; /* Minimum tags for HTML hash */ int learn_condition_cb; uint32_t retransmits; @@ -440,6 +441,7 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool) rule->weight_threshold = NAN; rule->html_weight = 1.0; rule->html_shingles = FALSE; + rule->text_hashes = TRUE; rule->min_html_tags = 10; return rule; @@ -2032,6 +2034,10 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj, rule->weight_threshold = ucl_object_todouble(value); } + if ((value = ucl_object_lookup(obj, "text_hashes")) != NULL) { + rule->text_hashes = ucl_obj_toboolean(value); + } + if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) { rule->html_shingles = ucl_object_toboolean(value); } @@ -2397,6 +2403,42 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx) 0, "true", 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Enable hashing of text content (set to false to disable text hashes)", + "text_hashes", + UCL_BOOLEAN, + NULL, + 0, + "true", + 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Enable HTML structure hashing for this rule", + "html_shingles", + UCL_BOOLEAN, + NULL, + 0, + "false", + 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Minimum number of HTML tags required to generate HTML hashes", + "min_html_tags", + UCL_INT, + NULL, + 0, + NULL, + 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Multiplier applied to HTML fuzzy matches", + "html_weight", + UCL_FLOAT, + NULL, + 0, + NULL, + 0); rspamd_rcl_add_doc_by_path(cfg, "fuzzy_check.rule", "Override module default min bytes for this rule", @@ -5086,7 +5128,7 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, g_ptr_array_add(res, io); } - goto end; + return res; } else if (c == FUZZY_PING) { res = g_ptr_array_sized_new(1); @@ -5096,11 +5138,11 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, g_ptr_array_add(res, io); } - goto end; + return res; } if (task->message == NULL) { - goto end; + return res; } res = g_ptr_array_sized_new(MESSAGE_FIELD(task, parts)->len + 1); @@ -5118,21 +5160,25 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, if (mime_part->part_type == RSPAMD_MIME_PART_TEXT && !(flags & FUZZY_CHECK_FLAG_NOTEXT)) { part = mime_part->specific.txt; + gboolean allow_html = rule->html_shingles && + !(flags & FUZZY_CHECK_FLAG_NOHTML) && + (check_part || !rule->text_hashes); + + if (check_part && rule->text_hashes) { + io = fuzzy_cmd_from_text_part(task, rule, + c, + flag, + value, + !fuzzy_check, + part, + mime_part); + } - io = fuzzy_cmd_from_text_part(task, rule, - c, - flag, - value, - !fuzzy_check, - part, - mime_part); - - /* Try HTML fuzzy hash if enabled and text hash generation succeeded/failed */ - if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) { + if (allow_html && part != NULL) { struct fuzzy_cmd_io *html_io; html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value, - part, mime_part); + part, mime_part); if (html_io) { /* Add HTML hash as separate command */ @@ -5140,17 +5186,17 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, } } } - else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE && - !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) { + else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_IMAGE && + !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) { image = mime_part->specific.img; io = fuzzy_cmd_from_data_part(rule, c, flag, value, - task, - image->parent->digest, - mime_part); + task, + image->parent->digest, + mime_part); io->flags |= FUZZY_CMD_FLAG_IMAGE; } - else if (mime_part->part_type == RSPAMD_MIME_PART_CUSTOM_LUA) { + else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_CUSTOM_LUA) { const struct rspamd_lua_specific_part *lua_spec; lua_spec = &mime_part->specific.lua_specific; @@ -5189,10 +5235,10 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, if (hlen == rspamd_cryptobox_HASHBYTES) { io = fuzzy_cmd_from_data_part(rule, c, - flag, value, - task, - (unsigned char *) h, - mime_part); + flag, value, + task, + (unsigned char *) h, + mime_part); if (io) { io->flags |= FUZZY_CMD_FLAG_CONTENT; @@ -5208,16 +5254,16 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, * Add part itself as well */ io = fuzzy_cmd_from_data_part(rule, c, - flag, value, - task, - mime_part->digest, - mime_part); + flag, value, + task, + mime_part->digest, + mime_part); } } - else { + else if (check_part) { io = fuzzy_cmd_from_data_part(rule, c, flag, value, - task, - mime_part->digest, mime_part); + task, + mime_part->digest, mime_part); } if (io) { @@ -5226,7 +5272,7 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, PTR_ARRAY_FOREACH(res, j, cur) { if (memcmp(cur->cmd.digest, io->cmd.digest, - sizeof(io->cmd.digest)) == 0) { + sizeof(io->cmd.digest)) == 0) { skip_existing = TRUE; break; } @@ -5240,7 +5286,6 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, } } -end: if (res && res->len == 0) { g_ptr_array_free(res, TRUE);