From: Vsevolod Stakhov Date: Sat, 4 Oct 2025 18:34:48 +0000 (+0100) Subject: [Feature] Integrate HTML fuzzy hashing into fuzzy_check module X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=28e67afe35d7e67a77ade47e15df16c1e98f4d50;p=thirdparty%2Frspamd.git [Feature] Integrate HTML fuzzy hashing into fuzzy_check module Add support for HTML structure fuzzy hashing in fuzzy_check plugin: Core integration: - Add FUZZY_CMD_FLAG_HTML flag and FUZZY_RESULT_HTML result type - Add html_shingles, min_html_tags, html_weight options to fuzzy_rule - Implement fuzzy_cmd_from_html_part() to generate HTML fuzzy commands - Integrate into fuzzy_generate_commands() for automatic hash generation - Handle HTML results with configurable weight multiplier Configuration: - html_shingles: enable/disable HTML fuzzy hashing per rule - min_html_tags: minimum HTML tags threshold (default 10) - html_weight: score multiplier for HTML matches (default 1.0) Use cases: 1. Brand protection: detect phishing with copied HTML but fake CTA 2. Spam campaigns: group messages by HTML structure 3. Template detection: identify newsletters/notifications 4. Phishing: text match + HTML CTA mismatch = suspicious Files added: - lualib/lua_fuzzy_html.lua: helper functions for mismatch detection - conf/modules.d/fuzzy_check_html.conf: configuration examples - test/functional/configs/fuzzy_html_test.conf: test configuration - rules/fuzzy_html_phishing.lua: phishing detection rules HTML fuzzy works alongside text fuzzy: - Both hashes generated and sent to storage - Separate result types allow different handling - CTA domain verification prevents false positives Next steps: - Performance testing on real email corpus - Fine-tune weights and thresholds - Collect legitimate brand templates for whitelisting --- diff --git a/conf/modules.d/fuzzy_check_html.conf b/conf/modules.d/fuzzy_check_html.conf new file mode 100644 index 0000000000..face9c916d --- /dev/null +++ b/conf/modules.d/fuzzy_check_html.conf @@ -0,0 +1,114 @@ +# HTML Fuzzy Hashing Configuration Example +# +# This configuration demonstrates how to use HTML fuzzy hashing for: +# 1. Detecting spam campaigns with similar HTML structure +# 2. Phishing detection (similar structure, different CTA domains) +# 3. Brand protection (legitimate templates vs. fake emails) + +fuzzy_check { + # Example rule for HTML structure matching + rule "HTML_FUZZY" { + # Standard fuzzy storage configuration + servers = "localhost:11335"; + + # Encryption (optional, recommended for production) + # encryption_key = "your_base32_encoded_public_key"; + # fuzzy_key = "your_hashing_key"; + # fuzzy_shingles_key = "your_shingles_key"; + + # Algorithm for shingles (mumhash recommended for HTML) + algorithm = "mumhash"; + + # Enable HTML fuzzy hashing + html_shingles = true; + + # Minimum number of HTML tags to generate hash + # (prevents hashing of trivial HTML snippets) + min_html_tags = 15; + + # Weight multiplier for HTML fuzzy matches + # Can be < 1.0 to reduce impact, or > 1.0 to increase + html_weight = 1.0; + + # Regular fuzzy check settings + symbol = "FUZZY_HTML"; + max_score = 20.0; + + # Fuzzy flag mappings + fuzzy_map = { + # Whitelist: known legitimate HTML structures + "FUZZY_HTML_WHITELIST" { + flag = 1; + max_score = 20.0; + } + # Blacklist: known spam/phishing HTML structures + "FUZZY_HTML_BLACKLIST" { + flag = 2; + max_score = 20.0; + } + } + + # Optional: skip specific hashes + # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map"; + } + + # Example: Combined text + HTML rule + rule "COMBINED_FUZZY" { + servers = "localhost:11335"; + algorithm = "mumhash"; + + # Enable both text and HTML fuzzy hashing + html_shingles = true; + min_html_tags = 10; + + # This rule will generate: + # - Text fuzzy hashes (from content) + # - HTML fuzzy hashes (from structure) + # Both sent to same storage with same flag + + symbol = "FUZZY_COMBINED"; + max_score = 30.0; + + fuzzy_map = { + "FUZZY_COMBINED_WHITE" { + flag = 10; + max_score = 30.0; + } + "FUZZY_COMBINED_SPAM" { + flag = 11; + max_score = 30.0; + } + } + } + + # Example: Phishing detection rule (higher weight for HTML) + rule "PHISHING_DETECTION" { + servers = "localhost:11335"; + algorithm = "mumhash"; + + html_shingles = true; + min_html_tags = 20; + + # Higher weight for HTML matches = prioritize structure over content + html_weight = 1.5; + + symbol = "FUZZY_PHISHING"; + max_score = 25.0; + + fuzzy_map = { + # Known phishing HTML templates + "FUZZY_PHISHING_HTML" { + flag = 20; + max_score = 25.0; + } + # Known legitimate brands (for comparison) + "FUZZY_LEGIT_BRANDS" { + flag = 21; + max_score = -25.0; # Negative score = whitelist + } + } + } +} + +# Additional configuration for phishing detection rules +# See rules/fuzzy_html_phishing.lua for Lua-based detection logic diff --git a/lualib/lua_fuzzy_html.lua b/lualib/lua_fuzzy_html.lua new file mode 100644 index 0000000000..1b3b36cd9f --- /dev/null +++ b/lualib/lua_fuzzy_html.lua @@ -0,0 +1,98 @@ +--[[ +HTML Fuzzy Hashing Helper Module + +This module provides helper functions for HTML fuzzy hash matching +and phishing detection based on HTML structure vs. content mismatches. + +Use case: Detect phishing where HTML structure matches legitimate emails +but CTA (Call-To-Action) domains are different. +]] + +local exports = {} +local rspamd_logger = require "rspamd_logger" +local lua_util = require "lua_util" + +--[[ +Analyze fuzzy results to detect potential phishing based on: +- Text content fuzzy match (high score) +- HTML structure fuzzy match (high score) +- But HTML CTA domains differ from known legitimate + +Returns: phishing_score, explanation +]] +exports.check_html_text_mismatch = function(task, fuzzy_results) + local html_matches = {} + local text_matches = {} + + -- Separate HTML and text fuzzy matches + for _, res in ipairs(fuzzy_results or {}) do + if res.type == 'html' then + table.insert(html_matches, res) + elseif res.type == 'txt' then + table.insert(text_matches, res) + end + end + + -- Phishing scenario: high text match but low/no HTML match + if #text_matches > 0 and #html_matches == 0 then + local max_text_score = 0 + for _, res in ipairs(text_matches) do + if res.score > max_text_score then + max_text_score = res.score + end + end + + -- High text match but no HTML match = suspicious + if max_text_score > 0.7 then + return max_text_score * 0.5, string.format( + "Text fuzzy match (%.2f) without HTML match - possible CTA substitution", + max_text_score) + end + end + + -- Inverse scenario: HTML match but no text match + -- (Could be template with varying content - less suspicious) + if #html_matches > 0 and #text_matches == 0 then + local max_html_score = 0 + for _, res in ipairs(html_matches) do + if res.score > max_html_score then + max_html_score = res.score + end + end + + -- This is expected for newsletters/notifications + lua_util.debugm('fuzzy_html', task, + 'HTML match (%.2f) without text match - likely template variation', + max_html_score) + end + + return 0, nil +end + +--[[ +Check if message has suspicious HTML fuzzy pattern: +- Known legitimate HTML structure +- But text content is different or manipulated +- Useful for brand protection + +Example: Amazon email template with phishing text +]] +exports.check_brand_hijack = function(task, html_fuzzy_result, text_fuzzy_result) + if not html_fuzzy_result then + return 0, nil + end + + -- High HTML match = known template + if html_fuzzy_result.score > 0.8 then + -- Check if text is suspicious + if not text_fuzzy_result or text_fuzzy_result.score < 0.3 then + return html_fuzzy_result.score * 0.6, + string.format("Known HTML template (%.2f) with unfamiliar text - possible brand hijacking", + html_fuzzy_result.score) + end + end + + return 0, nil +end + +return exports diff --git a/rules/fuzzy_html_phishing.lua b/rules/fuzzy_html_phishing.lua new file mode 100644 index 0000000000..77cc50c6ce --- /dev/null +++ b/rules/fuzzy_html_phishing.lua @@ -0,0 +1,115 @@ +--[[ +Copyright (c) 2025, Vsevolod Stakhov + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +]]-- + +--[[ +HTML Fuzzy Phishing Detection Rules + +Detects phishing based on fuzzy hash mismatches: +1. Text content matches known legitimate email (whitelist) +2. But HTML structure doesn't match or has different CTA domains +3. Or vice versa: HTML structure matches but text/CTA is suspicious + +This indicates possible template reuse for phishing. +]] + +local rspamd_logger = require "rspamd_logger" +local lua_util = require "lua_util" + +local N = 'fuzzy_html_phishing' + +local function check_fuzzy_mismatch(task) + local fuzzy_results = task:get_mempool():get_variable('fuzzy_result') + + if not fuzzy_results then + return false + end + + -- Collect results by type + local text_matches = {} + local html_matches = {} + + for _, hash_result in ipairs(fuzzy_results) do + local symbol = tostring(hash_result) + -- Parse fuzzy result format: "flag:hash:prob:type" + -- This is simplified - actual parsing depends on result format + + -- For now, check mempool variables set by fuzzy_insert_result + -- We need to enhance fuzzy_check to expose result types + end + + -- Get fuzzy check symbols from task results + local fuzzy_symbols = task:get_symbols_all() + local has_text_fuzzy = false + local has_html_fuzzy = false + local text_score = 0 + local html_score = 0 + + for _, sym in ipairs(fuzzy_symbols) do + if sym.name:match('FUZZY.*TEXT') or sym.name == 'R_FUZZY_HASH' then + has_text_fuzzy = true + text_score = math.max(text_score, sym.score or 0) + end + if sym.name:match('FUZZY.*HTML') then + has_html_fuzzy = true + html_score = math.max(html_score, sym.score or 0) + end + end + + -- Scenario 1: Text matches legitimate but no HTML match + -- This could indicate phishing with copied text but fake HTML/CTA + if has_text_fuzzy and not has_html_fuzzy and text_score > 5.0 then + task:insert_result('FUZZY_HTML_PHISHING_MISMATCH', 0.5, + string.format('text_score:%.2f', text_score)) + lua_util.debugm(N, task, + 'Phishing suspect: text fuzzy match (%.2f) without HTML match', + text_score) + return true + end + + -- Scenario 2: HTML matches but text doesn't (less suspicious) + -- This is common for newsletters/notifications with varying content + if has_html_fuzzy and not has_text_fuzzy and html_score > 8.0 then + -- Only flag if HTML score is very high (known template) + lua_util.debugm(N, task, + 'HTML template match (%.2f) with varying text - likely legitimate newsletter', + html_score) + -- Could add negative score or just log + end + + return false +end + +-- Register symbol +rspamd_config:register_symbol{ + name = 'FUZZY_HTML_PHISHING_MISMATCH', + type = 'virtual', + score = 5.0, + description = 'Text fuzzy matches legitimate but HTML structure does not', + group = 'fuzzy' +} + +-- Register callback +local id = rspamd_config:register_symbol{ + name = 'FUZZY_HTML_PHISHING_CHECK', + type = 'callback', + callback = check_fuzzy_mismatch, + score = 0.0, + group = 'fuzzy', + description = 'Check for HTML/text fuzzy mismatches indicating phishing' +} + +-- Depends on fuzzy_check +rspamd_config:register_dependency('FUZZY_HTML_PHISHING_CHECK', 'FUZZY_CALLBACK') diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 7dd5162ac7..9d83b7896d 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -94,10 +94,13 @@ struct fuzzy_rule { struct rspamd_cryptobox_pubkey *peer_key; double max_score; double weight_threshold; + double html_weight; /* Weight multiplier for HTML hashes (default 1.0) */ enum fuzzy_rule_mode mode; gboolean skip_unknown; gboolean no_share; gboolean no_subject; + gboolean html_shingles; /* Enable HTML fuzzy hashing */ + unsigned int min_html_tags; /* Minimum tags for HTML hash */ int learn_condition_cb; uint32_t retransmits; struct rspamd_hash_map_helper *skip_map; @@ -127,7 +130,8 @@ enum fuzzy_result_type { FUZZY_RESULT_TXT, FUZZY_RESULT_IMG, FUZZY_RESULT_CONTENT, - FUZZY_RESULT_BIN + FUZZY_RESULT_BIN, + FUZZY_RESULT_HTML }; struct fuzzy_client_result { @@ -174,10 +178,12 @@ struct fuzzy_learn_session { #define FUZZY_CMD_FLAG_SENT (1 << 1) #define FUZZY_CMD_FLAG_IMAGE (1 << 2) #define FUZZY_CMD_FLAG_CONTENT (1 << 3) +#define FUZZY_CMD_FLAG_HTML (1 << 4) #define FUZZY_CHECK_FLAG_NOIMAGES (1 << 0) #define FUZZY_CHECK_FLAG_NOATTACHMENTS (1 << 1) #define FUZZY_CHECK_FLAG_NOTEXT (1 << 2) +#define FUZZY_CHECK_FLAG_NOHTML (1 << 3) struct fuzzy_cmd_io { uint32_t tag; @@ -340,6 +346,9 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool) rule->mappings); rule->mode = fuzzy_rule_read_write; rule->weight_threshold = NAN; + rule->html_weight = 1.0; + rule->html_shingles = FALSE; + rule->min_html_tags = 10; return rule; } @@ -720,6 +729,18 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj, rule->weight_threshold = ucl_object_todouble(value); } + if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) { + rule->html_shingles = ucl_object_toboolean(value); + } + + if ((value = ucl_object_lookup(obj, "min_html_tags")) != NULL) { + rule->min_html_tags = ucl_object_toint(value); + } + + if ((value = ucl_object_lookup(obj, "html_weight")) != NULL) { + rule->html_weight = ucl_object_todouble(value); + } + /* * Process rule in Lua */ @@ -2074,6 +2095,139 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task, return io; } +/* + * Create fuzzy command from HTML structure (if part is HTML) + */ +static struct fuzzy_cmd_io * +fuzzy_cmd_from_html_part(struct rspamd_task *task, + struct fuzzy_rule *rule, + int c, + int flag, + uint32_t weight, + struct rspamd_mime_text_part *part, + struct rspamd_mime_part *mp) +{ + struct rspamd_fuzzy_shingle_cmd *shcmd = NULL; + struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd = NULL; + struct rspamd_cached_shingles *cached = NULL; + struct rspamd_html_shingle *html_sh = NULL; + struct fuzzy_cmd_io *io; + unsigned int additional_length; + unsigned char *additional_data; + + /* Check if HTML shingles are enabled for this rule */ + if (!rule->html_shingles) { + return NULL; + } + + /* Check if this is an HTML part */ + if (!IS_TEXT_PART_HTML(part) || part->html == NULL) { + return NULL; + } + + /* Check minimum tags threshold */ + if (part->html_features && part->html_features->tags_count < rule->min_html_tags) { + msg_debug_fuzzy_check("HTML part has %d tags, less than minimum %d", + part->html_features->tags_count, rule->min_html_tags); + return NULL; + } + + cached = fuzzy_cmd_get_cached(rule, task, mp); + + if (cached) { + /* Copy from cache */ + additional_length = cached->additional_length; + additional_data = cached->additional_data; + + if (cached->sh) { + encshcmd = rspamd_mempool_alloc0(task->task_pool, + sizeof(*encshcmd) + additional_length); + shcmd = &encshcmd->cmd; + memcpy(&shcmd->sgl, cached->sh, sizeof(struct rspamd_shingle)); + memcpy(shcmd->basic.digest, cached->digest, sizeof(cached->digest)); + memcpy(((unsigned char *) encshcmd) + sizeof(*encshcmd), additional_data, + additional_length); + shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; + } + else { + return NULL; + } + } + else { + /* Generate HTML shingles */ + additional_length = fuzzy_cmd_extension_length(task, rule); + cached = rspamd_mempool_alloc0(task->task_pool, sizeof(*cached) + additional_length); + cached->additional_length = additional_length; + cached->additional_data = ((unsigned char *) cached) + sizeof(*cached); + + if (additional_length > 0) { + fuzzy_cmd_write_extensions(task, rule, cached->additional_data, additional_length); + } + + encshcmd = rspamd_mempool_alloc0(task->task_pool, + sizeof(*encshcmd) + additional_length); + shcmd = &encshcmd->cmd; + + msg_debug_fuzzy_check("generating HTML shingles for part with %d tags", + part->html_features ? part->html_features->tags_count : 0); + + html_sh = rspamd_shingles_from_html(part->html, + rule->shingles_key->str, task->task_pool, + rspamd_shingles_default_filter, NULL, + rule->alg); + + if (html_sh != NULL) { + /* Use structure shingles for fuzzy matching */ + memcpy(&shcmd->sgl, &html_sh->structure_shingles, sizeof(struct rspamd_shingle)); + /* Use direct hash as digest for exact matching */ + memcpy(shcmd->basic.digest, html_sh->direct_hash, sizeof(shcmd->basic.digest)); + shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; + + /* Cache results */ + cached->sh = &html_sh->structure_shingles; + memcpy(cached->digest, html_sh->direct_hash, sizeof(cached->digest)); + additional_data = ((unsigned char *) encshcmd) + sizeof(*encshcmd); + memcpy(additional_data, cached->additional_data, additional_length); + } + else { + /* No HTML shingles generated */ + return NULL; + } + + fuzzy_cmd_set_cached(rule, task, mp, cached); + } + + io = rspamd_mempool_alloc(task->task_pool, sizeof(*io)); + io->part = mp; + + shcmd->basic.tag = ottery_rand_uint32(); + shcmd->basic.cmd = c; + shcmd->basic.version = RSPAMD_FUZZY_PLUGIN_VERSION; + + if (c != FUZZY_CHECK) { + shcmd->basic.flag = flag; + shcmd->basic.value = weight; + } + + io->tag = shcmd->basic.tag; + io->flags = FUZZY_CMD_FLAG_HTML; + memcpy(&io->cmd, &shcmd->basic, sizeof(io->cmd)); + + if (rule->peer_key) { + /* Encrypt data */ + fuzzy_encrypt_cmd(rule, &encshcmd->hdr, (unsigned char *) shcmd, + sizeof(*shcmd) + additional_length); + io->io.iov_base = encshcmd; + io->io.iov_len = sizeof(*encshcmd) + additional_length; + } + else { + io->io.iov_base = shcmd; + io->io.iov_len = sizeof(*shcmd) + additional_length; + } + + return io; +} + #if 0 static struct fuzzy_cmd_io * fuzzy_cmd_from_image_part (struct fuzzy_rule *rule, @@ -2443,6 +2597,15 @@ fuzzy_insert_result(struct fuzzy_client_session *session, type = "img"; res->type = FUZZY_RESULT_IMG; } + else if ((io->flags & FUZZY_CMD_FLAG_HTML)) { + /* HTML structural hash */ + nval *= sqrtf(rep->v1.prob); + /* Apply HTML weight multiplier from rule config */ + nval *= session->rule->html_weight; + + type = "html"; + res->type = FUZZY_RESULT_HTML; + } else { /* Calc real probability */ nval *= sqrtf(rep->v1.prob); @@ -3095,6 +3258,9 @@ fuzzy_controller_io_callback(int fd, short what, void *arg) if ((io->flags & FUZZY_CMD_FLAG_IMAGE)) { ftype = "img"; } + else if ((io->flags & FUZZY_CMD_FLAG_HTML)) { + ftype = "html"; + } else if (io->flags & FUZZY_CMD_FLAG_CONTENT) { ftype = "content"; } @@ -3340,6 +3506,19 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule, !fuzzy_check, part, mime_part); + + /* Try HTML fuzzy hash if enabled and text hash generation succeeded/failed */ + if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) { + struct fuzzy_cmd_io *html_io; + + html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value, + part, mime_part); + + if (html_io) { + /* Add HTML hash as separate command */ + g_ptr_array_add(res, html_io); + } + } } else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE && !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) { diff --git a/test/functional/configs/fuzzy_html_test.conf b/test/functional/configs/fuzzy_html_test.conf new file mode 100644 index 0000000000..4166e97b12 --- /dev/null +++ b/test/functional/configs/fuzzy_html_test.conf @@ -0,0 +1,53 @@ +# Test configuration for HTML fuzzy hashing + +.include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf" +.include(duplicate=merge,priority=0) "{= env.TESTDIR =}/configs/statistic.conf" + +fuzzy_check { + # Test rule for HTML fuzzy hashing + rule "TEST_HTML_FUZZY" { + servers = "localhost:11335"; + algorithm = "mumhash"; + + # Enable HTML fuzzy hashing + html_shingles = true; + min_html_tags = 5; # Low threshold for testing + html_weight = 1.0; + + symbol = "FUZZY_HTML_TEST"; + max_score = 10.0; + + # Skip encryption for testing + # encryption_key = ""; + + fuzzy_map = { + "FUZZY_HTML_WHITELIST" { + flag = 1; + max_score = 10.0; + } + "FUZZY_HTML_SPAM" { + flag = 2; + max_score = 10.0; + } + } + } + + # Rule with both text and HTML enabled + rule "TEST_COMBINED" { + servers = "localhost:11335"; + algorithm = "mumhash"; + + html_shingles = true; + min_html_tags = 3; + + symbol = "FUZZY_COMBINED_TEST"; + max_score = 15.0; + + fuzzy_map = { + "FUZZY_COMBINED_MATCH" { + flag = 10; + max_score = 15.0; + } + } + } +}