--- /dev/null
+# HTML Fuzzy Hashing Configuration Example
+#
+# This configuration demonstrates how to use HTML fuzzy hashing for:
+# 1. Detecting spam campaigns with similar HTML structure
+# 2. Phishing detection (similar structure, different CTA domains)
+# 3. Brand protection (legitimate templates vs. fake emails)
+
+fuzzy_check {
+ # Example rule for HTML structure matching
+ rule "HTML_FUZZY" {
+ # Standard fuzzy storage configuration
+ servers = "localhost:11335";
+
+ # Encryption (optional, recommended for production)
+ # encryption_key = "your_base32_encoded_public_key";
+ # fuzzy_key = "your_hashing_key";
+ # fuzzy_shingles_key = "your_shingles_key";
+
+ # Algorithm for shingles (mumhash recommended for HTML)
+ algorithm = "mumhash";
+
+ # Enable HTML fuzzy hashing
+ html_shingles = true;
+
+ # Minimum number of HTML tags to generate hash
+ # (prevents hashing of trivial HTML snippets)
+ min_html_tags = 15;
+
+ # Weight multiplier for HTML fuzzy matches
+ # Can be < 1.0 to reduce impact, or > 1.0 to increase
+ html_weight = 1.0;
+
+ # Regular fuzzy check settings
+ symbol = "FUZZY_HTML";
+ max_score = 20.0;
+
+ # Fuzzy flag mappings
+ fuzzy_map = {
+ # Whitelist: known legitimate HTML structures
+ "FUZZY_HTML_WHITELIST" {
+ flag = 1;
+ max_score = 20.0;
+ }
+ # Blacklist: known spam/phishing HTML structures
+ "FUZZY_HTML_BLACKLIST" {
+ flag = 2;
+ max_score = 20.0;
+ }
+ }
+
+ # Optional: skip specific hashes
+ # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map";
+ }
+
+ # Example: Combined text + HTML rule
+ rule "COMBINED_FUZZY" {
+ servers = "localhost:11335";
+ algorithm = "mumhash";
+
+ # Enable both text and HTML fuzzy hashing
+ html_shingles = true;
+ min_html_tags = 10;
+
+ # This rule will generate:
+ # - Text fuzzy hashes (from content)
+ # - HTML fuzzy hashes (from structure)
+ # Both sent to same storage with same flag
+
+ symbol = "FUZZY_COMBINED";
+ max_score = 30.0;
+
+ fuzzy_map = {
+ "FUZZY_COMBINED_WHITE" {
+ flag = 10;
+ max_score = 30.0;
+ }
+ "FUZZY_COMBINED_SPAM" {
+ flag = 11;
+ max_score = 30.0;
+ }
+ }
+ }
+
+ # Example: Phishing detection rule (higher weight for HTML)
+ rule "PHISHING_DETECTION" {
+ servers = "localhost:11335";
+ algorithm = "mumhash";
+
+ html_shingles = true;
+ min_html_tags = 20;
+
+ # Higher weight for HTML matches = prioritize structure over content
+ html_weight = 1.5;
+
+ symbol = "FUZZY_PHISHING";
+ max_score = 25.0;
+
+ fuzzy_map = {
+ # Known phishing HTML templates
+ "FUZZY_PHISHING_HTML" {
+ flag = 20;
+ max_score = 25.0;
+ }
+ # Known legitimate brands (for comparison)
+ "FUZZY_LEGIT_BRANDS" {
+ flag = 21;
+ max_score = -25.0; # Negative score = whitelist
+ }
+ }
+ }
+}
+
+# Additional configuration for phishing detection rules
+# See rules/fuzzy_html_phishing.lua for Lua-based detection logic
--- /dev/null
+--[[
+HTML Fuzzy Hashing Helper Module
+
+This module provides helper functions for HTML fuzzy hash matching
+and phishing detection based on HTML structure vs. content mismatches.
+
+Use case: Detect phishing where HTML structure matches legitimate emails
+but CTA (Call-To-Action) domains are different.
+]]
+
+local exports = {}
+local rspamd_logger = require "rspamd_logger"
+local lua_util = require "lua_util"
+
+--[[
+Analyze fuzzy results to detect potential phishing based on:
+- Text content fuzzy match (high score)
+- HTML structure fuzzy match (high score)
+- But HTML CTA domains differ from known legitimate
+
+Returns: phishing_score, explanation
+]]
+exports.check_html_text_mismatch = function(task, fuzzy_results)
+ local html_matches = {}
+ local text_matches = {}
+
+ -- Separate HTML and text fuzzy matches
+ for _, res in ipairs(fuzzy_results or {}) do
+ if res.type == 'html' then
+ table.insert(html_matches, res)
+ elseif res.type == 'txt' then
+ table.insert(text_matches, res)
+ end
+ end
+
+ -- Phishing scenario: high text match but low/no HTML match
+ if #text_matches > 0 and #html_matches == 0 then
+ local max_text_score = 0
+ for _, res in ipairs(text_matches) do
+ if res.score > max_text_score then
+ max_text_score = res.score
+ end
+ end
+
+ -- High text match but no HTML match = suspicious
+ if max_text_score > 0.7 then
+ return max_text_score * 0.5, string.format(
+ "Text fuzzy match (%.2f) without HTML match - possible CTA substitution",
+ max_text_score)
+ end
+ end
+
+ -- Inverse scenario: HTML match but no text match
+ -- (Could be template with varying content - less suspicious)
+ if #html_matches > 0 and #text_matches == 0 then
+ local max_html_score = 0
+ for _, res in ipairs(html_matches) do
+ if res.score > max_html_score then
+ max_html_score = res.score
+ end
+ end
+
+ -- This is expected for newsletters/notifications
+ lua_util.debugm('fuzzy_html', task,
+ 'HTML match (%.2f) without text match - likely template variation',
+ max_html_score)
+ end
+
+ return 0, nil
+end
+
+--[[
+Check if message has suspicious HTML fuzzy pattern:
+- Known legitimate HTML structure
+- But text content is different or manipulated
+- Useful for brand protection
+
+Example: Amazon email template with phishing text
+]]
+exports.check_brand_hijack = function(task, html_fuzzy_result, text_fuzzy_result)
+ if not html_fuzzy_result then
+ return 0, nil
+ end
+
+ -- High HTML match = known template
+ if html_fuzzy_result.score > 0.8 then
+ -- Check if text is suspicious
+ if not text_fuzzy_result or text_fuzzy_result.score < 0.3 then
+ return html_fuzzy_result.score * 0.6,
+ string.format("Known HTML template (%.2f) with unfamiliar text - possible brand hijacking",
+ html_fuzzy_result.score)
+ end
+ end
+
+ return 0, nil
+end
+
+return exports
--- /dev/null
+--[[
+Copyright (c) 2025, Vsevolod Stakhov <vsevolod@rspamd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+--[[
+HTML Fuzzy Phishing Detection Rules
+
+Detects phishing based on fuzzy hash mismatches:
+1. Text content matches known legitimate email (whitelist)
+2. But HTML structure doesn't match or has different CTA domains
+3. Or vice versa: HTML structure matches but text/CTA is suspicious
+
+This indicates possible template reuse for phishing.
+]]
+
+local rspamd_logger = require "rspamd_logger"
+local lua_util = require "lua_util"
+
+local N = 'fuzzy_html_phishing'
+
+local function check_fuzzy_mismatch(task)
+ local fuzzy_results = task:get_mempool():get_variable('fuzzy_result')
+
+ if not fuzzy_results then
+ return false
+ end
+
+ -- Collect results by type
+ local text_matches = {}
+ local html_matches = {}
+
+ for _, hash_result in ipairs(fuzzy_results) do
+ local symbol = tostring(hash_result)
+ -- Parse fuzzy result format: "flag:hash:prob:type"
+ -- This is simplified - actual parsing depends on result format
+
+ -- For now, check mempool variables set by fuzzy_insert_result
+ -- We need to enhance fuzzy_check to expose result types
+ end
+
+ -- Get fuzzy check symbols from task results
+ local fuzzy_symbols = task:get_symbols_all()
+ local has_text_fuzzy = false
+ local has_html_fuzzy = false
+ local text_score = 0
+ local html_score = 0
+
+ for _, sym in ipairs(fuzzy_symbols) do
+ if sym.name:match('FUZZY.*TEXT') or sym.name == 'R_FUZZY_HASH' then
+ has_text_fuzzy = true
+ text_score = math.max(text_score, sym.score or 0)
+ end
+ if sym.name:match('FUZZY.*HTML') then
+ has_html_fuzzy = true
+ html_score = math.max(html_score, sym.score or 0)
+ end
+ end
+
+ -- Scenario 1: Text matches legitimate but no HTML match
+ -- This could indicate phishing with copied text but fake HTML/CTA
+ if has_text_fuzzy and not has_html_fuzzy and text_score > 5.0 then
+ task:insert_result('FUZZY_HTML_PHISHING_MISMATCH', 0.5,
+ string.format('text_score:%.2f', text_score))
+ lua_util.debugm(N, task,
+ 'Phishing suspect: text fuzzy match (%.2f) without HTML match',
+ text_score)
+ return true
+ end
+
+ -- Scenario 2: HTML matches but text doesn't (less suspicious)
+ -- This is common for newsletters/notifications with varying content
+ if has_html_fuzzy and not has_text_fuzzy and html_score > 8.0 then
+ -- Only flag if HTML score is very high (known template)
+ lua_util.debugm(N, task,
+ 'HTML template match (%.2f) with varying text - likely legitimate newsletter',
+ html_score)
+ -- Could add negative score or just log
+ end
+
+ return false
+end
+
+-- Register symbol
+rspamd_config:register_symbol{
+ name = 'FUZZY_HTML_PHISHING_MISMATCH',
+ type = 'virtual',
+ score = 5.0,
+ description = 'Text fuzzy matches legitimate but HTML structure does not',
+ group = 'fuzzy'
+}
+
+-- Register callback
+local id = rspamd_config:register_symbol{
+ name = 'FUZZY_HTML_PHISHING_CHECK',
+ type = 'callback',
+ callback = check_fuzzy_mismatch,
+ score = 0.0,
+ group = 'fuzzy',
+ description = 'Check for HTML/text fuzzy mismatches indicating phishing'
+}
+
+-- Depends on fuzzy_check
+rspamd_config:register_dependency('FUZZY_HTML_PHISHING_CHECK', 'FUZZY_CALLBACK')
struct rspamd_cryptobox_pubkey *peer_key;
double max_score;
double weight_threshold;
+ double html_weight; /* Weight multiplier for HTML hashes (default 1.0) */
enum fuzzy_rule_mode mode;
gboolean skip_unknown;
gboolean no_share;
gboolean no_subject;
+ gboolean html_shingles; /* Enable HTML fuzzy hashing */
+ unsigned int min_html_tags; /* Minimum tags for HTML hash */
int learn_condition_cb;
uint32_t retransmits;
struct rspamd_hash_map_helper *skip_map;
FUZZY_RESULT_TXT,
FUZZY_RESULT_IMG,
FUZZY_RESULT_CONTENT,
- FUZZY_RESULT_BIN
+ FUZZY_RESULT_BIN,
+ FUZZY_RESULT_HTML
};
struct fuzzy_client_result {
#define FUZZY_CMD_FLAG_SENT (1 << 1)
#define FUZZY_CMD_FLAG_IMAGE (1 << 2)
#define FUZZY_CMD_FLAG_CONTENT (1 << 3)
+#define FUZZY_CMD_FLAG_HTML (1 << 4)
#define FUZZY_CHECK_FLAG_NOIMAGES (1 << 0)
#define FUZZY_CHECK_FLAG_NOATTACHMENTS (1 << 1)
#define FUZZY_CHECK_FLAG_NOTEXT (1 << 2)
+#define FUZZY_CHECK_FLAG_NOHTML (1 << 3)
struct fuzzy_cmd_io {
uint32_t tag;
rule->mappings);
rule->mode = fuzzy_rule_read_write;
rule->weight_threshold = NAN;
+ rule->html_weight = 1.0;
+ rule->html_shingles = FALSE;
+ rule->min_html_tags = 10;
return rule;
}
rule->weight_threshold = ucl_object_todouble(value);
}
+ if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) {
+ rule->html_shingles = ucl_object_toboolean(value);
+ }
+
+ if ((value = ucl_object_lookup(obj, "min_html_tags")) != NULL) {
+ rule->min_html_tags = ucl_object_toint(value);
+ }
+
+ if ((value = ucl_object_lookup(obj, "html_weight")) != NULL) {
+ rule->html_weight = ucl_object_todouble(value);
+ }
+
/*
* Process rule in Lua
*/
return io;
}
+/*
+ * Create fuzzy command from HTML structure (if part is HTML)
+ */
+static struct fuzzy_cmd_io *
+fuzzy_cmd_from_html_part(struct rspamd_task *task,
+ struct fuzzy_rule *rule,
+ int c,
+ int flag,
+ uint32_t weight,
+ struct rspamd_mime_text_part *part,
+ struct rspamd_mime_part *mp)
+{
+ struct rspamd_fuzzy_shingle_cmd *shcmd = NULL;
+ struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd = NULL;
+ struct rspamd_cached_shingles *cached = NULL;
+ struct rspamd_html_shingle *html_sh = NULL;
+ struct fuzzy_cmd_io *io;
+ unsigned int additional_length;
+ unsigned char *additional_data;
+
+ /* Check if HTML shingles are enabled for this rule */
+ if (!rule->html_shingles) {
+ return NULL;
+ }
+
+ /* Check if this is an HTML part */
+ if (!IS_TEXT_PART_HTML(part) || part->html == NULL) {
+ return NULL;
+ }
+
+ /* Check minimum tags threshold */
+ if (part->html_features && part->html_features->tags_count < rule->min_html_tags) {
+ msg_debug_fuzzy_check("HTML part has %d tags, less than minimum %d",
+ part->html_features->tags_count, rule->min_html_tags);
+ return NULL;
+ }
+
+ cached = fuzzy_cmd_get_cached(rule, task, mp);
+
+ if (cached) {
+ /* Copy from cache */
+ additional_length = cached->additional_length;
+ additional_data = cached->additional_data;
+
+ if (cached->sh) {
+ encshcmd = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(*encshcmd) + additional_length);
+ shcmd = &encshcmd->cmd;
+ memcpy(&shcmd->sgl, cached->sh, sizeof(struct rspamd_shingle));
+ memcpy(shcmd->basic.digest, cached->digest, sizeof(cached->digest));
+ memcpy(((unsigned char *) encshcmd) + sizeof(*encshcmd), additional_data,
+ additional_length);
+ shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+ }
+ else {
+ return NULL;
+ }
+ }
+ else {
+ /* Generate HTML shingles */
+ additional_length = fuzzy_cmd_extension_length(task, rule);
+ cached = rspamd_mempool_alloc0(task->task_pool, sizeof(*cached) + additional_length);
+ cached->additional_length = additional_length;
+ cached->additional_data = ((unsigned char *) cached) + sizeof(*cached);
+
+ if (additional_length > 0) {
+ fuzzy_cmd_write_extensions(task, rule, cached->additional_data, additional_length);
+ }
+
+ encshcmd = rspamd_mempool_alloc0(task->task_pool,
+ sizeof(*encshcmd) + additional_length);
+ shcmd = &encshcmd->cmd;
+
+ msg_debug_fuzzy_check("generating HTML shingles for part with %d tags",
+ part->html_features ? part->html_features->tags_count : 0);
+
+ html_sh = rspamd_shingles_from_html(part->html,
+ rule->shingles_key->str, task->task_pool,
+ rspamd_shingles_default_filter, NULL,
+ rule->alg);
+
+ if (html_sh != NULL) {
+ /* Use structure shingles for fuzzy matching */
+ memcpy(&shcmd->sgl, &html_sh->structure_shingles, sizeof(struct rspamd_shingle));
+ /* Use direct hash as digest for exact matching */
+ memcpy(shcmd->basic.digest, html_sh->direct_hash, sizeof(shcmd->basic.digest));
+ shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+
+ /* Cache results */
+ cached->sh = &html_sh->structure_shingles;
+ memcpy(cached->digest, html_sh->direct_hash, sizeof(cached->digest));
+ additional_data = ((unsigned char *) encshcmd) + sizeof(*encshcmd);
+ memcpy(additional_data, cached->additional_data, additional_length);
+ }
+ else {
+ /* No HTML shingles generated */
+ return NULL;
+ }
+
+ fuzzy_cmd_set_cached(rule, task, mp, cached);
+ }
+
+ io = rspamd_mempool_alloc(task->task_pool, sizeof(*io));
+ io->part = mp;
+
+ shcmd->basic.tag = ottery_rand_uint32();
+ shcmd->basic.cmd = c;
+ shcmd->basic.version = RSPAMD_FUZZY_PLUGIN_VERSION;
+
+ if (c != FUZZY_CHECK) {
+ shcmd->basic.flag = flag;
+ shcmd->basic.value = weight;
+ }
+
+ io->tag = shcmd->basic.tag;
+ io->flags = FUZZY_CMD_FLAG_HTML;
+ memcpy(&io->cmd, &shcmd->basic, sizeof(io->cmd));
+
+ if (rule->peer_key) {
+ /* Encrypt data */
+ fuzzy_encrypt_cmd(rule, &encshcmd->hdr, (unsigned char *) shcmd,
+ sizeof(*shcmd) + additional_length);
+ io->io.iov_base = encshcmd;
+ io->io.iov_len = sizeof(*encshcmd) + additional_length;
+ }
+ else {
+ io->io.iov_base = shcmd;
+ io->io.iov_len = sizeof(*shcmd) + additional_length;
+ }
+
+ return io;
+}
+
#if 0
static struct fuzzy_cmd_io *
fuzzy_cmd_from_image_part (struct fuzzy_rule *rule,
type = "img";
res->type = FUZZY_RESULT_IMG;
}
+ else if ((io->flags & FUZZY_CMD_FLAG_HTML)) {
+ /* HTML structural hash */
+ nval *= sqrtf(rep->v1.prob);
+ /* Apply HTML weight multiplier from rule config */
+ nval *= session->rule->html_weight;
+
+ type = "html";
+ res->type = FUZZY_RESULT_HTML;
+ }
else {
/* Calc real probability */
nval *= sqrtf(rep->v1.prob);
if ((io->flags & FUZZY_CMD_FLAG_IMAGE)) {
ftype = "img";
}
+ else if ((io->flags & FUZZY_CMD_FLAG_HTML)) {
+ ftype = "html";
+ }
else if (io->flags & FUZZY_CMD_FLAG_CONTENT) {
ftype = "content";
}
!fuzzy_check,
part,
mime_part);
+
+ /* Try HTML fuzzy hash if enabled and text hash generation succeeded/failed */
+ if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
+ struct fuzzy_cmd_io *html_io;
+
+ html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
+ part, mime_part);
+
+ if (html_io) {
+ /* Add HTML hash as separate command */
+ g_ptr_array_add(res, html_io);
+ }
+ }
}
else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
!(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
--- /dev/null
+# Test configuration for HTML fuzzy hashing
+
+.include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf"
+.include(duplicate=merge,priority=0) "{= env.TESTDIR =}/configs/statistic.conf"
+
+fuzzy_check {
+ # Test rule for HTML fuzzy hashing
+ rule "TEST_HTML_FUZZY" {
+ servers = "localhost:11335";
+ algorithm = "mumhash";
+
+ # Enable HTML fuzzy hashing
+ html_shingles = true;
+ min_html_tags = 5; # Low threshold for testing
+ html_weight = 1.0;
+
+ symbol = "FUZZY_HTML_TEST";
+ max_score = 10.0;
+
+ # Skip encryption for testing
+ # encryption_key = "";
+
+ fuzzy_map = {
+ "FUZZY_HTML_WHITELIST" {
+ flag = 1;
+ max_score = 10.0;
+ }
+ "FUZZY_HTML_SPAM" {
+ flag = 2;
+ max_score = 10.0;
+ }
+ }
+ }
+
+ # Rule with both text and HTML enabled
+ rule "TEST_COMBINED" {
+ servers = "localhost:11335";
+ algorithm = "mumhash";
+
+ html_shingles = true;
+ min_html_tags = 3;
+
+ symbol = "FUZZY_COMBINED_TEST";
+ max_score = 15.0;
+
+ fuzzy_map = {
+ "FUZZY_COMBINED_MATCH" {
+ flag = 10;
+ max_score = 15.0;
+ }
+ }
+ }
+}