]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Integrate HTML fuzzy hashing into fuzzy_check module
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 18:34:48 +0000 (19:34 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 4 Oct 2025 18:34:48 +0000 (19:34 +0100)
Add support for HTML structure fuzzy hashing in fuzzy_check plugin:

Core integration:
- Add FUZZY_CMD_FLAG_HTML flag and FUZZY_RESULT_HTML result type
- Add html_shingles, min_html_tags, html_weight options to fuzzy_rule
- Implement fuzzy_cmd_from_html_part() to generate HTML fuzzy commands
- Integrate into fuzzy_generate_commands() for automatic hash generation
- Handle HTML results with configurable weight multiplier

Configuration:
- html_shingles: enable/disable HTML fuzzy hashing per rule
- min_html_tags: minimum HTML tags threshold (default 10)
- html_weight: score multiplier for HTML matches (default 1.0)

Use cases:
1. Brand protection: detect phishing with copied HTML but fake CTA
2. Spam campaigns: group messages by HTML structure
3. Template detection: identify newsletters/notifications
4. Phishing: text match + HTML CTA mismatch = suspicious

Files added:
- lualib/lua_fuzzy_html.lua: helper functions for mismatch detection
- conf/modules.d/fuzzy_check_html.conf: configuration examples
- test/functional/configs/fuzzy_html_test.conf: test configuration
- rules/fuzzy_html_phishing.lua: phishing detection rules

HTML fuzzy works alongside text fuzzy:
- Both hashes generated and sent to storage
- Separate result types allow different handling
- CTA domain verification prevents false positives

Next steps:
- Performance testing on real email corpus
- Fine-tune weights and thresholds
- Collect legitimate brand templates for whitelisting

conf/modules.d/fuzzy_check_html.conf [new file with mode: 0644]
lualib/lua_fuzzy_html.lua [new file with mode: 0644]
rules/fuzzy_html_phishing.lua [new file with mode: 0644]
src/plugins/fuzzy_check.c
test/functional/configs/fuzzy_html_test.conf [new file with mode: 0644]

diff --git a/conf/modules.d/fuzzy_check_html.conf b/conf/modules.d/fuzzy_check_html.conf
new file mode 100644 (file)
index 0000000..face9c9
--- /dev/null
@@ -0,0 +1,114 @@
+# HTML Fuzzy Hashing Configuration Example
+#
+# This configuration demonstrates how to use HTML fuzzy hashing for:
+# 1. Detecting spam campaigns with similar HTML structure
+# 2. Phishing detection (similar structure, different CTA domains)
+# 3. Brand protection (legitimate templates vs. fake emails)
+
+fuzzy_check {
+  # Example rule for HTML structure matching
+  rule "HTML_FUZZY" {
+    # Standard fuzzy storage configuration
+    servers = "localhost:11335";
+    
+    # Encryption (optional, recommended for production)
+    # encryption_key = "your_base32_encoded_public_key";
+    # fuzzy_key = "your_hashing_key";
+    # fuzzy_shingles_key = "your_shingles_key";
+    
+    # Algorithm for shingles (mumhash recommended for HTML)
+    algorithm = "mumhash";
+    
+    # Enable HTML fuzzy hashing
+    html_shingles = true;
+    
+    # Minimum number of HTML tags to generate hash
+    # (prevents hashing of trivial HTML snippets)
+    min_html_tags = 15;
+    
+    # Weight multiplier for HTML fuzzy matches
+    # Can be < 1.0 to reduce impact, or > 1.0 to increase
+    html_weight = 1.0;
+    
+    # Regular fuzzy check settings
+    symbol = "FUZZY_HTML";
+    max_score = 20.0;
+    
+    # Fuzzy flag mappings
+    fuzzy_map = {
+      # Whitelist: known legitimate HTML structures
+      "FUZZY_HTML_WHITELIST" {
+        flag = 1;
+        max_score = 20.0;
+      }
+      # Blacklist: known spam/phishing HTML structures
+      "FUZZY_HTML_BLACKLIST" {
+        flag = 2;
+        max_score = 20.0;
+      }
+    }
+    
+    # Optional: skip specific hashes
+    # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map";
+  }
+  
+  # Example: Combined text + HTML rule
+  rule "COMBINED_FUZZY" {
+    servers = "localhost:11335";
+    algorithm = "mumhash";
+    
+    # Enable both text and HTML fuzzy hashing
+    html_shingles = true;
+    min_html_tags = 10;
+    
+    # This rule will generate:
+    # - Text fuzzy hashes (from content)
+    # - HTML fuzzy hashes (from structure)
+    # Both sent to same storage with same flag
+    
+    symbol = "FUZZY_COMBINED";
+    max_score = 30.0;
+    
+    fuzzy_map = {
+      "FUZZY_COMBINED_WHITE" {
+        flag = 10;
+        max_score = 30.0;
+      }
+      "FUZZY_COMBINED_SPAM" {
+        flag = 11;
+        max_score = 30.0;
+      }
+    }
+  }
+  
+  # Example: Phishing detection rule (higher weight for HTML)
+  rule "PHISHING_DETECTION" {
+    servers = "localhost:11335";
+    algorithm = "mumhash";
+    
+    html_shingles = true;
+    min_html_tags = 20;
+    
+    # Higher weight for HTML matches = prioritize structure over content
+    html_weight = 1.5;
+    
+    symbol = "FUZZY_PHISHING";
+    max_score = 25.0;
+    
+    fuzzy_map = {
+      # Known phishing HTML templates
+      "FUZZY_PHISHING_HTML" {
+        flag = 20;
+        max_score = 25.0;
+      }
+      # Known legitimate brands (for comparison)
+      "FUZZY_LEGIT_BRANDS" {
+        flag = 21;
+        max_score = -25.0;  # Negative score = whitelist
+      }
+    }
+  }
+}
+
+# Additional configuration for phishing detection rules
+# See rules/fuzzy_html_phishing.lua for Lua-based detection logic
diff --git a/lualib/lua_fuzzy_html.lua b/lualib/lua_fuzzy_html.lua
new file mode 100644 (file)
index 0000000..1b3b36c
--- /dev/null
@@ -0,0 +1,98 @@
+--[[
+HTML Fuzzy Hashing Helper Module
+
+This module provides helper functions for HTML fuzzy hash matching
+and phishing detection based on HTML structure vs. content mismatches.
+
+Use case: Detect phishing where HTML structure matches legitimate emails
+but CTA (Call-To-Action) domains are different.
+]]
+
+local exports = {}
+local rspamd_logger = require "rspamd_logger"
+local lua_util = require "lua_util"
+
+--[[
+Analyze fuzzy results to detect potential phishing based on:
+- Text content fuzzy match (high score)
+- HTML structure fuzzy match (high score)
+- But HTML CTA domains differ from known legitimate
+
+Returns: phishing_score, explanation
+]]
+exports.check_html_text_mismatch = function(task, fuzzy_results)
+  local html_matches = {}
+  local text_matches = {}
+  
+  -- Separate HTML and text fuzzy matches
+  for _, res in ipairs(fuzzy_results or {}) do
+    if res.type == 'html' then
+      table.insert(html_matches, res)
+    elseif res.type == 'txt' then
+      table.insert(text_matches, res)
+    end
+  end
+  
+  -- Phishing scenario: high text match but low/no HTML match
+  if #text_matches > 0 and #html_matches == 0 then
+    local max_text_score = 0
+    for _, res in ipairs(text_matches) do
+      if res.score > max_text_score then
+        max_text_score = res.score
+      end
+    end
+    
+    -- High text match but no HTML match = suspicious
+    if max_text_score > 0.7 then
+      return max_text_score * 0.5, string.format(
+        "Text fuzzy match (%.2f) without HTML match - possible CTA substitution",
+        max_text_score)
+    end
+  end
+  
+  -- Inverse scenario: HTML match but no text match
+  -- (Could be template with varying content - less suspicious)
+  if #html_matches > 0 and #text_matches == 0 then
+    local max_html_score = 0
+    for _, res in ipairs(html_matches) do
+      if res.score > max_html_score then
+        max_html_score = res.score
+      end
+    end
+    
+    -- This is expected for newsletters/notifications
+    lua_util.debugm('fuzzy_html', task,
+      'HTML match (%.2f) without text match - likely template variation',
+      max_html_score)
+  end
+  
+  return 0, nil
+end
+
+--[[
+Check if message has suspicious HTML fuzzy pattern:
+- Known legitimate HTML structure
+- But text content is different or manipulated
+- Useful for brand protection
+
+Example: Amazon email template with phishing text
+]]
+exports.check_brand_hijack = function(task, html_fuzzy_result, text_fuzzy_result)
+  if not html_fuzzy_result then
+    return 0, nil
+  end
+  
+  -- High HTML match = known template
+  if html_fuzzy_result.score > 0.8 then
+    -- Check if text is suspicious
+    if not text_fuzzy_result or text_fuzzy_result.score < 0.3 then
+      return html_fuzzy_result.score * 0.6,
+        string.format("Known HTML template (%.2f) with unfamiliar text - possible brand hijacking",
+          html_fuzzy_result.score)
+    end
+  end
+  
+  return 0, nil
+end
+
+return exports
diff --git a/rules/fuzzy_html_phishing.lua b/rules/fuzzy_html_phishing.lua
new file mode 100644 (file)
index 0000000..77cc50c
--- /dev/null
@@ -0,0 +1,115 @@
+--[[
+Copyright (c) 2025, Vsevolod Stakhov <vsevolod@rspamd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+--[[
+HTML Fuzzy Phishing Detection Rules
+
+Detects phishing based on fuzzy hash mismatches:
+1. Text content matches known legitimate email (whitelist)
+2. But HTML structure doesn't match or has different CTA domains
+3. Or vice versa: HTML structure matches but text/CTA is suspicious
+
+This indicates possible template reuse for phishing.
+]]
+
+local rspamd_logger = require "rspamd_logger"
+local lua_util = require "lua_util"
+
+local N = 'fuzzy_html_phishing'
+
+local function check_fuzzy_mismatch(task)
+  local fuzzy_results = task:get_mempool():get_variable('fuzzy_result')
+  
+  if not fuzzy_results then
+    return false
+  end
+  
+  -- Collect results by type
+  local text_matches = {}
+  local html_matches = {}
+  
+  for _, hash_result in ipairs(fuzzy_results) do
+    local symbol = tostring(hash_result)
+    -- Parse fuzzy result format: "flag:hash:prob:type"
+    -- This is simplified - actual parsing depends on result format
+    
+    -- For now, check mempool variables set by fuzzy_insert_result
+    -- We need to enhance fuzzy_check to expose result types
+  end
+  
+  -- Get fuzzy check symbols from task results
+  local fuzzy_symbols = task:get_symbols_all()
+  local has_text_fuzzy = false
+  local has_html_fuzzy = false
+  local text_score = 0
+  local html_score = 0
+  
+  for _, sym in ipairs(fuzzy_symbols) do
+    if sym.name:match('FUZZY.*TEXT') or sym.name == 'R_FUZZY_HASH' then
+      has_text_fuzzy = true
+      text_score = math.max(text_score, sym.score or 0)
+    end
+    if sym.name:match('FUZZY.*HTML') then
+      has_html_fuzzy = true
+      html_score = math.max(html_score, sym.score or 0)
+    end
+  end
+  
+  -- Scenario 1: Text matches legitimate but no HTML match
+  -- This could indicate phishing with copied text but fake HTML/CTA
+  if has_text_fuzzy and not has_html_fuzzy and text_score > 5.0 then
+    task:insert_result('FUZZY_HTML_PHISHING_MISMATCH', 0.5,
+      string.format('text_score:%.2f', text_score))
+    lua_util.debugm(N, task,
+      'Phishing suspect: text fuzzy match (%.2f) without HTML match',
+      text_score)
+    return true
+  end
+  
+  -- Scenario 2: HTML matches but text doesn't (less suspicious)
+  -- This is common for newsletters/notifications with varying content
+  if has_html_fuzzy and not has_text_fuzzy and html_score > 8.0 then
+    -- Only flag if HTML score is very high (known template)
+    lua_util.debugm(N, task,
+      'HTML template match (%.2f) with varying text - likely legitimate newsletter',
+      html_score)
+    -- Could add negative score or just log
+  end
+  
+  return false
+end
+
+-- Register symbol
+rspamd_config:register_symbol{
+  name = 'FUZZY_HTML_PHISHING_MISMATCH',
+  type = 'virtual',
+  score = 5.0,
+  description = 'Text fuzzy matches legitimate but HTML structure does not',
+  group = 'fuzzy'
+}
+
+-- Register callback
+local id = rspamd_config:register_symbol{
+  name = 'FUZZY_HTML_PHISHING_CHECK',
+  type = 'callback',
+  callback = check_fuzzy_mismatch,
+  score = 0.0,
+  group = 'fuzzy',
+  description = 'Check for HTML/text fuzzy mismatches indicating phishing'
+}
+
+-- Depends on fuzzy_check
+rspamd_config:register_dependency('FUZZY_HTML_PHISHING_CHECK', 'FUZZY_CALLBACK')
index 7dd5162ac720f00728a50df5b4f865d9f4f9842b..9d83b7896d51ba248043779572fd2343e9db7e2d 100644 (file)
@@ -94,10 +94,13 @@ struct fuzzy_rule {
        struct rspamd_cryptobox_pubkey *peer_key;
        double max_score;
        double weight_threshold;
+       double html_weight; /* Weight multiplier for HTML hashes (default 1.0) */
        enum fuzzy_rule_mode mode;
        gboolean skip_unknown;
        gboolean no_share;
        gboolean no_subject;
+       gboolean html_shingles; /* Enable HTML fuzzy hashing */
+       unsigned int min_html_tags; /* Minimum tags for HTML hash */
        int learn_condition_cb;
        uint32_t retransmits;
        struct rspamd_hash_map_helper *skip_map;
@@ -127,7 +130,8 @@ enum fuzzy_result_type {
        FUZZY_RESULT_TXT,
        FUZZY_RESULT_IMG,
        FUZZY_RESULT_CONTENT,
-       FUZZY_RESULT_BIN
+       FUZZY_RESULT_BIN,
+       FUZZY_RESULT_HTML
 };
 
 struct fuzzy_client_result {
@@ -174,10 +178,12 @@ struct fuzzy_learn_session {
 #define FUZZY_CMD_FLAG_SENT (1 << 1)
 #define FUZZY_CMD_FLAG_IMAGE (1 << 2)
 #define FUZZY_CMD_FLAG_CONTENT (1 << 3)
+#define FUZZY_CMD_FLAG_HTML (1 << 4)
 
 #define FUZZY_CHECK_FLAG_NOIMAGES (1 << 0)
 #define FUZZY_CHECK_FLAG_NOATTACHMENTS (1 << 1)
 #define FUZZY_CHECK_FLAG_NOTEXT (1 << 2)
+#define FUZZY_CHECK_FLAG_NOHTML (1 << 3)
 
 struct fuzzy_cmd_io {
        uint32_t tag;
@@ -340,6 +346,9 @@ fuzzy_rule_new(const char *default_symbol, rspamd_mempool_t *pool)
                                                                  rule->mappings);
        rule->mode = fuzzy_rule_read_write;
        rule->weight_threshold = NAN;
+       rule->html_weight = 1.0;
+       rule->html_shingles = FALSE;
+       rule->min_html_tags = 10;
 
        return rule;
 }
@@ -720,6 +729,18 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj,
                rule->weight_threshold = ucl_object_todouble(value);
        }
 
+       if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) {
+               rule->html_shingles = ucl_object_toboolean(value);
+       }
+
+       if ((value = ucl_object_lookup(obj, "min_html_tags")) != NULL) {
+               rule->min_html_tags = ucl_object_toint(value);
+       }
+
+       if ((value = ucl_object_lookup(obj, "html_weight")) != NULL) {
+               rule->html_weight = ucl_object_todouble(value);
+       }
+
        /*
         * Process rule in Lua
         */
@@ -2074,6 +2095,139 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
        return io;
 }
 
+/*
+ * Create fuzzy command from HTML structure (if part is HTML)
+ */
+static struct fuzzy_cmd_io *
+fuzzy_cmd_from_html_part(struct rspamd_task *task,
+                                                struct fuzzy_rule *rule,
+                                                int c,
+                                                int flag,
+                                                uint32_t weight,
+                                                struct rspamd_mime_text_part *part,
+                                                struct rspamd_mime_part *mp)
+{
+       struct rspamd_fuzzy_shingle_cmd *shcmd = NULL;
+       struct rspamd_fuzzy_encrypted_shingle_cmd *encshcmd = NULL;
+       struct rspamd_cached_shingles *cached = NULL;
+       struct rspamd_html_shingle *html_sh = NULL;
+       struct fuzzy_cmd_io *io;
+       unsigned int additional_length;
+       unsigned char *additional_data;
+
+       /* Check if HTML shingles are enabled for this rule */
+       if (!rule->html_shingles) {
+               return NULL;
+       }
+
+       /* Check if this is an HTML part */
+       if (!IS_TEXT_PART_HTML(part) || part->html == NULL) {
+               return NULL;
+       }
+
+       /* Check minimum tags threshold */
+       if (part->html_features && part->html_features->tags_count < rule->min_html_tags) {
+               msg_debug_fuzzy_check("HTML part has %d tags, less than minimum %d",
+                                                         part->html_features->tags_count, rule->min_html_tags);
+               return NULL;
+       }
+
+       cached = fuzzy_cmd_get_cached(rule, task, mp);
+
+       if (cached) {
+               /* Copy from cache */
+               additional_length = cached->additional_length;
+               additional_data = cached->additional_data;
+
+               if (cached->sh) {
+                       encshcmd = rspamd_mempool_alloc0(task->task_pool,
+                                                                                        sizeof(*encshcmd) + additional_length);
+                       shcmd = &encshcmd->cmd;
+                       memcpy(&shcmd->sgl, cached->sh, sizeof(struct rspamd_shingle));
+                       memcpy(shcmd->basic.digest, cached->digest, sizeof(cached->digest));
+                       memcpy(((unsigned char *) encshcmd) + sizeof(*encshcmd), additional_data,
+                                  additional_length);
+                       shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+               }
+               else {
+                       return NULL;
+               }
+       }
+       else {
+               /* Generate HTML shingles */
+               additional_length = fuzzy_cmd_extension_length(task, rule);
+               cached = rspamd_mempool_alloc0(task->task_pool, sizeof(*cached) + additional_length);
+               cached->additional_length = additional_length;
+               cached->additional_data = ((unsigned char *) cached) + sizeof(*cached);
+
+               if (additional_length > 0) {
+                       fuzzy_cmd_write_extensions(task, rule, cached->additional_data, additional_length);
+               }
+
+               encshcmd = rspamd_mempool_alloc0(task->task_pool,
+                                                                                sizeof(*encshcmd) + additional_length);
+               shcmd = &encshcmd->cmd;
+
+               msg_debug_fuzzy_check("generating HTML shingles for part with %d tags",
+                                                         part->html_features ? part->html_features->tags_count : 0);
+
+               html_sh = rspamd_shingles_from_html(part->html,
+                                                                                       rule->shingles_key->str, task->task_pool,
+                                                                                       rspamd_shingles_default_filter, NULL,
+                                                                                       rule->alg);
+
+               if (html_sh != NULL) {
+                       /* Use structure shingles for fuzzy matching */
+                       memcpy(&shcmd->sgl, &html_sh->structure_shingles, sizeof(struct rspamd_shingle));
+                       /* Use direct hash as digest for exact matching */
+                       memcpy(shcmd->basic.digest, html_sh->direct_hash, sizeof(shcmd->basic.digest));
+                       shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;
+
+                       /* Cache results */
+                       cached->sh = &html_sh->structure_shingles;
+                       memcpy(cached->digest, html_sh->direct_hash, sizeof(cached->digest));
+                       additional_data = ((unsigned char *) encshcmd) + sizeof(*encshcmd);
+                       memcpy(additional_data, cached->additional_data, additional_length);
+               }
+               else {
+                       /* No HTML shingles generated */
+                       return NULL;
+               }
+
+               fuzzy_cmd_set_cached(rule, task, mp, cached);
+       }
+
+       io = rspamd_mempool_alloc(task->task_pool, sizeof(*io));
+       io->part = mp;
+
+       shcmd->basic.tag = ottery_rand_uint32();
+       shcmd->basic.cmd = c;
+       shcmd->basic.version = RSPAMD_FUZZY_PLUGIN_VERSION;
+
+       if (c != FUZZY_CHECK) {
+               shcmd->basic.flag = flag;
+               shcmd->basic.value = weight;
+       }
+
+       io->tag = shcmd->basic.tag;
+       io->flags = FUZZY_CMD_FLAG_HTML;
+       memcpy(&io->cmd, &shcmd->basic, sizeof(io->cmd));
+
+       if (rule->peer_key) {
+               /* Encrypt data */
+               fuzzy_encrypt_cmd(rule, &encshcmd->hdr, (unsigned char *) shcmd,
+                                                 sizeof(*shcmd) + additional_length);
+               io->io.iov_base = encshcmd;
+               io->io.iov_len = sizeof(*encshcmd) + additional_length;
+       }
+       else {
+               io->io.iov_base = shcmd;
+               io->io.iov_len = sizeof(*shcmd) + additional_length;
+       }
+
+       return io;
+}
+
 #if 0
 static struct fuzzy_cmd_io *
 fuzzy_cmd_from_image_part (struct fuzzy_rule *rule,
@@ -2443,6 +2597,15 @@ fuzzy_insert_result(struct fuzzy_client_session *session,
                        type = "img";
                        res->type = FUZZY_RESULT_IMG;
                }
+               else if ((io->flags & FUZZY_CMD_FLAG_HTML)) {
+                       /* HTML structural hash */
+                       nval *= sqrtf(rep->v1.prob);
+                       /* Apply HTML weight multiplier from rule config */
+                       nval *= session->rule->html_weight;
+
+                       type = "html";
+                       res->type = FUZZY_RESULT_HTML;
+               }
                else {
                        /* Calc real probability */
                        nval *= sqrtf(rep->v1.prob);
@@ -3095,6 +3258,9 @@ fuzzy_controller_io_callback(int fd, short what, void *arg)
                                        if ((io->flags & FUZZY_CMD_FLAG_IMAGE)) {
                                                ftype = "img";
                                        }
+                                       else if ((io->flags & FUZZY_CMD_FLAG_HTML)) {
+                                               ftype = "html";
+                                       }
                                        else if (io->flags & FUZZY_CMD_FLAG_CONTENT) {
                                                ftype = "content";
                                        }
@@ -3340,6 +3506,19 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                                                                                  !fuzzy_check,
                                                                                                  part,
                                                                                                  mime_part);
+
+                                       /* Try HTML fuzzy hash if enabled and text hash generation succeeded/failed */
+                                       if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
+                                               struct fuzzy_cmd_io *html_io;
+
+                                               html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
+                                                                                                                  part, mime_part);
+
+                                               if (html_io) {
+                                                       /* Add HTML hash as separate command */
+                                                       g_ptr_array_add(res, html_io);
+                                               }
+                                       }
                                }
                                else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
                                                 !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
diff --git a/test/functional/configs/fuzzy_html_test.conf b/test/functional/configs/fuzzy_html_test.conf
new file mode 100644 (file)
index 0000000..4166e97
--- /dev/null
@@ -0,0 +1,53 @@
+# Test configuration for HTML fuzzy hashing
+
+.include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf"
+.include(duplicate=merge,priority=0) "{= env.TESTDIR =}/configs/statistic.conf"
+
+fuzzy_check {
+  # Test rule for HTML fuzzy hashing
+  rule "TEST_HTML_FUZZY" {
+    servers = "localhost:11335";
+    algorithm = "mumhash";
+    
+    # Enable HTML fuzzy hashing
+    html_shingles = true;
+    min_html_tags = 5;  # Low threshold for testing
+    html_weight = 1.0;
+    
+    symbol = "FUZZY_HTML_TEST";
+    max_score = 10.0;
+    
+    # Skip encryption for testing
+    # encryption_key = "";
+    
+    fuzzy_map = {
+      "FUZZY_HTML_WHITELIST" {
+        flag = 1;
+        max_score = 10.0;
+      }
+      "FUZZY_HTML_SPAM" {
+        flag = 2;
+        max_score = 10.0;
+      }
+    }
+  }
+  
+  # Rule with both text and HTML enabled
+  rule "TEST_COMBINED" {
+    servers = "localhost:11335";
+    algorithm = "mumhash";
+    
+    html_shingles = true;
+    min_html_tags = 3;
+    
+    symbol = "FUZZY_COMBINED_TEST";
+    max_score = 15.0;
+    
+    fuzzy_map = {
+      "FUZZY_COMBINED_MATCH" {
+        flag = 10;
+        max_score = 15.0;
+      }
+    }
+  }
+}