]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add structured fuzzy checks configuration
authorVsevolod Stakhov <vsevolod@rspamd.com>
Thu, 30 Oct 2025 20:46:41 +0000 (20:46 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 1 Nov 2025 09:45:20 +0000 (09:45 +0000)
- support new checks object while preserving legacy flags

- update lua helper and default config example to leverage structured checks

conf/modules.d/fuzzy_check.conf
lualib/lua_fuzzy.lua
src/plugins/fuzzy_check.c

index e3908ade2021640c8cd1d84b1b7b013f2551adee..9aef488e0f7e761438e06b9ac1f940cdff517f7f 100644 (file)
@@ -46,11 +46,15 @@ fuzzy_check {
   # want to maintain HTML structure hashes on a dedicated fuzzy storage.
   # rule "html_structure_example" {
   #   servers = "html-fuzzy.example.com:11335";
-  #   text_hashes = false;          # disable text hashes for this rule
-  #   skip_images = true;           # optional: do not hash images
-  #   html_shingles = true;         # enable HTML structure hashing
-  #   min_html_tags = 20;           # require substantial HTML before hashing
-  #   html_weight = 1.0;            # adjust weight of HTML matches if needed
+  #   checks = {
+  #     text { enabled = false; }    # disable text hashing for this rule
+  #     html {
+  #       enabled = true;
+  #       min_html_tags = 20;        # require substantial HTML before hashing
+  #       html_weight = 1.0;         # adjust weight of HTML matches if needed
+  #     }
+  #     image { enabled = false; }   # optional: do not hash images
+  #   }
   #   symbol = "FUZZY_HTML_STRUCTURE";
   #   max_score = 25.0;
   #   fuzzy_map = {
index 23757ebb7c3a489ffdc332741aa0e516a8d2e7af..e91033594cc0203079e4c7d7b09ce97868a85e94 100644 (file)
@@ -70,6 +70,155 @@ local policy_schema_open = ts.shape(schema_fields, {
 
 local exports = {}
 
+local function apply_checks_overrides(rule)
+  local checks = rule.checks
+
+  if type(checks) ~= 'table' then
+    return
+  end
+
+  local function find_section(name)
+    local lname = name:lower()
+
+    for k, v in pairs(checks) do
+      if type(k) == 'string' and k:lower() == lname then
+        return v
+      end
+    end
+
+    return nil
+  end
+
+  local function bool_opt(section, key)
+    if type(section) ~= 'table' then
+      return nil
+    end
+
+    if section[key] == nil then
+      return nil
+    end
+
+    return lua_util.toboolean(section[key])
+  end
+
+  local function number_opt(section, key)
+    if type(section) ~= 'table' then
+      return nil
+    end
+
+    if section[key] == nil then
+      return nil
+    end
+
+    return tonumber(section[key])
+  end
+
+  local text_section = find_section('text')
+
+  if text_section then
+    local enabled = bool_opt(text_section, 'enabled')
+
+    if enabled == nil then
+      enabled = true
+    end
+
+    rule.text_hashes = enabled
+
+    local opt = bool_opt(text_section, 'no_subject')
+
+    if opt ~= nil then
+      rule.no_subject = opt
+    end
+
+    opt = bool_opt(text_section, 'short_text_direct_hash')
+
+    if opt ~= nil then
+      rule.short_text_direct_hash = opt
+    end
+
+    local num = number_opt(text_section, 'min_length')
+
+    if num ~= nil then
+      rule.min_length = num
+    end
+
+    num = number_opt(text_section, 'text_multiplier')
+
+    if num ~= nil then
+      rule.text_multiplier = num
+    end
+  end
+
+  local html_section = find_section('html')
+
+  if html_section then
+    local enabled = bool_opt(html_section, 'enabled')
+
+    if enabled == nil then
+      enabled = true
+    end
+
+    rule.html_shingles = enabled
+
+    local num = number_opt(html_section, 'min_html_tags')
+
+    if num == nil then
+      num = number_opt(html_section, 'min_tags')
+    end
+
+    if num ~= nil then
+      rule.min_html_tags = num
+    end
+
+    num = number_opt(html_section, 'html_weight')
+
+    if num == nil then
+      num = number_opt(html_section, 'weight')
+    end
+
+    if num ~= nil then
+      rule.html_weight = num
+    end
+  end
+
+  local image_section = find_section('image') or find_section('images')
+
+  if image_section then
+    local enabled = bool_opt(image_section, 'enabled')
+
+    if enabled == nil then
+      enabled = true
+    end
+
+    rule.skip_images = not enabled
+
+    local num = number_opt(image_section, 'min_height')
+
+    if num ~= nil then
+      rule.min_height = num
+    end
+
+    num = number_opt(image_section, 'min_width')
+
+    if num ~= nil then
+      rule.min_width = num
+    end
+  end
+
+  local archive_section = find_section('archive') or find_section('archives')
+
+  if archive_section then
+    local enabled = bool_opt(archive_section, 'enabled')
+
+    if enabled == nil then
+      enabled = true
+    end
+
+    rule.scan_archives = enabled
+  end
+
+  rule.checks = nil
+end
 
 --[[[
 -- @function lua_fuzzy.register_policy(name, policy)
@@ -107,6 +256,8 @@ exports.process_rule = function(rule)
   if policy then
     processed_rule = lua_util.override_defaults(policy, processed_rule)
 
+    apply_checks_overrides(processed_rule)
+
     local parsed_policy, err = policy_schema_open:transform(processed_rule)
 
     if not parsed_policy then
@@ -116,6 +267,7 @@ exports.process_rule = function(rule)
     end
   else
     rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy)
+    apply_checks_overrides(processed_rule)
   end
 
   if processed_rule.mime_types then
index 8554f76bbe8669e35424d80985473b82b8503b51..bd10f52ffa4a5c5929e00fe37ba22ebcd1feb746 100644 (file)
@@ -28,6 +28,7 @@
  * - whitelist (map string): map of ip addresses that should not be checked with this module
  * - servers (string): list of fuzzy servers in format "server1:port,server2:port" - these servers would be used for checking and storing
  *   fuzzy hashes
+ * - checks (object): structured configuration of content hashing routines (e.g. checks { text { enabled = true; }, html { enabled = true; } })
  */
 
 #include "config.h"
@@ -412,6 +413,87 @@ parse_fuzzy_headers(struct rspamd_config *cfg, const char *str)
        return res;
 }
 
+static void
+fuzzy_rule_apply_checks(struct fuzzy_rule *rule,
+                                               struct rspamd_config *cfg,
+                                               const ucl_object_t *checks)
+{
+       const ucl_object_t *cur, *opt;
+       ucl_object_iter_t it = NULL;
+       const char *rule_name;
+
+       if (checks == NULL) {
+               return;
+       }
+
+       if (checks->type != UCL_OBJECT) {
+               rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown");
+               msg_warn_config("checks parameter for fuzzy rule %s must be an object", rule_name);
+               return;
+       }
+
+       rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown");
+
+       while ((cur = ucl_object_iterate(checks, &it, true)) != NULL) {
+               const char *check_name = ucl_object_key(cur);
+
+               if (check_name == NULL) {
+                       continue;
+               }
+
+               if (cur->type != UCL_OBJECT) {
+                       msg_warn_config("check %s in fuzzy rule %s must be an object", check_name, rule_name);
+                       continue;
+               }
+
+               if (g_ascii_strcasecmp(check_name, "text") == 0) {
+                       gboolean enabled = TRUE;
+
+                       if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) {
+                               enabled = ucl_obj_toboolean(opt);
+                       }
+
+                       rule->text_hashes = enabled;
+
+                       if ((opt = ucl_object_lookup(cur, "no_subject")) != NULL) {
+                               rule->no_subject = ucl_obj_toboolean(opt);
+                       }
+               }
+               else if (g_ascii_strcasecmp(check_name, "html") == 0) {
+                       gboolean enabled = TRUE;
+
+                       if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) {
+                               enabled = ucl_obj_toboolean(opt);
+                       }
+
+                       rule->html_shingles = enabled;
+
+                       if ((opt = ucl_object_lookup(cur, "min_html_tags")) != NULL) {
+                               rule->min_html_tags = ucl_obj_toint(opt);
+                       }
+                       else if ((opt = ucl_object_lookup(cur, "min_tags")) != NULL) {
+                               rule->min_html_tags = ucl_obj_toint(opt);
+                       }
+
+                       if ((opt = ucl_object_lookup(cur, "html_weight")) != NULL) {
+                               rule->html_weight = ucl_obj_todouble(opt);
+                       }
+                       else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) {
+                               rule->html_weight = ucl_obj_todouble(opt);
+                       }
+               }
+               else {
+                       /* Other checks are processed by lua_fuzzy; keep legacy behaviour */
+                       if (g_ascii_strcasecmp(check_name, "images") != 0 &&
+                               g_ascii_strcasecmp(check_name, "image") != 0 &&
+                               g_ascii_strcasecmp(check_name, "archives") != 0 &&
+                               g_ascii_strcasecmp(check_name, "archive") != 0) {
+                               msg_warn_config("unknown check type '%s' in fuzzy rule %s", check_name, rule_name);
+                       }
+               }
+       }
+}
+
 static double
 fuzzy_normalize(int32_t in, double weight)
 {
@@ -2050,6 +2132,10 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj,
                rule->html_weight = ucl_object_todouble(value);
        }
 
+       if ((value = ucl_object_lookup(obj, "checks")) != NULL) {
+               fuzzy_rule_apply_checks(rule, cfg, value);
+       }
+
        /* Initialize rate tracker */
        rule->rate_tracker.requests_count = 0;
        rule->rate_tracker.window_start = 0;
@@ -2439,6 +2525,15 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
                                   0,
                                   NULL,
                                   0);
+       rspamd_rcl_add_doc_by_path(cfg,
+                                  "fuzzy_check.rule",
+                                  "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
+                                  "checks",
+                                  UCL_OBJECT,
+                                  NULL,
+                                  0,
+                                  NULL,
+                                  0);
        rspamd_rcl_add_doc_by_path(cfg,
                                                           "fuzzy_check.rule",
                                                           "Override module default min bytes for this rule",