From 1f3e0fe563c079efd3444f6427f9582a5e6aa70b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 30 Oct 2025 20:46:41 +0000 Subject: [PATCH] [Feature] Add structured fuzzy checks configuration - support new checks object while preserving legacy flags - update lua helper and default config example to leverage structured checks --- conf/modules.d/fuzzy_check.conf | 14 +-- lualib/lua_fuzzy.lua | 152 ++++++++++++++++++++++++++++++++ src/plugins/fuzzy_check.c | 95 ++++++++++++++++++++ 3 files changed, 256 insertions(+), 5 deletions(-) diff --git a/conf/modules.d/fuzzy_check.conf b/conf/modules.d/fuzzy_check.conf index e3908ade20..9aef488e0f 100644 --- a/conf/modules.d/fuzzy_check.conf +++ b/conf/modules.d/fuzzy_check.conf @@ -46,11 +46,15 @@ fuzzy_check { # want to maintain HTML structure hashes on a dedicated fuzzy storage. # rule "html_structure_example" { # servers = "html-fuzzy.example.com:11335"; - # text_hashes = false; # disable text hashes for this rule - # skip_images = true; # optional: do not hash images - # html_shingles = true; # enable HTML structure hashing - # min_html_tags = 20; # require substantial HTML before hashing - # html_weight = 1.0; # adjust weight of HTML matches if needed + # checks = { + # text { enabled = false; } # disable text hashing for this rule + # html { + # enabled = true; + # min_html_tags = 20; # require substantial HTML before hashing + # html_weight = 1.0; # adjust weight of HTML matches if needed + # } + # image { enabled = false; } # optional: do not hash images + # } # symbol = "FUZZY_HTML_STRUCTURE"; # max_score = 25.0; # fuzzy_map = { diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua index 23757ebb7c..e91033594c 100644 --- a/lualib/lua_fuzzy.lua +++ b/lualib/lua_fuzzy.lua @@ -70,6 +70,155 @@ local policy_schema_open = ts.shape(schema_fields, { local exports = {} +local function apply_checks_overrides(rule) + local checks = rule.checks + + if type(checks) ~= 'table' then + return + end + + local function find_section(name) + local lname = name:lower() + + for k, v in pairs(checks) do + if type(k) == 'string' and k:lower() == lname then + return v + end + end + + return nil + end + + local function bool_opt(section, key) + if type(section) ~= 'table' then + return nil + end + + if section[key] == nil then + return nil + end + + return lua_util.toboolean(section[key]) + end + + local function number_opt(section, key) + if type(section) ~= 'table' then + return nil + end + + if section[key] == nil then + return nil + end + + return tonumber(section[key]) + end + + local text_section = find_section('text') + + if text_section then + local enabled = bool_opt(text_section, 'enabled') + + if enabled == nil then + enabled = true + end + + rule.text_hashes = enabled + + local opt = bool_opt(text_section, 'no_subject') + + if opt ~= nil then + rule.no_subject = opt + end + + opt = bool_opt(text_section, 'short_text_direct_hash') + + if opt ~= nil then + rule.short_text_direct_hash = opt + end + + local num = number_opt(text_section, 'min_length') + + if num ~= nil then + rule.min_length = num + end + + num = number_opt(text_section, 'text_multiplier') + + if num ~= nil then + rule.text_multiplier = num + end + end + + local html_section = find_section('html') + + if html_section then + local enabled = bool_opt(html_section, 'enabled') + + if enabled == nil then + enabled = true + end + + rule.html_shingles = enabled + + local num = number_opt(html_section, 'min_html_tags') + + if num == nil then + num = number_opt(html_section, 'min_tags') + end + + if num ~= nil then + rule.min_html_tags = num + end + + num = number_opt(html_section, 'html_weight') + + if num == nil then + num = number_opt(html_section, 'weight') + end + + if num ~= nil then + rule.html_weight = num + end + end + + local image_section = find_section('image') or find_section('images') + + if image_section then + local enabled = bool_opt(image_section, 'enabled') + + if enabled == nil then + enabled = true + end + + rule.skip_images = not enabled + + local num = number_opt(image_section, 'min_height') + + if num ~= nil then + rule.min_height = num + end + + num = number_opt(image_section, 'min_width') + + if num ~= nil then + rule.min_width = num + end + end + + local archive_section = find_section('archive') or find_section('archives') + + if archive_section then + local enabled = bool_opt(archive_section, 'enabled') + + if enabled == nil then + enabled = true + end + + rule.scan_archives = enabled + end + + rule.checks = nil +end --[[[ -- @function lua_fuzzy.register_policy(name, policy) @@ -107,6 +256,8 @@ exports.process_rule = function(rule) if policy then processed_rule = lua_util.override_defaults(policy, processed_rule) + apply_checks_overrides(processed_rule) + local parsed_policy, err = policy_schema_open:transform(processed_rule) if not parsed_policy then @@ -116,6 +267,7 @@ exports.process_rule = function(rule) end else rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy) + apply_checks_overrides(processed_rule) end if processed_rule.mime_types then diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index 8554f76bbe..bd10f52ffa 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -28,6 +28,7 @@ * - whitelist (map string): map of ip addresses that should not be checked with this module * - servers (string): list of fuzzy servers in format "server1:port,server2:port" - these servers would be used for checking and storing * fuzzy hashes + * - checks (object): structured configuration of content hashing routines (e.g. checks { text { enabled = true; }, html { enabled = true; } }) */ #include "config.h" @@ -412,6 +413,87 @@ parse_fuzzy_headers(struct rspamd_config *cfg, const char *str) return res; } +static void +fuzzy_rule_apply_checks(struct fuzzy_rule *rule, + struct rspamd_config *cfg, + const ucl_object_t *checks) +{ + const ucl_object_t *cur, *opt; + ucl_object_iter_t it = NULL; + const char *rule_name; + + if (checks == NULL) { + return; + } + + if (checks->type != UCL_OBJECT) { + rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown"); + msg_warn_config("checks parameter for fuzzy rule %s must be an object", rule_name); + return; + } + + rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown"); + + while ((cur = ucl_object_iterate(checks, &it, true)) != NULL) { + const char *check_name = ucl_object_key(cur); + + if (check_name == NULL) { + continue; + } + + if (cur->type != UCL_OBJECT) { + msg_warn_config("check %s in fuzzy rule %s must be an object", check_name, rule_name); + continue; + } + + if (g_ascii_strcasecmp(check_name, "text") == 0) { + gboolean enabled = TRUE; + + if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) { + enabled = ucl_obj_toboolean(opt); + } + + rule->text_hashes = enabled; + + if ((opt = ucl_object_lookup(cur, "no_subject")) != NULL) { + rule->no_subject = ucl_obj_toboolean(opt); + } + } + else if (g_ascii_strcasecmp(check_name, "html") == 0) { + gboolean enabled = TRUE; + + if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) { + enabled = ucl_obj_toboolean(opt); + } + + rule->html_shingles = enabled; + + if ((opt = ucl_object_lookup(cur, "min_html_tags")) != NULL) { + rule->min_html_tags = ucl_obj_toint(opt); + } + else if ((opt = ucl_object_lookup(cur, "min_tags")) != NULL) { + rule->min_html_tags = ucl_obj_toint(opt); + } + + if ((opt = ucl_object_lookup(cur, "html_weight")) != NULL) { + rule->html_weight = ucl_obj_todouble(opt); + } + else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) { + rule->html_weight = ucl_obj_todouble(opt); + } + } + else { + /* Other checks are processed by lua_fuzzy; keep legacy behaviour */ + if (g_ascii_strcasecmp(check_name, "images") != 0 && + g_ascii_strcasecmp(check_name, "image") != 0 && + g_ascii_strcasecmp(check_name, "archives") != 0 && + g_ascii_strcasecmp(check_name, "archive") != 0) { + msg_warn_config("unknown check type '%s' in fuzzy rule %s", check_name, rule_name); + } + } + } +} + static double fuzzy_normalize(int32_t in, double weight) { @@ -2050,6 +2132,10 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj, rule->html_weight = ucl_object_todouble(value); } + if ((value = ucl_object_lookup(obj, "checks")) != NULL) { + fuzzy_rule_apply_checks(rule, cfg, value); + } + /* Initialize rate tracker */ rule->rate_tracker.requests_count = 0; rule->rate_tracker.window_start = 0; @@ -2439,6 +2525,15 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx) 0, NULL, 0); + rspamd_rcl_add_doc_by_path(cfg, + "fuzzy_check.rule", + "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })", + "checks", + UCL_OBJECT, + NULL, + 0, + NULL, + 0); rspamd_rcl_add_doc_by_path(cfg, "fuzzy_check.rule", "Override module default min bytes for this rule", -- 2.47.3