# want to maintain HTML structure hashes on a dedicated fuzzy storage.
# rule "html_structure_example" {
# servers = "html-fuzzy.example.com:11335";
- # text_hashes = false; # disable text hashes for this rule
- # skip_images = true; # optional: do not hash images
- # html_shingles = true; # enable HTML structure hashing
- # min_html_tags = 20; # require substantial HTML before hashing
- # html_weight = 1.0; # adjust weight of HTML matches if needed
+ # checks = {
+ # text { enabled = false; } # disable text hashing for this rule
+ # html {
+ # enabled = true;
+ # min_html_tags = 20; # require substantial HTML before hashing
+ # html_weight = 1.0; # adjust weight of HTML matches if needed
+ # }
+ # image { enabled = false; } # optional: do not hash images
+ # }
# symbol = "FUZZY_HTML_STRUCTURE";
# max_score = 25.0;
# fuzzy_map = {
local exports = {}
+local function apply_checks_overrides(rule)
+ local checks = rule.checks
+
+ if type(checks) ~= 'table' then
+ return
+ end
+
+ local function find_section(name)
+ local lname = name:lower()
+
+ for k, v in pairs(checks) do
+ if type(k) == 'string' and k:lower() == lname then
+ return v
+ end
+ end
+
+ return nil
+ end
+
+ local function bool_opt(section, key)
+ if type(section) ~= 'table' then
+ return nil
+ end
+
+ if section[key] == nil then
+ return nil
+ end
+
+ return lua_util.toboolean(section[key])
+ end
+
+ local function number_opt(section, key)
+ if type(section) ~= 'table' then
+ return nil
+ end
+
+ if section[key] == nil then
+ return nil
+ end
+
+ return tonumber(section[key])
+ end
+
+ local text_section = find_section('text')
+
+ if text_section then
+ local enabled = bool_opt(text_section, 'enabled')
+
+ if enabled == nil then
+ enabled = true
+ end
+
+ rule.text_hashes = enabled
+
+ local opt = bool_opt(text_section, 'no_subject')
+
+ if opt ~= nil then
+ rule.no_subject = opt
+ end
+
+ opt = bool_opt(text_section, 'short_text_direct_hash')
+
+ if opt ~= nil then
+ rule.short_text_direct_hash = opt
+ end
+
+ local num = number_opt(text_section, 'min_length')
+
+ if num ~= nil then
+ rule.min_length = num
+ end
+
+ num = number_opt(text_section, 'text_multiplier')
+
+ if num ~= nil then
+ rule.text_multiplier = num
+ end
+ end
+
+ local html_section = find_section('html')
+
+ if html_section then
+ local enabled = bool_opt(html_section, 'enabled')
+
+ if enabled == nil then
+ enabled = true
+ end
+
+ rule.html_shingles = enabled
+
+ local num = number_opt(html_section, 'min_html_tags')
+
+ if num == nil then
+ num = number_opt(html_section, 'min_tags')
+ end
+
+ if num ~= nil then
+ rule.min_html_tags = num
+ end
+
+ num = number_opt(html_section, 'html_weight')
+
+ if num == nil then
+ num = number_opt(html_section, 'weight')
+ end
+
+ if num ~= nil then
+ rule.html_weight = num
+ end
+ end
+
+ local image_section = find_section('image') or find_section('images')
+
+ if image_section then
+ local enabled = bool_opt(image_section, 'enabled')
+
+ if enabled == nil then
+ enabled = true
+ end
+
+ rule.skip_images = not enabled
+
+ local num = number_opt(image_section, 'min_height')
+
+ if num ~= nil then
+ rule.min_height = num
+ end
+
+ num = number_opt(image_section, 'min_width')
+
+ if num ~= nil then
+ rule.min_width = num
+ end
+ end
+
+ local archive_section = find_section('archive') or find_section('archives')
+
+ if archive_section then
+ local enabled = bool_opt(archive_section, 'enabled')
+
+ if enabled == nil then
+ enabled = true
+ end
+
+ rule.scan_archives = enabled
+ end
+
+ rule.checks = nil
+end
--[[[
-- @function lua_fuzzy.register_policy(name, policy)
if policy then
processed_rule = lua_util.override_defaults(policy, processed_rule)
+ apply_checks_overrides(processed_rule)
+
local parsed_policy, err = policy_schema_open:transform(processed_rule)
if not parsed_policy then
end
else
rspamd_logger.warnx(rspamd_config, "unknown policy %s", processed_rule.policy)
+ apply_checks_overrides(processed_rule)
end
if processed_rule.mime_types then
* - whitelist (map string): map of ip addresses that should not be checked with this module
* - servers (string): list of fuzzy servers in format "server1:port,server2:port" - these servers would be used for checking and storing
* fuzzy hashes
+ * - checks (object): structured configuration of content hashing routines (e.g. checks { text { enabled = true; }, html { enabled = true; } })
*/
#include "config.h"
return res;
}
+static void
+fuzzy_rule_apply_checks(struct fuzzy_rule *rule,
+ struct rspamd_config *cfg,
+ const ucl_object_t *checks)
+{
+ const ucl_object_t *cur, *opt;
+ ucl_object_iter_t it = NULL;
+ const char *rule_name;
+
+ if (checks == NULL) {
+ return;
+ }
+
+ if (checks->type != UCL_OBJECT) {
+ rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown");
+ msg_warn_config("checks parameter for fuzzy rule %s must be an object", rule_name);
+ return;
+ }
+
+ rule_name = rule->name ? rule->name : (rule->symbol ? rule->symbol : "unknown");
+
+ while ((cur = ucl_object_iterate(checks, &it, true)) != NULL) {
+ const char *check_name = ucl_object_key(cur);
+
+ if (check_name == NULL) {
+ continue;
+ }
+
+ if (cur->type != UCL_OBJECT) {
+ msg_warn_config("check %s in fuzzy rule %s must be an object", check_name, rule_name);
+ continue;
+ }
+
+ if (g_ascii_strcasecmp(check_name, "text") == 0) {
+ gboolean enabled = TRUE;
+
+ if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) {
+ enabled = ucl_obj_toboolean(opt);
+ }
+
+ rule->text_hashes = enabled;
+
+ if ((opt = ucl_object_lookup(cur, "no_subject")) != NULL) {
+ rule->no_subject = ucl_obj_toboolean(opt);
+ }
+ }
+ else if (g_ascii_strcasecmp(check_name, "html") == 0) {
+ gboolean enabled = TRUE;
+
+ if ((opt = ucl_object_lookup(cur, "enabled")) != NULL) {
+ enabled = ucl_obj_toboolean(opt);
+ }
+
+ rule->html_shingles = enabled;
+
+ if ((opt = ucl_object_lookup(cur, "min_html_tags")) != NULL) {
+ rule->min_html_tags = ucl_obj_toint(opt);
+ }
+ else if ((opt = ucl_object_lookup(cur, "min_tags")) != NULL) {
+ rule->min_html_tags = ucl_obj_toint(opt);
+ }
+
+ if ((opt = ucl_object_lookup(cur, "html_weight")) != NULL) {
+ rule->html_weight = ucl_obj_todouble(opt);
+ }
+ else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) {
+ rule->html_weight = ucl_obj_todouble(opt);
+ }
+ }
+ else {
+ /* Other checks are processed by lua_fuzzy; keep legacy behaviour */
+ if (g_ascii_strcasecmp(check_name, "images") != 0 &&
+ g_ascii_strcasecmp(check_name, "image") != 0 &&
+ g_ascii_strcasecmp(check_name, "archives") != 0 &&
+ g_ascii_strcasecmp(check_name, "archive") != 0) {
+ msg_warn_config("unknown check type '%s' in fuzzy rule %s", check_name, rule_name);
+ }
+ }
+ }
+}
+
static double
fuzzy_normalize(int32_t in, double weight)
{
rule->html_weight = ucl_object_todouble(value);
}
+ if ((value = ucl_object_lookup(obj, "checks")) != NULL) {
+ fuzzy_rule_apply_checks(rule, cfg, value);
+ }
+
/* Initialize rate tracker */
rule->rate_tracker.requests_count = 0;
rule->rate_tracker.window_start = 0;
0,
NULL,
0);
+ rspamd_rcl_add_doc_by_path(cfg,
+ "fuzzy_check.rule",
+ "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
+ "checks",
+ UCL_OBJECT,
+ NULL,
+ 0,
+ NULL,
+ 0);
rspamd_rcl_add_doc_by_path(cfg,
"fuzzy_check.rule",
"Override module default min bytes for this rule",