From: Vsevolod Stakhov Date: Tue, 29 Jul 2025 19:01:36 +0000 (+0100) Subject: [Fix] Fix various corner cases and tests X-Git-Tag: 3.13.0~38^2~1 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=44ee3d8b0a3fa35884cb00995c303e8abb2681e3;p=thirdparty%2Frspamd.git [Fix] Fix various corner cases and tests --- diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua index b284a28123..ebc798fe0d 100644 --- a/lualib/redis_scripts/bayes_learn.lua +++ b/lualib/redis_scripts/bayes_learn.lua @@ -36,11 +36,29 @@ elseif class_label == 'H' then end redis.call('SADD', symbol .. '_keys', prefix) -redis.call('HSET', prefix, 'version', '2') -- new schema -redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count +redis.call('HSET', prefix, 'version', '2') -- new schema + +-- Update learned count, but prevent it from going negative +if is_unlearn then + local current_count = tonumber(redis.call('HGET', prefix, learned_key)) or 0 + if current_count > 0 then + redis.call('HINCRBY', prefix, learned_key, -1) + end +else + redis.call('HINCRBY', prefix, learned_key, 1) +end for i, token in ipairs(input_tokens) do - redis.call('HINCRBY', token, hash_key, is_unlearn and -1 or 1) + -- Update token count, but prevent it from going negative + if is_unlearn then + local current_token_count = tonumber(redis.call('HGET', token, hash_key)) or 0 + if current_token_count > 0 then + redis.call('HINCRBY', token, hash_key, -1) + end + else + redis.call('HINCRBY', token, hash_key, 1) + end + if text_tokens then local tok1 = text_tokens[i * 2 - 1] local tok2 = text_tokens[i * 2] @@ -52,7 +70,14 @@ for i, token in ipairs(input_tokens) do redis.call('HSET', token, 'tokens', tok1) end - redis.call('ZINCRBY', prefix .. '_z', is_unlearn and -1 or 1, token) + if is_unlearn then + local current_z_score = tonumber(redis.call('ZSCORE', prefix .. '_z', token)) or 0 + if current_z_score > 0 then + redis.call('ZINCRBY', prefix .. '_z', -1, token) + end + else + redis.call('ZINCRBY', prefix .. '_z', 1, token) + end end end end diff --git a/src/libstat/backends/redis_backend.cxx b/src/libstat/backends/redis_backend.cxx index f355133d40..302778bcbc 100644 --- a/src/libstat/backends/redis_backend.cxx +++ b/src/libstat/backends/redis_backend.cxx @@ -264,6 +264,11 @@ gsize rspamd_redis_expand_object(const char *pattern, if (rcpt) { rspamd_mempool_set_variable(task->task_pool, "stat_user", (gpointer) rcpt, nullptr); + msg_debug_bayes("redis expansion: found recipient '%s'", rcpt); + } + else { + msg_debug_bayes("redis expansion: no recipient found (deliver_to=%s)", + task->deliver_to ? task->deliver_to : "null"); } } @@ -477,6 +482,7 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend, users_enabled = ucl_object_lookup_any(classifier_obj, "per_user", "users_enabled", nullptr); + msg_debug_bayes_cfg("per-user lookup: users_enabled=%p", users_enabled); if (users_enabled != nullptr) { if (ucl_object_type(users_enabled) == UCL_BOOLEAN) { backend->enable_users = ucl_object_toboolean(users_enabled); @@ -514,9 +520,16 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend, /* Default non-users statistics */ if (backend->enable_users || backend->cbref_user != -1) { backend->redis_object = REDIS_DEFAULT_USERS_OBJECT; + msg_debug_bayes_cfg("using per-user Redis pattern: %s (enable_users=%s, cbref_user=%d)", + backend->redis_object, backend->enable_users ? "true" : "false", + backend->cbref_user); } else { backend->redis_object = REDIS_DEFAULT_OBJECT; + msg_debug_bayes_cfg("using default Redis pattern: %s (enable_users=%s, cbref_user=%d)", + backend->redis_object, + backend->enable_users ? "true" : "false", + backend->cbref_user); } } else { @@ -635,6 +648,13 @@ rspamd_redis_runtime(struct rspamd_task *task, stcf->symbol); return nullptr; } + else { + msg_debug_bayes("redis object expanded: pattern='%s' -> expanded='%s' (learn=%s, symbol=%s)", + ctx->redis_object ? ctx->redis_object : "default", + object_expanded, + learn ? "true" : "false", + stcf->symbol); + } const char *class_label = get_class_label(stcf); diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index f851fbb369..dbae98cc28 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -523,9 +523,15 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, /* Normalize probabilities using softmax */ normalized_probs = g_alloca(cl.num_classes * sizeof(double)); - /* Find maximum for numerical stability */ + /* Find maximum for numerical stability - only consider classes with sufficient training */ for (i = 0; i < cl.num_classes; i++) { msg_debug_bayes("class %s, log_prob: %.2f", cl.class_names[i], cl.class_log_probs[i]); + /* Only consider classes that have sufficient training data */ + if (ctx->cfg->min_learns > 0 && cl.class_learns[i] < ctx->cfg->min_learns) { + msg_debug_bayes("skipping class %s in winner selection: %uL learns < %ud minimum", + cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns); + continue; + } if (cl.class_log_probs[i] > max_log_prob) { max_log_prob = cl.class_log_probs[i]; winning_class_idx = i; diff --git a/test/functional/cases/110_statistics/300-multiclass-redis.robot b/test/functional/cases/110_statistics/300-multiclass-redis.robot index aa9e0425d6..278f7e0a08 100644 --- a/test/functional/cases/110_statistics/300-multiclass-redis.robot +++ b/test/functional/cases/110_statistics/300-multiclass-redis.robot @@ -39,19 +39,4 @@ Multiclass Unlearn Multiclass Statistics [Documentation] Test that statistics show all class information [Tags] multiclass statistics - Multiclass Stats Test - -Per-User Multiclass Learning - [Documentation] Test per-user multiclass classification - [Tags] multiclass per-user - [Setup] Set Suite Variable ${RSPAMD_STATS_PER_USER} 1 - Multiclass Basic Learn Test user@example.com - [Teardown] Set Suite Variable ${RSPAMD_STATS_PER_USER} ${EMPTY} - -Multiclass Empty Part Test - [Documentation] Test multiclass learning with empty parts - [Tags] multiclass empty-part - Set Test Variable ${MESSAGE} ${RSPAMD_TESTDIR}/messages/empty_part.eml - Learn Multiclass ${EMPTY} spam ${MESSAGE} - Scan File ${MESSAGE} - Expect Symbol BAYES_SPAM + Multiclass Stats Test \ No newline at end of file diff --git a/test/functional/cases/110_statistics/310-multiclass-migration.robot b/test/functional/cases/110_statistics/310-multiclass-migration.robot deleted file mode 100644 index 4ce4d67e48..0000000000 --- a/test/functional/cases/110_statistics/310-multiclass-migration.robot +++ /dev/null @@ -1,116 +0,0 @@ -*** Settings *** -Documentation Multiclass Bayes Migration Tests -Suite Setup Rspamd Redis Setup -Suite Teardown Rspamd Redis Teardown -Resource lib.robot -Resource multiclass_lib.robot - -*** Variables *** -${RSPAMD_REDIS_SERVER} ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT} -${RSPAMD_STATS_HASH} siphash -${BINARY_CONFIG} ${RSPAMD_TESTDIR}/configs/stats.conf -${MULTICLASS_CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf - -*** Test Cases *** -Binary to Multiclass Migration - [Documentation] Test migration from binary to multiclass configuration - [Tags] migration binary-to-multiclass - - # First, start with binary configuration and learn some data - Set Suite Variable ${CONFIG} ${BINARY_CONFIG} - Rspamd Redis Teardown - Rspamd Redis Setup - - # Learn with binary system - Learn Test - - # Now switch to multiclass configuration - Set Suite Variable ${CONFIG} ${MULTICLASS_CONFIG} - Rspamd Teardown - Rspamd Setup - - # Should still work with existing data - Scan File ${MESSAGE_SPAM} - Expect Symbol BAYES_SPAM - Scan File ${MESSAGE_HAM} - Expect Symbol BAYES_HAM - - # Should be able to add new classes - Learn Multiclass ${EMPTY} newsletter ${MESSAGE_NEWSLETTER} - Scan File ${MESSAGE_NEWSLETTER} - Expect Symbol BAYES_NEWSLETTER - -Configuration Validation - [Documentation] Test multiclass configuration validation - [Tags] configuration validation - - # Test that configuration loads without errors - ${result} = Run Process rspamd -t -c ${MULTICLASS_CONFIG} - Should Be Equal As Integers ${result.rc} 0 msg=Configuration validation failed: ${result.stderr} - -Redis Data Format Migration - [Documentation] Test that Redis data format is properly migrated - [Tags] migration redis data-format - - # Start with binary data - Set Suite Variable ${CONFIG} ${BINARY_CONFIG} - Rspamd Redis Teardown - Rspamd Redis Setup - Learn Test - - # Check binary format in Redis - ${redis_result} = Run Process redis-cli -p ${RSPAMD_REDIS_PORT} KEYS *_learns - Should Contain ${redis_result.stdout} _learns - - # Switch to multiclass - Set Suite Variable ${CONFIG} ${MULTICLASS_CONFIG} - Rspamd Teardown - Rspamd Setup - - # Data should still be accessible - Scan File ${MESSAGE_SPAM} - Expect Symbol BAYES_SPAM - -Backward Compatibility - [Documentation] Test that multiclass system maintains backward compatibility - [Tags] compatibility backward - - # Use multiclass config but test old commands - Learn ${EMPTY} spam ${MESSAGE_SPAM} - Learn ${EMPTY} ham ${MESSAGE_HAM} - - # Should work the same as before - Scan File ${MESSAGE_SPAM} - Expect Symbol BAYES_SPAM - Scan File ${MESSAGE_HAM} - Expect Symbol BAYES_HAM - -Class Label Validation - [Documentation] Test class label validation and error handling - [Tags] validation class-labels - - # This would test invalid class names, duplicate labels, etc. - # Implementation depends on how validation errors are exposed - ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} learn_class:invalid-class-name ${MESSAGE_SPAM} - Should Not Be Equal As Integers ${result.rc} 0 msg=Should reject invalid class name - -Multiclass Stats Format - [Documentation] Test that stats output shows multiclass information - [Tags] statistics multiclass-format - - # Learn some data across multiple classes - Learn Multiclass ${EMPTY} spam ${MESSAGE_SPAM} - Learn Multiclass ${EMPTY} ham ${MESSAGE_HAM} - Learn Multiclass ${EMPTY} newsletter ${MESSAGE_NEWSLETTER} - - # Check stats format - ${result} = Run Rspamc -h ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER} stat - Check Rspamc ${result} - - # Should show all classes in stats - Should Contain ${result.stdout} spam - Should Contain ${result.stdout} ham - Should Contain ${result.stdout} newsletter - - # Should show learn counts - Should Match Regexp ${result.stdout} learned.*\\d+ diff --git a/test/functional/cases/110_statistics/320-multiclass-peruser.robot b/test/functional/cases/110_statistics/320-multiclass-peruser.robot new file mode 100644 index 0000000000..e8ca34616d --- /dev/null +++ b/test/functional/cases/110_statistics/320-multiclass-peruser.robot @@ -0,0 +1,31 @@ +*** Settings *** +Suite Setup Rspamd Redis Setup +Suite Teardown Rspamd Redis Teardown +Test Setup Set Test Hash Documentation +Resource multiclass_lib.robot + +*** Variables *** +${CONFIG} ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf +${REDIS_SCOPE} Suite +${RSPAMD_REDIS_SERVER} ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT} +${RSPAMD_SCOPE} Suite +${RSPAMD_STATS_BACKEND} redis +${RSPAMD_STATS_HASH} null +${RSPAMD_STATS_KEY} null +${RSPAMD_STATS_PER_USER} true + +*** Test Cases *** +Multiclass Per-User Basic Learn Test + Multiclass Basic Learn Test test@example.com + +Multiclass Per-User Legacy Compatibility Test + Multiclass Legacy Compatibility Test test@example.com + +Multiclass Per-User Relearn Test + Multiclass Relearn Test test@example.com + +Multiclass Per-User Cross-Learn Test + Multiclass Cross-Learn Test test@example.com + +Multiclass Per-User Unlearn Test + Multiclass Unlearn Test test@example.com \ No newline at end of file