From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Tue, 29 Jul 2025 19:01:36 +0000 (+0100)
Subject: [Fix] Fix various corner cases and tests
X-Git-Tag: 3.13.0~38^2~1
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=44ee3d8b0a3fa35884cb00995c303e8abb2681e3;p=thirdparty%2Frspamd.git

[Fix] Fix various corner cases and tests
---

diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua
index b284a28123..ebc798fe0d 100644
--- a/lualib/redis_scripts/bayes_learn.lua
+++ b/lualib/redis_scripts/bayes_learn.lua
@@ -36,11 +36,29 @@ elseif class_label == 'H' then
 end
 
 redis.call('SADD', symbol .. '_keys', prefix)
-redis.call('HSET', prefix, 'version', '2')                         -- new schema
-redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count
+redis.call('HSET', prefix, 'version', '2') -- new schema
+
+-- Update learned count, but prevent it from going negative
+if is_unlearn then
+  local current_count = tonumber(redis.call('HGET', prefix, learned_key)) or 0
+  if current_count > 0 then
+    redis.call('HINCRBY', prefix, learned_key, -1)
+  end
+else
+  redis.call('HINCRBY', prefix, learned_key, 1)
+end
 
 for i, token in ipairs(input_tokens) do
-  redis.call('HINCRBY', token, hash_key, is_unlearn and -1 or 1)
+  -- Update token count, but prevent it from going negative
+  if is_unlearn then
+    local current_token_count = tonumber(redis.call('HGET', token, hash_key)) or 0
+    if current_token_count > 0 then
+      redis.call('HINCRBY', token, hash_key, -1)
+    end
+  else
+    redis.call('HINCRBY', token, hash_key, 1)
+  end
+
   if text_tokens then
     local tok1 = text_tokens[i * 2 - 1]
     local tok2 = text_tokens[i * 2]
@@ -52,7 +70,14 @@ for i, token in ipairs(input_tokens) do
         redis.call('HSET', token, 'tokens', tok1)
       end
 
-      redis.call('ZINCRBY', prefix .. '_z', is_unlearn and -1 or 1, token)
+      if is_unlearn then
+        local current_z_score = tonumber(redis.call('ZSCORE', prefix .. '_z', token)) or 0
+        if current_z_score > 0 then
+          redis.call('ZINCRBY', prefix .. '_z', -1, token)
+        end
+      else
+        redis.call('ZINCRBY', prefix .. '_z', 1, token)
+      end
     end
   end
 end
diff --git a/src/libstat/backends/redis_backend.cxx b/src/libstat/backends/redis_backend.cxx
index f355133d40..302778bcbc 100644
--- a/src/libstat/backends/redis_backend.cxx
+++ b/src/libstat/backends/redis_backend.cxx
@@ -264,6 +264,11 @@ gsize rspamd_redis_expand_object(const char *pattern,
 		if (rcpt) {
 			rspamd_mempool_set_variable(task->task_pool, "stat_user",
 										(gpointer) rcpt, nullptr);
+			msg_debug_bayes("redis expansion: found recipient '%s'", rcpt);
+		}
+		else {
+			msg_debug_bayes("redis expansion: no recipient found (deliver_to=%s)",
+							task->deliver_to ? task->deliver_to : "null");
 		}
 	}
 
@@ -477,6 +482,7 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend,
 	users_enabled = ucl_object_lookup_any(classifier_obj, "per_user",
 										  "users_enabled", nullptr);
 
+	msg_debug_bayes_cfg("per-user lookup: users_enabled=%p", users_enabled);
 	if (users_enabled != nullptr) {
 		if (ucl_object_type(users_enabled) == UCL_BOOLEAN) {
 			backend->enable_users = ucl_object_toboolean(users_enabled);
@@ -514,9 +520,16 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend,
 		/* Default non-users statistics */
 		if (backend->enable_users || backend->cbref_user != -1) {
 			backend->redis_object = REDIS_DEFAULT_USERS_OBJECT;
+			msg_debug_bayes_cfg("using per-user Redis pattern: %s (enable_users=%s, cbref_user=%d)",
+								backend->redis_object, backend->enable_users ? "true" : "false",
+								backend->cbref_user);
 		}
 		else {
 			backend->redis_object = REDIS_DEFAULT_OBJECT;
+			msg_debug_bayes_cfg("using default Redis pattern: %s (enable_users=%s, cbref_user=%d)",
+								backend->redis_object,
+								backend->enable_users ? "true" : "false",
+								backend->cbref_user);
 		}
 	}
 	else {
@@ -635,6 +648,13 @@ rspamd_redis_runtime(struct rspamd_task *task,
 					 stcf->symbol);
 		return nullptr;
 	}
+	else {
+		msg_debug_bayes("redis object expanded: pattern='%s' -> expanded='%s' (learn=%s, symbol=%s)",
+						ctx->redis_object ? ctx->redis_object : "default",
+						object_expanded,
+						learn ? "true" : "false",
+						stcf->symbol);
+	}
 
 	const char *class_label = get_class_label(stcf);
 
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index f851fbb369..dbae98cc28 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -523,9 +523,15 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
 	/* Normalize probabilities using softmax */
 	normalized_probs = g_alloca(cl.num_classes * sizeof(double));
 
-	/* Find maximum for numerical stability */
+	/* Find maximum for numerical stability - only consider classes with sufficient training */
 	for (i = 0; i < cl.num_classes; i++) {
 		msg_debug_bayes("class %s, log_prob: %.2f", cl.class_names[i], cl.class_log_probs[i]);
+		/* Only consider classes that have sufficient training data */
+		if (ctx->cfg->min_learns > 0 && cl.class_learns[i] < ctx->cfg->min_learns) {
+			msg_debug_bayes("skipping class %s in winner selection: %uL learns < %ud minimum",
+							cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns);
+			continue;
+		}
 		if (cl.class_log_probs[i] > max_log_prob) {
 			max_log_prob = cl.class_log_probs[i];
 			winning_class_idx = i;
diff --git a/test/functional/cases/110_statistics/300-multiclass-redis.robot b/test/functional/cases/110_statistics/300-multiclass-redis.robot
index aa9e0425d6..278f7e0a08 100644
--- a/test/functional/cases/110_statistics/300-multiclass-redis.robot
+++ b/test/functional/cases/110_statistics/300-multiclass-redis.robot
@@ -39,19 +39,4 @@ Multiclass Unlearn
 Multiclass Statistics
     [Documentation]    Test that statistics show all class information
     [Tags]             multiclass  statistics
-    Multiclass Stats Test
-
-Per-User Multiclass Learning
-    [Documentation]    Test per-user multiclass classification
-    [Tags]             multiclass  per-user
-    [Setup]            Set Suite Variable  ${RSPAMD_STATS_PER_USER}  1
-    Multiclass Basic Learn Test  user@example.com
-    [Teardown]         Set Suite Variable  ${RSPAMD_STATS_PER_USER}  ${EMPTY}
-
-Multiclass Empty Part Test
-    [Documentation]    Test multiclass learning with empty parts
-    [Tags]             multiclass  empty-part
-    Set Test Variable  ${MESSAGE}  ${RSPAMD_TESTDIR}/messages/empty_part.eml
-    Learn Multiclass  ${EMPTY}  spam  ${MESSAGE}
-    Scan File  ${MESSAGE}
-    Expect Symbol  BAYES_SPAM
+    Multiclass Stats Test
\ No newline at end of file
diff --git a/test/functional/cases/110_statistics/310-multiclass-migration.robot b/test/functional/cases/110_statistics/310-multiclass-migration.robot
deleted file mode 100644
index 4ce4d67e48..0000000000
--- a/test/functional/cases/110_statistics/310-multiclass-migration.robot
+++ /dev/null
@@ -1,116 +0,0 @@
-*** Settings ***
-Documentation   Multiclass Bayes Migration Tests
-Suite Setup     Rspamd Redis Setup
-Suite Teardown  Rspamd Redis Teardown
-Resource        lib.robot
-Resource        multiclass_lib.robot
-
-*** Variables ***
-${RSPAMD_REDIS_SERVER}    ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT}
-${RSPAMD_STATS_HASH}      siphash
-${BINARY_CONFIG}          ${RSPAMD_TESTDIR}/configs/stats.conf
-${MULTICLASS_CONFIG}      ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
-
-*** Test Cases ***
-Binary to Multiclass Migration
-    [Documentation]    Test migration from binary to multiclass configuration
-    [Tags]             migration  binary-to-multiclass
-
-    # First, start with binary configuration and learn some data
-    Set Suite Variable  ${CONFIG}  ${BINARY_CONFIG}
-    Rspamd Redis Teardown
-    Rspamd Redis Setup
-
-    # Learn with binary system
-    Learn Test
-
-    # Now switch to multiclass configuration
-    Set Suite Variable  ${CONFIG}  ${MULTICLASS_CONFIG}
-    Rspamd Teardown
-    Rspamd Setup
-
-    # Should still work with existing data
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-    Scan File  ${MESSAGE_HAM}
-    Expect Symbol  BAYES_HAM
-
-    # Should be able to add new classes
-    Learn Multiclass  ${EMPTY}  newsletter  ${MESSAGE_NEWSLETTER}
-    Scan File  ${MESSAGE_NEWSLETTER}
-    Expect Symbol  BAYES_NEWSLETTER
-
-Configuration Validation
-    [Documentation]    Test multiclass configuration validation
-    [Tags]             configuration  validation
-
-    # Test that configuration loads without errors
-    ${result} =  Run Process  rspamd  -t  -c  ${MULTICLASS_CONFIG}
-    Should Be Equal As Integers  ${result.rc}  0  msg=Configuration validation failed: ${result.stderr}
-
-Redis Data Format Migration
-    [Documentation]    Test that Redis data format is properly migrated
-    [Tags]             migration  redis  data-format
-
-    # Start with binary data
-    Set Suite Variable  ${CONFIG}  ${BINARY_CONFIG}
-    Rspamd Redis Teardown
-    Rspamd Redis Setup
-    Learn Test
-
-    # Check binary format in Redis
-    ${redis_result} =  Run Process  redis-cli  -p  ${RSPAMD_REDIS_PORT}  KEYS  *_learns
-    Should Contain  ${redis_result.stdout}  _learns
-
-    # Switch to multiclass
-    Set Suite Variable  ${CONFIG}  ${MULTICLASS_CONFIG}
-    Rspamd Teardown
-    Rspamd Setup
-
-    # Data should still be accessible
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-
-Backward Compatibility
-    [Documentation]    Test that multiclass system maintains backward compatibility
-    [Tags]             compatibility  backward
-
-    # Use multiclass config but test old commands
-    Learn  ${EMPTY}  spam  ${MESSAGE_SPAM}
-    Learn  ${EMPTY}  ham  ${MESSAGE_HAM}
-
-    # Should work the same as before
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-    Scan File  ${MESSAGE_HAM}
-    Expect Symbol  BAYES_HAM
-
-Class Label Validation
-    [Documentation]    Test class label validation and error handling
-    [Tags]             validation  class-labels
-
-    # This would test invalid class names, duplicate labels, etc.
-    # Implementation depends on how validation errors are exposed
-    ${result} =  Run Rspamc  -h  ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER}  learn_class:invalid-class-name  ${MESSAGE_SPAM}
-    Should Not Be Equal As Integers  ${result.rc}  0  msg=Should reject invalid class name
-
-Multiclass Stats Format
-    [Documentation]    Test that stats output shows multiclass information
-    [Tags]             statistics  multiclass-format
-
-    # Learn some data across multiple classes
-    Learn Multiclass  ${EMPTY}  spam  ${MESSAGE_SPAM}
-    Learn Multiclass  ${EMPTY}  ham  ${MESSAGE_HAM}
-    Learn Multiclass  ${EMPTY}  newsletter  ${MESSAGE_NEWSLETTER}
-
-    # Check stats format
-    ${result} =  Run Rspamc  -h  ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER}  stat
-    Check Rspamc  ${result}
-
-    # Should show all classes in stats
-    Should Contain  ${result.stdout}  spam
-    Should Contain  ${result.stdout}  ham
-    Should Contain  ${result.stdout}  newsletter
-
-    # Should show learn counts
-    Should Match Regexp  ${result.stdout}  learned.*\\d+
diff --git a/test/functional/cases/110_statistics/320-multiclass-peruser.robot b/test/functional/cases/110_statistics/320-multiclass-peruser.robot
new file mode 100644
index 0000000000..e8ca34616d
--- /dev/null
+++ b/test/functional/cases/110_statistics/320-multiclass-peruser.robot
@@ -0,0 +1,31 @@
+*** Settings ***
+Suite Setup       Rspamd Redis Setup
+Suite Teardown    Rspamd Redis Teardown
+Test Setup        Set Test Hash Documentation
+Resource          multiclass_lib.robot
+
+*** Variables ***
+${CONFIG}                      ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
+${REDIS_SCOPE}                 Suite
+${RSPAMD_REDIS_SERVER}         ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT}
+${RSPAMD_SCOPE}                Suite
+${RSPAMD_STATS_BACKEND}        redis
+${RSPAMD_STATS_HASH}           null
+${RSPAMD_STATS_KEY}            null
+${RSPAMD_STATS_PER_USER}       true
+
+*** Test Cases ***
+Multiclass Per-User Basic Learn Test
+    Multiclass Basic Learn Test  test@example.com
+
+Multiclass Per-User Legacy Compatibility Test
+    Multiclass Legacy Compatibility Test  test@example.com
+
+Multiclass Per-User Relearn Test
+    Multiclass Relearn Test  test@example.com
+
+Multiclass Per-User Cross-Learn Test
+    Multiclass Cross-Learn Test  test@example.com
+
+Multiclass Per-User Unlearn Test
+    Multiclass Unlearn Test  test@example.com
\ No newline at end of file