]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Fix various corner cases and tests
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 29 Jul 2025 19:01:36 +0000 (20:01 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 29 Jul 2025 19:01:36 +0000 (20:01 +0100)
lualib/redis_scripts/bayes_learn.lua
src/libstat/backends/redis_backend.cxx
src/libstat/classifiers/bayes.c
test/functional/cases/110_statistics/300-multiclass-redis.robot
test/functional/cases/110_statistics/310-multiclass-migration.robot [deleted file]
test/functional/cases/110_statistics/320-multiclass-peruser.robot [new file with mode: 0644]

index b284a2812326e5e8b6ed26bb27c8f7927e6d0a7a..ebc798fe0d085d62414174ec627b090fa32d1639 100644 (file)
@@ -36,11 +36,29 @@ elseif class_label == 'H' then
 end
 
 redis.call('SADD', symbol .. '_keys', prefix)
-redis.call('HSET', prefix, 'version', '2')                         -- new schema
-redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count
+redis.call('HSET', prefix, 'version', '2') -- new schema
+
+-- Update learned count, but prevent it from going negative
+if is_unlearn then
+  local current_count = tonumber(redis.call('HGET', prefix, learned_key)) or 0
+  if current_count > 0 then
+    redis.call('HINCRBY', prefix, learned_key, -1)
+  end
+else
+  redis.call('HINCRBY', prefix, learned_key, 1)
+end
 
 for i, token in ipairs(input_tokens) do
-  redis.call('HINCRBY', token, hash_key, is_unlearn and -1 or 1)
+  -- Update token count, but prevent it from going negative
+  if is_unlearn then
+    local current_token_count = tonumber(redis.call('HGET', token, hash_key)) or 0
+    if current_token_count > 0 then
+      redis.call('HINCRBY', token, hash_key, -1)
+    end
+  else
+    redis.call('HINCRBY', token, hash_key, 1)
+  end
+
   if text_tokens then
     local tok1 = text_tokens[i * 2 - 1]
     local tok2 = text_tokens[i * 2]
@@ -52,7 +70,14 @@ for i, token in ipairs(input_tokens) do
         redis.call('HSET', token, 'tokens', tok1)
       end
 
-      redis.call('ZINCRBY', prefix .. '_z', is_unlearn and -1 or 1, token)
+      if is_unlearn then
+        local current_z_score = tonumber(redis.call('ZSCORE', prefix .. '_z', token)) or 0
+        if current_z_score > 0 then
+          redis.call('ZINCRBY', prefix .. '_z', -1, token)
+        end
+      else
+        redis.call('ZINCRBY', prefix .. '_z', 1, token)
+      end
     end
   end
 end
index f355133d403374600b6381ee87d5c29b1b75b3ae..302778bcbc78cf81d119f2a70a16fffdc54855b1 100644 (file)
@@ -264,6 +264,11 @@ gsize rspamd_redis_expand_object(const char *pattern,
                if (rcpt) {
                        rspamd_mempool_set_variable(task->task_pool, "stat_user",
                                                                                (gpointer) rcpt, nullptr);
+                       msg_debug_bayes("redis expansion: found recipient '%s'", rcpt);
+               }
+               else {
+                       msg_debug_bayes("redis expansion: no recipient found (deliver_to=%s)",
+                                                       task->deliver_to ? task->deliver_to : "null");
                }
        }
 
@@ -477,6 +482,7 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend,
        users_enabled = ucl_object_lookup_any(classifier_obj, "per_user",
                                                                                  "users_enabled", nullptr);
 
+       msg_debug_bayes_cfg("per-user lookup: users_enabled=%p", users_enabled);
        if (users_enabled != nullptr) {
                if (ucl_object_type(users_enabled) == UCL_BOOLEAN) {
                        backend->enable_users = ucl_object_toboolean(users_enabled);
@@ -514,9 +520,16 @@ rspamd_redis_parse_classifier_opts(struct redis_stat_ctx *backend,
                /* Default non-users statistics */
                if (backend->enable_users || backend->cbref_user != -1) {
                        backend->redis_object = REDIS_DEFAULT_USERS_OBJECT;
+                       msg_debug_bayes_cfg("using per-user Redis pattern: %s (enable_users=%s, cbref_user=%d)",
+                                                               backend->redis_object, backend->enable_users ? "true" : "false",
+                                                               backend->cbref_user);
                }
                else {
                        backend->redis_object = REDIS_DEFAULT_OBJECT;
+                       msg_debug_bayes_cfg("using default Redis pattern: %s (enable_users=%s, cbref_user=%d)",
+                                                               backend->redis_object,
+                                                               backend->enable_users ? "true" : "false",
+                                                               backend->cbref_user);
                }
        }
        else {
@@ -635,6 +648,13 @@ rspamd_redis_runtime(struct rspamd_task *task,
                                         stcf->symbol);
                return nullptr;
        }
+       else {
+               msg_debug_bayes("redis object expanded: pattern='%s' -> expanded='%s' (learn=%s, symbol=%s)",
+                                               ctx->redis_object ? ctx->redis_object : "default",
+                                               object_expanded,
+                                               learn ? "true" : "false",
+                                               stcf->symbol);
+       }
 
        const char *class_label = get_class_label(stcf);
 
index f851fbb369714b883173007a501a3375d81ce98e..dbae98cc28bba4ec9e4786a478505b47fc857cf9 100644 (file)
@@ -523,9 +523,15 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
        /* Normalize probabilities using softmax */
        normalized_probs = g_alloca(cl.num_classes * sizeof(double));
 
-       /* Find maximum for numerical stability */
+       /* Find maximum for numerical stability - only consider classes with sufficient training */
        for (i = 0; i < cl.num_classes; i++) {
                msg_debug_bayes("class %s, log_prob: %.2f", cl.class_names[i], cl.class_log_probs[i]);
+               /* Only consider classes that have sufficient training data */
+               if (ctx->cfg->min_learns > 0 && cl.class_learns[i] < ctx->cfg->min_learns) {
+                       msg_debug_bayes("skipping class %s in winner selection: %uL learns < %ud minimum",
+                                                       cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns);
+                       continue;
+               }
                if (cl.class_log_probs[i] > max_log_prob) {
                        max_log_prob = cl.class_log_probs[i];
                        winning_class_idx = i;
index aa9e0425d6cc5d5b34248236ce3997ee9f01c123..278f7e0a08ac46f0149e4276d7f0758641992181 100644 (file)
@@ -39,19 +39,4 @@ Multiclass Unlearn
 Multiclass Statistics
     [Documentation]    Test that statistics show all class information
     [Tags]             multiclass  statistics
-    Multiclass Stats Test
-
-Per-User Multiclass Learning
-    [Documentation]    Test per-user multiclass classification
-    [Tags]             multiclass  per-user
-    [Setup]            Set Suite Variable  ${RSPAMD_STATS_PER_USER}  1
-    Multiclass Basic Learn Test  user@example.com
-    [Teardown]         Set Suite Variable  ${RSPAMD_STATS_PER_USER}  ${EMPTY}
-
-Multiclass Empty Part Test
-    [Documentation]    Test multiclass learning with empty parts
-    [Tags]             multiclass  empty-part
-    Set Test Variable  ${MESSAGE}  ${RSPAMD_TESTDIR}/messages/empty_part.eml
-    Learn Multiclass  ${EMPTY}  spam  ${MESSAGE}
-    Scan File  ${MESSAGE}
-    Expect Symbol  BAYES_SPAM
+    Multiclass Stats Test
\ No newline at end of file
diff --git a/test/functional/cases/110_statistics/310-multiclass-migration.robot b/test/functional/cases/110_statistics/310-multiclass-migration.robot
deleted file mode 100644 (file)
index 4ce4d67..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-*** Settings ***
-Documentation   Multiclass Bayes Migration Tests
-Suite Setup     Rspamd Redis Setup
-Suite Teardown  Rspamd Redis Teardown
-Resource        lib.robot
-Resource        multiclass_lib.robot
-
-*** Variables ***
-${RSPAMD_REDIS_SERVER}    ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT}
-${RSPAMD_STATS_HASH}      siphash
-${BINARY_CONFIG}          ${RSPAMD_TESTDIR}/configs/stats.conf
-${MULTICLASS_CONFIG}      ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
-
-*** Test Cases ***
-Binary to Multiclass Migration
-    [Documentation]    Test migration from binary to multiclass configuration
-    [Tags]             migration  binary-to-multiclass
-
-    # First, start with binary configuration and learn some data
-    Set Suite Variable  ${CONFIG}  ${BINARY_CONFIG}
-    Rspamd Redis Teardown
-    Rspamd Redis Setup
-
-    # Learn with binary system
-    Learn Test
-
-    # Now switch to multiclass configuration
-    Set Suite Variable  ${CONFIG}  ${MULTICLASS_CONFIG}
-    Rspamd Teardown
-    Rspamd Setup
-
-    # Should still work with existing data
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-    Scan File  ${MESSAGE_HAM}
-    Expect Symbol  BAYES_HAM
-
-    # Should be able to add new classes
-    Learn Multiclass  ${EMPTY}  newsletter  ${MESSAGE_NEWSLETTER}
-    Scan File  ${MESSAGE_NEWSLETTER}
-    Expect Symbol  BAYES_NEWSLETTER
-
-Configuration Validation
-    [Documentation]    Test multiclass configuration validation
-    [Tags]             configuration  validation
-
-    # Test that configuration loads without errors
-    ${result} =  Run Process  rspamd  -t  -c  ${MULTICLASS_CONFIG}
-    Should Be Equal As Integers  ${result.rc}  0  msg=Configuration validation failed: ${result.stderr}
-
-Redis Data Format Migration
-    [Documentation]    Test that Redis data format is properly migrated
-    [Tags]             migration  redis  data-format
-
-    # Start with binary data
-    Set Suite Variable  ${CONFIG}  ${BINARY_CONFIG}
-    Rspamd Redis Teardown
-    Rspamd Redis Setup
-    Learn Test
-
-    # Check binary format in Redis
-    ${redis_result} =  Run Process  redis-cli  -p  ${RSPAMD_REDIS_PORT}  KEYS  *_learns
-    Should Contain  ${redis_result.stdout}  _learns
-
-    # Switch to multiclass
-    Set Suite Variable  ${CONFIG}  ${MULTICLASS_CONFIG}
-    Rspamd Teardown
-    Rspamd Setup
-
-    # Data should still be accessible
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-
-Backward Compatibility
-    [Documentation]    Test that multiclass system maintains backward compatibility
-    [Tags]             compatibility  backward
-
-    # Use multiclass config but test old commands
-    Learn  ${EMPTY}  spam  ${MESSAGE_SPAM}
-    Learn  ${EMPTY}  ham  ${MESSAGE_HAM}
-
-    # Should work the same as before
-    Scan File  ${MESSAGE_SPAM}
-    Expect Symbol  BAYES_SPAM
-    Scan File  ${MESSAGE_HAM}
-    Expect Symbol  BAYES_HAM
-
-Class Label Validation
-    [Documentation]    Test class label validation and error handling
-    [Tags]             validation  class-labels
-
-    # This would test invalid class names, duplicate labels, etc.
-    # Implementation depends on how validation errors are exposed
-    ${result} =  Run Rspamc  -h  ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER}  learn_class:invalid-class-name  ${MESSAGE_SPAM}
-    Should Not Be Equal As Integers  ${result.rc}  0  msg=Should reject invalid class name
-
-Multiclass Stats Format
-    [Documentation]    Test that stats output shows multiclass information
-    [Tags]             statistics  multiclass-format
-
-    # Learn some data across multiple classes
-    Learn Multiclass  ${EMPTY}  spam  ${MESSAGE_SPAM}
-    Learn Multiclass  ${EMPTY}  ham  ${MESSAGE_HAM}
-    Learn Multiclass  ${EMPTY}  newsletter  ${MESSAGE_NEWSLETTER}
-
-    # Check stats format
-    ${result} =  Run Rspamc  -h  ${RSPAMD_LOCAL_ADDR}:${RSPAMD_PORT_CONTROLLER}  stat
-    Check Rspamc  ${result}
-
-    # Should show all classes in stats
-    Should Contain  ${result.stdout}  spam
-    Should Contain  ${result.stdout}  ham
-    Should Contain  ${result.stdout}  newsletter
-
-    # Should show learn counts
-    Should Match Regexp  ${result.stdout}  learned.*\\d+
diff --git a/test/functional/cases/110_statistics/320-multiclass-peruser.robot b/test/functional/cases/110_statistics/320-multiclass-peruser.robot
new file mode 100644 (file)
index 0000000..e8ca346
--- /dev/null
@@ -0,0 +1,31 @@
+*** Settings ***
+Suite Setup       Rspamd Redis Setup
+Suite Teardown    Rspamd Redis Teardown
+Test Setup        Set Test Hash Documentation
+Resource          multiclass_lib.robot
+
+*** Variables ***
+${CONFIG}                      ${RSPAMD_TESTDIR}/configs/multiclass_bayes.conf
+${REDIS_SCOPE}                 Suite
+${RSPAMD_REDIS_SERVER}         ${RSPAMD_REDIS_ADDR}:${RSPAMD_REDIS_PORT}
+${RSPAMD_SCOPE}                Suite
+${RSPAMD_STATS_BACKEND}        redis
+${RSPAMD_STATS_HASH}           null
+${RSPAMD_STATS_KEY}            null
+${RSPAMD_STATS_PER_USER}       true
+
+*** Test Cases ***
+Multiclass Per-User Basic Learn Test
+    Multiclass Basic Learn Test  test@example.com
+
+Multiclass Per-User Legacy Compatibility Test
+    Multiclass Legacy Compatibility Test  test@example.com
+
+Multiclass Per-User Relearn Test
+    Multiclass Relearn Test  test@example.com
+
+Multiclass Per-User Cross-Learn Test
+    Multiclass Cross-Learn Test  test@example.com
+
+Multiclass Per-User Unlearn Test
+    Multiclass Unlearn Test  test@example.com
\ No newline at end of file