From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Sat, 16 May 2026 20:13:29 +0000 (+0100)
Subject: [Test] neural: cover providers_digest rotation carryover
X-Git-Tag: 4.1.0~55^2
X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=7a84ee0da7672c1020a45f3d32bc0c99f8e9c690;p=thirdparty%2Frspamd.git

[Test] neural: cover providers_digest rotation carryover

Regression test for the symcache-driven profile rotation fix.

Drives a live rspamd + Redis through: train ANN with providers-only
input (metatokens, disable_symbols_input=true) -> verify NEURAL_SPAM /
NEURAL_HAM fire -> mutate set.symbols/set.digest in the scanner worker
(simulates a symcache shift) -> verify inference still fires after the
next check_anns poll.

Pre-fix the mutation pushes the symbol-list Levenshtein distance well
past the 30% tolerance, the worker rejects the trained profile, and
NEURAL_SPAM stops firing.  Post-fix the providers_digest stays
constant and is recognised as the authoritative schema fingerprint, so
the trained ANN is reloaded.

max_trains=1 because metatokens-only scans produce an identical
vector per message and Redis SADD deduplicates — one spam + one ham
scan are enough to fire training.
---

diff --git a/test/functional/cases/330_neural/003_carryover.robot b/test/functional/cases/330_neural/003_carryover.robot
new file mode 100644
index 0000000000..6f06353bab
--- /dev/null
+++ b/test/functional/cases/330_neural/003_carryover.robot
@@ -0,0 +1,49 @@
+*** Settings ***
+Suite Setup      Rspamd Redis Setup
+Suite Teardown   Rspamd Redis Teardown
+Library         Process
+Library         ${RSPAMD_TESTDIR}/lib/rspamd.py
+Resource        ${RSPAMD_TESTDIR}/lib/rspamd.robot
+Variables       ${RSPAMD_TESTDIR}/lib/vars.py
+
+*** Variables ***
+${CONFIG}          ${RSPAMD_TESTDIR}/configs/neural_rotation.conf
+${SPAM_MSG}        ${RSPAMD_TESTDIR}/messages/spam.eml
+${HAM_MSG}         ${RSPAMD_TESTDIR}/messages/ham.eml
+${REDIS_SCOPE}     Suite
+${RSPAMD_SCOPE}    Suite
+${RSPAMD_URL_TLD}  ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat
+
+*** Test Cases ***
+Train providers-driven ANN
+  # max_trains=1 means a single spam + single ham scan triggers training.
+  # Metatokens-only vector + disable_symbols_input=true makes the input
+  # vector independent of which symbols fire â providers_digest is the
+  # only schema fingerprint.
+  Sleep  2s  Wait for redis and initial check_anns
+  Scan File  ${SPAM_MSG}  Settings={symbols_enabled = ["SPAM_SYMBOL1", "SPAM_SYMBOL2", "SPAM_SYMBOL3"]}
+  Expect Symbol  SPAM_SYMBOL1
+  Scan File  ${HAM_MSG}   Settings={symbols_enabled = ["HAM_SYMBOL1", "HAM_SYMBOL2", "HAM_SYMBOL3"]}
+  Expect Symbol  HAM_SYMBOL1
+
+Inference fires before rotation
+  Sleep  5s  Wait for training to complete and ANN to be reloaded
+  Scan File  ${SPAM_MSG}  Settings={symbols_enabled = ["SPAM_SYMBOL1","SPAM_SYMBOL2","SPAM_SYMBOL3"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]}
+  Expect Symbol  NEURAL_SPAM_SHORT
+  Scan File  ${HAM_MSG}   Settings={symbols_enabled = ["HAM_SYMBOL1","HAM_SYMBOL2","HAM_SYMBOL3"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]}
+  Expect Symbol  NEURAL_HAM_SHORT
+
+Force symcache-style rotation
+  # Mutate set.symbols/set.digest in the scanner worker so the next
+  # check_anns poll re-runs profile selection.  With the fix, the
+  # providers_digest-based match preserves the trained ANN; pre-fix
+  # the symbol-digest shift would orphan it.
+  Scan File  ${SPAM_MSG}  Settings={symbols_enabled = ["FORCE_ROTATE_NEURAL"];symbols_disabled = ["NEURAL_LEARN","NEURAL_CHECK"]}
+  Expect Symbol  FORCE_ROTATE_NEURAL
+  Sleep  3s  Wait for check_anns periodic to reload after rotation
+
+Inference still fires after rotation
+  Scan File  ${SPAM_MSG}  Settings={symbols_enabled = ["SPAM_SYMBOL1","SPAM_SYMBOL2","SPAM_SYMBOL3"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]}
+  Expect Symbol  NEURAL_SPAM_SHORT
+  Scan File  ${HAM_MSG}   Settings={symbols_enabled = ["HAM_SYMBOL1","HAM_SYMBOL2","HAM_SYMBOL3"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]}
+  Expect Symbol  NEURAL_HAM_SHORT
diff --git a/test/functional/configs/neural_rotation.conf b/test/functional/configs/neural_rotation.conf
new file mode 100644
index 0000000000..95aa773a6c
--- /dev/null
+++ b/test/functional/configs/neural_rotation.conf
@@ -0,0 +1,82 @@
+options = {
+  url_tld = "{= env.URL_TLD =}"
+  pidfile = "{= env.TMPDIR =}/rspamd.pid"
+  lua_path = "{= env.INSTALLROOT =}/share/rspamd/lib/?.lua"
+  filters = [];
+  explicit_modules = ["settings"];
+}
+
+logging = {
+  type = "file",
+  level = "debug"
+  filename = "{= env.TMPDIR =}/rspamd.log"
+  log_usec = true;
+}
+metric = {
+  name = "default",
+  actions = {
+    reject = 100500,
+    add_header = 50500,
+  }
+  unknown_weight = 1
+}
+worker {
+  type = normal
+  bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_NORMAL =}"
+  count = 1
+  task_timeout = 10s;
+}
+worker {
+  type = controller
+  bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_CONTROLLER =}"
+  count = 1
+  secure_ip = ["127.0.0.1", "::1"];
+  stats_path = "{= env.TMPDIR =}/stats.ucl"
+}
+
+modules {
+  path = "{= env.TESTDIR =}/../../src/plugins/lua/"
+}
+
+lua = "{= env.TESTDIR =}/lua/test_coverage.lua";
+
+neural {
+  rules {
+      SHORT {
+          train {
+              learning_rate = 0.001;
+              max_usages = 2;
+              spam_score = 1;
+              ham_score = -1;
+              # metatokens-only vectors deduplicate per message in Redis SADD,
+              # so a single sample per class is enough â keep max_trains at 1
+              # so balanced-mode training fires after one spam + one ham scan.
+              max_trains = 1;
+              max_iterations = 250;
+              classes_bias = 0.0;
+          }
+          symbol_spam = "NEURAL_SPAM_SHORT";
+          symbol_ham = "NEURAL_HAM_SHORT";
+          ann_expire = 86400;
+          watch_interval = 0.5;
+          # Providers-driven input vector; symbol set is decoupled from the ANN.
+          # Rotating set.symbols/set.digest mid-life must not invalidate the
+          # trained model so long as providers_digest stays constant.
+          providers = [
+            { type = "metatokens"; }
+          ];
+          disable_symbols_input = true;
+          fusion {
+            include_meta = false;
+            normalization = "none";
+          }
+      }
+  }
+  allow_local = true;
+}
+redis {
+  servers = "{= env.REDIS_ADDR =}:{= env.REDIS_PORT =}";
+  expand_keys = true;
+}
+
+lua = "{= env.TESTDIR =}/lua/neural_rotation.lua";
diff --git a/test/functional/lua/neural_rotation.lua b/test/functional/lua/neural_rotation.lua
new file mode 100644
index 0000000000..4371f44e8e
--- /dev/null
+++ b/test/functional/lua/neural_rotation.lua
@@ -0,0 +1,73 @@
+-- Test helper for the providers-digest rotation scenario.
+--
+-- Mirrors the SPAM_SYMBOL{i}/HAM_SYMBOL{i} setup from neural.lua (so a Robot
+-- suite can drive autolearn via verdict scoring) and adds a callback symbol
+-- that forces a symcache-style rotation: mutates the loaded neural rule's
+-- settings to flip set.symbols and set.digest in place, then clears
+-- set.ann/set.training_profile so the next check_anns poll re-runs profile
+-- selection.
+--
+-- With providers configured + disable_symbols_input=true, the rotation must
+-- not invalidate the trained ANN: providers_digest stays constant, so the
+-- old profile is still compatible and must be reloaded.
+
+local lua_util = require "lua_util"
+local neural_common = require "plugins/neural"
+
+for i = 1, 14 do
+  rspamd_config:register_symbol({
+    name = 'SPAM_SYMBOL' .. tostring(i),
+    score = 5.0,
+    callback = function()
+      return true, 'Fires always'
+    end
+  })
+  rspamd_config:register_symbol({
+    name = 'HAM_SYMBOL' .. tostring(i),
+    score = -3.0,
+    callback = function()
+      return true, 'Fires always'
+    end
+  })
+end
+
+-- Force an in-place "symcache shift" on the loaded neural rule(s).
+-- Appends a unique symbol to set.symbols, recomputes set.digest, and clears
+-- the loaded ANN reference so the next check_anns poll re-selects a profile
+-- from Redis.
+--
+-- IMPORTANT: registered WITHOUT explicit_disable so it stays subject to the
+-- symbols_enabled allowlist â otherwise it would fire on every training scan
+-- and trample set.can_store_vectors before training data can accumulate.
+-- Replace set.symbols with a wholly fresh list so the Levenshtein distance
+-- against the stored profile exceeds the legacy 30% tolerance â pre-fix this
+-- would orphan the trained ANN; with providers_digest matching it is still
+-- recognised as compatible.
+local rotation_counter = 0
+rspamd_config.FORCE_ROTATE_NEURAL = {
+  callback = function(task)
+    rotation_counter = rotation_counter + 1
+    for _, rule in pairs(neural_common.settings.rules or {}) do
+      for _, set in pairs(rule.settings or {}) do
+        if type(set) == 'table' and type(set.symbols) == 'table' then
+          local fresh = {}
+          for i = 1, math.max(#set.symbols * 2, 32) do
+            fresh[#fresh + 1] = string.format('ROTATED_SYM_%d_%d',
+              rotation_counter, i)
+          end
+          table.sort(fresh)
+          set.symbols = fresh
+          set.digest = lua_util.table_digest(fresh)
+          set.ann = nil
+          set.training_profile = nil
+          -- Leave set.can_store_vectors alone: check_anns has already
+          -- populated profile state for this set, the next poll will
+          -- reselect from Redis.
+        end
+      end
+    end
+    return true, 1.0, string.format('rotated_%d', rotation_counter)
+  end
+}
+
+dofile(rspamd_env.INSTALLROOT .. "/share/rspamd/rules/controller/init.lua")