From: Vsevolod Stakhov Date: Sat, 20 Jun 2026 19:20:19 +0000 (+0100) Subject: [Feature] neural: forced-learn fast path and first-class freeze X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;ds=sidebyside;p=thirdparty%2Frspamd.git [Feature] neural: forced-learn fast path and first-class freeze Two training controls plus a supporting task primitive: * train.forced_learn_minimal_scan (default on when disable_symbols_input): a high-priority neural prefilter disables every non-neural symbol on an ANN-Train scan, so a symbols-independent training vector is built without issuing RBL/DNS, fuzzy, bayes, ClickHouse or capture/cluster work. The stored vector and the profile key are byte-for-byte identical to the live full-scan path (asserted in tests). For symbol-dependent rules it stays off, and if any applicable neural rule needs symbols the whole task falls back to a full scan. * train.frozen: stops automatic training and auto-storing of live vectors so a frozen model's pools cannot drift into an imbalanced live set, while inference keeps serving the current ANN unchanged. An explicit ANN-Train still stores and retrains on demand (gated by a per-profile retrain marker). Supersedes the auto-learn side of store_set_only/store_pool_only; both keep working when frozen is unset. * task:disable_all_symbols([skip_mask]): Lua binding over the existing rspamd_symcache_disable_all_symbols "process only these" primitive (defaults to keeping explicit_disable symbols), used by the prefilter. Functional coverage in test/functional/cases/330_neural/006_forced_learn_minimal and 007_frozen. --- diff --git a/lualib/plugins/neural.lua b/lualib/plugins/neural.lua index d15dc31fb6..01520be2c8 100644 --- a/lualib/plugins/neural.lua +++ b/lualib/plugins/neural.lua @@ -54,6 +54,20 @@ local default_options = { store_pool_only = false, -- store tokens in cache only (disables autotrain); store_set_only = false, -- store ham and spam sets in Redis, but do not train ANN (autotrain must be enabled); -- neural_vec_mpack stores vector of training data in messagepack neural_profile_digest stores profile digest + -- frozen: first-class freeze. Stops automatic training and stops auto-storing + -- live vectors (so a frozen model's pools never accrue an imbalanced live + -- set), while inference keeps serving the current ANN unchanged. Explicit + -- ANN-Train (manual_train) still stores AND trains on demand. Supersedes the + -- auto-learn side of store_set_only/store_pool_only (those keep working when + -- frozen is not set). + frozen = false, + -- forced_learn_minimal_scan: when a manual-train scan (ANN-Train header) maps + -- to a disable_symbols_input rule, a high-priority neural prefilter disables + -- every non-neural symbol so the symbols-independent training vector is built + -- without issuing any RBL/DNS, fuzzy, ClickHouse, capture/cluster work. nil + -- means "default to disable_symbols_input" (resolved per-rule at init); set to + -- false to opt out and keep running the full pipeline for forced learns. + forced_learn_minimal_scan = nil, }, watch_interval = 60.0, lock_expire = 600, diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 44477f8fac..13a8141fb7 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -910,6 +910,19 @@ LUA_FUNCTION_DEF(task, enable_symbol); * @return {boolean} `true` if symbol has been found */ LUA_FUNCTION_DEF(task, disable_symbol); +/*** + * @method task:disable_all_symbols([skip_mask]) + * Disable execution of every symbol for this particular task except those whose + * type/flags intersect `skip_mask`. This is the "process only these" primitive: + * it mirrors what the `symbols_enabled` settings key does internally. Combine it + * with `task:enable_symbol()` (called afterwards) to run only a chosen subset of + * symbols. Typically invoked from a high-priority prefilter so that the disabled + * symbols never execute (no wasted DNS/Redis/HTTP work). + * @param {number} skip_mask optional bitmask of SYMBOL_TYPE_* flags to keep + * enabled; defaults to `SYMBOL_TYPE_EXPLICIT_DISABLE` (i.e. symbols flagged + * `explicit_disable` are left running, matching the `symbols_enabled` default) + */ +LUA_FUNCTION_DEF(task, disable_all_symbols); /*** * @method task:get_date(type[, gmt]) * Returns timestamp for a connection or for a MIME message. This function can be called with a @@ -1479,6 +1492,7 @@ static const struct luaL_reg tasklib_m[] = { LUA_INTERFACE_DEF(task, has_symbol_regexp), LUA_INTERFACE_DEF(task, enable_symbol), LUA_INTERFACE_DEF(task, disable_symbol), + LUA_INTERFACE_DEF(task, disable_all_symbols), LUA_INTERFACE_DEF(task, get_date), LUA_INTERFACE_DEF(task, get_message_id), LUA_INTERFACE_DEF(task, get_timeval), @@ -5615,6 +5629,30 @@ lua_task_disable_symbol(lua_State *L) return 1; } +static int +lua_task_disable_all_symbols(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_task *task = lua_check_task(L, 1); + unsigned int skip_mask = SYMBOL_TYPE_EXPLICIT_DISABLE; + + if (task) { + if (lua_isnumber(L, 2)) { + skip_mask = (unsigned int) lua_tointeger(L, 2); + } + + /* No runtime means we are not inside a scan; nothing to disable */ + if (task->symcache_runtime != NULL) { + rspamd_symcache_disable_all_symbols(task, task->cfg->cache, skip_mask); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 0; +} + static int lua_task_get_symbols(lua_State *L) { diff --git a/src/plugins/lua/neural.lua b/src/plugins/lua/neural.lua index 4f7af25366..d6d84c631b 100644 --- a/src/plugins/lua/neural.lua +++ b/src/plugins/lua/neural.lua @@ -242,6 +242,80 @@ local function get_ann_train_header(task) return nil end +-- High-priority prefilter for the forced-learn fast path. +-- +-- When a scan carries an explicit `ANN-Train: spam|ham` header and every neural +-- rule that applies to this task can train from a symbols-independent vector +-- (disable_symbols_input + train.forced_learn_minimal_scan), there is no reason +-- to run any non-neural symbol: the stored training vector is built purely from +-- registered providers + metatokens, all of which read parsed-message data +-- (text parts, URLs, headers) that is already available at the prefilter stage +-- (PROCESS_MESSAGE runs before PRE_FILTERS). So we disable every symbol except +-- the neural ones, which prevents RBL/DNS, fuzzy, bayes-scoring, ClickHouse, +-- capture/cluster and other idempotent work from ever being issued. +-- +-- This never changes the neural settings id or the profile key: with +-- disable_symbols_input the profile/digest are derived from providers_digest +-- (see process_settings_elt + is_profile_compatible), so the vector and the key +-- are byte-for-byte identical to what the live full-scan path would store. +local function neural_forced_learn_prefilter(task) + -- Only act on explicit manual-train scans + local hv = get_ann_train_header(task) + if not (hv == 'spam' or hv == 'ham') then + return + end + + -- Always keep the neural symbols runnable. NEURAL_LEARN is flagged + -- explicit_disable, so disable_all_symbols (skip_mask = explicit_disable) + -- leaves it alone; we still list it for clarity and re-enable NEURAL_CHECK and + -- every rule's spam/ham virtuals (these are NOT explicit_disable). + local keep = { + NEURAL_CHECK = true, + NEURAL_LEARN = true, + } + local any_minimal = false + + for _, rule in pairs(settings.rules) do + local set = neural_common.get_rule_settings(task, rule) + if set then + keep[rule.symbol_spam] = true + keep[rule.symbol_ham] = true + local minimal = rule.disable_symbols_input and rule.train + and rule.train.forced_learn_minimal_scan + if minimal then + any_minimal = true + else + -- An applicable neural rule is NOT eligible for the minimal scan: either + -- its vector depends on symbols, or the operator opted out. Stripping + -- symbols could change that rule's stored vector (or just defeats an + -- explicit opt-out), so fall back to a full scan for the whole task. + lua_util.debugm(N, task, + 'forced-learn minimal scan disabled: rule %s is not eligible ' .. + '(disable_symbols_input=%s, forced_learn_minimal_scan=%s)', + rule.prefix, rule.disable_symbols_input, + rule.train and rule.train.forced_learn_minimal_scan) + return + end + end + end + + if not any_minimal then + return + end + + -- Disable every symbol-cache item except explicit_disable ones (keeps + -- NEURAL_LEARN), then re-enable the rest of the neural symbols. This is the + -- same "process only these" primitive that `symbols_enabled` uses internally, + -- so it reliably covers filter, postfilter and idempotent stages. + task:disable_all_symbols() + for sym in pairs(keep) do + task:enable_symbol(sym) + end + + lua_util.debugm(N, task, + 'forced-learn minimal scan: disabled all non-neural symbols for %s training', hv) +end + local function ann_push_task_result(rule, task, verdict, score, set) local train_opts = rule.train local learn_spam, learn_ham @@ -296,7 +370,18 @@ local function ann_push_task_result(rule, task, verdict, score, set) has_symbols_provider = true end - if has_llm_provider and not manual_train then + if train_opts.frozen and not manual_train then + -- Frozen model: never auto-learn and never auto-store a live vector, so the + -- pools cannot accrue an imbalanced live set. Inference keeps serving the + -- current ANN unchanged; only an explicit ANN-Train (manual_train) below can + -- still store and (on demand) retrain. This supersedes the auto-learn side + -- of store_set_only/store_pool_only. + learn_spam = false + learn_ham = false + skip_reason = 'model is frozen (train.frozen): auto-learn disabled' + lua_util.debugm(N, task, '%s:%s is frozen, skip auto-store of live vector', + rule.prefix, set.name) + elseif has_llm_provider and not manual_train then -- Use expression-based autolearn conditions for LLM providers if rule.autolearn and rule.autolearn.enabled then local learn_type, reason = neural_learn.get_learn_type(task, rule) @@ -422,6 +507,28 @@ local function ann_push_task_result(rule, task, verdict, score, set) 'SADD', -- command { target_key, str } -- arguments ) + + -- A frozen model trains ONLY when an operator pushes a corpus via + -- ANN-Train; record that request so the controller's auto-train + -- trigger (which is otherwise short-circuited for frozen models) + -- knows to retrain from these manual vectors. TTL keeps it from + -- forcing stale retrains long after the corpus push. + if rule.train.frozen and manual_train then + local marker_key = neural_common.pending_train_key(rule, set) .. '_retrain_req' + lua_redis.redis_make_request(task, + rule.redis, + nil, + true, -- is write + function(merr) + if merr then + lua_util.debugm(N, task, 'cannot set frozen retrain marker %s: %s', + marker_key, merr) + end + end, + 'SET', + { marker_key, tostring(rspamd_util.get_time()), 'EX', tostring(rule.ann_expire) } + ) + end end if rule.providers and #rule.providers > 0 then @@ -1138,8 +1245,11 @@ local function maybe_train_existing_ann(worker, ev_base, rule, set, profiles) -- We have our ANN and that's train vectors, check if we can learn local ann_key = sel_elt.redis_key - -- Check if we need to train ann - if rule.train.store_set_only then + -- Check if we need to train ann. Frozen supersedes store_set_only: a frozen + -- model never auto-trains, but unlike store_set_only it still retrains when an + -- operator pushes a corpus via ANN-Train (gated on the retrain-request marker + -- below). When not frozen, the historical store_set_only behaviour applies. + if not rule.train.frozen and rule.train.store_set_only then lua_util.debugm(N, rspamd_config, "skiped check if ANN %s needs to be trained due to store_set_only", ann_key) return end @@ -1155,6 +1265,12 @@ local function maybe_train_existing_ann(worker, ev_base, rule, set, profiles) ann_key, pending_key, lens) lua_util.debugm(N, rspamd_config, 'maybe_train_existing_ann: initiating train for key=%s spam=%s ham=%s', ann_key, lens.spam or -1, lens.ham or -1) + -- Consume the frozen retrain-request marker now that an actual training is + -- starting, so one operator corpus push triggers exactly one retrain. + if rule.train.frozen then + lua_redis.redis_make_request_taskless(ev_base, rspamd_config, rule.redis, + nil, true, function(_, _) end, 'DEL', { pending_key .. '_retrain_req' }) + end do_train_ann(worker, ev_base, rule, set, ann_key) end @@ -1268,8 +1384,40 @@ local function maybe_train_existing_ann(worker, ev_base, rule, set, profiles) ) end - -- Start the chain - check_spam_len() + -- Start the chain. For a frozen model the controller's auto-train trigger is + -- short-circuited: it only proceeds when an operator-driven ANN-Train left a + -- retrain-request marker (the marker is consumed in initiate_train, so a + -- single corpus push yields a single retrain). + if rule.train.frozen then + local marker_key = pending_key .. '_retrain_req' + lua_redis.redis_make_request_taskless(ev_base, + rspamd_config, + rule.redis, + nil, + false, -- is read + function(err, data) + if err then + rspamd_logger.errx(rspamd_config, 'cannot read frozen retrain marker %s: %s', + marker_key, err) + return + end + -- Redis GET returns a boolean `false` (userdata/boolean) for a missing + -- key via lua_redis; treat anything non-string/empty as "no request". + if type(data) ~= 'string' or data == '' then + lua_util.debugm(N, rspamd_config, + 'frozen ANN %s: no pending ANN-Train retrain request, skip auto-train', ann_key) + return + end + lua_util.debugm(N, rspamd_config, + 'frozen ANN %s: ANN-Train retrain requested, counting vectors', ann_key) + check_spam_len() + end, + 'GET', + { marker_key } + ) + else + check_spam_len() + end end end @@ -1591,6 +1739,15 @@ for k, r in pairs(rules) do rule_elt.train.max_trains = rule_elt.train.max_train end + -- forced_learn_minimal_scan defaults to ON whenever the rule's training vector + -- is symbols-independent (disable_symbols_input): a forced ANN-Train scan then + -- skips the whole non-neural pipeline. Operators can set it to false to opt out + -- explicitly. For symbol-dependent rules it stays off (stripping symbols would + -- change the stored vector relative to the live full-scan path). + if rule_elt.train.forced_learn_minimal_scan == nil then + rule_elt.train.forced_learn_minimal_scan = rule_elt.disable_symbols_input and true or false + end + if not rule_elt.profile then rule_elt.profile = {} end @@ -1671,6 +1828,19 @@ rspamd_config:register_symbol({ callback = ann_push_vector }) +-- Forced-learn fast path: a prefilter that, for qualifying ANN-Train scans of +-- disable_symbols_input rules, disables the whole non-neural pipeline. Priority +-- `high` runs it after the settings prefilters (priority `top`) so that +-- get_settings_id()/get_rule_settings() see the resolved settings, but before +-- the heavy DNS/Redis filter symbols. +rspamd_config:register_symbol({ + name = 'NEURAL_FORCED_LEARN_CHECK', + type = 'prefilter', + flags = 'empty,nostat,explicit_disable', + priority = lua_util.symbols_priorities.high, + callback = neural_forced_learn_prefilter +}) + -- We also need to deal with settings rspamd_config:add_post_init(neural_common.process_rules_settings) diff --git a/test/functional/cases/330_neural/006_forced_learn_minimal.robot b/test/functional/cases/330_neural/006_forced_learn_minimal.robot new file mode 100644 index 0000000000..5c83046665 --- /dev/null +++ b/test/functional/cases/330_neural/006_forced_learn_minimal.robot @@ -0,0 +1,85 @@ +*** Settings *** +Suite Setup Rspamd Redis Setup +Suite Teardown Rspamd Redis Teardown +Library Process +Library Collections +Library ${RSPAMD_TESTDIR}/lib/rspamd.py +Resource ${RSPAMD_TESTDIR}/lib/rspamd.robot +Variables ${RSPAMD_TESTDIR}/lib/vars.py + +*** Variables *** +${CONFIG} ${RSPAMD_TESTDIR}/configs/neural_forced_learn.conf +${SPAM_MSG} ${RSPAMD_TESTDIR}/messages/spam.eml +${HAM_MSG} ${RSPAMD_TESTDIR}/messages/ham.eml +${REDIS_SCOPE} Suite +${RSPAMD_SCOPE} Suite +${RSPAMD_URL_TLD} ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat + +*** Test Cases *** +Forced-learn minimal scan disables non-neural symbols + # A disable_symbols_input rule with forced_learn_minimal_scan=true: an + # ANN-Train scan must run the neural prefilter, which disables every non-neural + # symbol. SPAM_SYMBOL1 is a plain always-firing filter symbol, so it must NOT + # appear when ANN-Train is set, and MUST appear on a normal (full) scan. + Sleep 2s Wait for redis and initial check_anns + Scan File ${SPAM_MSG} ANN-Train=spam + Do Not Expect Symbol SPAM_SYMBOL1 + Do Not Expect Symbol SPAM_SYMBOL2 + Scan File ${SPAM_MSG} + Expect Symbol SPAM_SYMBOL1 + +Minimal scan stores the vector under the providers-digest profile key + # The forced-learn scan above must have stored a training vector under the + # providers-digest key (rn_SHORT_default___spam_set), exactly the + # key the live full-scan path uses (disable_symbols_input keys on + # providers_digest, not on which symbols fired). + ${spam_set} = Get Neural Train Set spam + Should Not Be Empty ${spam_set} msg=no spam training set created by forced learn + ${n} = Redis SCARD ${spam_set} + Should Be True ${n} >= 1 msg=forced-learn scan did not store a training vector + +Minimal scan vector is byte-identical to the full-scan vector + # Re-scan the SAME message through the full pipeline (NEURAL_FORCED_LEARN_CHECK + # disabled at config-equivalent level is not needed: metatokens are + # symbols-independent, so the full-scan auto-learn vector equals the minimal + # one). Storing both into the same Redis SET must dedup to a single member — + # the byte-for-byte equivalence the feature guarantees. + ${spam_set} = Get Neural Train Set spam + # ANN-Train scan again (minimal path) — identical vector, dedups + Scan File ${SPAM_MSG} ANN-Train=spam + # Full-pipeline auto-learn of the same message — identical metatokens vector + Scan File ${SPAM_MSG} + Expect Symbol SPAM_SYMBOL1 + Sleep 0.5s Let the async SADDs settle + ${n} = Redis SCARD ${spam_set} + Should Be Equal As Integers ${n} 1 + ... msg=minimal-scan and full-scan vectors for the same message are not identical + +Forced-learn corpus trains the model + # Add one ham vector via a minimal ANN-Train scan: with max_trains=1 the + # balanced trigger now fires (1 spam + 1 ham) and the model trains from the + # symbols-independent corpus. Inference must then fire on both classes. + Scan File ${HAM_MSG} ANN-Train=ham + Do Not Expect Symbol HAM_SYMBOL1 + Sleep 6s Wait for training to complete and ANN to be reloaded + Scan File ${SPAM_MSG} Settings={groups_enabled=["neural"];symbols_disabled=["NEURAL_LEARN"]} + Expect Symbol NEURAL_SPAM_SHORT + Scan File ${HAM_MSG} Settings={groups_enabled=["neural"];symbols_disabled=["NEURAL_LEARN"]} + Expect Symbol NEURAL_HAM_SHORT + +*** Keywords *** +Get Neural Train Set + [Arguments] ${class} + # The training set keys use the rn_ prefix (ANN blobs/sets), distinct from the + # rn3_ profile zset. Return the first rn_SHORT_*__set key. + ${res} = Run Process redis-cli -h ${RSPAMD_REDIS_ADDR} -p ${RSPAMD_REDIS_PORT} + ... KEYS rn_SHORT_*_${class}_set + ${key} = Evaluate $res.stdout.strip().split('\\n')[0] + [Return] ${key} + +Redis SCARD + [Arguments] ${key} + ${res} = Run Process redis-cli -h ${RSPAMD_REDIS_ADDR} -p ${RSPAMD_REDIS_PORT} + ... SCARD ${key} + ${n} = Convert To Integer ${res.stdout.strip()} + [Return] ${n} diff --git a/test/functional/cases/330_neural/007_frozen.robot b/test/functional/cases/330_neural/007_frozen.robot new file mode 100644 index 0000000000..c3781f3ce5 --- /dev/null +++ b/test/functional/cases/330_neural/007_frozen.robot @@ -0,0 +1,69 @@ +*** Settings *** +Suite Setup Rspamd Redis Setup +Suite Teardown Rspamd Redis Teardown +Library Process +Library Collections +Library ${RSPAMD_TESTDIR}/lib/rspamd.py +Resource ${RSPAMD_TESTDIR}/lib/rspamd.robot +Variables ${RSPAMD_TESTDIR}/lib/vars.py + +*** Variables *** +${CONFIG} ${RSPAMD_TESTDIR}/configs/neural_frozen.conf +${MESSAGE} ${RSPAMD_TESTDIR}/messages/spam_message.eml +${REDIS_SCOPE} Suite +${RSPAMD_SCOPE} Suite +${RSPAMD_URL_TLD} ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat + +*** Test Cases *** +Live traffic does not grow a frozen model's pools + # Identical to 001_autotrain's training drive, but the rule is train.frozen=true. + # Each scan reaches a spam/ham verdict that would normally auto-store a vector; + # a frozen model must store NOTHING from live traffic, so no training-set key is + # ever created. + Sleep 2s Wait for redis and initial check_anns + FOR ${INDEX} IN RANGE 4 14 + Scan File ${MESSAGE} Settings={symbols_enabled = ["SPAM_SYMBOL1", "SPAM_SYMBOL2", "SPAM_SYMBOL3", "SPAM_SYMBOL${INDEX}"]} + Expect Symbol SPAM_SYMBOL${INDEX} + Scan File ${MESSAGE} Settings={symbols_enabled = ["HAM_SYMBOL1", "HAM_SYMBOL2", "HAM_SYMBOL3", "HAM_SYMBOL${INDEX}"]} + Expect Symbol HAM_SYMBOL${INDEX} + END + Sleep 2s Give any (erroneous) auto-store a chance to land + ${nkeys} = Count Neural Train Set Keys + Should Be Equal As Integers ${nkeys} 0 + ... msg=frozen model accrued live training vectors (pools must not grow) + +Frozen model does not auto-train + # With no stored vectors and a short-circuited auto-train trigger, inference + # must stay dark — nothing has been trained from the live traffic above. + Scan File ${MESSAGE} Settings={symbols_enabled = ["SPAM_SYMBOL1","SPAM_SYMBOL2","SPAM_SYMBOL3","SPAM_SYMBOL8"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]} + Do Not Expect Symbol NEURAL_SPAM_SHORT + Do Not Expect Symbol NEURAL_HAM_SHORT + +ANN-Train trains a frozen model on demand + # Freeze stops auto-learn, NOT operator-driven corpus retrains. Pushing a + # balanced corpus with the ANN-Train header stores vectors, sets the retrain + # marker and lets the controller train once. Inference must then fire. + FOR ${INDEX} IN RANGE 4 14 + Scan File ${MESSAGE} ANN-Train=spam Settings={symbols_enabled = ["SPAM_SYMBOL1", "SPAM_SYMBOL2", "SPAM_SYMBOL3", "SPAM_SYMBOL${INDEX}"]} + Scan File ${MESSAGE} ANN-Train=ham Settings={symbols_enabled = ["HAM_SYMBOL1", "HAM_SYMBOL2", "HAM_SYMBOL3", "HAM_SYMBOL${INDEX}"]} + END + ${nkeys} = Count Neural Train Set Keys + Should Be True ${nkeys} >= 1 msg=ANN-Train did not store vectors on a frozen model + Sleep 6s Wait for training to complete and ANN to be reloaded + Scan File ${MESSAGE} Settings={symbols_enabled = ["SPAM_SYMBOL1","SPAM_SYMBOL2","SPAM_SYMBOL3","SPAM_SYMBOL8"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]} + Expect Symbol NEURAL_SPAM_SHORT + Do Not Expect Symbol NEURAL_HAM_SHORT + +Check Neural HAM after frozen ANN-Train + Scan File ${MESSAGE} Settings={symbols_enabled = ["HAM_SYMBOL1","HAM_SYMBOL2","HAM_SYMBOL3","HAM_SYMBOL8"];groups_enabled=["neural"];symbols_disabled = ["NEURAL_LEARN"]} + Expect Symbol NEURAL_HAM_SHORT + Do Not Expect Symbol NEURAL_SPAM_SHORT + +*** Keywords *** +Count Neural Train Set Keys + # Number of rn_SHORT_*_set training keys (spam_set / ham_set). The rn3_ profile + # zset is registered regardless; only training-set keys signal accrued vectors. + ${res} = Run Process redis-cli -h ${RSPAMD_REDIS_ADDR} -p ${RSPAMD_REDIS_PORT} + ... KEYS rn_SHORT_*_set + ${count} = Evaluate len([k for k in $res.stdout.strip().split('\\n') if k]) + [Return] ${count} diff --git a/test/functional/configs/neural_forced_learn.conf b/test/functional/configs/neural_forced_learn.conf new file mode 100644 index 0000000000..4019f4b1b1 --- /dev/null +++ b/test/functional/configs/neural_forced_learn.conf @@ -0,0 +1,85 @@ +options = { + url_tld = "{= env.URL_TLD =}" + pidfile = "{= env.TMPDIR =}/rspamd.pid" + lua_path = "{= env.INSTALLROOT =}/share/rspamd/lib/?.lua" + filters = []; + explicit_modules = ["settings"]; +} + +logging = { + type = "file", + level = "debug" + filename = "{= env.TMPDIR =}/rspamd.log" + log_usec = true; +} +metric = { + name = "default", + actions = { + reject = 100500, + add_header = 50500, + } + unknown_weight = 1 +} +worker { + type = normal + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_NORMAL =}" + count = 1 + task_timeout = 10s; +} +worker { + type = controller + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_CONTROLLER =}" + count = 1 + secure_ip = ["127.0.0.1", "::1"]; + stats_path = "{= env.TMPDIR =}/stats.ucl" +} + +modules { + path = "{= env.TESTDIR =}/../../src/plugins/lua/" +} + +lua = "{= env.TESTDIR =}/lua/test_coverage.lua"; + +neural { + rules { + SHORT { + train { + learning_rate = 0.001; + max_usages = 2; + spam_score = 1; + ham_score = -1; + # metatokens-only vectors deduplicate per message in Redis SADD, so + # a single sample per class is enough; balanced-mode training fires + # only once both a spam and a ham vector exist (one spam scan on its + # own never rotates the key, keeping the equivalence asserts stable). + max_trains = 1; + max_iterations = 250; + classes_bias = 0.0; + # default would already be true for disable_symbols_input; set it + # explicitly so the intent of the suite is obvious. + forced_learn_minimal_scan = true; + } + symbol_spam = "NEURAL_SPAM_SHORT"; + symbol_ham = "NEURAL_HAM_SHORT"; + ann_expire = 86400; + watch_interval = 0.5; + # Symbols-independent vector: input is metatokens only, so the stored + # vector never depends on which rule symbols fired. + providers = [ + { type = "metatokens"; } + ]; + disable_symbols_input = true; + fusion { + include_meta = false; + normalization = "none"; + } + } + } + allow_local = true; +} +redis { + servers = "{= env.REDIS_ADDR =}:{= env.REDIS_PORT =}"; + expand_keys = true; +} + +lua = "{= env.TESTDIR =}/lua/neural_rotation.lua"; diff --git a/test/functional/configs/neural_frozen.conf b/test/functional/configs/neural_frozen.conf new file mode 100644 index 0000000000..36d39cc1c0 --- /dev/null +++ b/test/functional/configs/neural_frozen.conf @@ -0,0 +1,71 @@ +options = { + url_tld = "{= env.URL_TLD =}" + pidfile = "{= env.TMPDIR =}/rspamd.pid" + lua_path = "{= env.INSTALLROOT =}/share/rspamd/lib/?.lua" + filters = []; + explicit_modules = ["settings"]; +} + +logging = { + type = "file", + level = "debug" + filename = "{= env.TMPDIR =}/rspamd.log" + log_usec = true; +} +metric = { + name = "default", + actions = { + reject = 100500, + add_header = 50500, + } + unknown_weight = 1 +} +worker { + type = normal + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_NORMAL =}" + count = 1 + task_timeout = 10s; +} +worker { + type = controller + bind_socket = "{= env.LOCAL_ADDR =}:{= env.PORT_CONTROLLER =}" + count = 1 + secure_ip = ["127.0.0.1", "::1"]; + stats_path = "{= env.TMPDIR =}/stats.ucl" +} + +modules { + path = "{= env.TESTDIR =}/../../src/plugins/lua/" +} + +lua = "{= env.TESTDIR =}/lua/test_coverage.lua"; + +neural { + rules { + SHORT { + train { + learning_rate = 0.001; + max_usages = 2; + spam_score = 1; + ham_score = -1; + max_trains = 10; + max_iterations = 250; + # Frozen: live (verdict-based) traffic must NOT accrue vectors and + # the controller must NOT auto-train; only an explicit ANN-Train + # corpus push may store and retrain. + frozen = true; + } + symbol_spam = "NEURAL_SPAM_SHORT"; + symbol_ham = "NEURAL_HAM_SHORT"; + ann_expire = 86400; + watch_interval = 0.5; + } + } + allow_local = true; +} +redis { + servers = "{= env.REDIS_ADDR =}:{= env.REDIS_PORT =}"; + expand_keys = true; +} + +lua = "{= env.TESTDIR =}/lua/neural.lua";