if not profile_elt.symbols or not set.symbols then
return false, math.huge
end
+ -- Accept profiles whose symbol list still overlaps the current one by at
+ -- least 50% (i.e. Levenshtein drift < 50% of |set.symbols|). The previous
+ -- 30% threshold rejected the old profile on every modest config change
+ -- and inference went completely dark until a new ANN trained from scratch
+ -- (weeks under realistic class imbalance). With this looser cap the worker
+ -- keeps using the old profile's redis_key -- and crucially its OWN symbol
+ -- list, since result_to_vector uses profile.symbols -- so the trained
+ -- weights stay correctly indexed against the features that produced them.
local dist = lua_util.distance_sorted(profile_elt.symbols, set.symbols)
- if dist >= #set.symbols * 0.3 then
+ if dist >= #set.symbols * 0.5 then
return false, dist
end
return true, dist
else
rspamd_logger.infox(task, 'created new ANN profile for %s:%s, data stored at prefix %s',
rule.prefix, set.name, profile.redis_key)
- -- If a prior profile with the same providers_digest holds trained
- -- weights, carry them over into the fresh profile key. This prevents
- -- a symcache-driven profile rotation from abandoning a still-valid
- -- ANN whenever the input vector schema is decided by providers
- -- (rather than the symbol list).
- if providers_digest then
+ -- Carry weights from a prior profile (same providers_digest, different
+ -- symbol-list digest) into the fresh profile key ONLY when the input
+ -- vector schema is decided entirely by providers -- i.e. when
+ -- disable_symbols_input is set. In hybrid mode (providers + symbols)
+ -- the symbol portion of the vector reshapes with symbol drift, and
+ -- load_new_ann then sets set.ann.symbols = profile.symbols (= current
+ -- symbol list), so copied weights would be indexed against features
+ -- they were never trained against -- silent garbage at inference.
+ -- For hybrid mode is_profile_compatible already routes inference to
+ -- the prior profile entry, which carries its own (older) symbol list
+ -- and therefore keeps weights correctly aligned at inference time;
+ -- skipping carryover is the right behaviour.
+ if providers_digest and rule.disable_symbols_input then
maybe_carryover_ann(task, rule, set, ann_key, providers_digest)
end
end