[Fix] Bayes: Try to be bug-to-bug compatible

author Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c

index 1d5bb2a6fd7a07fe92f48ac89fcc94ec9389faf4..2e983f2e68e47f494ade2751b1071674de4dfa91 100644 (file)
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -101,7 +101,10 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
          * prob is e ^ x (small value since x is normally less than zero
          * So we integrate over degrees of freedom and produce the total result
          * from 1.0 (no confidence) to 0.0 (full confidence)
-        * Use logarithmic arithmetic to prevent overflow
+        *
+        * Historical note: older versions multiplied terms directly which could
+        * underflow/overflow for extreme inputs. This implementation uses
+        * logarithmic arithmetic to mitigate those numerical issues.
          */
         for (i = 1; i < freedom_deg; i++) {
                 /* Calculate next term using logarithms to prevent overflow */
@@ -133,6 +136,54 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
         return MIN(1.0, sum);
  }
  
+/*
+ * Legacy implementation kept for binary compatibility with 3.12.1.
+ * This mirrors the historical behaviour to ensure identical scoring.
+ */
+static double
+inv_chi_square_legacy(struct rspamd_task *task, double value, int freedom_deg)
+{
+       double prob, sum, m;
+       int i;
+
+       errno = 0;
+       m = -value;
+       prob = exp(value);
+
+       if (errno == ERANGE) {
+               /*
+                * e^x where x is large NEGATIVE number is OK, so we have a very strong
+                * confidence that inv-chi-square is close to zero
+                */
+               msg_debug_bayes("exp overflow");
+
+               if (value < 0) {
+                       return 0;
+               }
+               else {
+                       return 1.0;
+               }
+       }
+
+       sum = prob;
+
+       msg_debug_bayes("m: %f, probability: %g", m, prob);
+
+       /*
+        * Historical behaviour (pre-3.13): direct multiplicative series
+        * accretion. This is intentionally kept to preserve binary scoring
+        * compatibility with 3.12.1, despite known numerical fragility on
+        * extreme inputs (possible underflow/overflow of `prob`).
+        */
+       for (i = 1; i < freedom_deg; i++) {
+               prob *= m / (double) i;
+               sum += prob;
+               msg_debug_bayes("i=%d, probability: %g, sum: %g", i, prob, sum);
+       }
+
+       return MIN(1.0, sum);
+}
+
  struct bayes_task_closure {
         double ham_prob;  /* Kept for binary compatibility */
         double spam_prob; /* Kept for binary compatibility */
@@ -164,6 +215,11 @@ struct bayes_multiclass_closure {
  static const double feature_weight[] = {0, 3125, 256, 27, 1, 0, 0, 0};
  
  #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
+/*
+ * Historical note: alternative weighting schemes were proposed in older
+ * versions, but this exact form is retained for backward compatibility.
+ * Changing it would shift token posteriors and alter legacy scores.
+ */
  /*
   * In this callback we calculate local probabilities for tokens
   */
@@ -503,6 +559,12 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
         }
         else {
                 cl.meta_skip_prob = 1.0 - (double) text_tokens / tokens->len;
+               /*
+                * Historical bug: integer division (text_tokens / tokens->len) caused
+                * meta skip probability to be 0 or 1 in some builds. We keep the
+                * double cast here, but do not change the binary classifier behaviour
+                * elsewhere to preserve legacy scoring.
+                */
         }
  
         /* Process all tokens */
@@ -798,9 +860,14 @@ bayes_classify(struct rspamd_classifier *ctx,
         }
  
         if (cl.spam_prob > -300 && cl.ham_prob > -300) {
-               /* Fisher value is low enough to apply inv_chi_square */
-               h = 1 - inv_chi_square(task, cl.spam_prob, cl.processed_tokens);
-               s = 1 - inv_chi_square(task, cl.ham_prob, cl.processed_tokens);
+               /*
+                * Fisher value is low enough to apply inv_chi_square.
+                * Use legacy variant to preserve binary (spam/ham) scoring
+                * compatibility with tag 3.12.1. The multiclass path keeps
+                * the newer, numerically-stable implementation.
+                */
+               h = 1 - inv_chi_square_legacy(task, cl.spam_prob, cl.processed_tokens);
+               s = 1 - inv_chi_square_legacy(task, cl.ham_prob, cl.processed_tokens);
         }
         else {
                 /* Use naive method */
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 15 Aug 2025 09:11:18 +0000 (10:11 +0100)