From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Fri, 15 Aug 2025 09:11:18 +0000 (+0100)
Subject: [Fix] Bayes: Try to be bug-to-bug compatible
X-Git-Tag: 3.13.0~30
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=65b52ce843383274d3ae7407449e8037e936739c;p=thirdparty%2Frspamd.git

[Fix] Bayes: Try to be bug-to-bug compatible
---

diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 1d5bb2a6fd..2e983f2e68 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -101,7 +101,10 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
 	 * prob is e ^ x (small value since x is normally less than zero
 	 * So we integrate over degrees of freedom and produce the total result
 	 * from 1.0 (no confidence) to 0.0 (full confidence)
-	 * Use logarithmic arithmetic to prevent overflow
+	 *
+	 * Historical note: older versions multiplied terms directly which could
+	 * underflow/overflow for extreme inputs. This implementation uses
+	 * logarithmic arithmetic to mitigate those numerical issues.
 	 */
 	for (i = 1; i < freedom_deg; i++) {
 		/* Calculate next term using logarithms to prevent overflow */
@@ -133,6 +136,54 @@ inv_chi_square(struct rspamd_task *task, double value, int freedom_deg)
 	return MIN(1.0, sum);
 }
 
+/*
+ * Legacy implementation kept for binary compatibility with 3.12.1.
+ * This mirrors the historical behaviour to ensure identical scoring.
+ */
+static double
+inv_chi_square_legacy(struct rspamd_task *task, double value, int freedom_deg)
+{
+	double prob, sum, m;
+	int i;
+
+	errno = 0;
+	m = -value;
+	prob = exp(value);
+
+	if (errno == ERANGE) {
+		/*
+		 * e^x where x is large NEGATIVE number is OK, so we have a very strong
+		 * confidence that inv-chi-square is close to zero
+		 */
+		msg_debug_bayes("exp overflow");
+
+		if (value < 0) {
+			return 0;
+		}
+		else {
+			return 1.0;
+		}
+	}
+
+	sum = prob;
+
+	msg_debug_bayes("m: %f, probability: %g", m, prob);
+
+	/*
+	 * Historical behaviour (pre-3.13): direct multiplicative series
+	 * accretion. This is intentionally kept to preserve binary scoring
+	 * compatibility with 3.12.1, despite known numerical fragility on
+	 * extreme inputs (possible underflow/overflow of `prob`).
+	 */
+	for (i = 1; i < freedom_deg; i++) {
+		prob *= m / (double) i;
+		sum += prob;
+		msg_debug_bayes("i=%d, probability: %g, sum: %g", i, prob, sum);
+	}
+
+	return MIN(1.0, sum);
+}
+
 struct bayes_task_closure {
 	double ham_prob;  /* Kept for binary compatibility */
 	double spam_prob; /* Kept for binary compatibility */
@@ -164,6 +215,11 @@ struct bayes_multiclass_closure {
 static const double feature_weight[] = {0, 3125, 256, 27, 1, 0, 0, 0};
 
 #define PROB_COMBINE(prob, cnt, weight, assumed) (((weight) * (assumed) + (cnt) * (prob)) / ((weight) + (cnt)))
+/*
+ * Historical note: alternative weighting schemes were proposed in older
+ * versions, but this exact form is retained for backward compatibility.
+ * Changing it would shift token posteriors and alter legacy scores.
+ */
 /*
  * In this callback we calculate local probabilities for tokens
  */
@@ -503,6 +559,12 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx,
 	}
 	else {
 		cl.meta_skip_prob = 1.0 - (double) text_tokens / tokens->len;
+		/*
+		 * Historical bug: integer division (text_tokens / tokens->len) caused
+		 * meta skip probability to be 0 or 1 in some builds. We keep the
+		 * double cast here, but do not change the binary classifier behaviour
+		 * elsewhere to preserve legacy scoring.
+		 */
 	}
 
 	/* Process all tokens */
@@ -798,9 +860,14 @@ bayes_classify(struct rspamd_classifier *ctx,
 	}
 
 	if (cl.spam_prob > -300 && cl.ham_prob > -300) {
-		/* Fisher value is low enough to apply inv_chi_square */
-		h = 1 - inv_chi_square(task, cl.spam_prob, cl.processed_tokens);
-		s = 1 - inv_chi_square(task, cl.ham_prob, cl.processed_tokens);
+		/*
+		 * Fisher value is low enough to apply inv_chi_square.
+		 * Use legacy variant to preserve binary (spam/ham) scoring
+		 * compatibility with tag 3.12.1. The multiclass path keeps
+		 * the newer, numerically-stable implementation.
+		 */
+		h = 1 - inv_chi_square_legacy(task, cl.spam_prob, cl.processed_tokens);
+		s = 1 - inv_chi_square_legacy(task, cl.ham_prob, cl.processed_tokens);
 	}
 	else {
 		/* Use naive method */