From: Vsevolod Stakhov Date: Mon, 28 Jul 2025 14:06:06 +0000 (+0100) Subject: [Project] Fix unlearn stuff X-Git-Tag: 3.13.0~38^2~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=bfdd04e6534f3c79bef38c8b72336aaf56ccfbea;p=thirdparty%2Frspamd.git [Project] Fix unlearn stuff --- diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c index ffe96237ce..f851fbb369 100644 --- a/src/libstat/classifiers/bayes.c +++ b/src/libstat/classifiers/bayes.c @@ -333,6 +333,11 @@ bayes_classify_token_multiclass(struct rspamd_classifier *ctx, /* Apply multinomial model for each class */ for (j = 0; j < cl->num_classes; j++) { + /* Skip classes with insufficient learns */ + if (ctx->cfg->min_learns > 0 && cl->class_learns[j] < ctx->cfg->min_learns) { + continue; + } + double class_freq = (double) class_counts[j] / MAX(1.0, (double) cl->class_learns[j]); double class_prob = PROB_COMBINE(class_freq, total_count, w, 1.0 / cl->num_classes); @@ -431,16 +436,26 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, } } - /* Check minimum learns requirement */ + /* Check minimum learns requirement - count viable classes */ + unsigned int viable_classes = 0; if (ctx->cfg->min_learns > 0) { for (i = 0; i < cl.num_classes; i++) { - if (cl.class_learns[i] < ctx->cfg->min_learns) { - msg_info_task("not classified as %s. The class needs more " - "training samples. Currently: %uL; minimum %ud required", + if (cl.class_learns[i] >= ctx->cfg->min_learns) { + viable_classes++; + } + else { + msg_info_task("class %s excluded from classification: %uL learns < %ud minimum", cl.class_names[i], cl.class_learns[i], ctx->cfg->min_learns); - return TRUE; } } + + if (viable_classes == 0) { + msg_info_task("no classes have sufficient training samples for classification"); + return TRUE; + } + + msg_info_bayes("multiclass classification: %ud/%ud classes have sufficient learns", + viable_classes, cl.num_classes); } /* Count text tokens */ @@ -580,6 +595,10 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, rspamd_task_set_multiclass_result(task, result); + msg_info_bayes("MULTICLASS_RESULT: winning_class='%s', confidence=%.3f, normalized_prob=%.3f, tokens=%uL", + cl.class_names[winning_class_idx], confidence, + normalized_probs[winning_class_idx], cl.processed_tokens); + /* Insert symbol for winning class if confidence is significant */ if (confidence > 0.05) { char sumbuf[32]; @@ -594,6 +613,8 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, if (st->stcf->class_name && strcmp(st->stcf->class_name, cl.class_names[winning_class_idx]) == 0) { + msg_info_bayes("SYMBOL_INSERT: symbol='%s', final_prob=%.3f, confidence_display='%s'", + st->stcf->symbol, final_prob, sumbuf); rspamd_task_insert_result(task, st->stcf->symbol, final_prob, sumbuf); break; } @@ -605,6 +626,9 @@ bayes_classify_multiclass(struct rspamd_classifier *ctx, normalized_probs[winning_class_idx], confidence, cl.processed_tokens); } + else { + msg_info_bayes("SYMBOL_SKIPPED: confidence=%.3f <= 0.05, no symbol inserted", confidence); + } return TRUE; } @@ -963,6 +987,9 @@ bayes_learn_class(struct rspamd_classifier *ctx, g_assert(tokens != NULL); g_assert(class_name != NULL); + msg_info_bayes("LEARN_CLASS: class='%s', unlearn=%s, tokens=%ud", + class_name, unlearn ? "true" : "false", tokens->len); + incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND; /* Count classes and prepare arrays for multi-class learning */