From: Vsevolod Stakhov Date: Fri, 20 Jun 2025 14:24:49 +0000 (+0100) Subject: [Minor] Do not apply snowball to something that has already been stemmed X-Git-Tag: 3.13.0~56^2~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=885721e71226122e2a2c0bde36ddc0db380512a7;p=thirdparty%2Frspamd.git [Minor] Do not apply snowball to something that has already been stemmed --- diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index f220011f76..8a9f429924 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -980,6 +980,16 @@ void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool, for (i = 0; i < kv_size(*words); i++) { tok = &kv_A(*words, i); + /* Skip stemming if token has already been stemmed by custom tokenizer */ + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_STEMMED) { + /* Already stemmed, just check for stop words */ + if (tok->stemmed.len > 0 && lang_detector != NULL && + rspamd_language_detector_is_stop_word(lang_detector, tok->stemmed.begin, tok->stemmed.len)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STOP_WORD; + } + continue; + } + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { const char *stemmed = NULL;