From: Vsevolod Stakhov Date: Mon, 26 Nov 2018 17:42:43 +0000 (+0000) Subject: [Feature] Ignore bogus whitespaces in the words X-Git-Tag: 1.8.3~43 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=05f17c6cd61546712e5f213dc624b693e2b0dfa4;p=thirdparty%2Frspamd.git [Feature] Ignore bogus whitespaces in the words Issue: #2649 --- diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index ee8db8af2e..9dcd6f8e87 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -38,6 +38,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 9) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 10) +#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 11) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 19a5dba98c..c62718278b 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -608,7 +608,14 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, while (i < srclen) { U16_NEXT_UNSAFE (src, i, t); - *d++ = u_tolower (t); + + if (u_isgraph (t)) { + *d++ = u_tolower (t); + } + else { + /* Invisible spaces ! */ + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES; + } } tok->unicode.begin = dest;