From: Vsevolod Stakhov Date: Tue, 27 Aug 2019 17:20:59 +0000 (+0100) Subject: [Fix] Fix normalization of non-alphabet based languages X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=44776f99b8d31fc26540f5bbb9281361df1a3c63;p=thirdparty%2Frspamd.git [Fix] Fix normalization of non-alphabet based languages --- diff --git a/src/libstat/backends/redis_backend.c b/src/libstat/backends/redis_backend.c index d54767c129..baeb2308d8 100644 --- a/src/libstat/backends/redis_backend.c +++ b/src/libstat/backends/redis_backend.c @@ -527,7 +527,8 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task, "HSET %b_tokens %b %b", prefix, (size_t) prefix_len, n0, (size_t) l0, - tok->t1->stemmed.begin, tok->t1->stemmed.len); + tok->t1->stemmed.begin, + tok->t1->stemmed.len); } } else { diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index f69378f9b9..ffa1af9db1 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -627,14 +627,10 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, } #endif - if (cat == U_UPPERCASE_LETTER || - cat == U_LOWERCASE_LETTER || - cat == U_DECIMAL_DIGIT_NUMBER || + if ((cat >= U_UPPERCASE_LETTER && cat <= U_OTHER_NUMBER) || cat == U_CONNECTOR_PUNCTUATION || cat == U_MATH_SYMBOL || - cat == U_CURRENCY_SYMBOL || - cat == U_INITIAL_PUNCTUATION || - cat == U_FINAL_PUNCTUATION) { + cat == U_CURRENCY_SYMBOL) { *d++ = u_tolower (t); } }