From ab43e080ebc5fea5a2c54bcad9180202b1a38711 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 25 Feb 2019 18:19:51 +0000 Subject: [PATCH] [Feature] Try to filter bad unicode types during normalisation --- src/libstat/stat_api.h | 1 + src/libstat/tokenizers/tokenizers.c | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/libstat/stat_api.h b/src/libstat/stat_api.h index 533c429486..f9d1aab5a9 100644 --- a/src/libstat/stat_api.h +++ b/src/libstat/stat_api.h @@ -39,6 +39,7 @@ #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10) #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11) #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12) +#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13) typedef struct rspamd_stat_token_s { rspamd_ftok_t original; /* utf8 raw */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index acbbcf2f01..caa4a48a5d 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -610,7 +610,25 @@ rspamd_uchars_to_ucs32 (const UChar *src, gsize srclen, U16_NEXT_UNSAFE (src, i, t); if (u_isgraph (t)) { - *d++ = u_tolower (t); + UCharCategory cat; + + cat = u_charType (t); +#if U_ICU_VERSION_MAJOR_NUM >= 57 + if (u_hasBinaryProperty (t, UCHAR_EMOJI)) { + tok->flags |= RSPAMD_STAT_TOKEN_FLAG_EMOJI; + } +#endif + + if (cat == U_UPPERCASE_LETTER || + cat == U_LOWERCASE_LETTER || + cat == U_DECIMAL_DIGIT_NUMBER || + cat == U_CONNECTOR_PUNCTUATION || + cat == U_MATH_SYMBOL || + cat == U_CURRENCY_SYMBOL || + cat == U_INITIAL_PUNCTUATION || + cat == U_FINAL_PUNCTUATION) { + *d++ = u_tolower (t); + } } else { /* Invisible spaces ! */ -- 2.47.3