From dfc9cfd5b80c8a4240841e12425eb23636ce674e Mon Sep 17 00:00:00 2001 From: Timo Sirainen Date: Mon, 1 Jun 2015 21:28:42 +0300 Subject: [PATCH] lib-fts: simple tokenizer optimization - don't check unicode word breaks for ASCII chars. --- src/lib-fts/fts-tokenizer-generic.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c index 45d21cf679..4d29cb6fe8 100644 --- a/src/lib-fts/fts-tokenizer-generic.c +++ b/src/lib-fts/fts-tokenizer-generic.c @@ -129,21 +129,10 @@ static bool uint32_find(const uint32_t *data, unsigned int count, BINARY_NUMBER_SEARCH(data, count, value, idx_r); } -static bool fts_ascii_word_break(unsigned char c) -{ - if (c < 0x80) - return fts_ascii_word_breaks[c] != 0; - return FALSE; -} - static bool fts_uni_word_break(unichar_t c) { unsigned int idx; - /* Override some apostrophes, which get special treatment. */ - if (IS_APOSTROPHE(c)) - return FALSE; - /* Unicode General Punctuation, including deprecated characters. */ if (c >= 0x2000 && c <= 0x206f) return TRUE; @@ -169,8 +158,10 @@ fts_simple_is_word_break(struct generic_fts_tokenizer *tok, { if (apostrophe) return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE; + else if (c < 0x80) + return fts_ascii_word_breaks[c] != 0; else - return fts_ascii_word_break(c) || fts_uni_word_break(c); + return fts_uni_word_break(c); } static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok) -- 2.47.3