From: Timo Sirainen Date: Mon, 1 Jun 2015 18:58:30 +0000 (+0300) Subject: lib-fts: tokenizers - Fixed removal of trailing character in truncated tokens. X-Git-Tag: 2.2.19.rc1~413 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b6b06530d654f0436bfbaefc1e988d53fff0cbee;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: tokenizers - Fixed removal of trailing character in truncated tokens. If the token is truncated, we don't want to remove the trailing character since it's not actually there. Also we don't want to remove trailing apostrophes from a truncated word, because they're not actually at the end of the (untruncated) token there. This doesn't make a big difference, but it's slightly more correct. --- diff --git a/src/lib-fts/fts-tokenizer-generic-private.h b/src/lib-fts/fts-tokenizer-generic-private.h index 363d1a730a..c2b8a1aadc 100644 --- a/src/lib-fts/fts-tokenizer-generic-private.h +++ b/src/lib-fts/fts-tokenizer-generic-private.h @@ -43,6 +43,7 @@ struct generic_fts_tokenizer { enum boundary_algorithm algorithm; enum letter_type prev_letter; enum letter_type prev_prev_letter; + size_t untruncated_length; buffer_t *token; }; diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c index ad52ebcc5a..e792610b02 100644 --- a/src/lib-fts/fts-tokenizer-generic.c +++ b/src/lib-fts/fts-tokenizer-generic.c @@ -107,7 +107,7 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, const unsigned char *data; size_t len = tok->token->used; - if (len > 0) { + if (len > 0 && tok->untruncated_length <= tok->max_length) { /* Remove the trailing apostrophe - it was made into U+0027 earlier. There can be only a single such apostrophe, because otherwise the token would have already @@ -124,6 +124,7 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, *token_r = len == 0 ? "" : fts_uni_strndup(tok->token->data, len); buffer_set_used_size(tok->token, 0); + tok->untruncated_length = 0; tok->prev_letter = LETTER_TYPE_NONE; return (*token_r)[0] != '\0'; } @@ -176,6 +177,7 @@ static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok) tok->prev_letter = LETTER_TYPE_NONE; tok->prev_prev_letter = LETTER_TYPE_NONE; + tok->untruncated_length = 0; buffer_set_used_size(tok->token, 0); } @@ -184,6 +186,7 @@ static void tok_append_truncated(struct generic_fts_tokenizer *tok, { buffer_append(tok->token, data, I_MIN(size, tok->max_length - tok->token->used)); + tok->untruncated_length += size; } static int @@ -541,7 +544,8 @@ fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok, const unsigned char *data = tok->token->data; ssize_t len = tok->token->used; - if (is_one_past_end(tok)) { + if (is_one_past_end(tok) && + tok->untruncated_length <= tok->max_length) { /* delete the last character */ while ((data[len-1] & 0x80) != 0) len--; @@ -558,6 +562,7 @@ fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok, *token_r = fts_uni_strndup(data, len); buffer_set_used_size(tok->token, 0); + tok->untruncated_length = 0; } struct letter_fn { diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c index 8ee4484b70..1f355f4abe 100644 --- a/src/lib-fts/test-fts-tokenizer.c +++ b/src/lib-fts/test-fts-tokenizer.c @@ -32,6 +32,14 @@ static const char *test_inputs[] = { "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''", "'1234567890123456789012345678ä," + "123456789012345678901234567x'ä," + "1234567890123456789012345678x're," + "1234567890123456789012345678x'," + "1234567890123456789012345678x''," + "12345678901234567890123456789x'," + "12345678901234567890123456789x''," + "123456789012345678901234567890x'," + "123456789012345678901234567890x''," /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2 81 9f) */ @@ -139,6 +147,14 @@ static void test_fts_tokenizer_generic_only(void) "word", "pre", "post", NULL, "1234567890123456789012345678ä", + "123456789012345678901234567x'", + "1234567890123456789012345678x'", + "1234567890123456789012345678x", + "1234567890123456789012345678x", + "12345678901234567890123456789x", + "12345678901234567890123456789x", + "123456789012345678901234567890", + "123456789012345678901234567890", "hello", "world", "And", "there", "was", "text", "galore", @@ -183,6 +199,14 @@ static void test_fts_tokenizer_generic_tr29_only(void) "word", "pre", "post", NULL, "1234567890123456789012345678ä", + "123456789012345678901234567x'", + "1234567890123456789012345678x'", + "1234567890123456789012345678x", + "1234567890123456789012345678x", + "12345678901234567890123456789x", + "12345678901234567890123456789x", + "123456789012345678901234567890", + "123456789012345678901234567890", "hello", "world", "And", "there", "was", "text", "galore",