]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: tokenizers - Fixed removal of trailing character in truncated tokens.
authorTimo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:58:30 +0000 (21:58 +0300)
committerTimo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:58:30 +0000 (21:58 +0300)
If the token is truncated, we don't want to remove the trailing character
since it's not actually there.

Also we don't want to remove trailing apostrophes from a truncated word,
because they're not actually at the end of the (untruncated) token there.
This doesn't make a big difference, but it's slightly more correct.

src/lib-fts/fts-tokenizer-generic-private.h
src/lib-fts/fts-tokenizer-generic.c
src/lib-fts/test-fts-tokenizer.c

index 363d1a730abc1f7828283e52938ae31894c97f9a..c2b8a1aadc92dc521606de77f8fc88e51d8c1cb8 100644 (file)
@@ -43,6 +43,7 @@ struct generic_fts_tokenizer {
        enum boundary_algorithm algorithm;
        enum letter_type prev_letter;
        enum letter_type prev_prev_letter;
+       size_t untruncated_length;
        buffer_t *token;
 };
 
index ad52ebcc5ac0bef5d040e15035cee8f8f387b5e2..e792610b02f345c91e5aa8dbf3e586ea698ad7b9 100644 (file)
@@ -107,7 +107,7 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
        const unsigned char *data;
        size_t len = tok->token->used;
 
-       if (len > 0) {
+       if (len > 0 && tok->untruncated_length <= tok->max_length) {
                /* Remove the trailing apostrophe - it was made
                   into U+0027 earlier. There can be only a single such
                   apostrophe, because otherwise the token would have already
@@ -124,6 +124,7 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
        *token_r = len == 0 ? "" :
                fts_uni_strndup(tok->token->data, len);
        buffer_set_used_size(tok->token, 0);
+       tok->untruncated_length = 0;
        tok->prev_letter = LETTER_TYPE_NONE;
        return (*token_r)[0] != '\0';
 }
@@ -176,6 +177,7 @@ static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
 
        tok->prev_letter = LETTER_TYPE_NONE;
        tok->prev_prev_letter = LETTER_TYPE_NONE;
+       tok->untruncated_length = 0;
        buffer_set_used_size(tok->token, 0);
 }
 
@@ -184,6 +186,7 @@ static void tok_append_truncated(struct generic_fts_tokenizer *tok,
 {
        buffer_append(tok->token, data,
                      I_MIN(size, tok->max_length - tok->token->used));
+       tok->untruncated_length += size;
 }
 
 static int
@@ -541,7 +544,8 @@ fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
        const unsigned char *data = tok->token->data;
        ssize_t len = tok->token->used;
 
-       if (is_one_past_end(tok)) {
+       if (is_one_past_end(tok) &&
+           tok->untruncated_length <= tok->max_length) {
                /* delete the last character */
                while ((data[len-1] & 0x80) != 0)
                        len--;
@@ -558,6 +562,7 @@ fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
 
        *token_r = fts_uni_strndup(data, len);
        buffer_set_used_size(tok->token, 0);
+       tok->untruncated_length = 0;
 }
 
 struct letter_fn {
index 8ee4484b706b8517d0302731d5ed628d17bd5508..1f355f4abeb3a99e7cc895cf2789501c088592c4 100644 (file)
@@ -32,6 +32,14 @@ static const char *test_inputs[] = {
        "' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
 
        "'1234567890123456789012345678ä,"
+       "123456789012345678901234567x'ä,"
+       "1234567890123456789012345678x're,"
+       "1234567890123456789012345678x',"
+       "1234567890123456789012345678x'',"
+       "12345678901234567890123456789x',"
+       "12345678901234567890123456789x'',"
+       "123456789012345678901234567890x',"
+       "123456789012345678901234567890x'',"
 
        /* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
           U+205A(e2 81 9a) and U+205F(e2 81 9f) */
@@ -139,6 +147,14 @@ static void test_fts_tokenizer_generic_only(void)
                "word", "pre", "post", NULL,
 
                "1234567890123456789012345678ä",
+               "123456789012345678901234567x'",
+               "1234567890123456789012345678x'",
+               "1234567890123456789012345678x",
+               "1234567890123456789012345678x",
+               "12345678901234567890123456789x",
+               "12345678901234567890123456789x",
+               "123456789012345678901234567890",
+               "123456789012345678901234567890",
 
                "hello", "world", "And",
                "there", "was", "text", "galore",
@@ -183,6 +199,14 @@ static void test_fts_tokenizer_generic_tr29_only(void)
                "word", "pre", "post", NULL,
 
                "1234567890123456789012345678ä",
+               "123456789012345678901234567x'",
+               "1234567890123456789012345678x'",
+               "1234567890123456789012345678x",
+               "1234567890123456789012345678x",
+               "12345678901234567890123456789x",
+               "12345678901234567890123456789x",
+               "123456789012345678901234567890",
+               "123456789012345678901234567890",
 
                "hello", "world", "And",
                "there", "was", "text", "galore",