]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: tokenizer-generic - tr29 explicit-prefix parsing
authorPhil Carmody <phil@dovecot.fi>
Wed, 16 May 2018 15:08:01 +0000 (18:08 +0300)
committerAki Tuomi <aki.tuomi@open-xchange.com>
Wed, 10 Oct 2018 04:47:26 +0000 (04:47 +0000)
Similar logic to before - any wordlike sequence that ends with a * is
considered a prefix search, and immediately begins a new token.

Signed-off-by: Phil Carmody <phil@dovecot.fi>
src/lib-fts/fts-tokenizer-generic.c
src/lib-fts/test-fts-tokenizer.c

index 9bbf31a755cd8d1783c55b182a0afdd5b008fa3b..4284f37abaeb22290da2ec93910bfe241c9917eb 100644 (file)
@@ -356,6 +356,8 @@ static enum letter_type letter_type(unichar_t c)
                return LETTER_TYPE_NUMERIC;
        if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx))
                return LETTER_TYPE_EXTENDNUMLET;
+       if (IS_PREFIX_SPLAT(c)) /* prioritise appropriately */
+               return LETTER_TYPE_PREFIXSPLAT;
        return LETTER_TYPE_OTHER;
 }
 
@@ -554,7 +556,11 @@ static bool letter_apostrophe(struct generic_fts_tokenizer *tok)
 
        return TRUE; /* Any / Any */
 }
-
+static bool letter_prefixsplat(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+{
+       /* Dovecot explicit-prefix specific */
+       return TRUE; /* Always induces a word break - but with special handling */
+}
 static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
 {
        return TRUE; /* Any / Any */
@@ -653,7 +659,7 @@ static struct letter_fn letter_fns[] = {
        {letter_single_quote}, {letter_double_quote},
        {letter_midnumlet}, {letter_midletter}, {letter_midnum},
        {letter_numeric}, {letter_extendnumlet}, {letter_panic},
-       {letter_panic}, {letter_apostrophe}, {letter_panic},
+       {letter_panic}, {letter_apostrophe}, {letter_prefixsplat},
        {letter_other}
 };
 
@@ -733,6 +739,10 @@ fts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok,
                        i_assert(char_start_i >= start_pos && size >= start_pos);
                        tok_append_truncated(tok, data + start_pos,
                                             char_start_i - start_pos);
+                       if (lt == LETTER_TYPE_PREFIXSPLAT && tok->prefixsplat) {
+                               const unsigned char prefix_char = FTS_PREFIX_SPLAT_CHAR;
+                               tok_append_truncated(tok, &prefix_char, 1);
+                       }
                        *skip_r = i;
                        fts_tokenizer_generic_tr29_current_token(tok, token_r);
                        return 1;
index 09bc7560d65efb2911a535d8baa9424bdbad57f3..5d5260285e810e9480f2d55b3b8c6b0b9a8afb0c 100644 (file)
@@ -552,7 +552,7 @@ test_fts_tokenizer_explicit_prefix(void)
                                                     &tok, &error);
                                test_tokenizer_inputs(
                                        tok, &input, 1,
-                                       (search!=0) && (explicitprefix!=0) && (algo==0)
+                                       (search!=0) && (explicitprefix!=0)
                                        ? expected_star : expected_nostar);
 
                                fts_tokenizer_unref(&tok);