From: Phil Carmody Date: Wed, 16 May 2018 15:08:01 +0000 (+0300) Subject: lib-fts: tokenizer-generic - tr29 explicit-prefix parsing X-Git-Tag: 2.3.9~1276 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=001bcbdabcd9112f913167ba3b900e628cb87247;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: tokenizer-generic - tr29 explicit-prefix parsing Similar logic to before - any wordlike sequence that ends with a * is considered a prefix search, and immediately begins a new token. Signed-off-by: Phil Carmody --- diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c index 9bbf31a755..4284f37aba 100644 --- a/src/lib-fts/fts-tokenizer-generic.c +++ b/src/lib-fts/fts-tokenizer-generic.c @@ -356,6 +356,8 @@ static enum letter_type letter_type(unichar_t c) return LETTER_TYPE_NUMERIC; if (uint32_find(ExtendNumLet, N_ELEMENTS(ExtendNumLet), c, &idx)) return LETTER_TYPE_EXTENDNUMLET; + if (IS_PREFIX_SPLAT(c)) /* prioritise appropriately */ + return LETTER_TYPE_PREFIXSPLAT; return LETTER_TYPE_OTHER; } @@ -554,7 +556,11 @@ static bool letter_apostrophe(struct generic_fts_tokenizer *tok) return TRUE; /* Any / Any */ } - +static bool letter_prefixsplat(struct generic_fts_tokenizer *tok ATTR_UNUSED) +{ + /* Dovecot explicit-prefix specific */ + return TRUE; /* Always induces a word break - but with special handling */ +} static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED) { return TRUE; /* Any / Any */ @@ -653,7 +659,7 @@ static struct letter_fn letter_fns[] = { {letter_single_quote}, {letter_double_quote}, {letter_midnumlet}, {letter_midletter}, {letter_midnum}, {letter_numeric}, {letter_extendnumlet}, {letter_panic}, - {letter_panic}, {letter_apostrophe}, {letter_panic}, + {letter_panic}, {letter_apostrophe}, {letter_prefixsplat}, {letter_other} }; @@ -733,6 +739,10 @@ fts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok, i_assert(char_start_i >= start_pos && size >= start_pos); tok_append_truncated(tok, data + start_pos, char_start_i - start_pos); + if (lt == LETTER_TYPE_PREFIXSPLAT && tok->prefixsplat) { + const unsigned char prefix_char = FTS_PREFIX_SPLAT_CHAR; + tok_append_truncated(tok, &prefix_char, 1); + } *skip_r = i; fts_tokenizer_generic_tr29_current_token(tok, token_r); return 1; diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c index 09bc7560d6..5d5260285e 100644 --- a/src/lib-fts/test-fts-tokenizer.c +++ b/src/lib-fts/test-fts-tokenizer.c @@ -552,7 +552,7 @@ test_fts_tokenizer_explicit_prefix(void) &tok, &error); test_tokenizer_inputs( tok, &input, 1, - (search!=0) && (explicitprefix!=0) && (algo==0) + (search!=0) && (explicitprefix!=0) ? expected_star : expected_nostar); fts_tokenizer_unref(&tok);