From: Phil Carmody Date: Wed, 16 May 2018 14:57:14 +0000 (+0300) Subject: lib-fts: tokenizer-generic - simple explicit prefix search logic X-Git-Tag: 2.3.9~1278 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e6a17422de488f4b21bad5b16cb50421119378ee;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: tokenizer-generic - simple explicit prefix search logic Logic is that words followed by a '*' create a prefix search token. A new token is begun immediately after that. So "foo*bar" is 2 tokens "foo*" and "bar", when in explicit prefix search tokenisation mode. Only active in 'simple', not 'tr29'. Signed-off-by: Phil Carmody --- diff --git a/src/lib-fts/fts-common.h b/src/lib-fts/fts-common.h index df21dbd2de..1a1446390a 100644 --- a/src/lib-fts/fts-common.h +++ b/src/lib-fts/fts-common.h @@ -8,6 +8,9 @@ ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c)) #define IS_WB5A_APOSTROPHE(c) \ ((c) == 0x0027 || (c) == 0x2019) +#define FTS_PREFIX_SPLAT_CHAR 0x002A /* '*' */ +#define IS_PREFIX_SPLAT(c) \ + ((c) == FTS_PREFIX_SPLAT_CHAR) /* The h letters are included because it is an exception in French. A, E, H, I, O, U, Y, a, e, h, i, o, u, y */ #define IS_ASCII_VOWEL(c) \ diff --git a/src/lib-fts/fts-tokenizer-generic-private.h b/src/lib-fts/fts-tokenizer-generic-private.h index 2669023bc5..87f4d48fa1 100644 --- a/src/lib-fts/fts-tokenizer-generic-private.h +++ b/src/lib-fts/fts-tokenizer-generic-private.h @@ -26,6 +26,7 @@ enum letter_type { LETTER_TYPE_SOT, LETTER_TYPE_EOT, LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */ + LETTER_TYPE_PREFIXSPLAT, /* Dovecot '*' for glob-like explicit prefix searching */ LETTER_TYPE_OTHER /* WB14 "any" */ }; diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c index 79b878d25b..9bbf31a755 100644 --- a/src/lib-fts/fts-tokenizer-generic.c +++ b/src/lib-fts/fts-tokenizer-generic.c @@ -152,6 +152,10 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, len--; i_assert(len > 0 && data[len-1] != '\''); } + if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) { + len--; + i_assert(len > 0 && data[len-1] != '*'); + } } else { fts_tokenizer_delete_trailing_partial_char(data, &len); } @@ -161,7 +165,6 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok, t_strndup(tok->token->data, len); buffer_set_used_size(tok->token, 0); tok->untruncated_length = 0; - shift_prev_type(tok, LETTER_TYPE_NONE); return len > 0; } @@ -260,19 +263,23 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok, i_assert(char_size > 0); apostrophe = IS_APOSTROPHE(c); - break_type = fts_simple_is_word_break(tok, c, apostrophe); - if (break_type != FTS_WORD_TO_WORD && break_type != FTS_STOP_TO_WORD) { + if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) && + (tok->prev_type == LETTER_TYPE_ALETTER)) { + /* this might be a prefix-mathing query */ + shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT); + } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe)) + != FTS_WORD_TO_WORD) { tok_append_truncated(tok, data + start, i - start); + shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0 + ? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE); if (fts_tokenizer_generic_simple_current_token(tok, token_r)) { - *skip_r = i + char_size; + *skip_r = i; + if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */ + *skip_r += char_size; return 1; } - start = i + char_size; - /* it doesn't actually matter at this point how whether - subsequent apostrophes are handled by prefix - skipping or by ignoring empty tokens - they will be - dropped in any case. */ - shift_prev_type(tok, LETTER_TYPE_NONE); + if ((break_type & FTS_TO_WORD) == 0) + start = i + char_size; } else if (apostrophe) { /* all apostrophes require special handling */ const unsigned char apostrophe_char = '\''; @@ -295,6 +302,7 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok, /* return the last token */ if (size == 0) { + shift_prev_type(tok, LETTER_TYPE_NONE); if (fts_tokenizer_generic_simple_current_token(tok, token_r)) return 1; } @@ -645,7 +653,8 @@ static struct letter_fn letter_fns[] = { {letter_single_quote}, {letter_double_quote}, {letter_midnumlet}, {letter_midletter}, {letter_midnum}, {letter_numeric}, {letter_extendnumlet}, {letter_panic}, - {letter_panic}, {letter_apostrophe}, {letter_other} + {letter_panic}, {letter_apostrophe}, {letter_panic}, + {letter_other} }; /*