]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: tokenizer-generic - simple explicit prefix search logic
authorPhil Carmody <phil@dovecot.fi>
Wed, 16 May 2018 14:57:14 +0000 (17:57 +0300)
committerAki Tuomi <aki.tuomi@open-xchange.com>
Wed, 10 Oct 2018 04:47:26 +0000 (04:47 +0000)
Logic is that words followed by a '*' create a prefix search token.
A new token is begun immediately after that. So "foo*bar" is 2 tokens
"foo*" and "bar", when in explicit prefix search tokenisation mode.

Only active in 'simple', not 'tr29'.

Signed-off-by: Phil Carmody <phil@dovecot.fi>
src/lib-fts/fts-common.h
src/lib-fts/fts-tokenizer-generic-private.h
src/lib-fts/fts-tokenizer-generic.c

index df21dbd2de3509c490728b6b021a4f13d2164a6a..1a1446390a6386f2e88bf759152def1d58387bfb 100644 (file)
@@ -8,6 +8,9 @@
        ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
 #define IS_WB5A_APOSTROPHE(c) \
        ((c) == 0x0027 || (c) == 0x2019)
+#define FTS_PREFIX_SPLAT_CHAR 0x002A /* '*' */
+#define IS_PREFIX_SPLAT(c) \
+       ((c) == FTS_PREFIX_SPLAT_CHAR)
 /* The h letters are included because it is an exception in French.
    A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
 #define IS_ASCII_VOWEL(c) \
index 2669023bc5a665d92a7ae6d6396c5aa8bc620067..87f4d48fa16dc5016f38c7c5e786906f90f68968 100644 (file)
@@ -26,6 +26,7 @@ enum letter_type {
        LETTER_TYPE_SOT,
        LETTER_TYPE_EOT,
        LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */
+       LETTER_TYPE_PREFIXSPLAT, /* Dovecot '*' for glob-like explicit prefix searching */
        LETTER_TYPE_OTHER /* WB14 "any" */
 };
 
index 79b878d25b9bf465a944969f99799d07f8103ee3..9bbf31a755cd8d1783c55b182a0afdd5b008fa3b 100644 (file)
@@ -152,6 +152,10 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                        len--;
                        i_assert(len > 0 && data[len-1] != '\'');
                }
+               if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) {
+                       len--;
+                       i_assert(len > 0 && data[len-1] != '*');
+               }
        } else {
                fts_tokenizer_delete_trailing_partial_char(data, &len);
        }
@@ -161,7 +165,6 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                t_strndup(tok->token->data, len);
        buffer_set_used_size(tok->token, 0);
        tok->untruncated_length = 0;
-       shift_prev_type(tok, LETTER_TYPE_NONE);
        return len > 0;
 }
 
@@ -260,19 +263,23 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
                i_assert(char_size > 0);
 
                apostrophe = IS_APOSTROPHE(c);
-               break_type = fts_simple_is_word_break(tok, c, apostrophe);
-               if (break_type != FTS_WORD_TO_WORD && break_type != FTS_STOP_TO_WORD) {
+               if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) &&
+                   (tok->prev_type == LETTER_TYPE_ALETTER)) {
+                       /* this might be a prefix-mathing query */
+                       shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT);
+               } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe))
+                          != FTS_WORD_TO_WORD) {
                        tok_append_truncated(tok, data + start, i - start);
+                       shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0
+                                       ? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE);
                        if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
-                               *skip_r = i + char_size;
+                               *skip_r = i;
+                               if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */
+                                       *skip_r += char_size;
                                return 1;
                        }
-                       start = i + char_size;
-                       /* it doesn't actually matter at this point how whether
-                          subsequent apostrophes are handled by prefix
-                          skipping or by ignoring empty tokens - they will be
-                          dropped in any case. */
-                       shift_prev_type(tok, LETTER_TYPE_NONE);
+                       if ((break_type & FTS_TO_WORD) == 0)
+                               start = i + char_size;
                } else if (apostrophe) {
                        /* all apostrophes require special handling */
                        const unsigned char apostrophe_char = '\'';
@@ -295,6 +302,7 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
 
        /* return the last token */
        if (size == 0) {
+               shift_prev_type(tok, LETTER_TYPE_NONE);
                if (fts_tokenizer_generic_simple_current_token(tok, token_r))
                        return 1;
        }
@@ -645,7 +653,8 @@ static struct letter_fn letter_fns[] = {
        {letter_single_quote}, {letter_double_quote},
        {letter_midnumlet}, {letter_midletter}, {letter_midnum},
        {letter_numeric}, {letter_extendnumlet}, {letter_panic},
-       {letter_panic}, {letter_apostrophe}, {letter_other}
+       {letter_panic}, {letter_apostrophe}, {letter_panic},
+       {letter_other}
 };
 
 /*