lib-fts: tokenizer-generic - simple explicit prefix search logic

author Phil Carmody <phil@dovecot.fi>

Wed, 16 May 2018 14:57:14 +0000 (17:57 +0300)

committer Aki Tuomi <aki.tuomi@open-xchange.com>

Wed, 10 Oct 2018 04:47:26 +0000 (04:47 +0000)
author Phil Carmody <phil@dovecot.fi>
Wed, 16 May 2018 14:57:14 +0000 (17:57 +0300)
committer Aki Tuomi <aki.tuomi@open-xchange.com>
Wed, 10 Oct 2018 04:47:26 +0000 (04:47 +0000)
diff --git a/src/lib-fts/fts-common.h b/src/lib-fts/fts-common.h

index df21dbd2de3509c490728b6b021a4f13d2164a6a..1a1446390a6386f2e88bf759152def1d58387bfb 100644 (file)
--- a/src/lib-fts/fts-common.h
+++ b/src/lib-fts/fts-common.h
@@ -8,6 +8,9 @@
         ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
  #define IS_WB5A_APOSTROPHE(c) \
         ((c) == 0x0027 || (c) == 0x2019)
+#define FTS_PREFIX_SPLAT_CHAR 0x002A /* '*' */
+#define IS_PREFIX_SPLAT(c) \
+       ((c) == FTS_PREFIX_SPLAT_CHAR)
  /* The h letters are included because it is an exception in French.
     A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
  #define IS_ASCII_VOWEL(c) \
diff --git a/src/lib-fts/fts-tokenizer-generic-private.h b/src/lib-fts/fts-tokenizer-generic-private.h

index 2669023bc5a665d92a7ae6d6396c5aa8bc620067..87f4d48fa16dc5016f38c7c5e786906f90f68968 100644 (file)
--- a/src/lib-fts/fts-tokenizer-generic-private.h
+++ b/src/lib-fts/fts-tokenizer-generic-private.h
@@ -26,6 +26,7 @@ enum letter_type {
         LETTER_TYPE_SOT,
         LETTER_TYPE_EOT,
         LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */
+       LETTER_TYPE_PREFIXSPLAT, /* Dovecot '*' for glob-like explicit prefix searching */
         LETTER_TYPE_OTHER /* WB14 "any" */
  };
  
diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c

index 79b878d25b9bf465a944969f99799d07f8103ee3..9bbf31a755cd8d1783c55b182a0afdd5b008fa3b 100644 (file)
--- a/src/lib-fts/fts-tokenizer-generic.c
+++ b/src/lib-fts/fts-tokenizer-generic.c
@@ -152,6 +152,10 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                         len--;
                         i_assert(len > 0 && data[len-1] != '\'');
                 }
+               if (len > 0 && data[len-1] == '*' && !tok->prefixsplat) {
+                       len--;
+                       i_assert(len > 0 && data[len-1] != '*');
+               }
         } else {
                 fts_tokenizer_delete_trailing_partial_char(data, &len);
         }
@@ -161,7 +165,6 @@ fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                 t_strndup(tok->token->data, len);
         buffer_set_used_size(tok->token, 0);
         tok->untruncated_length = 0;
-       shift_prev_type(tok, LETTER_TYPE_NONE);
         return len > 0;
  }
  
@@ -260,19 +263,23 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
                 i_assert(char_size > 0);
  
                 apostrophe = IS_APOSTROPHE(c);
-               break_type = fts_simple_is_word_break(tok, c, apostrophe);
-               if (break_type != FTS_WORD_TO_WORD && break_type != FTS_STOP_TO_WORD) {
+               if ((tok->prefixsplat && IS_PREFIX_SPLAT(c)) &&
+                   (tok->prev_type == LETTER_TYPE_ALETTER)) {
+                       /* this might be a prefix-mathing query */
+                       shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT);
+               } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe))
+                          != FTS_WORD_TO_WORD) {
                         tok_append_truncated(tok, data + start, i - start);
+                       shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0
+                                       ? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE);
                         if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
-                               *skip_r = i + char_size;
+                               *skip_r = i;
+                               if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */
+                                       *skip_r += char_size;
                                 return 1;
                         }
-                       start = i + char_size;
-                       /* it doesn't actually matter at this point how whether
-                          subsequent apostrophes are handled by prefix
-                          skipping or by ignoring empty tokens - they will be
-                          dropped in any case. */
-                       shift_prev_type(tok, LETTER_TYPE_NONE);
+                       if ((break_type & FTS_TO_WORD) == 0)
+                               start = i + char_size;
                 } else if (apostrophe) {
                         /* all apostrophes require special handling */
                         const unsigned char apostrophe_char = '\'';
@@ -295,6 +302,7 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
  
         /* return the last token */
         if (size == 0) {
+               shift_prev_type(tok, LETTER_TYPE_NONE);
                 if (fts_tokenizer_generic_simple_current_token(tok, token_r))
                         return 1;
         }
@@ -645,7 +653,8 @@ static struct letter_fn letter_fns[] = {
         {letter_single_quote}, {letter_double_quote},
         {letter_midnumlet}, {letter_midletter}, {letter_midnum},
         {letter_numeric}, {letter_extendnumlet}, {letter_panic},
-       {letter_panic}, {letter_apostrophe}, {letter_other}
+       {letter_panic}, {letter_apostrophe}, {letter_panic},
+       {letter_other}
  };
  
  /*
author	Phil Carmody <phil@dovecot.fi>
	Wed, 16 May 2018 14:57:14 +0000 (17:57 +0300)
committer	Aki Tuomi <aki.tuomi@open-xchange.com>
	Wed, 10 Oct 2018 04:47:26 +0000 (04:47 +0000)
src/lib-fts/fts-common.h		patch \| blob \| blame \| history
src/lib-fts/fts-tokenizer-generic-private.h		patch \| blob \| blame \| history
src/lib-fts/fts-tokenizer-generic.c		patch \| blob \| blame \| history