]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: simple tokenizer cleanup - make prev_letter updating more explicit.
authorTimo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
committerTimo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
It was now hidden inside one of the functions, which didn't make the
prev_letter very consistent when a word break was found. It didn't actually
matter what the prev_letter was at that point, but now the behavior is more
consistent.

src/lib-fts/fts-tokenizer-generic.c

index 182dc7280218d0af3b67824c1b864490bcd2ca08..45d21cf679d73f003eec445a42db0db916ca0374 100644 (file)
@@ -163,18 +163,14 @@ static bool fts_uni_word_break(unichar_t c)
        return FALSE;
 }
 
-static bool
-fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
-{
-       if (IS_APOSTROPHE(c)) {
-               if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
-                       return TRUE;
-               else
-                       tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
-       } else {
-               tok->prev_letter = LETTER_TYPE_NONE;
-       }
-       return FALSE;
+static inline bool
+fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+                        unichar_t c, bool apostrophe)
+{
+       if (apostrophe)
+               return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+       else
+               return fts_ascii_word_break(c) || fts_uni_word_break(c);
 }
 
 static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -223,13 +219,15 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
        size_t i, start = 0;
        unsigned int char_size;
        unichar_t c;
+       bool apostrophe;
 
        for (i = 0; i < size; i += char_size) {
                if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
                        i_unreached();
                char_size = uni_utf8_char_bytes(data[i]);
-               if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
-                   fts_apostrophe_word_break(tok, c)) {
+
+               apostrophe = IS_APOSTROPHE(c);
+               if (fts_simple_is_word_break(tok, c, apostrophe)) {
                        tok_append_truncated(tok, data + start, i - start);
                        if (tok->token->used > 0 &&
                            fts_tokenizer_generic_simple_current_token(tok, token_r)) {
@@ -237,6 +235,14 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
                                return 1;
                        }
                        start = i + char_size;
+                       /* it doesn't actually matter at this point how whether
+                          subsequent apostrophes are handled by prefix
+                          skipping or by ignoring empty tokens - they will be
+                          dropped in any case. */
+                       tok->prev_letter = LETTER_TYPE_NONE;
+               } else {
+                       tok->prev_letter = apostrophe ?
+                               LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
                }
        }
        /* word boundary not found yet */