lib-fts: simple tokenizer cleanup - make prev_letter updating more explicit.

author Timo Sirainen <tss@iki.fi>

Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)

committer Timo Sirainen <tss@iki.fi>

Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
author Timo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
committer Timo Sirainen <tss@iki.fi>
Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c

index 182dc7280218d0af3b67824c1b864490bcd2ca08..45d21cf679d73f003eec445a42db0db916ca0374 100644 (file)
--- a/src/lib-fts/fts-tokenizer-generic.c
+++ b/src/lib-fts/fts-tokenizer-generic.c
@@ -163,18 +163,14 @@ static bool fts_uni_word_break(unichar_t c)
         return FALSE;
  }
  
-static bool
-fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
-{
-       if (IS_APOSTROPHE(c)) {
-               if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
-                       return TRUE;
-               else
-                       tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
-       } else {
-               tok->prev_letter = LETTER_TYPE_NONE;
-       }
-       return FALSE;
+static inline bool
+fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+                        unichar_t c, bool apostrophe)
+{
+       if (apostrophe)
+               return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+       else
+               return fts_ascii_word_break(c) || fts_uni_word_break(c);
  }
  
  static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -223,13 +219,15 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
         size_t i, start = 0;
         unsigned int char_size;
         unichar_t c;
+       bool apostrophe;
  
         for (i = 0; i < size; i += char_size) {
                 if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
                         i_unreached();
                 char_size = uni_utf8_char_bytes(data[i]);
-               if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
-                   fts_apostrophe_word_break(tok, c)) {
+
+               apostrophe = IS_APOSTROPHE(c);
+               if (fts_simple_is_word_break(tok, c, apostrophe)) {
                         tok_append_truncated(tok, data + start, i - start);
                         if (tok->token->used > 0 &&
                             fts_tokenizer_generic_simple_current_token(tok, token_r)) {
@@ -237,6 +235,14 @@ fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
                                 return 1;
                         }
                         start = i + char_size;
+                       /* it doesn't actually matter at this point how whether
+                          subsequent apostrophes are handled by prefix
+                          skipping or by ignoring empty tokens - they will be
+                          dropped in any case. */
+                       tok->prev_letter = LETTER_TYPE_NONE;
+               } else {
+                       tok->prev_letter = apostrophe ?
+                               LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
                 }
         }
         /* word boundary not found yet */
author	Timo Sirainen <tss@iki.fi>
	Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)
committer	Timo Sirainen <tss@iki.fi>
	Mon, 1 Jun 2015 18:27:09 +0000 (21:27 +0300)