From: Phil Carmody <phil@dovecot.fi>
Date: Wed, 16 May 2018 14:37:29 +0000 (+0300)
Subject: lib-fts: tokenizer-generic - add more history to break detection
X-Git-Tag: 2.3.9~1279
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c59424ddcb5447c9b8f1709d10fd0d3419e73aaf;p=thirdparty%2Fdovecot%2Fcore.git

lib-fts: tokenizer-generic - add more history to break detection

For example, going from non-word to non-word is a different type
of break (not really a break) from the transition from a word to
a non-word. Presently, that distinction isn't needed, but it will
be for explicit prefix searches.

Make the tok parameter const too, whilst there.

Signed-off-by: Phil Carmody <phil@dovecot.fi>
---

diff --git a/src/lib-fts/fts-tokenizer-generic.c b/src/lib-fts/fts-tokenizer-generic.c
index 797e431616..79b878d25b 100644
--- a/src/lib-fts/fts-tokenizer-generic.c
+++ b/src/lib-fts/fts-tokenizer-generic.c
@@ -194,16 +194,32 @@ static bool fts_uni_word_break(unichar_t c)
 	return FALSE;
 }
 
-static inline bool
-fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+enum fts_break_type {
+	FTS_FROM_STOP = 0,
+	FTS_FROM_WORD = 2,
+	FTS_TO_STOP= 0,
+	FTS_TO_WORD = 1,
+#define FROM_TO(f,t) FTS_##f##_TO_##t = FTS_FROM_##f | FTS_TO_##t
+	FROM_TO(STOP,STOP),
+	FROM_TO(STOP,WORD),
+	FROM_TO(WORD,STOP),
+	FROM_TO(WORD,WORD),
+};
+static inline enum fts_break_type
+fts_simple_is_word_break(const struct generic_fts_tokenizer *tok,
 			 unichar_t c, bool apostrophe)
 {
+	/* Until we know better, a letter followed by an apostrophe is continuation of the word.
+	   However, if we see non-word letters afterwards, we'll reverse that decision. */
 	if (apostrophe)
-		return tok->prev_type == LETTER_TYPE_SINGLE_QUOTE;
-	else if (c < 0x80)
-		return fts_ascii_word_breaks[c] != 0;
-	else
-		return fts_uni_word_break(c);
+		return tok->prev_type == LETTER_TYPE_ALETTER ? FTS_WORD_TO_WORD : FTS_STOP_TO_STOP;
+
+	bool new_breakiness = (c < 0x80) ? (fts_ascii_word_breaks[c] != 0) : fts_uni_word_break(c);
+
+	return (new_breakiness ? FTS_TO_STOP : FTS_TO_WORD)
+		+ (tok->prev_type == LETTER_TYPE_ALETTER ||
+		   tok->prev_type == LETTER_TYPE_SINGLE_QUOTE
+		   ? FTS_FROM_WORD : FTS_FROM_STOP);
 }
 
 static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -237,13 +253,15 @@ fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
 	int char_size;
 	unichar_t c;
 	bool apostrophe;
+	enum fts_break_type break_type;
 
 	for (i = 0; i < size; i += char_size) {
 		char_size = uni_utf8_get_char_n(data + i, size - i, &c);
 		i_assert(char_size > 0);
 
 		apostrophe = IS_APOSTROPHE(c);
-		if (fts_simple_is_word_break(tok, c, apostrophe)) {
+		break_type = fts_simple_is_word_break(tok, c, apostrophe);
+		if (break_type != FTS_WORD_TO_WORD && break_type != FTS_STOP_TO_WORD) {
 			tok_append_truncated(tok, data + start, i - start);
 			if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
 				*skip_r = i + char_size;