return FALSE;
}
-static bool
-fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
-{
- if (IS_APOSTROPHE(c)) {
- if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
- return TRUE;
- else
- tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
- } else {
- tok->prev_letter = LETTER_TYPE_NONE;
- }
- return FALSE;
+static inline bool
+fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+ unichar_t c, bool apostrophe)
+{
+ if (apostrophe)
+ return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+ else
+ return fts_ascii_word_break(c) || fts_uni_word_break(c);
}
static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
size_t i, start = 0;
unsigned int char_size;
unichar_t c;
+ bool apostrophe;
for (i = 0; i < size; i += char_size) {
if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
i_unreached();
char_size = uni_utf8_char_bytes(data[i]);
- if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
- fts_apostrophe_word_break(tok, c)) {
+
+ apostrophe = IS_APOSTROPHE(c);
+ if (fts_simple_is_word_break(tok, c, apostrophe)) {
tok_append_truncated(tok, data + start, i - start);
if (tok->token->used > 0 &&
fts_tokenizer_generic_simple_current_token(tok, token_r)) {
return 1;
}
start = i + char_size;
+ /* it doesn't actually matter at this point how whether
+ subsequent apostrophes are handled by prefix
+ skipping or by ignoring empty tokens - they will be
+ dropped in any case. */
+ tok->prev_letter = LETTER_TYPE_NONE;
+ } else {
+ tok->prev_letter = apostrophe ?
+ LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
}
}
/* word boundary not found yet */