t_strndup(tok->token->data, len);
buffer_set_used_size(tok->token, 0);
tok->untruncated_length = 0;
- tok->prev_letter = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
return len > 0;
}
unichar_t c, bool apostrophe)
{
if (apostrophe)
- return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+ return tok->prev_type == LETTER_TYPE_SINGLE_QUOTE;
else if (c < 0x80)
return fts_ascii_word_breaks[c] != 0;
else
struct generic_fts_tokenizer *tok =
container_of(_tok, struct generic_fts_tokenizer, tokenizer);
- tok->prev_letter = LETTER_TYPE_NONE;
- tok->prev_prev_letter = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
+ tok->prev_prev_type = LETTER_TYPE_NONE;
tok->untruncated_length = 0;
buffer_set_used_size(tok->token, 0);
}
subsequent apostrophes are handled by prefix
skipping or by ignoring empty tokens - they will be
dropped in any case. */
- tok->prev_letter = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
} else if (apostrophe) {
/* all apostrophes require special handling */
const unsigned char apostrophe_char = '\'';
if (tok->token->used > 0)
tok_append_truncated(tok, &apostrophe_char, 1);
start = i + char_size;
- tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
+ tok->prev_type = LETTER_TYPE_SINGLE_QUOTE;
} else {
- tok->prev_letter = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
}
}
/* word boundary not found yet */
static bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
{
/* WB13c */
- if (tok->prev_letter == LETTER_TYPE_REGIONAL_INDICATOR)
+ if (tok->prev_type == LETTER_TYPE_REGIONAL_INDICATOR)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_katakana(struct generic_fts_tokenizer *tok)
{
/* WB13 */
- if (tok->prev_letter == LETTER_TYPE_KATAKANA)
+ if (tok->prev_type == LETTER_TYPE_KATAKANA)
return FALSE;
/* WB13b */
- if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_hebrew(struct generic_fts_tokenizer *tok)
{
/* WB5 */
- if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+ if (tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return FALSE;
/* WB7 WB7c, except MidNumLet */
- if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
- (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
- tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
- tok->prev_letter == LETTER_TYPE_MIDLETTER ||
- tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
+ if (tok->prev_prev_type == LETTER_TYPE_HEBREW_LETTER &&
+ (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_MIDLETTER ||
+ tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE))
return FALSE;
/* WB10 */
- if (tok->prev_letter == LETTER_TYPE_NUMERIC)
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
return FALSE;
/* WB13b */
- if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
return FALSE;
return TRUE; /* Any / Any */
}
/* WB5 */
- if (tok->prev_letter == LETTER_TYPE_ALETTER)
+ if (tok->prev_type == LETTER_TYPE_ALETTER)
return FALSE;
/* WB7, except MidNumLet */
- if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
- (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
- tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
- tok->prev_letter == LETTER_TYPE_MIDLETTER))
+ if (tok->prev_prev_type == LETTER_TYPE_ALETTER &&
+ (tok->prev_type == LETTER_TYPE_SINGLE_QUOTE ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_MIDLETTER))
return FALSE;
/* WB10 */
- if (tok->prev_letter == LETTER_TYPE_NUMERIC)
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
return FALSE;
/* WB13b */
- if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
return FALSE;
static bool letter_single_quote(struct generic_fts_tokenizer *tok)
{
/* WB6 */
- if (tok->prev_letter == LETTER_TYPE_ALETTER ||
- tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return FALSE;
/* WB12 */
- if (tok->prev_letter == LETTER_TYPE_NUMERIC)
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_double_quote(struct generic_fts_tokenizer *tok)
{
- if (tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE)
+ if (tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_midletter(struct generic_fts_tokenizer *tok)
{
/* WB6 */
- if (tok->prev_letter == LETTER_TYPE_ALETTER ||
- tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_midnum(struct generic_fts_tokenizer *tok)
{
/* WB12 */
- if (tok->prev_letter == LETTER_TYPE_NUMERIC)
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_numeric(struct generic_fts_tokenizer *tok)
{
/* WB8 */
- if (tok->prev_letter == LETTER_TYPE_NUMERIC)
+ if (tok->prev_type == LETTER_TYPE_NUMERIC)
return FALSE;
/* WB9 */
- if (tok->prev_letter == LETTER_TYPE_ALETTER ||
- tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return FALSE;
/* WB11 */
- if(tok->prev_prev_letter == LETTER_TYPE_NUMERIC &&
- (tok->prev_letter == LETTER_TYPE_MIDNUM ||
- tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
- tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE))
+ if(tok->prev_prev_type == LETTER_TYPE_NUMERIC &&
+ (tok->prev_type == LETTER_TYPE_MIDNUM ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE))
return FALSE;
/* WB13b */
- if (tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
+ if (tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
return FALSE;
return TRUE; /* Any / Any */
{
/* WB13a */
- if (tok->prev_letter == LETTER_TYPE_ALETTER ||
- tok->prev_letter == LETTER_TYPE_HEBREW_LETTER ||
- tok->prev_letter == LETTER_TYPE_NUMERIC ||
- tok->prev_letter == LETTER_TYPE_KATAKANA ||
- tok->prev_letter == LETTER_TYPE_EXTENDNUMLET)
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER ||
+ tok->prev_type == LETTER_TYPE_NUMERIC ||
+ tok->prev_type == LETTER_TYPE_KATAKANA ||
+ tok->prev_type == LETTER_TYPE_EXTENDNUMLET)
return FALSE;
return TRUE; /* Any / Any */
static bool letter_apostrophe(struct generic_fts_tokenizer *tok)
{
- if (tok->prev_letter == LETTER_TYPE_ALETTER ||
- tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+ if (tok->prev_type == LETTER_TYPE_ALETTER ||
+ tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return FALSE;
return TRUE; /* Any / Any */
return TRUE; /* Any / Any */
}
-static void
-add_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
+static inline void
+add_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt)
{
- if(tok->prev_letter != LETTER_TYPE_NONE)
- tok->prev_prev_letter = tok->prev_letter;
- tok->prev_letter = lt;
+ if(tok->prev_type != LETTER_TYPE_NONE)
+ tok->prev_prev_type = tok->prev_type;
+ tok->prev_type = lt;
}
static void
static bool is_one_past_end(struct generic_fts_tokenizer *tok)
{
/* WB6/7 false positive detected at one past end. */
- if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
- tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
- tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
- tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
+ if (tok->prev_type == LETTER_TYPE_MIDLETTER ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE )
return TRUE;
/* WB11/12 false positive detected at one past end. */
- if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
- tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
- tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
- tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
+ if (tok->prev_type == LETTER_TYPE_MIDNUM ||
+ tok->prev_type == LETTER_TYPE_MIDNUMLET ||
+ tok->prev_type == LETTER_TYPE_APOSTROPHE ||
+ tok->prev_type == LETTER_TYPE_SINGLE_QUOTE)
return TRUE;
return FALSE;
i_assert(len > 0);
i_assert(len <= tok->max_length);
- tok->prev_prev_letter = LETTER_TYPE_NONE;
- tok->prev_letter = LETTER_TYPE_NONE;
+ tok->prev_prev_type = LETTER_TYPE_NONE;
+ tok->prev_type = LETTER_TYPE_NONE;
*token_r = t_strndup(data, len);
buffer_set_used_size(tok->token, 0);
tok->untruncated_length = 0;
uni_ucs4_to_utf8_c(tok->letter_c, utf8_str);
buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
- tok->prev_letter = letter_type(tok->letter_c);
+ tok->prev_type = letter_type(tok->letter_c);
tok->letter_c = 0;
tok->prev_letter_c = 0;
tok->seen_wb5a = FALSE;
{
/* No rule knows what to do with just one char, except the linebreaks
we eat away (above) anyway. */
- if (tok->prev_letter != LETTER_TYPE_NONE) {
+ if (tok->prev_type != LETTER_TYPE_NONE) {
if (letter_fns[lt].fn(tok))
return TRUE;
}
if (lt == LETTER_TYPE_EXTEND || lt == LETTER_TYPE_FORMAT) {
/* These types are completely ignored. */
} else {
- add_prev_letter(tok,lt);
+ add_prev_type(tok,lt);
}
return FALSE;
}
if (tok->seen_wb5a)
wb5a_reinsert(tok);
- if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
+ if (tok->prev_type == LETTER_TYPE_NONE && is_nontoken(lt)) {
/* Skip non-token chars at the beginning of token */
i_assert(tok->token->used == 0);
start_pos = i;