From: Timo Sirainen Date: Thu, 3 Dec 2015 12:24:06 +0000 (+0200) Subject: fts: Added support for per-language tokenizer settings. X-Git-Tag: 2.2.20.rc1~1 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=dbf26a3ea43cd79fe88f01ec99c7d9440679b996;p=thirdparty%2Fdovecot%2Fcore.git fts: Added support for per-language tokenizer settings. fts_tokenizer_ now overrides fts_tokenizers setting. fts_tokenizer__ now overrides fts_tokenizer_ setting. --- diff --git a/src/plugins/fts/fts-build-mail.c b/src/plugins/fts/fts-build-mail.c index fbb0c4c649..00f43c6680 100644 --- a/src/plugins/fts/fts-build-mail.c +++ b/src/plugins/fts/fts-build-mail.c @@ -135,6 +135,18 @@ static bool data_has_8bit(const unsigned char *data, size_t size) return FALSE; } +static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx, + struct fts_user_language *user_lang) +{ + i_assert(user_lang != NULL); + + ctx->cur_user_lang = user_lang; + /* reset tokenizer between fields - just to be sure no state + leaks between fields (especially if previous indexing had + failed) */ + fts_tokenizer_reset(user_lang->index_tokenizer); +} + static void fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx, const struct message_header_line *hdr) @@ -148,8 +160,10 @@ fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx, if (header_has_language(hdr->name) || data_has_8bit(hdr->full_value, hdr->full_value_len)) ctx->cur_user_lang = NULL; - else - ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user); + else { + fts_mail_build_ctx_set_lang(ctx, + fts_user_get_data_lang(ctx->update_ctx->backend->ns->user)); + } } static int fts_build_mail_header(struct fts_mail_build_context *ctx, @@ -268,12 +282,11 @@ static int fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx, const unsigned char *data, size_t size) { - struct fts_tokenizer *tokenizer; + struct fts_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer; struct fts_filter *filter = ctx->cur_user_lang->filter; const char *token, *error; int ret = 1, ret2; - tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user); while (ret > 0) T_BEGIN { ret = ret2 = fts_tokenizer_next(tokenizer, data, size, &token, &error); if (ret2 > 0 && filter != NULL) @@ -341,8 +354,7 @@ fts_build_tokenized(struct fts_mail_build_context *ctx, /* wait for more data */ return 0; } else { - ctx->cur_user_lang = fts_user_language_find(user, lang); - i_assert(ctx->cur_user_lang != NULL); + fts_mail_build_ctx_set_lang(ctx, fts_user_language_find(user, lang)); if (ctx->pending_input->used > 0) { if (fts_build_add_tokens_with_filter(ctx, @@ -480,16 +492,8 @@ fts_build_mail_real(struct fts_backend_update_context *update_ctx, memset(&ctx, 0, sizeof(ctx)); ctx.update_ctx = update_ctx; ctx.mail = mail; - if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) { + if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) ctx.pending_input = buffer_create_dynamic(default_pool, 128); - /* reset tokenizer between mails - just to be sure no state - leaks between mails (especially if previous indexing had - failed) */ - struct fts_tokenizer *tokenizer; - - tokenizer = fts_user_get_index_tokenizer(update_ctx->backend->ns->user); - fts_tokenizer_reset(tokenizer); - } prev_part = NULL; parser = message_parser_init(pool_datastack_create(), input, diff --git a/src/plugins/fts/fts-search-args.c b/src/plugins/fts/fts-search-args.c index 1de11380f4..f506fa329f 100644 --- a/src/plugins/fts/fts-search-args.c +++ b/src/plugins/fts/fts-search-args.c @@ -54,14 +54,14 @@ fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool, } static int -fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *languages, - pool_t pool, - struct mail_search_arg *parent_arg, - const struct mail_search_arg *orig_arg, - const char *orig_token, const char *token) +fts_backend_dovecot_expand_tokens(struct fts_filter *filter, + pool_t pool, + struct mail_search_arg *parent_arg, + const struct mail_search_arg *orig_arg, + const char *orig_token, const char *token, + const char **error_r) { struct mail_search_arg *arg; - struct fts_user_language *const *langp; ARRAY_TYPE(const_string) tokens; const char *token2, *error; int ret; @@ -73,15 +73,14 @@ fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *lang array_append(&tokens, &token, 1); /* add the word filtered */ - array_foreach(languages, langp) { + if (filter != NULL) { token2 = t_strdup(token); - ret = (*langp)->filter == NULL ? 1 : - fts_filter_filter((*langp)->filter, &token2, &error); + ret = fts_filter_filter(filter, &token2, &error); if (ret > 0) { token2 = t_strdup(token2); array_append(&tokens, &token2, 1); } else if (ret < 0) { - i_error("fts: Couldn't filter search tokens: %s", error); + *error_r = t_strdup_printf("Couldn't filter search token: %s", error); return -1; } } @@ -94,18 +93,50 @@ fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *lang return 0; } +static int +fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang, + pool_t pool, struct mail_search_arg *and_arg, + struct mail_search_arg *orig_arg, + const char *orig_token, const char **error_r) +{ + unsigned int orig_token_len = strlen(orig_token); + const char *token, *error; + int ret; + + /* reset tokenizer between search args in case there's any state left + from some previous failure */ + fts_tokenizer_reset(user_lang->search_tokenizer); + while ((ret = fts_tokenizer_next(user_lang->search_tokenizer, + (const void *)orig_token, + orig_token_len, &token, error_r)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + while (ret >= 0 && + (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) { + if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool, + and_arg, orig_arg, orig_token, + token, error_r) < 0) + return -1; + } + if (ret < 0) { + *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error); + return -1; + } + return 0; +} + static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool, struct mail_search_arg **argp) { const ARRAY_TYPE(fts_user_language) *languages; + struct fts_user_language *const *langp; struct mail_search_arg *and_arg, *orig_arg = *argp; - const char *error, *token, *orig_token = orig_arg->value.str; - unsigned int orig_token_len = strlen(orig_token); - struct fts_tokenizer *tokenizer; - int ret; + const char *error, *orig_token = orig_arg->value.str; languages = fts_user_get_all_languages(backend->ns->user); - tokenizer = fts_user_get_search_tokenizer(backend->ns->user); /* we want all the tokens found from the string to be found, so create a parent AND and place all the filtered token alternatives under @@ -115,27 +146,12 @@ static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool, and_arg->match_not = orig_arg->match_not; and_arg->next = orig_arg->next; - /* reset tokenizer between search args in case there's any state left - from some previous failure */ - fts_tokenizer_reset(tokenizer); - while ((ret = fts_tokenizer_next(tokenizer, - (const void *)orig_token, - orig_token_len, &token, &error)) > 0) { - if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg, - orig_arg, orig_token, - token) < 0) - return -1; - } - while (ret >= 0 && - (ret = fts_tokenizer_final(tokenizer, &token, &error)) > 0) { - if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg, - orig_arg, orig_token, - token) < 0) + array_foreach(languages, langp) { + if (fts_backend_dovecot_tokenize_lang(*langp, pool, and_arg, + orig_arg, orig_token, &error) < 0) { + i_error("fts: %s", error); return -1; - } - if (ret < 0) { - i_error("fts: Couldn't tokenize search args: %s", error); - return -1; + } } if (and_arg->value.subargs == NULL) { diff --git a/src/plugins/fts/fts-user.c b/src/plugins/fts/fts-user.c index a73b546768..24e021e528 100644 --- a/src/plugins/fts/fts-user.c +++ b/src/plugins/fts/fts-user.c @@ -16,7 +16,6 @@ struct fts_user { int refcount; struct fts_language_list *lang_list; - struct fts_tokenizer *index_tokenizer, *search_tokenizer; struct fts_user_language *data_lang; ARRAY_TYPE(fts_user_language) languages; }; @@ -148,6 +147,7 @@ fts_user_create_filters(struct mail_user *user, const struct fts_language *lang, static int fts_user_create_tokenizer(struct mail_user *user, + const struct fts_language *lang, struct fts_tokenizer **tokenizer_r, bool search, const char **error_r) { @@ -158,11 +158,15 @@ fts_user_create_tokenizer(struct mail_user *user, unsigned int i; int ret = 0; - tokenizers_key = "fts_tokenizers"; + tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL); str = mail_user_plugin_getenv(user, tokenizers_key); if (str == NULL) { - *error_r = "fts_tokenizers setting is missing"; - return -1; + str = mail_user_plugin_getenv(user, "fts_tokenizers"); + if (str == NULL) { + *error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key); + return -1; + } + tokenizers_key = "fts_tokenizers"; } tokenizers = t_strsplit_spaces(str, " "); @@ -177,8 +181,12 @@ fts_user_create_tokenizer(struct mail_user *user, } tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_'); - set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name); + set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name); str = mail_user_plugin_getenv(user, set_key); + if (str == NULL) { + set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name); + str = mail_user_plugin_getenv(user, set_key); + } /* tell the tokenizers that we're tokenizing a search string (instead of tokenizing indexed data) */ @@ -205,18 +213,20 @@ fts_user_create_tokenizer(struct mail_user *user, return 0; } -static int fts_user_init_tokenizers(struct mail_user *user, - struct fts_user *fuser, - const char **error_r) +static int +fts_user_language_init_tokenizers(struct mail_user *user, + struct fts_user_language *user_lang, + const char **error_r) { - if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE, + if (fts_user_create_tokenizer(user, user_lang->lang, + &user_lang->index_tokenizer, FALSE, error_r) < 0) return -1; - if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE, + if (fts_user_create_tokenizer(user, user_lang->lang, + &user_lang->search_tokenizer, TRUE, error_r) < 0) return -1; - return 0; } @@ -234,35 +244,21 @@ fts_user_language_find(struct mail_user *user, return NULL; } -struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user) -{ - struct fts_user *fuser = FTS_USER_CONTEXT(user); - - return fuser->index_tokenizer; -} - -struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user) -{ - struct fts_user *fuser = FTS_USER_CONTEXT(user); - - return fuser->search_tokenizer; -} - static int fts_user_language_create(struct mail_user *user, struct fts_user *fuser, const struct fts_language *lang, const char **error_r) { - struct fts_filter *filter; struct fts_user_language *user_lang; - if (fts_user_create_filters(user, lang, &filter, error_r) < 0) - return -1; user_lang = p_new(user->pool, struct fts_user_language, 1); user_lang->lang = lang; - user_lang->filter = filter; array_append(&fuser->languages, &user_lang, 1); + if (fts_user_language_init_tokenizers(user, user_lang, error_r) < 0) + return -1; + if (fts_user_create_filters(user, lang, &user_lang->filter, error_r) < 0) + return -1; return 0; } @@ -279,6 +275,27 @@ static int fts_user_languages_fill_all(struct mail_user *user, return 0; } +static int +fts_user_init_data_language(struct mail_user *user, struct fts_user *fuser, + const char **error_r) +{ + struct fts_user_language *user_lang; + const char *error; + + user_lang = p_new(user->pool, struct fts_user_language, 1); + user_lang->lang = &fts_language_data; + + if (fts_user_language_init_tokenizers(user, user_lang, error_r) < 0) + return -1; + + if (fts_filter_create(fts_filter_lowercase, NULL, user_lang->lang, NULL, + &user_lang->filter, &error) < 0) + i_unreached(); + i_assert(user_lang->filter != NULL); + fuser->data_lang = user_lang; + return 0; +} + struct fts_language_list *fts_user_get_language_list(struct mail_user *user) { struct fts_user *fuser = FTS_USER_CONTEXT(user); @@ -297,23 +314,20 @@ fts_user_get_all_languages(struct mail_user *user) struct fts_user_language *fts_user_get_data_lang(struct mail_user *user) { struct fts_user *fuser = FTS_USER_CONTEXT(user); - struct fts_user_language *lang; - const char *error; - - if (fuser->data_lang != NULL) - return fuser->data_lang; - lang = p_new(user->pool, struct fts_user_language, 1); - lang->lang = &fts_language_data; - - if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL, - &lang->filter, &error) < 0) - i_unreached(); - i_assert(lang->filter != NULL); - fuser->data_lang = lang; return fuser->data_lang; } +static void fts_user_language_free(struct fts_user_language *user_lang) +{ + if (user_lang->filter != NULL) + fts_filter_unref(&user_lang->filter); + if (user_lang->index_tokenizer != NULL) + fts_tokenizer_unref(&user_lang->index_tokenizer); + if (user_lang->search_tokenizer != NULL) + fts_tokenizer_unref(&user_lang->search_tokenizer); +} + static void fts_user_free(struct fts_user *fuser) { struct fts_user_language *const *user_langp; @@ -321,17 +335,10 @@ static void fts_user_free(struct fts_user *fuser) if (fuser->lang_list != NULL) fts_language_list_deinit(&fuser->lang_list); - array_foreach(&fuser->languages, user_langp) { - if ((*user_langp)->filter != NULL) - fts_filter_unref(&(*user_langp)->filter); - } - if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL) - fts_filter_unref(&fuser->data_lang->filter); - - if (fuser->index_tokenizer != NULL) - fts_tokenizer_unref(&fuser->index_tokenizer); - if (fuser->search_tokenizer != NULL) - fts_tokenizer_unref(&fuser->search_tokenizer); + array_foreach(&fuser->languages, user_langp) + fts_user_language_free(*user_langp); + if (fuser->data_lang != NULL) + fts_user_language_free(fuser->data_lang); } int fts_mail_user_init(struct mail_user *user, const char **error_r) @@ -348,12 +355,12 @@ int fts_mail_user_init(struct mail_user *user, const char **error_r) fuser->refcount = 1; p_array_init(&fuser->languages, user->pool, 4); - if (fts_user_init_languages(user, fuser, error_r) < 0) { + if (fts_user_init_languages(user, fuser, error_r) < 0 || + fts_user_init_data_language(user, fuser, error_r)) { fts_user_free(fuser); return -1; } - if (fts_user_languages_fill_all(user, fuser, error_r) < 0 || - fts_user_init_tokenizers(user, fuser, error_r) < 0) { + if (fts_user_languages_fill_all(user, fuser, error_r) < 0) { fts_user_free(fuser); return -1; } diff --git a/src/plugins/fts/fts-user.h b/src/plugins/fts/fts-user.h index 156ea99918..1c2159d599 100644 --- a/src/plugins/fts/fts-user.h +++ b/src/plugins/fts/fts-user.h @@ -4,14 +4,13 @@ struct fts_user_language { const struct fts_language *lang; struct fts_filter *filter; + struct fts_tokenizer *index_tokenizer, *search_tokenizer; }; ARRAY_DEFINE_TYPE(fts_user_language, struct fts_user_language *); struct fts_user_language * fts_user_language_find(struct mail_user *user, const struct fts_language *lang); -struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user); -struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user); struct fts_language_list *fts_user_get_language_list(struct mail_user *user); const ARRAY_TYPE(fts_user_language) * fts_user_get_all_languages(struct mail_user *user);