]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
fts: Added support for per-language tokenizer settings.
authorTimo Sirainen <tss@iki.fi>
Thu, 3 Dec 2015 12:24:06 +0000 (14:24 +0200)
committerTimo Sirainen <tss@iki.fi>
Thu, 3 Dec 2015 12:24:06 +0000 (14:24 +0200)
fts_tokenizer_<lang> now overrides fts_tokenizers setting.
fts_tokenizer_<name>_<lang> now overrides fts_tokenizer_<name> setting.

src/plugins/fts/fts-build-mail.c
src/plugins/fts/fts-search-args.c
src/plugins/fts/fts-user.c
src/plugins/fts/fts-user.h

index fbb0c4c649b11c3ab9c3415cc8f6d95190fabb77..00f43c668064f71144a551731842d34575b7ea8f 100644 (file)
@@ -135,6 +135,18 @@ static bool data_has_8bit(const unsigned char *data, size_t size)
        return FALSE;
 }
 
+static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx,
+                                       struct fts_user_language *user_lang)
+{
+       i_assert(user_lang != NULL);
+
+       ctx->cur_user_lang = user_lang;
+       /* reset tokenizer between fields - just to be sure no state
+          leaks between fields (especially if previous indexing had
+          failed) */
+       fts_tokenizer_reset(user_lang->index_tokenizer);
+}
+
 static void
 fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx,
                                    const struct message_header_line *hdr)
@@ -148,8 +160,10 @@ fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx,
        if (header_has_language(hdr->name) ||
            data_has_8bit(hdr->full_value, hdr->full_value_len))
                ctx->cur_user_lang = NULL;
-       else
-               ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user);
+       else {
+               fts_mail_build_ctx_set_lang(ctx,
+                       fts_user_get_data_lang(ctx->update_ctx->backend->ns->user));
+       }
 }
 
 static int fts_build_mail_header(struct fts_mail_build_context *ctx,
@@ -268,12 +282,11 @@ static int
 fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
                                 const unsigned char *data, size_t size)
 {
-       struct fts_tokenizer *tokenizer;
+       struct fts_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer;
        struct fts_filter *filter = ctx->cur_user_lang->filter;
        const char *token, *error;
        int ret = 1, ret2;
 
-       tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user);
        while (ret > 0) T_BEGIN {
                ret = ret2 = fts_tokenizer_next(tokenizer, data, size, &token, &error);
                if (ret2 > 0 && filter != NULL)
@@ -341,8 +354,7 @@ fts_build_tokenized(struct fts_mail_build_context *ctx,
                /* wait for more data */
                return 0;
        } else {
-               ctx->cur_user_lang = fts_user_language_find(user, lang);
-               i_assert(ctx->cur_user_lang != NULL);
+               fts_mail_build_ctx_set_lang(ctx, fts_user_language_find(user, lang));
 
                if (ctx->pending_input->used > 0) {
                        if (fts_build_add_tokens_with_filter(ctx,
@@ -480,16 +492,8 @@ fts_build_mail_real(struct fts_backend_update_context *update_ctx,
        memset(&ctx, 0, sizeof(ctx));
        ctx.update_ctx = update_ctx;
        ctx.mail = mail;
-       if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
+       if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
                ctx.pending_input = buffer_create_dynamic(default_pool, 128);
-               /* reset tokenizer between mails - just to be sure no state
-                  leaks between mails (especially if previous indexing had
-                  failed) */
-               struct fts_tokenizer *tokenizer;
-
-               tokenizer = fts_user_get_index_tokenizer(update_ctx->backend->ns->user);
-               fts_tokenizer_reset(tokenizer);
-       }
 
        prev_part = NULL;
        parser = message_parser_init(pool_datastack_create(), input,
index 1de11380f44aca6e0c40758db3a4680d82b5a122..f506fa329f50c07bf002ab96400f7defc44e2466 100644 (file)
@@ -54,14 +54,14 @@ fts_search_arg_create_or(const struct mail_search_arg *orig_arg, pool_t pool,
 }
 
 static int
-fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *languages,
-                                      pool_t pool,
-                                      struct mail_search_arg *parent_arg,
-                                      const struct mail_search_arg *orig_arg,
-                                      const char *orig_token, const char *token)
+fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
+                                 pool_t pool,
+                                 struct mail_search_arg *parent_arg,
+                                 const struct mail_search_arg *orig_arg,
+                                 const char *orig_token, const char *token,
+                                 const char **error_r)
 {
        struct mail_search_arg *arg;
-       struct fts_user_language *const *langp;
        ARRAY_TYPE(const_string) tokens;
        const char *token2, *error;
        int ret;
@@ -73,15 +73,14 @@ fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *lang
        array_append(&tokens, &token, 1);
 
        /* add the word filtered */
-       array_foreach(languages, langp) {
+       if (filter != NULL) {
                token2 = t_strdup(token);
-               ret = (*langp)->filter == NULL ? 1 :
-                       fts_filter_filter((*langp)->filter, &token2, &error);
+               ret = fts_filter_filter(filter, &token2, &error);
                if (ret > 0) {
                        token2 = t_strdup(token2);
                        array_append(&tokens, &token2, 1);
                } else if (ret < 0) {
-                       i_error("fts: Couldn't filter search tokens: %s", error);
+                       *error_r = t_strdup_printf("Couldn't filter search token: %s", error);
                        return -1;
                }
        }
@@ -94,18 +93,50 @@ fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *lang
        return 0;
 }
 
+static int
+fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
+                                 pool_t pool, struct mail_search_arg *and_arg,
+                                 struct mail_search_arg *orig_arg,
+                                 const char *orig_token, const char **error_r)
+{
+       unsigned int orig_token_len = strlen(orig_token);
+       const char *token, *error;
+       int ret;
+
+       /* reset tokenizer between search args in case there's any state left
+          from some previous failure */
+       fts_tokenizer_reset(user_lang->search_tokenizer);
+       while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
+                                        (const void *)orig_token,
+                                        orig_token_len, &token, error_r)) > 0) {
+               if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+                                                     and_arg, orig_arg, orig_token,
+                                                     token, error_r) < 0)
+                       return -1;
+       }
+       while (ret >= 0 &&
+              (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
+               if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+                                                     and_arg, orig_arg, orig_token,
+                                                     token, error_r) < 0)
+                       return -1;
+       }
+       if (ret < 0) {
+               *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
+               return -1;
+       }
+       return 0;
+}
+
 static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
                                 struct mail_search_arg **argp)
 {
        const ARRAY_TYPE(fts_user_language) *languages;
+       struct fts_user_language *const *langp;
        struct mail_search_arg *and_arg, *orig_arg = *argp;
-       const char *error, *token, *orig_token = orig_arg->value.str;
-       unsigned int orig_token_len = strlen(orig_token);
-       struct fts_tokenizer *tokenizer;
-       int ret;
+       const char *error, *orig_token = orig_arg->value.str;
 
        languages = fts_user_get_all_languages(backend->ns->user);
-       tokenizer = fts_user_get_search_tokenizer(backend->ns->user);
 
        /* we want all the tokens found from the string to be found, so create
           a parent AND and place all the filtered token alternatives under
@@ -115,27 +146,12 @@ static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
        and_arg->match_not = orig_arg->match_not;
        and_arg->next = orig_arg->next;
 
-       /* reset tokenizer between search args in case there's any state left
-          from some previous failure */
-       fts_tokenizer_reset(tokenizer);
-       while ((ret = fts_tokenizer_next(tokenizer,
-                                        (const void *)orig_token,
-                                        orig_token_len, &token, &error)) > 0) {
-               if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
-                                                          orig_arg, orig_token,
-                                                          token) < 0)
-                       return -1;
-       }
-       while (ret >= 0 &&
-              (ret = fts_tokenizer_final(tokenizer, &token, &error)) > 0) {
-               if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
-                                                          orig_arg, orig_token,
-                                                          token) < 0)
+       array_foreach(languages, langp) {
+               if (fts_backend_dovecot_tokenize_lang(*langp, pool, and_arg,
+                                                     orig_arg, orig_token, &error) < 0) {
+                       i_error("fts: %s", error);
                        return -1;
-       }
-       if (ret < 0) {
-               i_error("fts: Couldn't tokenize search args: %s", error);
-               return -1;
+               }
        }
 
        if (and_arg->value.subargs == NULL) {
index a73b5467683b48dbf50bfb7cb744902d66ddcc7c..24e021e5286f55bf8e7078edb34df1c7f66b0220 100644 (file)
@@ -16,7 +16,6 @@ struct fts_user {
        int refcount;
 
        struct fts_language_list *lang_list;
-       struct fts_tokenizer *index_tokenizer, *search_tokenizer;
        struct fts_user_language *data_lang;
        ARRAY_TYPE(fts_user_language) languages;
 };
@@ -148,6 +147,7 @@ fts_user_create_filters(struct mail_user *user, const struct fts_language *lang,
 
 static int
 fts_user_create_tokenizer(struct mail_user *user,
+                         const struct fts_language *lang,
                          struct fts_tokenizer **tokenizer_r, bool search,
                          const char **error_r)
 {
@@ -158,11 +158,15 @@ fts_user_create_tokenizer(struct mail_user *user,
        unsigned int i;
        int ret = 0;
 
-       tokenizers_key = "fts_tokenizers";
+       tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL);
        str = mail_user_plugin_getenv(user, tokenizers_key);
        if (str == NULL) {
-               *error_r = "fts_tokenizers setting is missing";
-               return -1;
+               str = mail_user_plugin_getenv(user, "fts_tokenizers");
+               if (str == NULL) {
+                       *error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key);
+                       return -1;
+               }
+               tokenizers_key = "fts_tokenizers";
        }
 
        tokenizers = t_strsplit_spaces(str, " ");
@@ -177,8 +181,12 @@ fts_user_create_tokenizer(struct mail_user *user,
                }
 
                tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_');
-               set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+               set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name);
                str = mail_user_plugin_getenv(user, set_key);
+               if (str == NULL) {
+                       set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+                       str = mail_user_plugin_getenv(user, set_key);
+               }
 
                /* tell the tokenizers that we're tokenizing a search string
                   (instead of tokenizing indexed data) */
@@ -205,18 +213,20 @@ fts_user_create_tokenizer(struct mail_user *user,
        return 0;
 }
 
-static int fts_user_init_tokenizers(struct mail_user *user,
-                                   struct fts_user *fuser,
-                                   const char **error_r)
+static int
+fts_user_language_init_tokenizers(struct mail_user *user,
+                                 struct fts_user_language *user_lang,
+                                 const char **error_r)
 {
-       if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE,
+       if (fts_user_create_tokenizer(user, user_lang->lang,
+                                     &user_lang->index_tokenizer, FALSE,
                                      error_r) < 0)
                return -1;
 
-       if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE,
+       if (fts_user_create_tokenizer(user, user_lang->lang,
+                                     &user_lang->search_tokenizer, TRUE,
                                      error_r) < 0)
                return -1;
-
        return 0;
 }
 
@@ -234,35 +244,21 @@ fts_user_language_find(struct mail_user *user,
        return NULL;
 }
 
-struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user)
-{
-       struct fts_user *fuser = FTS_USER_CONTEXT(user);
-
-       return fuser->index_tokenizer;
-}
-
-struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user)
-{
-       struct fts_user *fuser = FTS_USER_CONTEXT(user);
-
-       return fuser->search_tokenizer;
-}
-
 static int fts_user_language_create(struct mail_user *user,
                                     struct fts_user *fuser,
                                    const struct fts_language *lang,
                                    const char **error_r)
 {
-       struct fts_filter *filter;
        struct fts_user_language *user_lang;
-       if (fts_user_create_filters(user, lang, &filter, error_r) < 0)
-               return -1;
 
        user_lang = p_new(user->pool, struct fts_user_language, 1);
        user_lang->lang = lang;
-       user_lang->filter = filter;
        array_append(&fuser->languages, &user_lang, 1);
 
+       if (fts_user_language_init_tokenizers(user, user_lang, error_r) < 0)
+               return -1;
+       if (fts_user_create_filters(user, lang, &user_lang->filter, error_r) < 0)
+               return -1;
        return 0;
 }
 
@@ -279,6 +275,27 @@ static int fts_user_languages_fill_all(struct mail_user *user,
        return 0;
 }
 
+static int
+fts_user_init_data_language(struct mail_user *user, struct fts_user *fuser,
+                           const char **error_r)
+{
+       struct fts_user_language *user_lang;
+       const char *error;
+
+       user_lang = p_new(user->pool, struct fts_user_language, 1);
+       user_lang->lang = &fts_language_data;
+
+       if (fts_user_language_init_tokenizers(user, user_lang, error_r) < 0)
+               return -1;
+
+       if (fts_filter_create(fts_filter_lowercase, NULL, user_lang->lang, NULL,
+                             &user_lang->filter, &error) < 0)
+               i_unreached();
+       i_assert(user_lang->filter != NULL);
+       fuser->data_lang = user_lang;
+       return 0;
+}
+
 struct fts_language_list *fts_user_get_language_list(struct mail_user *user)
 {
        struct fts_user *fuser = FTS_USER_CONTEXT(user);
@@ -297,23 +314,20 @@ fts_user_get_all_languages(struct mail_user *user)
 struct fts_user_language *fts_user_get_data_lang(struct mail_user *user)
 {
        struct fts_user *fuser = FTS_USER_CONTEXT(user);
-       struct fts_user_language *lang;
-       const char *error;
-
-       if (fuser->data_lang != NULL)
-               return fuser->data_lang;
 
-       lang = p_new(user->pool, struct fts_user_language, 1);
-       lang->lang = &fts_language_data;
-
-       if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL,
-                             &lang->filter, &error) < 0)
-               i_unreached();
-       i_assert(lang->filter != NULL);
-       fuser->data_lang = lang;
        return fuser->data_lang;
 }
 
+static void fts_user_language_free(struct fts_user_language *user_lang)
+{
+       if (user_lang->filter != NULL)
+               fts_filter_unref(&user_lang->filter);
+       if (user_lang->index_tokenizer != NULL)
+               fts_tokenizer_unref(&user_lang->index_tokenizer);
+       if (user_lang->search_tokenizer != NULL)
+               fts_tokenizer_unref(&user_lang->search_tokenizer);
+}
+
 static void fts_user_free(struct fts_user *fuser)
 {
        struct fts_user_language *const *user_langp;
@@ -321,17 +335,10 @@ static void fts_user_free(struct fts_user *fuser)
        if (fuser->lang_list != NULL)
                fts_language_list_deinit(&fuser->lang_list);
 
-       array_foreach(&fuser->languages, user_langp) {
-               if ((*user_langp)->filter != NULL)
-                       fts_filter_unref(&(*user_langp)->filter);
-       }
-       if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL)
-               fts_filter_unref(&fuser->data_lang->filter);
-
-       if (fuser->index_tokenizer != NULL)
-               fts_tokenizer_unref(&fuser->index_tokenizer);
-       if (fuser->search_tokenizer != NULL)
-               fts_tokenizer_unref(&fuser->search_tokenizer);
+       array_foreach(&fuser->languages, user_langp)
+               fts_user_language_free(*user_langp);
+       if (fuser->data_lang != NULL)
+               fts_user_language_free(fuser->data_lang);
 }
 
 int fts_mail_user_init(struct mail_user *user, const char **error_r)
@@ -348,12 +355,12 @@ int fts_mail_user_init(struct mail_user *user, const char **error_r)
        fuser->refcount = 1;
        p_array_init(&fuser->languages, user->pool, 4);
 
-       if (fts_user_init_languages(user, fuser, error_r) < 0) {
+       if (fts_user_init_languages(user, fuser, error_r) < 0 ||
+           fts_user_init_data_language(user, fuser, error_r)) {
                fts_user_free(fuser);
                return -1;
        }
-       if (fts_user_languages_fill_all(user, fuser, error_r) < 0 ||
-           fts_user_init_tokenizers(user, fuser, error_r) < 0) {
+       if (fts_user_languages_fill_all(user, fuser, error_r) < 0) {
                fts_user_free(fuser);
                return -1;
        }
index 156ea99918be1025808c2d358543ecaf3c16f9cf..1c2159d599cba2c62cbbf2b177f000a112d7bc2f 100644 (file)
@@ -4,14 +4,13 @@
 struct fts_user_language {
        const struct fts_language *lang;
        struct fts_filter *filter;
+       struct fts_tokenizer *index_tokenizer, *search_tokenizer;
 };
 ARRAY_DEFINE_TYPE(fts_user_language, struct fts_user_language *);
 
 struct fts_user_language *
 fts_user_language_find(struct mail_user *user,
                        const struct fts_language *lang);
-struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user);
-struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user);
 struct fts_language_list *fts_user_get_language_list(struct mail_user *user);
 const ARRAY_TYPE(fts_user_language) *
 fts_user_get_all_languages(struct mail_user *user);