From 72ba0cb853f36078a978b91796ea6471fdaa3c73 Mon Sep 17 00:00:00 2001 From: Marco Bettini Date: Tue, 30 Jan 2024 09:01:24 +0000 Subject: [PATCH] lib-language: tokenizers - Use new settings --- src/lib-language/lang-tokenizer-address.c | 30 ++----- src/lib-language/lang-tokenizer-generic.c | 94 ++++++++----------- src/lib-language/lang-tokenizer-private.h | 6 +- src/lib-language/lang-tokenizer.c | 11 +-- src/lib-language/lang-tokenizer.h | 4 +- src/lib-language/test-lang-tokenizer.c | 104 ++++++++++++++-------- src/lib-storage/lang-user.c | 84 +++++------------ 7 files changed, 140 insertions(+), 193 deletions(-) diff --git a/src/lib-language/lang-tokenizer-address.c b/src/lib-language/lang-tokenizer-address.c index f084d786d5..fa191d3af3 100644 --- a/src/lib-language/lang-tokenizer-address.c +++ b/src/lib-language/lang-tokenizer-address.c @@ -6,12 +6,11 @@ #include "rfc822-parser.h" #include "lang-tokenizer-private.h" #include "lang-tokenizer-common.h" +#include "lang-settings.h" #define IS_DTEXT(c) \ (rfc822_atext_chars[(int)(unsigned char)(c)] == 2) -#define LANG_DEFAULT_ADDRESS_MAX_LENGTH 254 - enum email_address_parser_state { EMAIL_ADDRESS_PARSER_STATE_NONE = 0, EMAIL_ADDRESS_PARSER_STATE_LOCALPART, @@ -30,37 +29,18 @@ struct email_address_lang_tokenizer { }; static int -lang_tokenizer_email_address_create(const char *const *settings, +lang_tokenizer_email_address_create(const struct lang_settings *set, enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, - const char **error_r) + const char **error_r ATTR_UNUSED) { struct email_address_lang_tokenizer *tok; - bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); - unsigned int max_length = LANG_DEFAULT_ADDRESS_MAX_LENGTH; - unsigned int i; - - for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; - - if (strcmp(key, "maxlen") == 0) { - if (str_to_uint(value, &max_length) < 0 || - max_length == 0) { - *error_r = t_strdup_printf("Invalid maxlen setting: %s", value); - return -1; - } - } else { - *error_r = t_strdup_printf("Unknown setting: %s", key); - return -1; - } - } - tok = i_new(struct email_address_lang_tokenizer, 1); tok->tokenizer = *lang_tokenizer_email_address; tok->last_word = str_new(default_pool, 128); tok->parent_data = str_new(default_pool, 128); - tok->max_length = max_length; - tok->search = search; + tok->max_length = set->tokenizer_address_token_maxlen; + tok->search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); *tokenizer_r = &tok->tokenizer; return 0; } diff --git a/src/lib-language/lang-tokenizer-generic.c b/src/lib-language/lang-tokenizer-generic.c index f2199026b2..2a7cf6d7ff 100644 --- a/src/lib-language/lang-tokenizer-generic.c +++ b/src/lib-language/lang-tokenizer-generic.c @@ -10,14 +10,13 @@ #include "lang-tokenizer-private.h" #include "lang-tokenizer-generic-private.h" #include "lang-tokenizer-common.h" +#include "lang-settings.h" #include "word-boundary-data.c" #include "word-break-data.c" /* see comments below between is_base64() and skip_base64() */ #define LANG_SKIP_BASE64_MIN_SEQUENCES 1 #define LANG_SKIP_BASE64_MIN_CHARS 50 - -#define LANG_DEFAULT_TOKEN_MAX_LENGTH 30 #define LANG_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */ static unsigned char lang_ascii_word_breaks[128] = { @@ -32,71 +31,56 @@ static unsigned char lang_ascii_word_breaks[128] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */ }; +struct algorithm { + const char *name; + enum boundary_algorithm id; + const struct lang_tokenizer_vfuncs *v; +}; + +static const struct algorithm algorithms[] = { + { ALGORITHM_SIMPLE_NAME, BOUNDARY_ALGORITHM_SIMPLE, &generic_tokenizer_vfuncs_simple }, + { ALGORITHM_TR29_NAME, BOUNDARY_ALGORITHM_TR29, &generic_tokenizer_vfuncs_tr29 }, + { NULL, 0, NULL } +}; + +static const struct algorithm *parse_algorithm(const char *name) +{ + for (const struct algorithm *entry = algorithms; entry->name != NULL; entry++) + if (strcmp(name, entry->name) == 0) + return entry; + return NULL; +} + static int -lang_tokenizer_generic_create(const char *const *settings, +lang_tokenizer_generic_create(const struct lang_settings *set, enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r) { - struct generic_lang_tokenizer *tok; - unsigned int max_length = LANG_DEFAULT_TOKEN_MAX_LENGTH; - enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE; - bool wb5a = FALSE; - bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); - bool explicitprefix = FALSE; - unsigned int i; - - for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; - - if (strcmp(key, "maxlen") == 0) { - if (str_to_uint(value, &max_length) < 0 || - max_length == 0) { - *error_r = t_strdup_printf( - "Invalid maxlen setting: %s", value); - return -1; - } - } else if (strcmp(key, "algorithm") == 0) { - if (strcmp(value, ALGORITHM_TR29_NAME) == 0) - algo = BOUNDARY_ALGORITHM_TR29; - else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0) - ; - else { - *error_r = t_strdup_printf( - "Invalid algorithm: %s", value); - return -1; - } - } else if (strcasecmp(key, "wb5a") == 0) { - if (strcasecmp(value, "no") == 0) - wb5a = FALSE; - else - wb5a = TRUE; - } else if (strcasecmp(key, "explicitprefix") == 0) { - explicitprefix = TRUE; - } else { - *error_r = t_strdup_printf("Unknown setting: %s", key); - return -1; - } + const struct algorithm *algo = parse_algorithm(set->tokenizer_generic_algorithm); + if (algo == NULL) { + *error_r = t_strdup_printf( + "Unknown language_tokenizer_generic_algorithm: %s", + set->tokenizer_generic_algorithm); + return -1; } - /* Tokenise normally unless tokenising an explicit prefix query */ - if (!search) - explicitprefix = FALSE; - - if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) { - *error_r = "Can not use WB5a for algorithms other than TR29."; + bool wb5a = set->tokenizer_generic_wb5a; + if (wb5a && algo->id != BOUNDARY_ALGORITHM_TR29) { + *error_r = "Can not use language_tokenizer_generic_wb5a for " + "algorithms other than language_tokenizer_generic_algorithm = tr29"; return -1; } + bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); + + struct generic_lang_tokenizer *tok; tok = i_new(struct generic_lang_tokenizer, 1); - if (algo == BOUNDARY_ALGORITHM_TR29) - tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29; - else - tok->tokenizer.v = &generic_tokenizer_vfuncs_simple; - tok->max_length = max_length; - tok->algorithm = algo; + tok->tokenizer.v = algo->v; + tok->max_length = set->tokenizer_generic_token_maxlen; + tok->algorithm = algo->id; tok->wb5a = wb5a; - tok->prefixsplat = explicitprefix; + tok->prefixsplat = search && set->tokenizer_generic_explicit_prefix; tok->token = buffer_create_dynamic(default_pool, 64); *tokenizer_r = &tok->tokenizer; diff --git a/src/lib-language/lang-tokenizer-private.h b/src/lib-language/lang-tokenizer-private.h index 73ff5b73f6..1154613439 100644 --- a/src/lib-language/lang-tokenizer-private.h +++ b/src/lib-language/lang-tokenizer-private.h @@ -6,8 +6,10 @@ #define LANG_TOKENIZER_CLASSES_NR 2 struct lang_tokenizer_vfuncs { - int (*create)(const char *const *settings, unsigned int flags, - struct lang_tokenizer **tokenizer_r, const char **error_r); + int (*create)(const struct lang_settings *set, + enum lang_tokenizer_flags flags, + struct lang_tokenizer **tokenizer_r, + const char **error_r); void (*destroy)(struct lang_tokenizer *tok); void (*reset)(struct lang_tokenizer *tok); diff --git a/src/lib-language/lang-tokenizer.c b/src/lib-language/lang-tokenizer.c index ebc91bab8b..e2df851ff3 100644 --- a/src/lib-language/lang-tokenizer.c +++ b/src/lib-language/lang-tokenizer.c @@ -76,20 +76,13 @@ static void lang_tokenizer_self_reset(struct lang_tokenizer *tok) int lang_tokenizer_create(const struct lang_tokenizer *tok_class, struct lang_tokenizer *parent, - const char *const *settings, + const struct lang_settings *set, enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r) { struct lang_tokenizer *tok; - const char *empty_settings = NULL; - - i_assert(settings == NULL || str_array_length(settings) % 2 == 0); - - if (settings == NULL) - settings = &empty_settings; - - if (tok_class->v->create(settings, flags, &tok, error_r) < 0) { + if (tok_class->v->create(set, flags, &tok, error_r) < 0) { *tokenizer_r = NULL; return -1; } diff --git a/src/lib-language/lang-tokenizer.h b/src/lib-language/lang-tokenizer.h index 589b3b3d43..bae02c9eb6 100644 --- a/src/lib-language/lang-tokenizer.h +++ b/src/lib-language/lang-tokenizer.h @@ -1,6 +1,8 @@ #ifndef LANG_TOKENIZER_H #define LANG_TOKENIZER_H +struct lang_settings; + /* Settings are given in the form of a const char * const *settings = {"key, "value", "key2", "value2", NULL} array of string pairs. Some @@ -53,7 +55,7 @@ const struct lang_tokenizer *lang_tokenizer_find(const char *name); /* Create a new tokenizer. The settings are described above. */ int lang_tokenizer_create(const struct lang_tokenizer *tok_class, struct lang_tokenizer *parent, - const char *const *settings, + const struct lang_settings *set, enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r); diff --git a/src/lib-language/test-lang-tokenizer.c b/src/lib-language/test-lang-tokenizer.c index 19f975ccdd..690f90cb42 100644 --- a/src/lib-language/test-lang-tokenizer.c +++ b/src/lib-language/test-lang-tokenizer.c @@ -8,6 +8,24 @@ #include "lang-tokenizer-common.h" #include "lang-tokenizer-private.h" #include "lang-tokenizer-generic-private.h" +#include "lang-settings.h" + +static struct lang_settings simple_settings; +static struct lang_settings tr29_settings; +static struct lang_settings tr29_wb5a_settings; + +static void init_lang_settings(void) +{ + simple_settings = lang_default_settings; + simple_settings.tokenizer_generic_algorithm = "simple"; + + tr29_settings = lang_default_settings; + tr29_settings.tokenizer_generic_algorithm = "tr29"; + + tr29_wb5a_settings = lang_default_settings; + tr29_wb5a_settings.tokenizer_generic_algorithm = "tr29"; + tr29_wb5a_settings.tokenizer_generic_wb5a = TRUE; +} /*there should be a trailing space ' ' at the end of each string except the last one*/ #define TEST_INPUT_ADDRESS \ @@ -192,7 +210,7 @@ static void test_lang_tokenizer_generic_only(void) const char *error; test_begin("lang tokenizer generic simple"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &tok, &error) == 0); test_assert(((struct generic_lang_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); @@ -200,8 +218,6 @@ static void test_lang_tokenizer_generic_only(void) test_end(); } -const char *const tr29_settings[] = {"algorithm", "tr29", NULL}; - /* TODO: U+206F is in "Format" and therefore currently not word break. This definitely needs to be remapped. */ static void test_lang_tokenizer_generic_tr29_only(void) @@ -250,14 +266,12 @@ static void test_lang_tokenizer_generic_tr29_only(void) const char *error; test_begin("lang tokenizer generic TR29"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); lang_tokenizer_unref(&tok); test_end(); } -const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL}; - /* TODO: U+206F is in "Format" and therefore currently not word break. This definitely needs to be remapped. */ static void test_lang_tokenizer_generic_tr29_wb5a(void) @@ -307,7 +321,7 @@ static void test_lang_tokenizer_generic_tr29_wb5a(void) const char *error; test_begin("lang tokenizer generic TR29 with WB5a"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings_wb5a, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_wb5a_settings, 0, &tok, &error) == 0); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); lang_tokenizer_unref(&tok); test_end(); @@ -330,13 +344,13 @@ static void test_lang_tokenizer_address_only(void) const char *error; test_begin("lang tokenizer email address only"); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, NULL, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &lang_default_settings, 0, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); lang_tokenizer_unref(&tok); test_end(); } -static void test_lang_tokenizer_address_parent(const char *name, const char * const *settings, unsigned int flags) +static void test_lang_tokenizer_address_parent(const char *name, struct lang_settings *set, enum lang_tokenizer_flags flags) { static const char input[] = TEST_INPUT_ADDRESS; static const char *const expected_output[] = { @@ -366,23 +380,22 @@ static void test_lang_tokenizer_address_parent(const char *name, const char * co const char *error; test_begin(t_strdup_printf("lang tokenizer email address + parent %s", name)); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, flags, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, set, flags, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, 0, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); lang_tokenizer_unref(&tok); lang_tokenizer_unref(&gen_tok); test_end(); } -const char *const simple_settings[] = {"algorithm", "simple", NULL}; static void test_lang_tokenizer_address_parent_simple(void) { - test_lang_tokenizer_address_parent("simple", simple_settings, 0); + test_lang_tokenizer_address_parent("simple", &simple_settings, 0); } static void test_lang_tokenizer_address_parent_tr29(void) { - test_lang_tokenizer_address_parent("tr29", tr29_settings, 0); + test_lang_tokenizer_address_parent("tr29", &tr29_settings, 0); } static void test_lang_tokenizer_address_search(void) @@ -415,8 +428,8 @@ static void test_lang_tokenizer_address_search(void) const char *token, *error; test_begin("lang tokenizer search email address + parent"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); /* make sure state is forgotten at EOF */ @@ -478,13 +491,15 @@ static void test_lang_tokenizer_delete_trailing_partial_char(void) static void test_lang_tokenizer_address_maxlen(void) { - const char *const settings[] = {"maxlen", "5", NULL}; + struct lang_settings set = lang_default_settings; + set.tokenizer_address_token_maxlen = 5; + const char *input = "...\357\277\275@a"; struct lang_tokenizer *tok; const char *token, *error; test_begin("lang tokenizer address maxlen"); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, settings, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &set, 0, &tok, &error) == 0); while (lang_tokenizer_next(tok, (const unsigned char *)input, strlen(input), &token, &error) > 0) ; @@ -496,8 +511,13 @@ static void test_lang_tokenizer_address_maxlen(void) static void test_lang_tokenizer_random(void) { const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' }; - const char *const settings[] = {"algorithm", "simple", NULL}; - const char *const email_settings[] = {"maxlen", "9", NULL}; + + struct lang_settings set = lang_default_settings; + set.tokenizer_generic_algorithm = "simple"; + + struct lang_settings email_set = lang_default_settings; + email_set.tokenizer_address_token_maxlen = 9; + unsigned int i; unsigned char addr[10] = { 0 }; string_t *str = t_str_new(20); @@ -505,8 +525,8 @@ static void test_lang_tokenizer_random(void) const char *token, *error; test_begin("lang tokenizer random"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, 0, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, email_settings, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &set, 0, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &email_set, 0, &tok, &error) == 0); for (i = 0; i < 10000; i++) T_BEGIN { for (unsigned int j = 0; j < sizeof(addr); j++) @@ -539,32 +559,37 @@ test_lang_tokenizer_explicit_prefix(void) "twopre", "twoboth", "twopost", NULL, NULL }; - const char *settings[9] = { "algorithm", "tr29", "wb5a", "yes" }; - const char **setptr; + const struct algo { + const char *name; + bool wb5a; + } algos[] = { + { ALGORITHM_SIMPLE_NAME, FALSE }, + { ALGORITHM_TR29_NAME, FALSE }, + { ALGORITHM_TR29_NAME, TRUE }, + }; - const char *algos[] = { ALGORITHM_SIMPLE_NAME, - ALGORITHM_TR29_NAME, - ALGORITHM_TR29_NAME "+wb5a" }; - const char *searches[] = { "indexing", "searching" }; - const char *prefixes[] = { "fixed", "prefix" }; + struct lang_settings set = lang_default_settings; + for (unsigned int algo_index = 0; algo_index < N_ELEMENTS(algos); algo_index++) { + const struct algo *algo = &algos[algo_index]; + set.tokenizer_generic_wb5a = algo->wb5a; + set.tokenizer_generic_algorithm = algo->name; + const char *algo_str = t_strdup_printf("%s%s", algo->name, algo->wb5a ? "+wb5a" : ""); - for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */ for (int search = 0; search < 2; search++) { enum lang_tokenizer_flags flags = search > 0 ? LANG_TOKENIZER_FLAG_SEARCH : 0; + const char *search_str = search > 0 ? "searching" : "indexing"; + for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) { - setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */ - if (explicitprefix > 0) { *setptr++ = "explicitprefix"; *setptr++ = "y"; } - *setptr++ = NULL; + set.tokenizer_generic_explicit_prefix = explicitprefix > 0; + const char *prefix_str = explicitprefix > 0 ? "prefix" : "fixed"; test_begin(t_strdup_printf("prefix search %s:%s:%s", - algos[algo], - searches[search], - prefixes[explicitprefix])); + algo_str, search_str, prefix_str)); struct lang_tokenizer *tok; const char *error; - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, - flags, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, + &set, flags, &tok, &error) == 0); test_tokenizer_inputs( tok, &input, 1, (search!=0) && (explicitprefix!=0) @@ -635,7 +660,7 @@ static void test_lang_tokenizer_skip_base64(void) }; test_begin("lang tokenizer skip base64"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0); size_t index = 0; while (lang_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) { @@ -657,6 +682,7 @@ static void test_lang_tokenizer_skip_base64(void) int main(void) { + init_lang_settings(); static void (*const test_functions[])(void) = { test_lang_tokenizer_skip_base64, test_lang_tokenizer_find, diff --git a/src/lib-storage/lang-user.c b/src/lib-storage/lang-user.c index dee48bf8e5..507fb8bea7 100644 --- a/src/lib-storage/lang-user.c +++ b/src/lib-storage/lang-user.c @@ -29,36 +29,10 @@ struct lang_user { static MODULE_CONTEXT_DEFINE_INIT(lang_user_module, &mail_user_module_register); -static const char *const *str_keyvalues_to_array(const char *str) -{ - const char *key, *value, *const *keyvalues; - ARRAY_TYPE(const_string) arr; - unsigned int i; - - if (str == NULL) - return NULL; - - t_array_init(&arr, 8); - keyvalues = t_strsplit_spaces(str, " "); - for (i = 0; keyvalues[i] != NULL; i++) { - value = strchr(keyvalues[i], '='); - if (value != NULL) - key = t_strdup_until(keyvalues[i], value++); - else { - key = keyvalues[i]; - value = ""; - } - array_push_back(&arr, &key); - array_push_back(&arr, &value); - } - array_append_zero(&arr); - return array_front(&arr); -} - /* Returns the setting for the given language, or, if the langauge is not defined, the settings for the default language (which is always the first in the array) */ -const struct lang_settings * +static const struct lang_settings * lang_user_settings_get(struct mail_user *user, const char *lang) { struct lang_settings *set; @@ -150,48 +124,34 @@ lang_user_create_tokenizer(struct mail_user *user, struct lang_tokenizer **tokenizer_r, bool search, const char **error_r) { - const struct lang_tokenizer *tokenizer_class; - struct lang_tokenizer *tokenizer = NULL, *parent = NULL; - const char *tokenizers_key, *const *tokenizers, *tokenizer_set_name; - const char *str, *error, *set_key; - unsigned int i; - int ret = 0; - - tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL); - str = mail_user_plugin_getenv(user, tokenizers_key); - if (str == NULL) { - str = mail_user_plugin_getenv(user, "fts_tokenizers"); - if (str == NULL) { - *error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key); - return -1; - } - tokenizers_key = "fts_tokenizers"; + const struct lang_settings *set = lang_user_settings_get(user, lang->name); + if (array_is_empty(&set->tokenizers)) { + /* No tokenizers */ + *error_r = "Empty language_tokenizers { .. } list"; + return -1; } - tokenizers = t_strsplit_spaces(str, " "); + int ret = 0; + struct lang_tokenizer *tokenizer = NULL, *parent = NULL; + const char *entry_name; + array_foreach_elem(&set->tokenizers, entry_name) { + const struct lang_tokenizer *entry_class = + lang_tokenizer_find(entry_name); - for (i = 0; tokenizers[i] != NULL; i++) { - tokenizer_class = lang_tokenizer_find(tokenizers[i]); - if (tokenizer_class == NULL) { - *error_r = t_strdup_printf("%s: Unknown tokenizer '%s'", - tokenizers_key, tokenizers[i]); + if (entry_class == NULL) { + *error_r = t_strdup_printf( + "%s: Unknown tokenizer '%s'", + set->name, entry_name); ret = -1; break; } - tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_'); - set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name); - str = mail_user_plugin_getenv(user, set_key); - if (str == NULL) { - set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name); - str = mail_user_plugin_getenv(user, set_key); - } - - if (lang_tokenizer_create(tokenizer_class, parent, - str_keyvalues_to_array(str), - search ? LANG_TOKENIZER_FLAG_SEARCH : 0, - &tokenizer, &error) < 0) { - *error_r = t_strdup_printf("%s: %s", set_key, error); + const char *error; + if (lang_tokenizer_create(entry_class, parent, set, + search ? LANG_TOKENIZER_FLAG_SEARCH : 0, + &tokenizer, &error) < 0) { + *error_r = t_strdup_printf( + "%s:%s %s", set->name, entry_name, error); ret = -1; break; } -- 2.47.3