From: Marco Bettini Date: Fri, 26 Jan 2024 16:14:08 +0000 (+0000) Subject: lib-language: filters - Use new settings X-Git-Tag: 2.4.1~998 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a4d227301928179a99f191def4d841c6f96cc66c;p=thirdparty%2Fdovecot%2Fcore.git lib-language: filters - Use new settings --- diff --git a/src/lib-language/lang-filter-contractions.c b/src/lib-language/lang-filter-contractions.c index a225718351..c33940989e 100644 --- a/src/lib-language/lang-filter-contractions.c +++ b/src/lib-language/lang-filter-contractions.c @@ -5,22 +5,18 @@ #include "language.h" #include "lang-filter-private.h" #include "lang-common.h" +#include "lang-settings.h" #include "unichar.h" static int -lang_filter_contractions_create(const struct language *lang, - const char *const *settings, - struct lang_filter **filter_r, - const char **error_r) +lang_filter_contractions_create(const struct lang_settings *set, + struct lang_filter **filter_r, + const char **error_r) { struct lang_filter *filter; - if (settings[0] != NULL) { - *error_r = t_strdup_printf("Unknown setting: %s", settings[0]); - return -1; - } - if (strcmp(lang->name, "fr") != 0) { - *error_r = t_strdup_printf("Unsupported language: %s", lang->name); + if (strcmp(set->name, "fr") != 0) { + *error_r = t_strdup_printf("Unsupported language: %s", set->name); return -1; } diff --git a/src/lib-language/lang-filter-lowercase.c b/src/lib-language/lang-filter-lowercase.c index 15775ff725..1278d2fe0e 100644 --- a/src/lib-language/lang-filter-lowercase.c +++ b/src/lib-language/lang-filter-lowercase.c @@ -3,6 +3,7 @@ #include "lib.h" #include "str.h" #include "language.h" +#include "lang-settings.h" #include "lang-filter-private.h" #ifdef HAVE_LIBICU @@ -11,33 +12,15 @@ #endif static int -lang_filter_lowercase_create(const struct language *lang ATTR_UNUSED, - const char *const *settings, +lang_filter_lowercase_create(const struct lang_settings *set, struct lang_filter **filter_r, - const char **error_r) + const char **error_r ATTR_UNUSED) { struct lang_filter *filter; - unsigned int i, max_length = 250; - - for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; - - if (strcmp(key, "maxlen") == 0) { - if (str_to_uint(value, &max_length) < 0 || - max_length == 0) { - *error_r = t_strdup_printf("Invalid lowercase filter maxlen setting: %s", value); - return -1; - } - } - else { - *error_r = t_strdup_printf("Unknown setting: %s", key); - return -1; - } - } filter = i_new(struct lang_filter, 1); *filter = *lang_filter_lowercase; filter->token = str_new(default_pool, 64); - filter->max_length = max_length; + filter->max_length = set->filter_lowercase_token_maxlen; *filter_r = filter; return 0; diff --git a/src/lib-language/lang-filter-normalizer-icu.c b/src/lib-language/lang-filter-normalizer-icu.c index d6f8339e8c..fde2a0ac4a 100644 --- a/src/lib-language/lang-filter-normalizer-icu.c +++ b/src/lib-language/lang-filter-normalizer-icu.c @@ -6,6 +6,7 @@ #include "unichar.h" /* unicode replacement char */ #include "lang-filter-common.h" #include "lang-filter-private.h" +#include "lang-settings.h" #include "language.h" #ifdef HAVE_LIBICU @@ -32,43 +33,23 @@ static void lang_filter_normalizer_icu_destroy(struct lang_filter *filter) } static int -lang_filter_normalizer_icu_create(const struct language *lang ATTR_UNUSED, - const char *const *settings, +lang_filter_normalizer_icu_create(const struct lang_settings *set, struct lang_filter **filter_r, - const char **error_r) + const char **error_r ATTR_UNUSED) { struct lang_filter_normalizer_icu *np; pool_t pp; - unsigned int i, max_length = 250; - const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC; [\\x20] Remove"; - - for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; - - if (strcmp(key, "id") == 0) { - id = value; - } else if (strcmp(key, "maxlen") == 0) { - if (str_to_uint(value, &max_length) < 0 || - max_length == 0) { - *error_r = t_strdup_printf("Invalid icu maxlen setting: %s", value); - return -1; - } - } else { - *error_r = t_strdup_printf("Unknown setting: %s", key); - return -1; - } - } pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_normalizer_icu", sizeof(struct lang_filter_normalizer_icu)); np = p_new(pp, struct lang_filter_normalizer_icu, 1); np->pool = pp; np->filter = *lang_filter_normalizer_icu; - np->transliterator_id = p_strdup(pp, id); + np->transliterator_id = set->filter_normalizer_icu_id; p_array_init(&np->utf16_token, pp, 64); p_array_init(&np->trans_token, pp, 64); np->utf8_token = buffer_create_dynamic(pp, 128); - np->filter.max_length = max_length; + np->filter.max_length = set->filter_normalizer_token_maxlen; *filter_r = &np->filter; return 0; } diff --git a/src/lib-language/lang-filter-private.h b/src/lib-language/lang-filter-private.h index d2e755cfda..dce8ae076d 100644 --- a/src/lib-language/lang-filter-private.h +++ b/src/lib-language/lang-filter-private.h @@ -5,6 +5,8 @@ #define LANG_FILTER_CLASSES_NR 6 +struct lang_settings; + /* API that stemming providers (classes) must provide: The create() function is called to get an instance of a registered filter class. @@ -13,8 +15,7 @@ */ struct lang_filter_vfuncs { - int (*create)(const struct language *lang, - const char *const *settings, + int (*create)(const struct lang_settings *set, struct lang_filter **filter_r, const char **error_r); int (*filter)(struct lang_filter *filter, const char **token, diff --git a/src/lib-language/lang-filter-stemmer-snowball.c b/src/lib-language/lang-filter-stemmer-snowball.c index 5231899b17..5449496d09 100644 --- a/src/lib-language/lang-filter-stemmer-snowball.c +++ b/src/lib-language/lang-filter-stemmer-snowball.c @@ -3,6 +3,7 @@ #include "lib.h" #include "language.h" #include "lang-filter-private.h" +#include "lang-settings.h" #ifdef HAVE_LANG_STEMMER @@ -26,27 +27,20 @@ static void lang_filter_stemmer_snowball_destroy(struct lang_filter *filter) } static int -lang_filter_stemmer_snowball_create(const struct language *lang, - const char *const *settings, +lang_filter_stemmer_snowball_create(const struct lang_settings *set, struct lang_filter **filter_r, - const char **error_r) + const char **error_r ATTR_UNUSED) { struct lang_filter_stemmer_snowball *sp; pool_t pp; - *filter_r = NULL; - - if (settings[0] != NULL) { - *error_r = t_strdup_printf("Unknown setting: %s", settings[0]); - return -1; - } pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_stemmer_snowball", sizeof(struct lang_filter)); sp = p_new(pp, struct lang_filter_stemmer_snowball, 1); sp->pool = pp; sp->filter = *lang_filter_stemmer_snowball; sp->lang = p_malloc(sp->pool, sizeof(struct language)); - sp->lang->name = p_strdup(sp->pool, lang->name); + sp->lang->name = p_strdup(sp->pool, set->name); *filter_r = &sp->filter; return 0; } @@ -106,8 +100,7 @@ lang_filter_stemmer_snowball_filter(struct lang_filter *filter, #else static int -lang_filter_stemmer_snowball_create(const struct language *lang ATTR_UNUSED, - const char *const *settings ATTR_UNUSED, +lang_filter_stemmer_snowball_create(const struct lang_settings *set ATTR_UNUSED, struct lang_filter **filter_r ATTR_UNUSED, const char **error_r) { diff --git a/src/lib-language/lang-filter-stopwords.c b/src/lib-language/lang-filter-stopwords.c index 8f7ce00631..852f3c13da 100644 --- a/src/lib-language/lang-filter-stopwords.c +++ b/src/lib-language/lang-filter-stopwords.c @@ -8,6 +8,7 @@ #include "unichar.h" #include "language.h" #include "lang-filter-private.h" +#include "lang-settings.h" #define STOPWORDS_FILE_FORMAT "%s/stopwords_%s.txt" @@ -70,37 +71,21 @@ static void lang_filter_stopwords_destroy(struct lang_filter *filter) } static int -lang_filter_stopwords_create(const struct language *lang, - const char *const *settings, +lang_filter_stopwords_create(const struct lang_settings *set, struct lang_filter **filter_r, - const char **error_r) + const char **error_r ATTR_UNUSED) { struct lang_filter_stopwords *sp; pool_t pp; - const char *dir = NULL; - unsigned int i; - for (i = 0; settings[i] != NULL; i += 2) { - const char *key = settings[i], *value = settings[i+1]; - - if (strcmp(key, "stopwords_dir") == 0) { - dir = value; - } else { - *error_r = t_strdup_printf("Unknown setting: %s", key); - return -1; - } - } pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_stopwords", sizeof(struct lang_filter)); sp = p_new(pp, struct lang_filter_stopwords, 1); sp->filter = *lang_filter_stopwords; sp->pool = pp; sp->lang = p_malloc(sp->pool, sizeof(struct language)); - sp->lang->name = p_strdup(sp->pool, lang->name); - if (dir != NULL) - sp->stopwords_dir = p_strdup(pp, dir); - else - sp->stopwords_dir = DATADIR"/stopwords"; + sp->lang->name = set->name; + sp->stopwords_dir = set->filter_stopwords_dir; *filter_r = &sp->filter; return 0; } diff --git a/src/lib-language/lang-filter.c b/src/lib-language/lang-filter.c index b4021ba627..9b33a8fba3 100644 --- a/src/lib-language/lang-filter.c +++ b/src/lib-language/lang-filter.c @@ -52,30 +52,17 @@ const struct lang_filter *lang_filter_find(const char *name) int lang_filter_create(const struct lang_filter *filter_class, struct lang_filter *parent, - const struct language *lang, - const char *const *settings, + const struct lang_settings *set, struct lang_filter **filter_r, const char **error_r) { struct lang_filter *fp; - const char *empty_settings = NULL; - - i_assert(settings == NULL || str_array_length(settings) % 2 == 0); - - if (settings == NULL) - settings = &empty_settings; - if (filter_class->v.create != NULL) { - if (filter_class->v.create(lang, settings, &fp, error_r) < 0) { + if (filter_class->v.create(set, &fp, error_r) < 0) { *filter_r = NULL; return -1; } } else { - /* default implementation */ - if (settings[0] != NULL) { - *error_r = t_strdup_printf("Unknown setting: %s", settings[0]); - return -1; - } fp = i_new(struct lang_filter, 1); *fp = *filter_class; } diff --git a/src/lib-language/lang-filter.h b/src/lib-language/lang-filter.h index 6c2532a24e..d6f2fbc94c 100644 --- a/src/lib-language/lang-filter.h +++ b/src/lib-language/lang-filter.h @@ -3,6 +3,8 @@ struct language; struct lang_filter; +struct lang_settings; + /* Settings are given in the form of a const char * const *settings = {"key, "value", "key2", "value2", NULL} array of string pairs. @@ -54,8 +56,7 @@ void lang_filter_register(const struct lang_filter *filter_class); const struct lang_filter *lang_filter_find(const char *name); int lang_filter_create(const struct lang_filter *filter_class, struct lang_filter *parent, - const struct language *lang, - const char *const *settings, + const struct lang_settings *set, struct lang_filter **filter_r, const char **error_r); void lang_filter_ref(struct lang_filter *filter); diff --git a/src/lib-language/test-lang-filter.c b/src/lib-language/test-lang-filter.c index cf50383917..a9daf45eba 100644 --- a/src/lib-language/test-lang-filter.c +++ b/src/lib-language/test-lang-filter.c @@ -7,16 +7,34 @@ #include "test-common.h" #include "language.h" #include "lang-filter.h" +#include "settings.h" +#include "lang-settings.h" #include -static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL}; -static struct language english_language = { .name = "en" }; -static struct language french_language = { .name = "fr" }; -static struct language norwegian_language = { .name = "no" }; -#if defined(HAVE_LIBICU) && defined(HAVE_LANG_STEMMER) -static struct language swedish_language = { .name = "sv" }; -#endif +#define MALFORMED "malformed" +#define UNKNOWN "bebobidoop" +#define LANG_EN "en" +#define LANG_FI "fi" +#define LANG_FR "fr" +#define LANG_NO "no" +#define LANG_SV "sv" + +static struct lang_settings stopword_settings; +static void init_lang_settings(void) +{ + stopword_settings = lang_default_settings; + stopword_settings.filter_stopwords_dir = TEST_STOPWORDS_DIR; +} + +static struct lang_settings *make_settings(const char *lang, + const struct lang_settings *template) +{ + struct lang_settings *set = t_new(struct lang_settings, 1); + *set = template != NULL ? *template : lang_default_settings; + set->name = lang; + return set; +} static void test_lang_filter_find(void) { @@ -29,7 +47,6 @@ static void test_lang_filter_find(void) test_end(); } - static void test_lang_filter_contractions_fail(void) { @@ -37,7 +54,7 @@ static void test_lang_filter_contractions_fail(void) const char *error; test_begin("lang filter contractions, unsupported language"); - test_assert(lang_filter_create(lang_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0); + test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_EN, NULL), &filter, &error) != 0); test_assert(error != NULL); test_end(); } @@ -75,7 +92,7 @@ static void test_lang_filter_contractions_fr(void) int ret; test_begin("lang filter contractions, French"); - test_assert(lang_filter_create(lang_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_FR, NULL), &filter, &error) == 0); for (i = 0; i < N_ELEMENTS(tests); i++) { token = tests[i].input; @@ -106,7 +123,7 @@ static void test_lang_filter_lowercase(void) unsigned int i; test_begin("lang filter lowercase"); - test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0); for (i = 0; i < N_ELEMENTS(tests); i++) { token = tests[i].input; @@ -134,7 +151,7 @@ static void test_lang_filter_lowercase_utf8(void) unsigned int i; test_begin("lang filter lowercase, UTF8"); - test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0); for (i = 0; i < N_ELEMENTS(tests); i++) { token = tests[i].input; @@ -159,11 +176,12 @@ static void test_lang_filter_lowercase_too_long_utf8(void) struct lang_filter *filter; const char *error; const char *token; - const char * const settings[] = {"maxlen", "25", NULL}; + struct lang_settings set = lang_default_settings; + set.filter_lowercase_token_maxlen = 25; unsigned int i; test_begin("lang filter lowercase, too long UTF8"); - test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, &set), &filter, &error) == 0); for (i = 0; i < N_ELEMENTS(tests); i++) { token = tests[i].input; @@ -190,7 +208,7 @@ static void test_lang_filter_stopwords_eng(void) const char *token; test_begin("lang filter stopwords, English"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0); ip = input; op = output; @@ -215,7 +233,6 @@ static void test_lang_filter_stopwords_eng(void) static void test_lang_filter_stopwords_fin(void) { - const struct language finnish = { .name = "fi" }; struct lang_filter *filter; const char *error; int ret; @@ -231,7 +248,7 @@ static void test_lang_filter_stopwords_fin(void) const char *token; test_begin("lang filter stopwords, Finnish"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0); ip = input; op = output; @@ -252,7 +269,7 @@ static void test_lang_filter_stopwords_fin(void) lang_filter_unref(&filter); test_assert(filter == NULL); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0); ip = input2; op = output2; while (*ip != NULL) { @@ -291,7 +308,7 @@ static void test_lang_filter_stopwords_fra(void) const char *token; test_begin("lang filter stopwords, French"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FR, &stopword_settings), &filter, &error) == 0); ip = input; op = output; @@ -339,7 +356,7 @@ static void test_lang_filter_stopwords_no(void) const char *token; test_begin("lang filter stopwords, Norwegian"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0); ip = input; op = output; @@ -364,12 +381,11 @@ static void test_lang_filter_stopwords_no(void) static void test_lang_filter_stopwords_fail_lazy_init(void) { - const struct language unknown = { .name = "bebobidoop" }; struct lang_filter *filter = NULL; const char *error = NULL, *token = "foobar"; test_begin("lang filter stopwords, fail filter() (lazy init)"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &unknown, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(UNKNOWN, &stopword_settings), &filter, &error) == 0); test_assert(filter != NULL && error == NULL); test_assert(lang_filter(filter, &token, &error) < 0 && error != NULL); lang_filter_unref(&filter); @@ -379,12 +395,11 @@ static void test_lang_filter_stopwords_fail_lazy_init(void) static void test_lang_filter_stopwords_malformed(void) { - const struct language malformed = { .name = "malformed" }; struct lang_filter *filter = NULL; const char *error = NULL, *token = "foobar"; test_begin("lang filter stopwords, malformed list"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &malformed, stopword_settings, &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(MALFORMED, &stopword_settings), &filter, &error) == 0); test_assert(lang_filter(filter, &token, &error) < 0); test_assert(strstr(error, "seems empty. Is the file correctly formatted?") != NULL); test_expect_no_more_errors(); @@ -415,7 +430,7 @@ static void test_lang_filter_stemmer_snowball_stem_english(void) const char * const *bpp; test_begin("lang filter stem English"); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &english_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_EN, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { token = *tpp; @@ -445,7 +460,7 @@ static void test_lang_filter_stemmer_snowball_stem_french(void) const char * const *bpp; test_begin("lang filter stem French"); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_FR, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { token = *tpp; @@ -483,8 +498,8 @@ static void test_lang_filter_stopwords_stemmer_eng(void) test_begin("lang filters stopwords and stemming chained, English"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { @@ -524,15 +539,15 @@ static void test_lang_filter_normalizer_swedish_short(void) "aao", "vem kan segla forutan vind?\naaooaa" }; - const char * const settings[] = - {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC"; const char *error = NULL; const char *token = NULL; unsigned int i; test_begin("lang filter normalizer Swedish short text"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); for (i = 0; i < N_ELEMENTS(input); i++) { token = input[i]; test_assert_idx(lang_filter(norm, &token, &error) == 1, i); @@ -565,7 +580,7 @@ static void test_lang_filter_normalizer_swedish_short_default_id(void) test_begin("lang filter normalizer Swedish short text using default ID"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, NULL, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, NULL), &norm, &error) == 0); for (i = 0; i < N_ELEMENTS(input); i++) { token = input[i]; test_assert_idx(lang_filter(norm, &token, &error) == 1, i); @@ -582,8 +597,8 @@ static void test_lang_filter_normalizer_french(void) { struct lang_filter *norm = NULL; FILE *input; - const char * const settings[] = - {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove"; char buf[250] = {0}; const char *error = NULL; const char *tokens; @@ -603,7 +618,7 @@ static void test_lang_filter_normalizer_french(void) test_begin("lang filter normalizer French UDHR"); udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); input = fopen(udhr_path, "r"); test_assert(input != NULL); sha512_init(&ctx); @@ -632,14 +647,14 @@ static void test_lang_filter_normalizer_empty(void) "\xF3\xA0\x87\xAF", /* U+E01EF */ "\xCC\x80\xF3\xA0\x87\xAF" /* U+0300 U+E01EF */ }; - const char * const settings[] = - {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove"; struct lang_filter *norm; const char *error; unsigned int i; test_begin("lang filter normalizer empty tokens"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); for (i = 0; i < N_ELEMENTS(empty_tokens); i++) { const char *token = empty_tokens[i]; test_assert_idx(lang_filter(norm, &token, &error) == 0, i); @@ -650,8 +665,8 @@ static void test_lang_filter_normalizer_empty(void) static void test_lang_filter_normalizer_baddata(void) { - const char * const settings[] = - {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove"; struct lang_filter *norm; const char *token, *error; string_t *str; @@ -659,7 +674,7 @@ static void test_lang_filter_normalizer_baddata(void) test_begin("lang filter normalizer bad data"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); str = t_str_new(128); for (i = 1; i < 0x1ffff; i++) { if (!uni_is_valid_ucs4(i)) continue; @@ -683,13 +698,12 @@ static void test_lang_filter_normalizer_baddata(void) static void test_lang_filter_normalizer_invalid_id(void) { struct lang_filter *norm = NULL; - const char *settings[] = - {"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove", - NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove"; const char *error = NULL, *token = "foo"; test_begin("lang filter normalizer invalid id"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); test_assert(error == NULL); test_assert(lang_filter(norm, &token, &error) < 0 && error != NULL); lang_filter_unref(&norm); @@ -699,9 +713,9 @@ static void test_lang_filter_normalizer_invalid_id(void) static void test_lang_filter_normalizer_oversized(void) { struct lang_filter *norm = NULL; - const char *settings[] = - {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", "maxlen", "250", - NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove"; + set.filter_normalizer_token_maxlen = 250; const char *error = NULL; const char *token = "\xe4\x95\x91\x25\xe2\x94\xad\xe1\x90\xad\xee\x94\x81\xe2\x8e\x9e" "\xe7\x9a\xb7\xea\xbf\x97\xe3\xb2\x8f\xe4\x9c\xbe\xee\xb4\x98\xe1" @@ -721,7 +735,7 @@ static void test_lang_filter_normalizer_oversized(void) "\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1"; test_begin("lang filter normalizer over-sized token"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); test_assert(error == NULL); test_assert(lang_filter(norm, &token, &error) >= 0); test_assert(strlen(token) <= 250); @@ -732,15 +746,14 @@ static void test_lang_filter_normalizer_oversized(void) static void test_lang_filter_normalizer_truncation(void) { struct lang_filter *norm = NULL; - const char *settings[] = - {"id", "Any-Lower;", "maxlen", "10", - NULL}; + struct lang_settings set = lang_default_settings; + set.filter_normalizer_icu_id = "Any-Lower;"; + set.filter_normalizer_token_maxlen = 10; const char *error = NULL; const char *token = "abcdefghi\xC3\x85"; test_begin("lang filter normalizer token truncated mid letter"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, - settings, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0); test_assert(error == NULL); test_assert(lang_filter(norm, &token, &error) >= 0); test_assert(strcmp(token, "abcdefghi") == 0); @@ -756,9 +769,9 @@ static void test_lang_filter_normalizer_stopwords_stemmer_eng(void) struct lang_filter *stemmer; struct lang_filter *filter; const char *error; - const char * const id_settings[] = - //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL}; - {"id", "Lower", NULL}; + struct lang_settings set = lang_default_settings; + // set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC" + set.filter_normalizer_icu_id = "Lower"; const char *token = NULL; const char * const tokens[] = { "dries" ,"friendlies", "All", "human", "beings", "are", @@ -778,9 +791,9 @@ static void test_lang_filter_normalizer_stopwords_stemmer_eng(void) test_begin("lang filters normalizer, stopwords and stemming chained, English"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, id_settings, &normalizer, &error) == 0); - test_assert(lang_filter_create(lang_filter_stopwords, normalizer, &english_language, stopword_settings, &filter, &error) == 0); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &normalizer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, normalizer, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp = tokens; *tpp != NULL; tpp++) { @@ -840,9 +853,9 @@ static void test_lang_filter_stopwords_normalizer_stemmer_no(void) test_begin("lang filters with stopwords, default normalizer and stemming chained, Norwegian"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0); - test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &norwegian_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_NO, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp = tokens; *tpp != NULL; tpp++) { @@ -891,9 +904,9 @@ static void test_lang_filter_stopwords_normalizer_stemmer_sv(void) test_begin("lang filters with stopwords, default normalizer and stemming chained, Swedish"); - test_assert(lang_filter_create(lang_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0); - test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0); - test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_SV, &stopword_settings), &filter, &error) == 0); + test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0); + test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_SV, NULL), &stemmer, &error) == 0); bpp = bases; for (tpp = tokens; *tpp != NULL; tpp++) { @@ -962,7 +975,7 @@ static void test_lang_filter_english_possessive(void) test_begin("lang filter english possessive"); - test_assert(lang_filter_create(lang_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0); + test_assert(lang_filter_create(lang_filter_english_possessive, NULL, make_settings(NULL, NULL), &norm, &error) == 0); for (i = 0; i < N_ELEMENTS(input); i++) { token = input[i]; test_assert_idx(lang_filter(norm, &token, &error) == 1, i); @@ -978,6 +991,7 @@ static void test_lang_filter_english_possessive(void) int main(void) { + init_lang_settings(); static void (*const test_functions[])(void) = { test_lang_filter_find, test_lang_filter_contractions_fail, diff --git a/src/lib-storage/lang-user.c b/src/lib-storage/lang-user.c index 670cb17143..dee48bf8e5 100644 --- a/src/lib-storage/lang-user.c +++ b/src/lib-storage/lang-user.c @@ -55,6 +55,25 @@ static const char *const *str_keyvalues_to_array(const char *str) return array_front(&arr); } +/* Returns the setting for the given language, or, if the langauge is not + defined, the settings for the default language (which is always the first + in the array) */ +const struct lang_settings * +lang_user_settings_get(struct mail_user *user, const char *lang) +{ + struct lang_settings *set; + struct lang_user *luser = LANG_USER_CONTEXT_REQUIRE(user); + const ARRAY_TYPE(lang_settings) *langs = &luser->set->parsed_languages; + + array_foreach_elem(langs, set) { + if (strcmp(set->name, lang) == 0) + return set; + } + + i_assert(!(array_is_empty(langs))); + return array_idx_elem(langs, 0); +} + static int lang_user_init_languages(struct lang_user *luser, const char **error_r) { @@ -81,51 +100,34 @@ static int lang_user_create_filters(struct mail_user *user, const struct language *lang, struct lang_filter **filter_r, const char **error_r) { - const struct lang_filter *filter_class; - struct lang_filter *filter = NULL, *parent = NULL; - const char *filters_key, *const *filters, *filter_set_name; - const char *str, *error, *set_key; - unsigned int i; - int ret = 0; - /* try to get the language-specific filters first */ - filters_key = t_strconcat("fts_filters_", lang->name, NULL); - str = mail_user_plugin_getenv(user, filters_key); - if (str == NULL) { - /* fallback to global filters */ - filters_key = "fts_filters"; - str = mail_user_plugin_getenv(user, filters_key); - if (str == NULL) { - /* No filters */ - *filter_r = NULL; - return 0; - } + const struct lang_settings *set = lang_user_settings_get(user, lang->name); + if (array_is_empty(&set->filters)) { + /* No filters */ + *filter_r = NULL; + return 0; } - filters = t_strsplit_spaces(str, " "); - for (i = 0; filters[i] != NULL; i++) { - filter_class = lang_filter_find(filters[i]); - if (filter_class == NULL) { - *error_r = t_strdup_printf("%s: Unknown filter '%s'", - filters_key, filters[i]); + int ret = 0; + struct lang_filter *filter = NULL, *parent = NULL; + const char *entry_name; + array_foreach_elem(&set->filters, entry_name) { + const struct lang_filter *entry_class = + lang_filter_find(entry_name); + + if (entry_class == NULL) { + *error_r = t_strdup_printf( + "%s: Unknown filter '%s'", + set->name, entry_name); ret = -1; break; } - /* try the language-specific setting first */ - filter_set_name = t_str_replace(filters[i], '-', '_'); - set_key = t_strdup_printf("fts_filter_%s_%s", - lang->name, filter_set_name); - str = mail_user_plugin_getenv(user, set_key); - if (str == NULL) { - set_key = t_strdup_printf("fts_filter_%s", filter_set_name); - str = mail_user_plugin_getenv(user, set_key); - } - - if (lang_filter_create(filter_class, parent, lang, - str_keyvalues_to_array(str), - &filter, &error) < 0) { - *error_r = t_strdup_printf("%s: %s", set_key, error); + const char *error; + if (lang_filter_create(entry_class, parent, set, + &filter, &error) < 0) { + *error_r = t_strdup_printf( + "%s:%s %s", set->name, entry_name, error); ret = -1; break; } @@ -281,11 +283,12 @@ lang_user_init_data_language(struct mail_user *user, struct lang_user *luser, user_lang = p_new(user->pool, struct language_user, 1); user_lang->lang = &language_data; + const struct lang_settings *set = lang_user_settings_get(user, language_data.name); if (lang_user_language_init_tokenizers(user, user_lang, error_r) < 0) return -1; - if (lang_filter_create(lang_filter_lowercase, NULL, user_lang->lang, NULL, + if (lang_filter_create(lang_filter_lowercase, NULL, set, &user_lang->filter, &error) < 0) i_unreached(); i_assert(user_lang->filter != NULL);