#include "test-common.h"
#include "language.h"
#include "lang-filter.h"
+#include "settings.h"
+#include "lang-settings.h"
#include <stdio.h>
-static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
-static struct language english_language = { .name = "en" };
-static struct language french_language = { .name = "fr" };
-static struct language norwegian_language = { .name = "no" };
-#if defined(HAVE_LIBICU) && defined(HAVE_LANG_STEMMER)
-static struct language swedish_language = { .name = "sv" };
-#endif
+#define MALFORMED "malformed"
+#define UNKNOWN "bebobidoop"
+#define LANG_EN "en"
+#define LANG_FI "fi"
+#define LANG_FR "fr"
+#define LANG_NO "no"
+#define LANG_SV "sv"
+
+static struct lang_settings stopword_settings;
+static void init_lang_settings(void)
+{
+ stopword_settings = lang_default_settings;
+ stopword_settings.filter_stopwords_dir = TEST_STOPWORDS_DIR;
+}
+
+static struct lang_settings *make_settings(const char *lang,
+ const struct lang_settings *template)
+{
+ struct lang_settings *set = t_new(struct lang_settings, 1);
+ *set = template != NULL ? *template : lang_default_settings;
+ set->name = lang;
+ return set;
+}
static void test_lang_filter_find(void)
{
test_end();
}
-
static void test_lang_filter_contractions_fail(void)
{
const char *error;
test_begin("lang filter contractions, unsupported language");
- test_assert(lang_filter_create(lang_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_EN, NULL), &filter, &error) != 0);
test_assert(error != NULL);
test_end();
}
int ret;
test_begin("lang filter contractions, French");
- test_assert(lang_filter_create(lang_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_FR, NULL), &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
unsigned int i;
test_begin("lang filter lowercase");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
unsigned int i;
test_begin("lang filter lowercase, UTF8");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
struct lang_filter *filter;
const char *error;
const char *token;
- const char * const settings[] = {"maxlen", "25", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_lowercase_token_maxlen = 25;
unsigned int i;
test_begin("lang filter lowercase, too long UTF8");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, &set), &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
const char *token;
test_begin("lang filter stopwords, English");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
ip = input;
op = output;
static void test_lang_filter_stopwords_fin(void)
{
- const struct language finnish = { .name = "fi" };
struct lang_filter *filter;
const char *error;
int ret;
const char *token;
test_begin("lang filter stopwords, Finnish");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0);
ip = input;
op = output;
lang_filter_unref(&filter);
test_assert(filter == NULL);
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0);
ip = input2;
op = output2;
while (*ip != NULL) {
const char *token;
test_begin("lang filter stopwords, French");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FR, &stopword_settings), &filter, &error) == 0);
ip = input;
op = output;
const char *token;
test_begin("lang filter stopwords, Norwegian");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0);
ip = input;
op = output;
static void test_lang_filter_stopwords_fail_lazy_init(void)
{
- const struct language unknown = { .name = "bebobidoop" };
struct lang_filter *filter = NULL;
const char *error = NULL, *token = "foobar";
test_begin("lang filter stopwords, fail filter() (lazy init)");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &unknown, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(UNKNOWN, &stopword_settings), &filter, &error) == 0);
test_assert(filter != NULL && error == NULL);
test_assert(lang_filter(filter, &token, &error) < 0 && error != NULL);
lang_filter_unref(&filter);
static void test_lang_filter_stopwords_malformed(void)
{
- const struct language malformed = { .name = "malformed" };
struct lang_filter *filter = NULL;
const char *error = NULL, *token = "foobar";
test_begin("lang filter stopwords, malformed list");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &malformed, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(MALFORMED, &stopword_settings), &filter, &error) == 0);
test_assert(lang_filter(filter, &token, &error) < 0);
test_assert(strstr(error, "seems empty. Is the file correctly formatted?") != NULL);
test_expect_no_more_errors();
const char * const *bpp;
test_begin("lang filter stem English");
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &english_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
const char * const *bpp;
test_begin("lang filter stem French");
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_FR, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
test_begin("lang filters stopwords and stemming chained, English");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
"aao",
"vem kan segla forutan vind?\naaooaa"
};
- const char * const settings[] =
- {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC";
const char *error = NULL;
const char *token = NULL;
unsigned int i;
test_begin("lang filter normalizer Swedish short text");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_begin("lang filter normalizer Swedish short text using default ID");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, NULL, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, NULL), &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
{
struct lang_filter *norm = NULL;
FILE *input;
- const char * const settings[] =
- {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove";
char buf[250] = {0};
const char *error = NULL;
const char *tokens;
test_begin("lang filter normalizer French UDHR");
udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
input = fopen(udhr_path, "r");
test_assert(input != NULL);
sha512_init(&ctx);
"\xF3\xA0\x87\xAF", /* U+E01EF */
"\xCC\x80\xF3\xA0\x87\xAF" /* U+0300 U+E01EF */
};
- const char * const settings[] =
- {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove";
struct lang_filter *norm;
const char *error;
unsigned int i;
test_begin("lang filter normalizer empty tokens");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(empty_tokens); i++) {
const char *token = empty_tokens[i];
test_assert_idx(lang_filter(norm, &token, &error) == 0, i);
static void test_lang_filter_normalizer_baddata(void)
{
- const char * const settings[] =
- {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove";
struct lang_filter *norm;
const char *token, *error;
string_t *str;
test_begin("lang filter normalizer bad data");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
str = t_str_new(128);
for (i = 1; i < 0x1ffff; i++) {
if (!uni_is_valid_ucs4(i)) continue;
static void test_lang_filter_normalizer_invalid_id(void)
{
struct lang_filter *norm = NULL;
- const char *settings[] =
- {"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove",
- NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove";
const char *error = NULL, *token = "foo";
test_begin("lang filter normalizer invalid id");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) < 0 && error != NULL);
lang_filter_unref(&norm);
static void test_lang_filter_normalizer_oversized(void)
{
struct lang_filter *norm = NULL;
- const char *settings[] =
- {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", "maxlen", "250",
- NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove";
+ set.filter_normalizer_token_maxlen = 250;
const char *error = NULL;
const char *token = "\xe4\x95\x91\x25\xe2\x94\xad\xe1\x90\xad\xee\x94\x81\xe2\x8e\x9e"
"\xe7\x9a\xb7\xea\xbf\x97\xe3\xb2\x8f\xe4\x9c\xbe\xee\xb4\x98\xe1"
"\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1";
test_begin("lang filter normalizer over-sized token");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strlen(token) <= 250);
static void test_lang_filter_normalizer_truncation(void)
{
struct lang_filter *norm = NULL;
- const char *settings[] =
- {"id", "Any-Lower;", "maxlen", "10",
- NULL};
+ struct lang_settings set = lang_default_settings;
+ set.filter_normalizer_icu_id = "Any-Lower;";
+ set.filter_normalizer_token_maxlen = 10;
const char *error = NULL;
const char *token = "abcdefghi\xC3\x85";
test_begin("lang filter normalizer token truncated mid letter");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL,
- settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strcmp(token, "abcdefghi") == 0);
struct lang_filter *stemmer;
struct lang_filter *filter;
const char *error;
- const char * const id_settings[] =
- //{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
- {"id", "Lower", NULL};
+ struct lang_settings set = lang_default_settings;
+ // set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC"
+ set.filter_normalizer_icu_id = "Lower";
const char *token = NULL;
const char * const tokens[] = {
"dries" ,"friendlies", "All", "human", "beings", "are",
test_begin("lang filters normalizer, stopwords and stemming chained, English");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, id_settings, &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stopwords, normalizer, &english_language, stopword_settings, &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, normalizer, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filters with stopwords, default normalizer and stemming chained, Norwegian");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &norwegian_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_NO, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filters with stopwords, default normalizer and stemming chained, Swedish");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_SV, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_SV, NULL), &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filter english possessive");
- test_assert(lang_filter_create(lang_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_english_possessive, NULL, make_settings(NULL, NULL), &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
int main(void)
{
+ init_lang_settings();
static void (*const test_functions[])(void) = {
test_lang_filter_find,
test_lang_filter_contractions_fail,
return array_front(&arr);
}
+/* Returns the setting for the given language, or, if the langauge is not
+ defined, the settings for the default language (which is always the first
+ in the array) */
+const struct lang_settings *
+lang_user_settings_get(struct mail_user *user, const char *lang)
+{
+ struct lang_settings *set;
+ struct lang_user *luser = LANG_USER_CONTEXT_REQUIRE(user);
+ const ARRAY_TYPE(lang_settings) *langs = &luser->set->parsed_languages;
+
+ array_foreach_elem(langs, set) {
+ if (strcmp(set->name, lang) == 0)
+ return set;
+ }
+
+ i_assert(!(array_is_empty(langs)));
+ return array_idx_elem(langs, 0);
+}
+
static int
lang_user_init_languages(struct lang_user *luser, const char **error_r)
{
lang_user_create_filters(struct mail_user *user, const struct language *lang,
struct lang_filter **filter_r, const char **error_r)
{
- const struct lang_filter *filter_class;
- struct lang_filter *filter = NULL, *parent = NULL;
- const char *filters_key, *const *filters, *filter_set_name;
- const char *str, *error, *set_key;
- unsigned int i;
- int ret = 0;
- /* try to get the language-specific filters first */
- filters_key = t_strconcat("fts_filters_", lang->name, NULL);
- str = mail_user_plugin_getenv(user, filters_key);
- if (str == NULL) {
- /* fallback to global filters */
- filters_key = "fts_filters";
- str = mail_user_plugin_getenv(user, filters_key);
- if (str == NULL) {
- /* No filters */
- *filter_r = NULL;
- return 0;
- }
+ const struct lang_settings *set = lang_user_settings_get(user, lang->name);
+ if (array_is_empty(&set->filters)) {
+ /* No filters */
+ *filter_r = NULL;
+ return 0;
}
- filters = t_strsplit_spaces(str, " ");
- for (i = 0; filters[i] != NULL; i++) {
- filter_class = lang_filter_find(filters[i]);
- if (filter_class == NULL) {
- *error_r = t_strdup_printf("%s: Unknown filter '%s'",
- filters_key, filters[i]);
+ int ret = 0;
+ struct lang_filter *filter = NULL, *parent = NULL;
+ const char *entry_name;
+ array_foreach_elem(&set->filters, entry_name) {
+ const struct lang_filter *entry_class =
+ lang_filter_find(entry_name);
+
+ if (entry_class == NULL) {
+ *error_r = t_strdup_printf(
+ "%s: Unknown filter '%s'",
+ set->name, entry_name);
ret = -1;
break;
}
- /* try the language-specific setting first */
- filter_set_name = t_str_replace(filters[i], '-', '_');
- set_key = t_strdup_printf("fts_filter_%s_%s",
- lang->name, filter_set_name);
- str = mail_user_plugin_getenv(user, set_key);
- if (str == NULL) {
- set_key = t_strdup_printf("fts_filter_%s", filter_set_name);
- str = mail_user_plugin_getenv(user, set_key);
- }
-
- if (lang_filter_create(filter_class, parent, lang,
- str_keyvalues_to_array(str),
- &filter, &error) < 0) {
- *error_r = t_strdup_printf("%s: %s", set_key, error);
+ const char *error;
+ if (lang_filter_create(entry_class, parent, set,
+ &filter, &error) < 0) {
+ *error_r = t_strdup_printf(
+ "%s:%s %s", set->name, entry_name, error);
ret = -1;
break;
}
user_lang = p_new(user->pool, struct language_user, 1);
user_lang->lang = &language_data;
+ const struct lang_settings *set = lang_user_settings_get(user, language_data.name);
if (lang_user_language_init_tokenizers(user, user_lang, error_r) < 0)
return -1;
- if (lang_filter_create(lang_filter_lowercase, NULL, user_lang->lang, NULL,
+ if (lang_filter_create(lang_filter_lowercase, NULL, set,
&user_lang->filter, &error) < 0)
i_unreached();
i_assert(user_lang->filter != NULL);