From: Teemu Huovila Date: Mon, 31 Aug 2015 10:33:26 +0000 (+0300) Subject: lib-fts: Add prefixing contraction filter. X-Git-Tag: 2.2.19.rc1~151 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=440b625484f3cc9d3ec0a7ba36fe3583aa90172d;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: Add prefixing contraction filter. Filters away prefixing contracted words, e.g. "l'homme" -> "homme". Tokens to be filtered must be lower case. Only supports French in this initial version. --- diff --git a/src/lib-fts/Makefile.am b/src/lib-fts/Makefile.am index b7a72bc9bc..b530c63e6a 100644 --- a/src/lib-fts/Makefile.am +++ b/src/lib-fts/Makefile.am @@ -62,6 +62,7 @@ libfts_la_LIBADD = \ libfts_la_SOURCES = \ fts-filter.c \ + fts-filter-contractions.c \ fts-filter-english-possessive.c \ fts-filter-lowercase.c \ fts-filter-normalizer-icu.c \ diff --git a/src/lib-fts/fts-filter-contractions.c b/src/lib-fts/fts-filter-contractions.c new file mode 100644 index 0000000000..9ca5078706 --- /dev/null +++ b/src/lib-fts/fts-filter-contractions.c @@ -0,0 +1,84 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "fts-language.h" +#include "fts-filter-private.h" +#include "fts-common.h" +#include "unichar.h" + +static int +fts_filter_contractions_create(const struct fts_language *lang, + const char *const *settings, + struct fts_filter **filter_r, + const char **error_r) +{ + struct fts_filter *filter; + + if (settings[0] != NULL) { + *error_r = t_strdup_printf("Unknown setting: %s", settings[0]); + return -1; + } + if (strcmp(lang->name, "fr") != 0) { + *error_r = t_strdup_printf("Unsupported language: %s", lang->name); + return -1; + } + + filter = i_new(struct fts_filter, 1); + *filter = *fts_filter_contractions; + filter->token = str_new(default_pool, 64); + *filter_r = filter; + return 0; +} + +static int +fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED, + const char **_token, + const char **error_r ATTR_UNUSED) +{ + int char_size, pos = 0; + unichar_t apostrophe; + const char *token = *_token; + + switch (token[pos]) { + case 'q': + pos++; + if (token[pos] == '\0' || token[pos] != 'u') + break; + /* otherwise fall through */ + case 'c': + case 'd': + case 'l': + case 'm': + case 'n': + case 's': + case 't': + pos++; + if (token[pos] == '\0') + break; + char_size = uni_utf8_get_char(token + pos, &apostrophe); + if (IS_APOSTROPHE(apostrophe)) { + pos += char_size; + *_token = token + pos; + } + if (token[pos] == '\0') /* nothing left */ + return 0; + break; + default: + /* do nothing */ + break; + } + + return 1; +} + +static const struct fts_filter fts_filter_contractions_real = { + .class_name = "contractions", + .v = { + fts_filter_contractions_create, + fts_filter_contractions_filter, + NULL + } +}; + +const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real; diff --git a/src/lib-fts/fts-filter-private.h b/src/lib-fts/fts-filter-private.h index 0ac8b95f02..bfe8c58df8 100644 --- a/src/lib-fts/fts-filter-private.h +++ b/src/lib-fts/fts-filter-private.h @@ -3,7 +3,7 @@ #include "fts-filter.h" -#define FTS_FILTER_CLASSES_NR 3 +#define FTS_FILTER_CLASSES_NR 6 /* API that stemming providers (classes) must provide: The create() diff --git a/src/lib-fts/fts-filter.c b/src/lib-fts/fts-filter.c index 39b6f6dbd3..6c5352e484 100644 --- a/src/lib-fts/fts-filter.c +++ b/src/lib-fts/fts-filter.c @@ -21,6 +21,7 @@ void fts_filters_init(void) fts_filter_register(fts_filter_normalizer_icu); fts_filter_register(fts_filter_lowercase); fts_filter_register(fts_filter_english_possessive); + fts_filter_register(fts_filter_contractions); } void fts_filters_deinit(void) diff --git a/src/lib-fts/fts-filter.h b/src/lib-fts/fts-filter.h index 749b79e6ef..e37bdff665 100644 --- a/src/lib-fts/fts-filter.h +++ b/src/lib-fts/fts-filter.h @@ -35,6 +35,9 @@ extern const struct fts_filter *fts_filter_lowercase; /* Removes <'s> suffix from words. */ extern const struct fts_filter *fts_filter_english_possessive; +/* Removes prefixing contractions from words. */ +extern const struct fts_filter *fts_filter_contractions; + /* Register all built-in filters. */ void fts_filters_init(void); void fts_filters_deinit(void); diff --git a/src/lib-fts/test-fts-filter.c b/src/lib-fts/test-fts-filter.c index 0b865be6c6..236fba686f 100644 --- a/src/lib-fts/test-fts-filter.c +++ b/src/lib-fts/test-fts-filter.c @@ -12,6 +12,7 @@ static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL}; static struct fts_language english_language = { .name = "en" }; +static struct fts_language french_language = { .name = "fr" }; static void test_fts_filter_find(void) { @@ -20,6 +21,65 @@ static void test_fts_filter_find(void) test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball); test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu); test_assert(fts_filter_find("lowercase") == fts_filter_lowercase); + test_assert(fts_filter_find("contractions") == fts_filter_contractions); + test_end(); +} + + +static void test_fts_filter_contractions_fail(void) +{ + + struct fts_filter *filter; + const char *error; + + test_begin("fts filter contractions, unsupported language"); + test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0); + test_assert(error != NULL); + test_end(); +} + +static void test_fts_filter_contractions_fr(void) +{ + struct { + const char *input; + const char *output; + } tests[] = { + { "foo", "foo" }, + { "you're", "you're" }, + { "l'homme", "homme" }, + { "l\xE2\x80\x99homme", "homme" }, + { "aujourd'hui", "aujourd'hui" }, + { "qu\xE2\x80\x99il", "il" }, + { "qu'il", "il" }, + { "du'il", "du'il" }, + { "que", "que" }, + { "'foobar'", "'foobar'" }, + { "foo'bar", "foo'bar" }, + { "a'foo", "a'foo" }, + { "cu'", "cu'" }, + { "qu", "qu" }, + { "d", "d" }, + { "qu'", NULL } + }; + struct fts_filter *filter; + const char *error; + const char *token; + unsigned int i; + int ret; + + test_begin("fts filter contractions, French"); + test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0); + + for (i = 0; i < N_ELEMENTS(tests); i++) { + token = tests[i].input; + ret = fts_filter_filter(filter, &token, &error); + test_assert(ret >= 0); + if (ret > 0) + test_assert_idx(strcmp(token, tests[i].output) == 0, i); + else if (ret == 0) + test_assert_idx(token == NULL && tests[i].output == NULL, i); + } + fts_filter_unref(&filter); test_end(); } @@ -151,7 +211,6 @@ static void test_fts_filter_stopwords_fin(void) static void test_fts_filter_stopwords_fra(void) { - const struct fts_language french = { .name = "fr" }; struct fts_filter *filter; const char *error; int ret; @@ -167,7 +226,7 @@ static void test_fts_filter_stopwords_fra(void) const char *token; test_begin("fts filter stopwords, French"); - test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0); + test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0); ip = input; op = output; @@ -245,7 +304,6 @@ static void test_fts_filter_stemmer_snowball_stem_french(void) { struct fts_filter *stemmer; const char *error; - struct fts_language language = { .name = "fr" }; const char *token = NULL; const char * const tokens[] = { "Tous", "les", "\xC3\xAAtres", "humains", "naissent", @@ -258,7 +316,7 @@ static void test_fts_filter_stemmer_snowball_stem_french(void) const char * const *bpp; test_begin("fts filter stem French"); - test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0); + test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0); bpp = bases; for (tpp=tokens; *tpp != NULL; tpp++) { token = *tpp; @@ -627,6 +685,8 @@ int main(void) { static void (*test_functions[])(void) = { test_fts_filter_find, + test_fts_filter_contractions_fail, + test_fts_filter_contractions_fr, test_fts_filter_lowercase, test_fts_filter_stopwords_eng, test_fts_filter_stopwords_fin,