]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Add prefixing contraction filter.
authorTeemu Huovila <teemu.huovila@dovecot.fi>
Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
committerTeemu Huovila <teemu.huovila@dovecot.fi>
Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
Filters away prefixing contracted words, e.g. "l'homme" -> "homme".
Tokens to be filtered must be lower case. Only supports French in
this initial version.

src/lib-fts/Makefile.am
src/lib-fts/fts-filter-contractions.c [new file with mode: 0644]
src/lib-fts/fts-filter-private.h
src/lib-fts/fts-filter.c
src/lib-fts/fts-filter.h
src/lib-fts/test-fts-filter.c

index b7a72bc9bcf8b9012cc7b4992f916bebd4ab8db9..b530c63e6aae322460613020c43b266025db5843 100644 (file)
@@ -62,6 +62,7 @@ libfts_la_LIBADD = \
 
 libfts_la_SOURCES = \
        fts-filter.c \
+       fts-filter-contractions.c \
        fts-filter-english-possessive.c \
        fts-filter-lowercase.c \
        fts-filter-normalizer-icu.c \
diff --git a/src/lib-fts/fts-filter-contractions.c b/src/lib-fts/fts-filter-contractions.c
new file mode 100644 (file)
index 0000000..9ca5078
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "fts-language.h"
+#include "fts-filter-private.h"
+#include "fts-common.h"
+#include "unichar.h"
+
+static int
+fts_filter_contractions_create(const struct fts_language *lang,
+                              const char *const *settings,
+                              struct fts_filter **filter_r,
+                              const char **error_r)
+{
+       struct fts_filter *filter;
+
+       if (settings[0] != NULL) {
+               *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
+               return -1;
+       }
+       if (strcmp(lang->name, "fr") != 0) {
+               *error_r = t_strdup_printf("Unsupported language: %s", lang->name);
+               return -1;
+       }
+
+       filter = i_new(struct fts_filter, 1);
+       *filter = *fts_filter_contractions;
+       filter->token = str_new(default_pool, 64);
+       *filter_r = filter;
+       return 0;
+}
+
+static int
+fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+                           const char **_token,
+                           const char **error_r ATTR_UNUSED)
+{
+       int char_size, pos = 0;
+       unichar_t apostrophe;
+       const char *token = *_token;
+
+       switch (token[pos]) {
+       case 'q':
+               pos++;
+               if (token[pos] == '\0' || token[pos] != 'u')
+                       break;
+               /* otherwise fall through */
+       case 'c':
+       case 'd':
+       case 'l':
+       case 'm':
+       case 'n':
+       case 's':
+       case 't':
+               pos++;
+               if (token[pos] == '\0')
+                       break;
+               char_size = uni_utf8_get_char(token + pos, &apostrophe);
+               if (IS_APOSTROPHE(apostrophe)) {
+                       pos += char_size;
+                       *_token = token + pos;
+               }
+               if (token[pos] == '\0') /* nothing left */
+                       return 0;
+               break;
+       default:
+               /* do nothing */
+               break;
+       }
+
+       return 1;
+}
+
+static const struct fts_filter fts_filter_contractions_real = {
+       .class_name = "contractions",
+       .v = {
+               fts_filter_contractions_create,
+               fts_filter_contractions_filter,
+               NULL
+       }
+};
+
+const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
index 0ac8b95f02dfc44bf731bdbf8c4f6157d8ec50df..bfe8c58df8ce0f23f99688bc4a46251cea96bbed 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "fts-filter.h"
 
-#define FTS_FILTER_CLASSES_NR 3
+#define FTS_FILTER_CLASSES_NR 6
 
 /*
  API that stemming providers (classes) must provide: The create()
index 39b6f6dbd32961174d393b954c1bc9c7404c9838..6c5352e484e56f3726618a87581559da857d16a8 100644 (file)
@@ -21,6 +21,7 @@ void fts_filters_init(void)
        fts_filter_register(fts_filter_normalizer_icu);
        fts_filter_register(fts_filter_lowercase);
        fts_filter_register(fts_filter_english_possessive);
+       fts_filter_register(fts_filter_contractions);
 }
 
 void fts_filters_deinit(void)
index 749b79e6ef6c1686683f7b1bc9ff2e8c1601e5ce..e37bdff66569daf1ecb7d813ef9570fbad1ccefb 100644 (file)
@@ -35,6 +35,9 @@ extern const struct fts_filter *fts_filter_lowercase;
 /* Removes <'s> suffix from words. */
 extern const struct fts_filter *fts_filter_english_possessive;
 
+/* Removes prefixing contractions from words. */
+extern const struct fts_filter *fts_filter_contractions;
+
 /* Register all built-in filters. */
 void fts_filters_init(void);
 void fts_filters_deinit(void);
index 0b865be6c6d5789a0602d10f67415687c02b3a8e..236fba686fd907e2daa51f893c6c6fc94461a0e5 100644 (file)
@@ -12,6 +12,7 @@
 
 static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
 static struct fts_language english_language = { .name = "en" };
+static struct fts_language french_language = { .name = "fr" };
 
 static void test_fts_filter_find(void)
 {
@@ -20,6 +21,65 @@ static void test_fts_filter_find(void)
        test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
        test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
        test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
+       test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+       test_end();
+}
+
+
+static void test_fts_filter_contractions_fail(void)
+{
+
+       struct fts_filter *filter;
+       const char *error;
+
+       test_begin("fts filter contractions, unsupported language");
+       test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+       test_assert(error != NULL);
+       test_end();
+}
+
+static void test_fts_filter_contractions_fr(void)
+{
+       struct {
+               const char *input;
+               const char *output;
+       } tests[] = {
+               { "foo", "foo" },
+               { "you're", "you're" },
+               { "l'homme", "homme" },
+               { "l\xE2\x80\x99homme", "homme" },
+               { "aujourd'hui", "aujourd'hui" },
+               { "qu\xE2\x80\x99il", "il" },
+               { "qu'il", "il" },
+               { "du'il", "du'il" },
+               { "que", "que" },
+               { "'foobar'", "'foobar'" },
+               { "foo'bar", "foo'bar" },
+               { "a'foo", "a'foo" },
+               { "cu'", "cu'" },
+               { "qu", "qu" },
+               { "d", "d" },
+               { "qu'", NULL }
+       };
+       struct fts_filter *filter;
+       const char *error;
+       const char *token;
+       unsigned int i;
+       int ret;
+
+       test_begin("fts filter contractions, French");
+       test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+
+       for (i = 0; i < N_ELEMENTS(tests); i++) {
+               token = tests[i].input;
+               ret = fts_filter_filter(filter, &token, &error);
+               test_assert(ret >= 0);
+               if (ret > 0)
+                       test_assert_idx(strcmp(token, tests[i].output) == 0, i);
+               else if (ret == 0)
+                       test_assert_idx(token == NULL && tests[i].output == NULL, i);
+       }
+       fts_filter_unref(&filter);
        test_end();
 }
 
@@ -151,7 +211,6 @@ static void test_fts_filter_stopwords_fin(void)
 
 static void test_fts_filter_stopwords_fra(void)
 {
-       const struct fts_language french = { .name = "fr" };
        struct fts_filter *filter;
        const char *error;
        int ret;
@@ -167,7 +226,7 @@ static void test_fts_filter_stopwords_fra(void)
        const char *token;
 
        test_begin("fts filter stopwords, French");
-       test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
 
        ip = input;
        op = output;
@@ -245,7 +304,6 @@ static void test_fts_filter_stemmer_snowball_stem_french(void)
 {
        struct fts_filter *stemmer;
        const char *error;
-       struct fts_language language = { .name = "fr" };
        const char *token = NULL;
        const char * const tokens[] = {
                "Tous", "les", "\xC3\xAAtres", "humains", "naissent",
@@ -258,7 +316,7 @@ static void test_fts_filter_stemmer_snowball_stem_french(void)
        const char * const *bpp;
 
        test_begin("fts filter stem French");
-       test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
        bpp = bases;
        for (tpp=tokens; *tpp != NULL; tpp++) {
                token = *tpp;
@@ -627,6 +685,8 @@ int main(void)
 {
        static void (*test_functions[])(void) = {
                test_fts_filter_find,
+               test_fts_filter_contractions_fail,
+               test_fts_filter_contractions_fr,
                test_fts_filter_lowercase,
                test_fts_filter_stopwords_eng,
                test_fts_filter_stopwords_fin,