lib-fts: Add prefixing contraction filter.

author Teemu Huovila <teemu.huovila@dovecot.fi>

Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)

committer Teemu Huovila <teemu.huovila@dovecot.fi>

Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
author Teemu Huovila <teemu.huovila@dovecot.fi>
Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
committer Teemu Huovila <teemu.huovila@dovecot.fi>
Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
diff --git a/src/lib-fts/Makefile.am b/src/lib-fts/Makefile.am

index b7a72bc9bcf8b9012cc7b4992f916bebd4ab8db9..b530c63e6aae322460613020c43b266025db5843 100644 (file)
--- a/src/lib-fts/Makefile.am
+++ b/src/lib-fts/Makefile.am
@@ -62,6 +62,7 @@ libfts_la_LIBADD = \
  
  libfts_la_SOURCES = \
         fts-filter.c \
+       fts-filter-contractions.c \
         fts-filter-english-possessive.c \
         fts-filter-lowercase.c \
         fts-filter-normalizer-icu.c \
diff --git a/src/lib-fts/fts-filter-contractions.c b/src/lib-fts/fts-filter-contractions.c

new file mode 100644 (file)

index 0000000..9ca5078
--- /dev/null
+++ b/src/lib-fts/fts-filter-contractions.c
@@ -0,0 +1,84 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "fts-language.h"
+#include "fts-filter-private.h"
+#include "fts-common.h"
+#include "unichar.h"
+
+static int
+fts_filter_contractions_create(const struct fts_language *lang,
+                              const char *const *settings,
+                              struct fts_filter **filter_r,
+                              const char **error_r)
+{
+       struct fts_filter *filter;
+
+       if (settings[0] != NULL) {
+               *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
+               return -1;
+       }
+       if (strcmp(lang->name, "fr") != 0) {
+               *error_r = t_strdup_printf("Unsupported language: %s", lang->name);
+               return -1;
+       }
+
+       filter = i_new(struct fts_filter, 1);
+       *filter = *fts_filter_contractions;
+       filter->token = str_new(default_pool, 64);
+       *filter_r = filter;
+       return 0;
+}
+
+static int
+fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+                           const char **_token,
+                           const char **error_r ATTR_UNUSED)
+{
+       int char_size, pos = 0;
+       unichar_t apostrophe;
+       const char *token = *_token;
+
+       switch (token[pos]) {
+       case 'q':
+               pos++;
+               if (token[pos] == '\0' || token[pos] != 'u')
+                       break;
+               /* otherwise fall through */
+       case 'c':
+       case 'd':
+       case 'l':
+       case 'm':
+       case 'n':
+       case 's':
+       case 't':
+               pos++;
+               if (token[pos] == '\0')
+                       break;
+               char_size = uni_utf8_get_char(token + pos, &apostrophe);
+               if (IS_APOSTROPHE(apostrophe)) {
+                       pos += char_size;
+                       *_token = token + pos;
+               }
+               if (token[pos] == '\0') /* nothing left */
+                       return 0;
+               break;
+       default:
+               /* do nothing */
+               break;
+       }
+
+       return 1;
+}
+
+static const struct fts_filter fts_filter_contractions_real = {
+       .class_name = "contractions",
+       .v = {
+               fts_filter_contractions_create,
+               fts_filter_contractions_filter,
+               NULL
+       }
+};
+
+const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
diff --git a/src/lib-fts/fts-filter-private.h b/src/lib-fts/fts-filter-private.h

index 0ac8b95f02dfc44bf731bdbf8c4f6157d8ec50df..bfe8c58df8ce0f23f99688bc4a46251cea96bbed 100644 (file)
--- a/src/lib-fts/fts-filter-private.h
+++ b/src/lib-fts/fts-filter-private.h
@@ -3,7 +3,7 @@
  
  #include "fts-filter.h"
  
-#define FTS_FILTER_CLASSES_NR 3
+#define FTS_FILTER_CLASSES_NR 6
  
  /*
   API that stemming providers (classes) must provide: The create()
diff --git a/src/lib-fts/fts-filter.c b/src/lib-fts/fts-filter.c

index 39b6f6dbd32961174d393b954c1bc9c7404c9838..6c5352e484e56f3726618a87581559da857d16a8 100644 (file)
--- a/src/lib-fts/fts-filter.c
+++ b/src/lib-fts/fts-filter.c
@@ -21,6 +21,7 @@ void fts_filters_init(void)
         fts_filter_register(fts_filter_normalizer_icu);
         fts_filter_register(fts_filter_lowercase);
         fts_filter_register(fts_filter_english_possessive);
+       fts_filter_register(fts_filter_contractions);
  }
  
  void fts_filters_deinit(void)
diff --git a/src/lib-fts/fts-filter.h b/src/lib-fts/fts-filter.h

index 749b79e6ef6c1686683f7b1bc9ff2e8c1601e5ce..e37bdff66569daf1ecb7d813ef9570fbad1ccefb 100644 (file)
--- a/src/lib-fts/fts-filter.h
+++ b/src/lib-fts/fts-filter.h
@@ -35,6 +35,9 @@ extern const struct fts_filter *fts_filter_lowercase;
  /* Removes <'s> suffix from words. */
  extern const struct fts_filter *fts_filter_english_possessive;
  
+/* Removes prefixing contractions from words. */
+extern const struct fts_filter *fts_filter_contractions;
+
  /* Register all built-in filters. */
  void fts_filters_init(void);
  void fts_filters_deinit(void);
diff --git a/src/lib-fts/test-fts-filter.c b/src/lib-fts/test-fts-filter.c

index 0b865be6c6d5789a0602d10f67415687c02b3a8e..236fba686fd907e2daa51f893c6c6fc94461a0e5 100644 (file)
--- a/src/lib-fts/test-fts-filter.c
+++ b/src/lib-fts/test-fts-filter.c
@@ -12,6 +12,7 @@
  
  static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
  static struct fts_language english_language = { .name = "en" };
+static struct fts_language french_language = { .name = "fr" };
  
  static void test_fts_filter_find(void)
  {
@@ -20,6 +21,65 @@ static void test_fts_filter_find(void)
         test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
         test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
         test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
+       test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+       test_end();
+}
+
+
+static void test_fts_filter_contractions_fail(void)
+{
+
+       struct fts_filter *filter;
+       const char *error;
+
+       test_begin("fts filter contractions, unsupported language");
+       test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+       test_assert(error != NULL);
+       test_end();
+}
+
+static void test_fts_filter_contractions_fr(void)
+{
+       struct {
+               const char *input;
+               const char *output;
+       } tests[] = {
+               { "foo", "foo" },
+               { "you're", "you're" },
+               { "l'homme", "homme" },
+               { "l\xE2\x80\x99homme", "homme" },
+               { "aujourd'hui", "aujourd'hui" },
+               { "qu\xE2\x80\x99il", "il" },
+               { "qu'il", "il" },
+               { "du'il", "du'il" },
+               { "que", "que" },
+               { "'foobar'", "'foobar'" },
+               { "foo'bar", "foo'bar" },
+               { "a'foo", "a'foo" },
+               { "cu'", "cu'" },
+               { "qu", "qu" },
+               { "d", "d" },
+               { "qu'", NULL }
+       };
+       struct fts_filter *filter;
+       const char *error;
+       const char *token;
+       unsigned int i;
+       int ret;
+
+       test_begin("fts filter contractions, French");
+       test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+
+       for (i = 0; i < N_ELEMENTS(tests); i++) {
+               token = tests[i].input;
+               ret = fts_filter_filter(filter, &token, &error);
+               test_assert(ret >= 0);
+               if (ret > 0)
+                       test_assert_idx(strcmp(token, tests[i].output) == 0, i);
+               else if (ret == 0)
+                       test_assert_idx(token == NULL && tests[i].output == NULL, i);
+       }
+       fts_filter_unref(&filter);
         test_end();
  }
  
@@ -151,7 +211,6 @@ static void test_fts_filter_stopwords_fin(void)
  
  static void test_fts_filter_stopwords_fra(void)
  {
-       const struct fts_language french = { .name = "fr" };
         struct fts_filter *filter;
         const char *error;
         int ret;
@@ -167,7 +226,7 @@ static void test_fts_filter_stopwords_fra(void)
         const char *token;
  
         test_begin("fts filter stopwords, French");
-       test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
  
         ip = input;
         op = output;
@@ -245,7 +304,6 @@ static void test_fts_filter_stemmer_snowball_stem_french(void)
  {
         struct fts_filter *stemmer;
         const char *error;
-       struct fts_language language = { .name = "fr" };
         const char *token = NULL;
         const char * const tokens[] = {
                 "Tous", "les", "\xC3\xAAtres", "humains", "naissent",
@@ -258,7 +316,7 @@ static void test_fts_filter_stemmer_snowball_stem_french(void)
         const char * const *bpp;
  
         test_begin("fts filter stem French");
-       test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0);
+       test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
         bpp = bases;
         for (tpp=tokens; *tpp != NULL; tpp++) {
                 token = *tpp;
@@ -627,6 +685,8 @@ int main(void)
  {
         static void (*test_functions[])(void) = {
                 test_fts_filter_find,
+               test_fts_filter_contractions_fail,
+               test_fts_filter_contractions_fr,
                 test_fts_filter_lowercase,
                 test_fts_filter_stopwords_eng,
                 test_fts_filter_stopwords_fin,
author	Teemu Huovila <teemu.huovila@dovecot.fi>
	Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
committer	Teemu Huovila <teemu.huovila@dovecot.fi>
	Mon, 31 Aug 2015 10:33:26 +0000 (13:33 +0300)
src/lib-fts/Makefile.am		patch \| blob \| blame \| history
src/lib-fts/fts-filter-contractions.c	[new file with mode: 0644]	patch \| blob
src/lib-fts/fts-filter-private.h		patch \| blob \| blame \| history
src/lib-fts/fts-filter.c		patch \| blob \| blame \| history
src/lib-fts/fts-filter.h		patch \| blob \| blame \| history
src/lib-fts/test-fts-filter.c		patch \| blob \| blame \| history