]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Added "english-possessive" filter.
authorTimo Sirainen <tss@iki.fi>
Tue, 2 Jun 2015 22:04:49 +0000 (01:04 +0300)
committerTimo Sirainen <tss@iki.fi>
Tue, 2 Jun 2015 22:04:49 +0000 (01:04 +0300)
src/lib-fts/Makefile.am
src/lib-fts/fts-filter-english-possessive.c [new file with mode: 0644]
src/lib-fts/fts-filter.c
src/lib-fts/fts-filter.h
src/lib-fts/test-fts-filter.c

index 0b0fc5d2af9a32af331ba504c7f4d6e3561ff936..b7b2d30cabcc6741debb79e527d2be7fa724c928 100644 (file)
@@ -62,6 +62,7 @@ libfts_la_LIBADD = \
 
 libfts_la_SOURCES = \
        fts-filter.c \
+       fts-filter-english-possessive.c \
        fts-filter-lowercase.c \
        fts-filter-normalizer-icu.c \
        fts-filter-stopwords.c \
diff --git a/src/lib-fts/fts-filter-english-possessive.c b/src/lib-fts/fts-filter-english-possessive.c
new file mode 100644 (file)
index 0000000..e12eca3
--- /dev/null
@@ -0,0 +1,47 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "fts-common.h"
+#include "fts-filter-private.h"
+
+static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos)
+{
+       unichar_t c;
+
+       while (!UTF8_IS_START_SEQ(str[*end_pos])) {
+               i_assert(*end_pos > 0);
+               *end_pos -= 1;
+       }
+       if (uni_utf8_get_char(str + *end_pos, &c) <= 0)
+               i_unreached();
+       return c;
+}
+
+static int
+fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED,
+                                    const char **token,
+                                    const char **error_r ATTR_UNUSED)
+{
+       unsigned int len = strlen(*token);
+       unichar_t c;
+
+       if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) {
+               len -= 2;
+               c = get_ending_utf8_char(*token, &len);
+               if (IS_APOSTROPHE(c))
+                       *token = t_strndup(*token, len);
+       }
+       return 1;
+}
+
+static const struct fts_filter fts_filter_english_possessive_real = {
+       .class_name = "english-possessive",
+       .v = {
+               NULL,
+               fts_filter_english_possessive_filter,
+               NULL
+       }
+};
+
+const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
index db904ff6b70d3b0ab73e7b84e6fe723cd136b5f3..39b6f6dbd32961174d393b954c1bc9c7404c9838 100644 (file)
@@ -20,6 +20,7 @@ void fts_filters_init(void)
        fts_filter_register(fts_filter_stemmer_snowball);
        fts_filter_register(fts_filter_normalizer_icu);
        fts_filter_register(fts_filter_lowercase);
+       fts_filter_register(fts_filter_english_possessive);
 }
 
 void fts_filters_deinit(void)
index 6a15ec7a36c07137dde2530bc4a95de71197ff38..88733b85baa3d0c7a7169317a260c328d2761db0 100644 (file)
@@ -32,6 +32,9 @@ extern const struct fts_filter *fts_filter_normalizer_icu;
 /* Lowecases the input. Currently only ASCII data is lowercased. */
 extern const struct fts_filter *fts_filter_lowercase;
 
+/* Removes <'s> suffix from words. */
+extern const struct fts_filter *fts_filter_english_possessive;
+
 /* Register all built-in filters. */
 void fts_filters_init(void);
 void fts_filters_deinit(void);
index 93d511bbf40a26ad9e2619bc8ee5affa267a97a7..d4220f2b72e30bd4766670f3ce8e054e43fb4e44 100644 (file)
@@ -572,6 +572,62 @@ static void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
 #endif
 #endif
 
+static void test_fts_filter_english_possessive(void)
+{
+       struct fts_filter *norm = NULL;
+       const char *input[] = {
+               "foo'",
+
+               "foo's",
+               "fooä's",
+               "foo'S",
+               "foos'S",
+               "foo's's",
+               "foo'ss",
+
+               "foo\xE2\x80\x99s",
+               "fooä\xE2\x80\x99s",
+               "foo\xE2\x80\x99S",
+               "foos\xE2\x80\x99S",
+               "foo\xE2\x80\x99s\xE2\x80\x99s",
+               "foo\xE2\x80\x99ss"
+       };
+       const char *expected_output[] = {
+               "foo'",
+
+               "foo",
+               "fooä",
+               "foo",
+               "foos",
+               "foo's",
+               "foo'ss",
+
+               "foo",
+               "fooä",
+               "foo",
+               "foos",
+               "foo\xE2\x80\x99s",
+               "foo\xE2\x80\x99ss"
+       };
+       const char *error = NULL;
+       const char *token = NULL;
+       unsigned int i;
+
+       test_begin("fts filter english possessive");
+
+       T_BEGIN {
+               test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+               for (i = 0; i < N_ELEMENTS(input); i++) {
+                       token = input[i];
+                       test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+                       test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
+               }
+               fts_filter_unref(&norm);
+       } T_END;
+       test_assert(norm == NULL);
+       test_end();
+}
+
 /* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
   an unregister + find */
 
@@ -600,6 +656,7 @@ int main(void)
                test_fts_filter_normalizer_stopwords_stemmer_eng,
 #endif
 #endif
+               test_fts_filter_english_possessive,
                NULL
        };
        int ret;