From 471167b9701fcc99b66f7a8bcae07bc4ac0dbbd4 Mon Sep 17 00:00:00 2001 From: Timo Sirainen Date: Wed, 3 Jun 2015 01:04:49 +0300 Subject: [PATCH] lib-fts: Added "english-possessive" filter. --- src/lib-fts/Makefile.am | 1 + src/lib-fts/fts-filter-english-possessive.c | 47 +++++++++++++++++ src/lib-fts/fts-filter.c | 1 + src/lib-fts/fts-filter.h | 3 ++ src/lib-fts/test-fts-filter.c | 57 +++++++++++++++++++++ 5 files changed, 109 insertions(+) create mode 100644 src/lib-fts/fts-filter-english-possessive.c diff --git a/src/lib-fts/Makefile.am b/src/lib-fts/Makefile.am index 0b0fc5d2af..b7b2d30cab 100644 --- a/src/lib-fts/Makefile.am +++ b/src/lib-fts/Makefile.am @@ -62,6 +62,7 @@ libfts_la_LIBADD = \ libfts_la_SOURCES = \ fts-filter.c \ + fts-filter-english-possessive.c \ fts-filter-lowercase.c \ fts-filter-normalizer-icu.c \ fts-filter-stopwords.c \ diff --git a/src/lib-fts/fts-filter-english-possessive.c b/src/lib-fts/fts-filter-english-possessive.c new file mode 100644 index 0000000000..e12eca300d --- /dev/null +++ b/src/lib-fts/fts-filter-english-possessive.c @@ -0,0 +1,47 @@ +/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "unichar.h" +#include "fts-common.h" +#include "fts-filter-private.h" + +static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos) +{ + unichar_t c; + + while (!UTF8_IS_START_SEQ(str[*end_pos])) { + i_assert(*end_pos > 0); + *end_pos -= 1; + } + if (uni_utf8_get_char(str + *end_pos, &c) <= 0) + i_unreached(); + return c; +} + +static int +fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED, + const char **token, + const char **error_r ATTR_UNUSED) +{ + unsigned int len = strlen(*token); + unichar_t c; + + if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) { + len -= 2; + c = get_ending_utf8_char(*token, &len); + if (IS_APOSTROPHE(c)) + *token = t_strndup(*token, len); + } + return 1; +} + +static const struct fts_filter fts_filter_english_possessive_real = { + .class_name = "english-possessive", + .v = { + NULL, + fts_filter_english_possessive_filter, + NULL + } +}; + +const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real; diff --git a/src/lib-fts/fts-filter.c b/src/lib-fts/fts-filter.c index db904ff6b7..39b6f6dbd3 100644 --- a/src/lib-fts/fts-filter.c +++ b/src/lib-fts/fts-filter.c @@ -20,6 +20,7 @@ void fts_filters_init(void) fts_filter_register(fts_filter_stemmer_snowball); fts_filter_register(fts_filter_normalizer_icu); fts_filter_register(fts_filter_lowercase); + fts_filter_register(fts_filter_english_possessive); } void fts_filters_deinit(void) diff --git a/src/lib-fts/fts-filter.h b/src/lib-fts/fts-filter.h index 6a15ec7a36..88733b85ba 100644 --- a/src/lib-fts/fts-filter.h +++ b/src/lib-fts/fts-filter.h @@ -32,6 +32,9 @@ extern const struct fts_filter *fts_filter_normalizer_icu; /* Lowecases the input. Currently only ASCII data is lowercased. */ extern const struct fts_filter *fts_filter_lowercase; +/* Removes <'s> suffix from words. */ +extern const struct fts_filter *fts_filter_english_possessive; + /* Register all built-in filters. */ void fts_filters_init(void); void fts_filters_deinit(void); diff --git a/src/lib-fts/test-fts-filter.c b/src/lib-fts/test-fts-filter.c index 93d511bbf4..d4220f2b72 100644 --- a/src/lib-fts/test-fts-filter.c +++ b/src/lib-fts/test-fts-filter.c @@ -572,6 +572,62 @@ static void test_fts_filter_normalizer_stopwords_stemmer_eng(void) #endif #endif +static void test_fts_filter_english_possessive(void) +{ + struct fts_filter *norm = NULL; + const char *input[] = { + "foo'", + + "foo's", + "fooä's", + "foo'S", + "foos'S", + "foo's's", + "foo'ss", + + "foo\xE2\x80\x99s", + "fooä\xE2\x80\x99s", + "foo\xE2\x80\x99S", + "foos\xE2\x80\x99S", + "foo\xE2\x80\x99s\xE2\x80\x99s", + "foo\xE2\x80\x99ss" + }; + const char *expected_output[] = { + "foo'", + + "foo", + "fooä", + "foo", + "foos", + "foo's", + "foo'ss", + + "foo", + "fooä", + "foo", + "foos", + "foo\xE2\x80\x99s", + "foo\xE2\x80\x99ss" + }; + const char *error = NULL; + const char *token = NULL; + unsigned int i; + + test_begin("fts filter english possessive"); + + T_BEGIN { + test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0); + for (i = 0; i < N_ELEMENTS(input); i++) { + token = input[i]; + test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i); + test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i); + } + fts_filter_unref(&norm); + } T_END; + test_assert(norm == NULL); + test_end(); +} + /* TODO: Functions to test 1. ref-unref pairs 2. multiple registers + an unregister + find */ @@ -600,6 +656,7 @@ int main(void) test_fts_filter_normalizer_stopwords_stemmer_eng, #endif #endif + test_fts_filter_english_possessive, NULL }; int ret; -- 2.47.3