libfts_la_SOURCES = \
fts-filter.c \
+ fts-filter-english-possessive.c \
fts-filter-lowercase.c \
fts-filter-normalizer-icu.c \
fts-filter-stopwords.c \
--- /dev/null
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "fts-common.h"
+#include "fts-filter-private.h"
+
+static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos)
+{
+ unichar_t c;
+
+ while (!UTF8_IS_START_SEQ(str[*end_pos])) {
+ i_assert(*end_pos > 0);
+ *end_pos -= 1;
+ }
+ if (uni_utf8_get_char(str + *end_pos, &c) <= 0)
+ i_unreached();
+ return c;
+}
+
+static int
+fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED,
+ const char **token,
+ const char **error_r ATTR_UNUSED)
+{
+ unsigned int len = strlen(*token);
+ unichar_t c;
+
+ if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) {
+ len -= 2;
+ c = get_ending_utf8_char(*token, &len);
+ if (IS_APOSTROPHE(c))
+ *token = t_strndup(*token, len);
+ }
+ return 1;
+}
+
+static const struct fts_filter fts_filter_english_possessive_real = {
+ .class_name = "english-possessive",
+ .v = {
+ NULL,
+ fts_filter_english_possessive_filter,
+ NULL
+ }
+};
+
+const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
fts_filter_register(fts_filter_stemmer_snowball);
fts_filter_register(fts_filter_normalizer_icu);
fts_filter_register(fts_filter_lowercase);
+ fts_filter_register(fts_filter_english_possessive);
}
void fts_filters_deinit(void)
/* Lowecases the input. Currently only ASCII data is lowercased. */
extern const struct fts_filter *fts_filter_lowercase;
+/* Removes <'s> suffix from words. */
+extern const struct fts_filter *fts_filter_english_possessive;
+
/* Register all built-in filters. */
void fts_filters_init(void);
void fts_filters_deinit(void);
#endif
#endif
+static void test_fts_filter_english_possessive(void)
+{
+ struct fts_filter *norm = NULL;
+ const char *input[] = {
+ "foo'",
+
+ "foo's",
+ "fooä's",
+ "foo'S",
+ "foos'S",
+ "foo's's",
+ "foo'ss",
+
+ "foo\xE2\x80\x99s",
+ "fooä\xE2\x80\x99s",
+ "foo\xE2\x80\x99S",
+ "foos\xE2\x80\x99S",
+ "foo\xE2\x80\x99s\xE2\x80\x99s",
+ "foo\xE2\x80\x99ss"
+ };
+ const char *expected_output[] = {
+ "foo'",
+
+ "foo",
+ "fooä",
+ "foo",
+ "foos",
+ "foo's",
+ "foo'ss",
+
+ "foo",
+ "fooä",
+ "foo",
+ "foos",
+ "foo\xE2\x80\x99s",
+ "foo\xE2\x80\x99ss"
+ };
+ const char *error = NULL;
+ const char *token = NULL;
+ unsigned int i;
+
+ test_begin("fts filter english possessive");
+
+ T_BEGIN {
+ test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+ for (i = 0; i < N_ELEMENTS(input); i++) {
+ token = input[i];
+ test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+ test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
+ }
+ fts_filter_unref(&norm);
+ } T_END;
+ test_assert(norm == NULL);
+ test_end();
+}
+
/* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
an unregister + find */
test_fts_filter_normalizer_stopwords_stemmer_eng,
#endif
#endif
+ test_fts_filter_english_possessive,
NULL
};
int ret;