--- /dev/null
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "fts-language.h"
+#include "fts-filter-private.h"
+#include "fts-common.h"
+#include "unichar.h"
+
+static int
+fts_filter_contractions_create(const struct fts_language *lang,
+ const char *const *settings,
+ struct fts_filter **filter_r,
+ const char **error_r)
+{
+ struct fts_filter *filter;
+
+ if (settings[0] != NULL) {
+ *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
+ return -1;
+ }
+ if (strcmp(lang->name, "fr") != 0) {
+ *error_r = t_strdup_printf("Unsupported language: %s", lang->name);
+ return -1;
+ }
+
+ filter = i_new(struct fts_filter, 1);
+ *filter = *fts_filter_contractions;
+ filter->token = str_new(default_pool, 64);
+ *filter_r = filter;
+ return 0;
+}
+
+static int
+fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+ const char **_token,
+ const char **error_r ATTR_UNUSED)
+{
+ int char_size, pos = 0;
+ unichar_t apostrophe;
+ const char *token = *_token;
+
+ switch (token[pos]) {
+ case 'q':
+ pos++;
+ if (token[pos] == '\0' || token[pos] != 'u')
+ break;
+ /* otherwise fall through */
+ case 'c':
+ case 'd':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 's':
+ case 't':
+ pos++;
+ if (token[pos] == '\0')
+ break;
+ char_size = uni_utf8_get_char(token + pos, &apostrophe);
+ if (IS_APOSTROPHE(apostrophe)) {
+ pos += char_size;
+ *_token = token + pos;
+ }
+ if (token[pos] == '\0') /* nothing left */
+ return 0;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+
+ return 1;
+}
+
+static const struct fts_filter fts_filter_contractions_real = {
+ .class_name = "contractions",
+ .v = {
+ fts_filter_contractions_create,
+ fts_filter_contractions_filter,
+ NULL
+ }
+};
+
+const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
static struct fts_language english_language = { .name = "en" };
+static struct fts_language french_language = { .name = "fr" };
static void test_fts_filter_find(void)
{
test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
+ test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+ test_end();
+}
+
+
+static void test_fts_filter_contractions_fail(void)
+{
+
+ struct fts_filter *filter;
+ const char *error;
+
+ test_begin("fts filter contractions, unsupported language");
+ test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+ test_assert(error != NULL);
+ test_end();
+}
+
+static void test_fts_filter_contractions_fr(void)
+{
+ struct {
+ const char *input;
+ const char *output;
+ } tests[] = {
+ { "foo", "foo" },
+ { "you're", "you're" },
+ { "l'homme", "homme" },
+ { "l\xE2\x80\x99homme", "homme" },
+ { "aujourd'hui", "aujourd'hui" },
+ { "qu\xE2\x80\x99il", "il" },
+ { "qu'il", "il" },
+ { "du'il", "du'il" },
+ { "que", "que" },
+ { "'foobar'", "'foobar'" },
+ { "foo'bar", "foo'bar" },
+ { "a'foo", "a'foo" },
+ { "cu'", "cu'" },
+ { "qu", "qu" },
+ { "d", "d" },
+ { "qu'", NULL }
+ };
+ struct fts_filter *filter;
+ const char *error;
+ const char *token;
+ unsigned int i;
+ int ret;
+
+ test_begin("fts filter contractions, French");
+ test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ token = tests[i].input;
+ ret = fts_filter_filter(filter, &token, &error);
+ test_assert(ret >= 0);
+ if (ret > 0)
+ test_assert_idx(strcmp(token, tests[i].output) == 0, i);
+ else if (ret == 0)
+ test_assert_idx(token == NULL && tests[i].output == NULL, i);
+ }
+ fts_filter_unref(&filter);
test_end();
}
static void test_fts_filter_stopwords_fra(void)
{
- const struct fts_language french = { .name = "fr" };
struct fts_filter *filter;
const char *error;
int ret;
const char *token;
test_begin("fts filter stopwords, French");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
{
struct fts_filter *stemmer;
const char *error;
- struct fts_language language = { .name = "fr" };
const char *token = NULL;
const char * const tokens[] = {
"Tous", "les", "\xC3\xAAtres", "humains", "naissent",
const char * const *bpp;
test_begin("fts filter stem French");
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
{
static void (*test_functions[])(void) = {
test_fts_filter_find,
+ test_fts_filter_contractions_fail,
+ test_fts_filter_contractions_fr,
test_fts_filter_lowercase,
test_fts_filter_stopwords_eng,
test_fts_filter_stopwords_fin,