libfts_la_SOURCES = \
fts-filter.c \
fts-filter-contractions.c \
+ fts-filter-common.c \
fts-filter-english-possessive.c \
fts-filter-lowercase.c \
fts-filter-normalizer-icu.c \
headers = \
fts-common.h \
fts-filter.h \
+ fts-filter-common.h \
fts-filter-private.h \
fts-icu.h \
fts-language.h \
--- /dev/null
+/* Copyright (c) 2016 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "unichar.h"
+#include "fts-filter-private.h"
+#include "fts-filter-common.h"
+#include "fts-tokenizer-common.h"
+
+void fts_filter_truncate_token(string_t *token, size_t max_length)
+{
+ if (str_len(token) <= max_length)
+ return;
+
+ size_t len = max_length;
+ fts_tokenizer_delete_trailing_partial_char(token->data, &len);
+ str_truncate(token, len);
+ i_assert(len <= max_length);
+}
--- /dev/null
+#ifndef FTS_FILTER_COMMON_H
+#define FTS_FILTER_COMMON_H
+
+void fts_filter_truncate_token(string_t *token, size_t max_length);
+
+#endif
#ifdef HAVE_LIBICU
# include "fts-icu.h"
+# include "fts-filter-common.h"
#endif
static int
fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+ const char *const *settings,
+ struct fts_filter **filter_r,
+ const char **error_r)
{
struct fts_filter *filter;
+ unsigned int i, max_length = 250;
- if (settings[0] != NULL) {
- *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
- return -1;
+ for (i = 0; settings[i] != NULL; i += 2) {
+ const char *key = settings[i], *value = settings[i+1];
+
+ if (strcmp(key, "maxlen") == 0) {
+ if (str_to_uint(value, &max_length) < 0 ||
+ max_length == 0) {
+ *error_r = t_strdup_printf("Invalid lowercase filter maxlen setting: %s", value);
+ return -1;
+ }
+ }
+ else {
+ *error_r = t_strdup_printf("Unknown setting: %s", key);
+ return -1;
+ }
}
filter = i_new(struct fts_filter, 1);
*filter = *fts_filter_lowercase;
filter->token = str_new(default_pool, 64);
+ filter->max_length = max_length;
*filter_r = filter;
return 0;
static int
fts_filter_lowercase_filter(struct fts_filter *filter ATTR_UNUSED,
- const char **token,
- const char **error_r ATTR_UNUSED)
+ const char **token,
+ const char **error_r ATTR_UNUSED)
{
#ifdef HAVE_LIBICU
str_truncate(filter->token, 0);
fts_icu_lcase(filter->token, *token);
+ fts_filter_truncate_token(filter->token, filter->max_length);
*token = str_c(filter->token);
#else
*token = t_str_lcase(*token);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
- strcmp(token, tests[i].output) == 0, 0);
+ strcmp(token, tests[i].output) == 0, 0);
+ }
+ fts_filter_unref(&filter);
+ test_end();
+}
+
+static void test_fts_filter_lowercase_too_long_utf8(void)
+{
+ struct {
+ const char *input;
+ const char *output;
+ } tests[] = {
+ { "f\xC3\x85\xC3\x85", "f\xC3\xA5\xC3\xA5" },
+ { "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxy" },
+ { "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" },
+ { "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" }
+ };
+ struct fts_filter *filter;
+ const char *error;
+ const char *token;
+ const char * const settings[] = {"maxlen", "25", NULL};
+ unsigned int i;
+
+ test_begin("fts filter lowercase, too long UTF8");
+ test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
+
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ token = tests[i].input;
+ test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
+ strcmp(token, tests[i].output) == 0, 0);
}
fts_filter_unref(&filter);
test_end();
test_fts_filter_lowercase,
#ifdef HAVE_LIBICU
test_fts_filter_lowercase_utf8,
+ test_fts_filter_lowercase_too_long_utf8,
#endif
test_fts_filter_stopwords_eng,
test_fts_filter_stopwords_fin,