From: Teemu Huovila Date: Sun, 21 Aug 2016 19:12:33 +0000 (+0300) Subject: lib-fts: Cut overlong strings in lowercase filter. X-Git-Tag: 2.3.0.rc1~3117 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=5fcd30add8dcf4d883978cce3e39f3a89184f1e5;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: Cut overlong strings in lowercase filter. Added new common truncate function for filters. It also removes any partial characters, that would remain from plain truncation. --- diff --git a/src/lib-fts/Makefile.am b/src/lib-fts/Makefile.am index 1219ddf82a..91f905fa49 100644 --- a/src/lib-fts/Makefile.am +++ b/src/lib-fts/Makefile.am @@ -78,6 +78,7 @@ libfts_la_LIBADD = \ libfts_la_SOURCES = \ fts-filter.c \ fts-filter-contractions.c \ + fts-filter-common.c \ fts-filter-english-possessive.c \ fts-filter-lowercase.c \ fts-filter-normalizer-icu.c \ @@ -94,6 +95,7 @@ libfts_la_SOURCES = \ headers = \ fts-common.h \ fts-filter.h \ + fts-filter-common.h \ fts-filter-private.h \ fts-icu.h \ fts-language.h \ diff --git a/src/lib-fts/fts-filter-common.c b/src/lib-fts/fts-filter-common.c new file mode 100644 index 0000000000..63358108f0 --- /dev/null +++ b/src/lib-fts/fts-filter-common.c @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 Dovecot authors, see the included COPYING file */ + +#include "lib.h" +#include "str.h" +#include "unichar.h" +#include "fts-filter-private.h" +#include "fts-filter-common.h" +#include "fts-tokenizer-common.h" + +void fts_filter_truncate_token(string_t *token, size_t max_length) +{ + if (str_len(token) <= max_length) + return; + + size_t len = max_length; + fts_tokenizer_delete_trailing_partial_char(token->data, &len); + str_truncate(token, len); + i_assert(len <= max_length); +} diff --git a/src/lib-fts/fts-filter-common.h b/src/lib-fts/fts-filter-common.h new file mode 100644 index 0000000000..7b6552cf5d --- /dev/null +++ b/src/lib-fts/fts-filter-common.h @@ -0,0 +1,6 @@ +#ifndef FTS_FILTER_COMMON_H +#define FTS_FILTER_COMMON_H + +void fts_filter_truncate_token(string_t *token, size_t max_length); + +#endif diff --git a/src/lib-fts/fts-filter-lowercase.c b/src/lib-fts/fts-filter-lowercase.c index e0edcf8056..8aa1cd1d7c 100644 --- a/src/lib-fts/fts-filter-lowercase.c +++ b/src/lib-fts/fts-filter-lowercase.c @@ -7,23 +7,37 @@ #ifdef HAVE_LIBICU # include "fts-icu.h" +# include "fts-filter-common.h" #endif static int fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED, - const char *const *settings, - struct fts_filter **filter_r, - const char **error_r) + const char *const *settings, + struct fts_filter **filter_r, + const char **error_r) { struct fts_filter *filter; + unsigned int i, max_length = 250; - if (settings[0] != NULL) { - *error_r = t_strdup_printf("Unknown setting: %s", settings[0]); - return -1; + for (i = 0; settings[i] != NULL; i += 2) { + const char *key = settings[i], *value = settings[i+1]; + + if (strcmp(key, "maxlen") == 0) { + if (str_to_uint(value, &max_length) < 0 || + max_length == 0) { + *error_r = t_strdup_printf("Invalid lowercase filter maxlen setting: %s", value); + return -1; + } + } + else { + *error_r = t_strdup_printf("Unknown setting: %s", key); + return -1; + } } filter = i_new(struct fts_filter, 1); *filter = *fts_filter_lowercase; filter->token = str_new(default_pool, 64); + filter->max_length = max_length; *filter_r = filter; return 0; @@ -31,12 +45,13 @@ fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED, static int fts_filter_lowercase_filter(struct fts_filter *filter ATTR_UNUSED, - const char **token, - const char **error_r ATTR_UNUSED) + const char **token, + const char **error_r ATTR_UNUSED) { #ifdef HAVE_LIBICU str_truncate(filter->token, 0); fts_icu_lcase(filter->token, *token); + fts_filter_truncate_token(filter->token, filter->max_length); *token = str_c(filter->token); #else *token = t_str_lcase(*token); diff --git a/src/lib-fts/test-fts-filter.c b/src/lib-fts/test-fts-filter.c index 6c5f1a7b2c..8f8ff48a69 100644 --- a/src/lib-fts/test-fts-filter.c +++ b/src/lib-fts/test-fts-filter.c @@ -139,7 +139,36 @@ static void test_fts_filter_lowercase_utf8(void) for (i = 0; i < N_ELEMENTS(tests); i++) { token = tests[i].input; test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 && - strcmp(token, tests[i].output) == 0, 0); + strcmp(token, tests[i].output) == 0, 0); + } + fts_filter_unref(&filter); + test_end(); +} + +static void test_fts_filter_lowercase_too_long_utf8(void) +{ + struct { + const char *input; + const char *output; + } tests[] = { + { "f\xC3\x85\xC3\x85", "f\xC3\xA5\xC3\xA5" }, + { "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxy" }, + { "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" }, + { "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" } + }; + struct fts_filter *filter; + const char *error; + const char *token; + const char * const settings[] = {"maxlen", "25", NULL}; + unsigned int i; + + test_begin("fts filter lowercase, too long UTF8"); + test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0); + + for (i = 0; i < N_ELEMENTS(tests); i++) { + token = tests[i].input; + test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 && + strcmp(token, tests[i].output) == 0, 0); } fts_filter_unref(&filter); test_end(); @@ -936,6 +965,7 @@ int main(void) test_fts_filter_lowercase, #ifdef HAVE_LIBICU test_fts_filter_lowercase_utf8, + test_fts_filter_lowercase_too_long_utf8, #endif test_fts_filter_stopwords_eng, test_fts_filter_stopwords_fin,