]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Cut overlong strings in lowercase filter.
authorTeemu Huovila <teemu.huovila@dovecot.fi>
Sun, 21 Aug 2016 19:12:33 +0000 (22:12 +0300)
committerTimo Sirainen <timo.sirainen@dovecot.fi>
Mon, 22 Aug 2016 22:03:27 +0000 (01:03 +0300)
Added new common truncate function for filters. It also removes any partial
characters, that would remain from plain truncation.

src/lib-fts/Makefile.am
src/lib-fts/fts-filter-common.c [new file with mode: 0644]
src/lib-fts/fts-filter-common.h [new file with mode: 0644]
src/lib-fts/fts-filter-lowercase.c
src/lib-fts/test-fts-filter.c

index 1219ddf82a9c39c1abfff4ebd9ab64899492509e..91f905fa496f603831e212a99dd01674a07f920b 100644 (file)
@@ -78,6 +78,7 @@ libfts_la_LIBADD = \
 libfts_la_SOURCES = \
        fts-filter.c \
        fts-filter-contractions.c \
+       fts-filter-common.c \
        fts-filter-english-possessive.c \
        fts-filter-lowercase.c \
        fts-filter-normalizer-icu.c \
@@ -94,6 +95,7 @@ libfts_la_SOURCES = \
 headers = \
        fts-common.h \
        fts-filter.h \
+       fts-filter-common.h \
        fts-filter-private.h \
        fts-icu.h \
        fts-language.h \
diff --git a/src/lib-fts/fts-filter-common.c b/src/lib-fts/fts-filter-common.c
new file mode 100644 (file)
index 0000000..6335810
--- /dev/null
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "unichar.h"
+#include "fts-filter-private.h"
+#include "fts-filter-common.h"
+#include "fts-tokenizer-common.h"
+
+void fts_filter_truncate_token(string_t *token, size_t max_length)
+{
+       if (str_len(token) <= max_length)
+               return;
+
+       size_t len = max_length;
+       fts_tokenizer_delete_trailing_partial_char(token->data, &len);
+       str_truncate(token, len);
+       i_assert(len <= max_length);
+}
diff --git a/src/lib-fts/fts-filter-common.h b/src/lib-fts/fts-filter-common.h
new file mode 100644 (file)
index 0000000..7b6552c
--- /dev/null
@@ -0,0 +1,6 @@
+#ifndef FTS_FILTER_COMMON_H
+#define FTS_FILTER_COMMON_H
+
+void fts_filter_truncate_token(string_t *token, size_t max_length);
+
+#endif
index e0edcf8056570baedc04be68f459c931edcfb689..8aa1cd1d7ce26c38e85829d97baf33bf325ecb02 100644 (file)
@@ -7,23 +7,37 @@
 
 #ifdef HAVE_LIBICU
 #  include "fts-icu.h"
+#  include "fts-filter-common.h"
 #endif
 
 static int
 fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED,
-                           const char *const *settings,
-                           struct fts_filter **filter_r,
-                           const char **error_r)
+                            const char *const *settings,
+                            struct fts_filter **filter_r,
+                            const char **error_r)
 {
        struct fts_filter *filter;
+       unsigned int i, max_length = 250;
 
-       if (settings[0] != NULL) {
-               *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
-               return -1;
+       for (i = 0; settings[i] != NULL; i += 2) {
+               const char *key = settings[i], *value = settings[i+1];
+
+               if (strcmp(key, "maxlen") == 0) {
+                       if (str_to_uint(value, &max_length) < 0 ||
+                           max_length == 0) {
+                               *error_r = t_strdup_printf("Invalid lowercase filter maxlen setting: %s", value);
+                               return -1;
+                       }
+               }
+               else {
+                       *error_r = t_strdup_printf("Unknown setting: %s", key);
+                       return -1;
+               }
        }
        filter = i_new(struct fts_filter, 1);
        *filter = *fts_filter_lowercase;
        filter->token = str_new(default_pool, 64);
+       filter->max_length = max_length;
 
        *filter_r = filter;
        return 0;
@@ -31,12 +45,13 @@ fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED,
 
 static int
 fts_filter_lowercase_filter(struct fts_filter *filter ATTR_UNUSED,
-                           const char **token,
-                           const char **error_r ATTR_UNUSED)
+                            const char **token,
+                            const char **error_r ATTR_UNUSED)
 {
 #ifdef HAVE_LIBICU
        str_truncate(filter->token, 0);
        fts_icu_lcase(filter->token, *token);
+       fts_filter_truncate_token(filter->token, filter->max_length);
        *token = str_c(filter->token);
 #else
        *token = t_str_lcase(*token);
index 6c5f1a7b2c8f83b6c59908a3831bb8b9381f99ca..8f8ff48a69693a46b8bc6ab7e8aac2666ae08bc9 100644 (file)
@@ -139,7 +139,36 @@ static void test_fts_filter_lowercase_utf8(void)
        for (i = 0; i < N_ELEMENTS(tests); i++) {
                token = tests[i].input;
                test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
-                               strcmp(token, tests[i].output) == 0, 0);
+                               strcmp(token, tests[i].output) == 0, 0);
+       }
+       fts_filter_unref(&filter);
+       test_end();
+}
+
+static void test_fts_filter_lowercase_too_long_utf8(void)
+{
+       struct {
+               const char *input;
+               const char *output;
+       } tests[] = {
+               { "f\xC3\x85\xC3\x85", "f\xC3\xA5\xC3\xA5" },
+               { "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxy" },
+               { "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" },
+               { "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" }
+       };
+       struct fts_filter *filter;
+       const char *error;
+       const char *token;
+       const char * const settings[] = {"maxlen", "25", NULL};
+       unsigned int i;
+
+       test_begin("fts filter lowercase, too long UTF8");
+       test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
+
+       for (i = 0; i < N_ELEMENTS(tests); i++) {
+               token = tests[i].input;
+               test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
+                               strcmp(token, tests[i].output) == 0, 0);
        }
        fts_filter_unref(&filter);
        test_end();
@@ -936,6 +965,7 @@ int main(void)
                test_fts_filter_lowercase,
 #ifdef HAVE_LIBICU
                test_fts_filter_lowercase_utf8,
+               test_fts_filter_lowercase_too_long_utf8,
 #endif
                test_fts_filter_stopwords_eng,
                test_fts_filter_stopwords_fin,