]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: ICU normalization changes some characters to spaces - remove them.
authorTimo Sirainen <tss@iki.fi>
Fri, 22 May 2015 02:03:10 +0000 (22:03 -0400)
committerTimo Sirainen <tss@iki.fi>
Fri, 22 May 2015 02:03:10 +0000 (22:03 -0400)
We don't really want to add spaces to our index. It would be nice if the
words between spaces were actually split to different tokens, but that's
more of the fts-tokenizer's job and at filter stage that's probably not
wanted anymore.

src/lib-fts/fts-filter-normalizer-icu.c
src/lib-fts/test-fts-filter.c

index e1704ad928813ddf1f342dbbb4cc5573625207ea..11eb54f16c232b1e35c9ced26f6d6244a538efbe 100644 (file)
@@ -41,7 +41,7 @@ fts_filter_normalizer_icu_create(const struct fts_language *lang ATTR_UNUSED,
        struct fts_filter_normalizer_icu *np;
        pool_t pp;
        unsigned int i;
-       const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC";
+       const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC; [\\x20] Remove";
 
        for (i = 0; settings[i] != NULL; i += 2) {
                const char *key = settings[i], *value = settings[i+1];
index 95b1c07daac5a886826e09dbf8829c508ff2bea1..93d511bbf40a26ad9e2619bc8ee5affa267a97a7 100644 (file)
@@ -372,7 +372,7 @@ static void test_fts_filter_normalizer_swedish_short_default_id(void)
                "vem",
                "a",
                "aao",
-               "vem kan segla forutan vind?\naaooaa"
+               "vemkanseglaforutanvind?\naaooaa"
        };
        const char *error = NULL;
        const char *token = NULL;
@@ -446,12 +446,13 @@ static void test_fts_filter_normalizer_empty(void)
 {
        /* test just a couple of these */
        static const char *empty_tokens[] = {
+               "\xC2\xAF", /* U+00AF */
                "\xCC\x80", /* U+0300 */
                "\xF3\xA0\x87\xAF", /* U+E01EF */
                "\xCC\x80\xF3\xA0\x87\xAF" /* U+0300 U+E01EF */
        };
        const char * const settings[] =
-               {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
+               {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL};
        struct fts_filter *norm;
        const char *error;
        unsigned int i;