]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: fts_filter_stemmer_snowball_filter() - Handle cases where the stemmer return...
authorMarco Bettini <marco.bettini@open-xchange.com>
Tue, 22 Nov 2022 14:28:53 +0000 (14:28 +0000)
committeraki.tuomi <aki.tuomi@open-xchange.com>
Mon, 28 Nov 2022 18:28:50 +0000 (18:28 +0000)
Fixes an issue raised originally against flatcurve in GitHub Issue #37,
where in some combination of languages and filters, the indexing crashes.
The ultimate cause was the improper assumption that snowball ALWAYS returns
a token, which happens to not be true.

src/lib-fts/fts-filter-stemmer-snowball.c

index 96d91fdf1462d293be8a9c00626215ea5fd12705..d4450329dbf98ecf720f5898603002b193d858c0 100644 (file)
@@ -87,7 +87,19 @@ fts_filter_stemmer_snowball_filter(struct fts_filter *filter,
                               "sb_stemmer_stem(len=%zu) failed: Out of memory",
                               strlen(*token));
        }
-       *token = t_strndup(base, sb_stemmer_length(sp->stemmer));
+       int len = sb_stemmer_length(sp->stemmer);
+       if (len > 0)
+               *token = t_strndup(base, len);
+       else {
+               /* If the stemmer returns an empty token, the return value
+                * should be 0 instead of 1 (otherwise it causes an assertion
+                * fault in fts_filter_filter() ).
+                * However, removing tokens may bring the same kind of issues
+                * and inconsistencies that stopwords cause when used with
+                * multiple languages and negations.
+                * So, when the stemmer asks to remove a token,
+                * keep the original token unchanged instead. */
+       }
        return 1;
 }