]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
fts-lucene: Support normalize setting also without snowball. Added no_snowball setting.
authorTimo Sirainen <tss@iki.fi>
Sun, 9 Jun 2013 00:10:43 +0000 (03:10 +0300)
committerTimo Sirainen <tss@iki.fi>
Sun, 9 Jun 2013 00:10:43 +0000 (03:10 +0300)
Snowball seems to be converting / breaking words down rather annoyingly.

src/plugins/fts-lucene/fts-lucene-plugin.c
src/plugins/fts-lucene/fts-lucene-plugin.h
src/plugins/fts-lucene/lucene-wrapper.cc

index d4f078b124f9380b937ddf83a9e740747a2f2a88..be5fcc9632de4586a4e7b804250badf34bb89e87 100644 (file)
@@ -30,6 +30,8 @@ fts_lucene_plugin_init_settings(struct mail_user *user,
                        set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
                } else if (strcmp(*tmp, "normalize") == 0) {
                        set->normalize = TRUE;
+               } else if (strcmp(*tmp, "no_snowball") == 0) {
+                       set->no_snowball = TRUE;
                } else {
                        i_error("fts_lucene: Invalid setting: %s", *tmp);
                        return -1;
@@ -51,11 +53,6 @@ fts_lucene_plugin_init_settings(struct mail_user *user,
                        "but Dovecot built without stemmer support");
                return -1;
        }
-       if (set->normalize) {
-               i_error("fts_lucene: normalize not currently supported "
-                       "without stemmer support");
-               return -1;
-       }
 #else
        if (set->default_language == NULL)
                set->default_language = "english";
@@ -80,6 +77,8 @@ uint32_t fts_lucene_settings_checksum(const struct fts_lucene_settings *set)
        crc = crc32_str_more(crc, set->whitespace_chars);
        if (set->normalize)
                crc = crc32_str_more(crc, "n");
+       if (set->no_snowball)
+               crc = crc32_str_more(crc, "s");
        return crc;
 }
 
index 42587f39e02eaaa7d3bf2a5f2df826aa8d4d6e87..c5be44c552d4697b84aa39a10a862c40c0943188 100644 (file)
@@ -13,6 +13,7 @@ struct fts_lucene_settings {
        const char *textcat_conf, *textcat_dir;
        const char *whitespace_chars;
        bool normalize;
+       bool no_snowball;
 };
 
 struct fts_lucene_user {
index c57b67674c37a6ec9c3ab79773cd1df22916793f..3eea52e1c3d0fe7e606e6177cfbe3856d3538383 100644 (file)
@@ -67,6 +67,7 @@ struct lucene_index {
        IndexWriter *writer;
        IndexSearcher *searcher;
 
+       buffer_t *normalizer_buf;
        Analyzer *default_analyzer, *cur_analyzer;
        ARRAY(struct lucene_analyzer) analyzers;
 
@@ -118,13 +119,20 @@ struct lucene_index *lucene_index_init(const char *path,
                index->set.default_language = "";
        }
 #ifdef HAVE_LUCENE_STEMMER
-       index->default_analyzer =
-               _CLNEW snowball::SnowballAnalyzer(index->normalizer,
-                                                 index->set.default_language);
-#else
-       index->default_analyzer = _CLNEW standard::StandardAnalyzer();
-       i_assert(index->normalizer == NULL);
+       if (!set->no_snowball) {
+               index->default_analyzer =
+                       _CLNEW snowball::SnowballAnalyzer(index->normalizer,
+                                                         index->set.default_language);
+       }
 #endif
+       else {
+               index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+               if (index->normalizer != NULL) {
+                       index->normalizer_buf =
+                               buffer_create_dynamic(default_pool, 1024);
+               }
+       }
+
        i_array_init(&index->analyzers, 32);
        textcat_refcount++;
 
@@ -155,6 +163,8 @@ void lucene_index_deinit(struct lucene_index *index)
                textcat = NULL;
        }
        _CLDELETE(index->default_analyzer);
+       if (index->normalizer_buf != NULL)
+               buffer_free(&index->normalizer_buf);
        i_free(index->path);
        i_free(index);
 }
@@ -517,6 +527,13 @@ int lucene_index_build_more(struct lucene_index *index, uint32_t uid,
                index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
        }
 
+       if (index->normalizer_buf != NULL) {
+               buffer_set_used_size(index->normalizer_buf, 0);
+               index->normalizer(data, size, index->normalizer_buf);
+               data = (const unsigned char *)index->normalizer_buf->data;
+               size = index->normalizer_buf->used;
+       }
+
        datasize = uni_utf8_strlen_n(data, size) + 1;
        wchar_t dest[datasize];
        lucene_utf8_n_to_tchar(data, size, dest, datasize);
@@ -1055,8 +1072,18 @@ static Query *
 lucene_get_query_str(struct lucene_index *index,
                     const TCHAR *key, const char *str, bool fuzzy)
 {
-       const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
-       Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
+       const TCHAR *wvalue;
+       Analyzer *analyzer;
+
+       if (index->normalizer_buf != NULL) {
+               buffer_set_used_size(index->normalizer_buf, 0);
+               index->normalizer(str, strlen(str), index->normalizer_buf);
+               buffer_append_c(index->normalizer_buf, '\0');
+               str = (const char *)index->normalizer_buf->data;
+       }
+
+       wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
+       analyzer = guess_analyzer(index, str, strlen(str));
        if (analyzer == NULL)
                analyzer = index->default_analyzer;