#include "buffer.h"
#include "str.h"
#include "unichar.h" /* unicode replacement char */
-#include "fts-tokenizer-common.h"
+#include "fts-filter-common.h"
#include "fts-filter-private.h"
#include "fts-language.h"
UTransliterator *transliterator;
buffer_t *utf16_token, *trans_token;
string_t *utf8_token;
- unsigned int maxlen;
};
static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter)
np->utf16_token = buffer_create_dynamic(pp, 128);
np->trans_token = buffer_create_dynamic(pp, 128);
np->utf8_token = buffer_create_dynamic(pp, 128);
- np->maxlen = max_length;
+ np->filter.max_length = max_length;
*filter_r = &np->filter;
return 0;
}
fts_icu_utf16_to_utf8(np->utf8_token, np->trans_token->data,
np->trans_token->used / sizeof(UChar));
- if (str_len(np->utf8_token) > np->maxlen) {
- size_t len = np->maxlen;
- fts_tokenizer_delete_trailing_partial_char(np->utf8_token->data, &len);
- str_truncate(np->utf8_token, len);
- }
+ fts_filter_truncate_token(np->utf8_token, np->filter.max_length);
*token = str_c(np->utf8_token);
return 1;
}
test_end();
}
+static void test_fts_filter_normalizer_truncation(void)
+{
+ struct fts_filter *norm = NULL;
+ const char *settings[] =
+ {"id", "Any-Lower;", "maxlen", "10",
+ NULL};
+ const char *error = NULL;
+ const char *token = "abcdefghi\xC3\x85";
+
+ test_begin("fts filter normalizer token truncated mid letter");
+ test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL,
+ settings, &norm, &error) == 0);
+ test_assert(error == NULL);
+ test_assert(fts_filter_filter(norm, &token, &error) >= 0);
+ test_assert(strcmp(token, "abcdefghi") == 0);
+ fts_filter_unref(&norm);
+ test_end();
+}
+
#ifdef HAVE_FTS_STEMMER
static void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
{
test_fts_filter_normalizer_baddata,
test_fts_filter_normalizer_invalid_id,
test_fts_filter_normalizer_oversized,
+ test_fts_filter_normalizer_truncation,
#ifdef HAVE_FTS_STEMMER
test_fts_filter_normalizer_stopwords_stemmer_eng,
test_fts_filter_stopwords_normalizer_stemmer_no,