endif
if BUILD_LIBICU
+ICU_SOURCES = fts-icu.c
NORMALIZER_LIBS = $(LIBICU_LIBS)
+ICU_TESTS = test-fts-icu
endif
libfts_la_LIBADD = \
fts-language.c \
fts-tokenizer.c \
fts-tokenizer-address.c \
- fts-tokenizer-generic.c
+ fts-tokenizer-generic.c \
+ $(ICU_SOURCES)
noinst_HEADERS = \
fts-filter.h \
fts-filter-private.h \
+ fts-icu.h \
fts-language.h \
fts-tokenizer.h \
fts-tokenizer-private.h \
fts-tokenizer-generic-private.h
test_programs = \
- test-fts-filter \
+ $(ICU_TESTS) \
$(TEST_FTS_LANGUAGE) \
+ test-fts-filter \
test-fts-tokenizer
noinst_PROGRAMS = $(test_programs)
../lib/liblib.la
test_deps = $(noinst_LTLIBRARIES) $(test_libs)
+test_fts_icu_SOURCES = test-fts-icu.c
+test_fts_icu_LDADD = fts-icu.lo $(LIBICU_LIBS) $(test_libs)
+test_fts_icu_DEPENDENCIES = fts-icu.lo $(test_deps)
+
test_fts_filter_SOURCES = test-fts-filter.c
test_fts_filter_LDADD = libfts.la $(test_libs)
test_fts_filter_DEPENDENCIES = libfts.la $(test_deps)
#include "fts-language.h"
#ifdef HAVE_LIBICU
-
-#include <unicode/utrans.h>
-#include <unicode/uenum.h>
-#include <unicode/ustring.h>
-#include <unicode/ucnv.h>
-#include <stdlib.h>
+#include "fts-icu.h"
struct fts_filter_normalizer_icu {
struct fts_filter filter;
pool_t pool;
const char *transliterator_id;
+ const UChar *transliterator_id_utf16;
+ unsigned int transliterator_id_utf16_len;
+
UTransliterator *transliterator;
+ buffer_t *utf16_token, *trans_token;
+ string_t *utf8_token;
};
-/* Helper to create UTF16, which libicu wants as input.
-
- On input, if *dst_uchars_r > 0, it indicates the number of UChar
- sized units that should be allocated for the text. However, the
- function will not use the number, if the text will not fit in that
- amount.
-
- On return *dst_uchars_r will contain the number of UChar sized units
- allocated for the dst. NOT the number of bytes nor the length of the
- text. */
-static void make_uchar(const char *src, UChar **dst, int32_t *dst_uchars_r)
-{
- UErrorCode err = U_ZERO_ERROR;
- int32_t len = strlen(src);
- int32_t ustr_len = 0;
- int32_t ustr_len_actual = 0;
- UChar *retp = NULL;
- int32_t alloc_uchars = 0;
-
- i_assert(dst_uchars_r != NULL);
-
- /* Check length required for encoded dst. */
- retp = u_strFromUTF8(NULL, 0, &ustr_len, src, len, &err);
-
- /* When preflighting a successful call returns a buffer overflow
- error. */
- if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
- i_panic("Failed to estimate allocation size with lib ICU"
- " u_strFromUTF8(): %s",u_errorName(err));
- }
- i_assert(NULL == retp);
-
- err = U_ZERO_ERROR;
- if (*dst_uchars_r > 0 && *dst_uchars_r > ustr_len)
- alloc_uchars = *dst_uchars_r;
- else
- alloc_uchars = ustr_len;
- alloc_uchars++; /* room for null bytes(2) */
- *dst = t_malloc(alloc_uchars * sizeof(UChar));
- *dst_uchars_r = alloc_uchars;
- retp = u_strFromUTF8(*dst, alloc_uchars, &ustr_len_actual,
- src, len, &err);
-
- if (U_FAILURE(err))
- i_panic("Lib ICU u_strFromUTF8 failed: %s", u_errorName(err));
- i_assert(retp == *dst);
- i_assert(ustr_len == ustr_len_actual);
-}
-
-static void make_utf8(const UChar *src, const char **_dst)
-{
- char *dst;
- char *retp = NULL;
- int32_t dsize = 0;
- int32_t dsize_actual = 0;
- int32_t sub_num = 0;
- UErrorCode err = U_ZERO_ERROR;
- int32_t usrc_len = u_strlen(src); /* libicu selects different codepaths
- depending if srclen -1 or not */
-
- retp = u_strToUTF8WithSub(NULL, 0, &dsize, src, usrc_len,
- UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
-
- /* Preflighting can cause buffer overflow to be reported */
- if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
- i_panic("Failed to estimate allocation size with lib ICU"
- " u_strToUTF8(): %s",u_errorName(err));
- }
- i_assert(0 == sub_num);
- i_assert(NULL == retp);
-
- dsize++; /* room for '\0' byte */
- dst = t_malloc(dsize);
- err = U_ZERO_ERROR;
- retp = u_strToUTF8WithSub(dst, dsize, &dsize_actual, src, usrc_len,
- UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
- if (U_FAILURE(err))
- i_panic("Lib ICU u_strToUTF8WithSub() failed: %s",
- u_errorName(err));
- if (dsize_actual >= dsize) {
- i_panic("Produced UTF8 string length (%d) does not fit in "
- "preflighted(%d). Buffer overflow?",
- dsize_actual, dsize);
- }
- if (0 != sub_num) {
- i_panic("UTF8 string not well formed. "
- "Substitutions (%d) were made.", sub_num);
- }
- i_assert(retp == dst);
- *_dst = dst;
-}
-
static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter)
{
struct fts_filter_normalizer_icu *np =
np->pool = pp;
np->filter = *fts_filter_normalizer_icu;
np->transliterator_id = p_strdup(pp, id);
+ np->utf16_token = buffer_create_dynamic(pp, 128);
+ np->trans_token = buffer_create_dynamic(pp, 128);
+ np->utf8_token = buffer_create_dynamic(pp, 128);
+ fts_icu_utf8_to_utf16(np->utf16_token, id);
+ np->transliterator_id_utf16 =
+ p_memdup(pp, np->utf16_token->data, np->utf16_token->used);
+ np->transliterator_id_utf16_len = np->utf16_token->used / sizeof(UChar);
*filter_r = &np->filter;
return 0;
}
{
UErrorCode err = U_ZERO_ERROR;
UParseError perr;
- UChar *id_uchar = NULL;
- int32_t id_len_uchar = 0;
memset(&perr, 0, sizeof(perr));
- make_uchar(np->transliterator_id, &id_uchar, &id_len_uchar);
-
- np->transliterator = utrans_openU(id_uchar, u_strlen(id_uchar),
+ np->transliterator = utrans_openU(np->transliterator_id_utf16,
+ np->transliterator_id_utf16_len,
UTRANS_FORWARD, NULL, 0, &perr, &err);
if (U_FAILURE(err)) {
string_t *str = t_str_new(128);
{
struct fts_filter_normalizer_icu *np =
(struct fts_filter_normalizer_icu *)filter;
- UErrorCode err = U_ZERO_ERROR;
- UChar *utext = NULL;
- int32_t utext_cap = 0;
- int32_t utext_len = -1;
- int32_t utext_limit;
if (np->transliterator == NULL) {
if (fts_filter_normalizer_icu_create_trans(np, error_r) < 0)
return -1;
}
- make_uchar(*token, &utext, &utext_cap);
- utext_limit = u_strlen(utext);
- utrans_transUChars(np->transliterator, utext, &utext_len,
- utext_cap, 0, &utext_limit, &err);
-
- /* Data did not fit into utext. */
- if (utext_len > utext_cap || err == U_BUFFER_OVERFLOW_ERROR) {
- /* This is a crude retry fix... Make a new utext of the
- size utrans_transUChars indicated */
- utext_len++; /* room for '\0' bytes(2) */
- utext_cap = utext_len;
- make_uchar(*token, &utext, &utext_cap);
- i_assert(utext_cap == utext_len);
- utext_limit = u_strlen(utext);
- utext_len = -1;
- err = U_ZERO_ERROR;
- utrans_transUChars(np->transliterator, utext,
- &utext_len, utext_cap, 0,
- &utext_limit, &err);
- }
-
- if (U_FAILURE(err)) {
- *error_r = t_strdup_printf("utrans_transUChars() failed: %s\n",
- u_errorName(err));
+ fts_icu_utf8_to_utf16(np->utf16_token, *token);
+ buffer_append_zero(np->utf16_token, 2);
+ buffer_set_used_size(np->utf16_token, np->utf16_token->used-2);
+ buffer_set_used_size(np->trans_token, 0);
+ if (fts_icu_translate(np->trans_token, np->utf16_token->data,
+ np->utf16_token->used / sizeof(UChar),
+ np->transliterator, error_r) < 0)
return -1;
- }
- if (utext_len == 0)
+ if (np->trans_token->used == 0)
return 0;
- make_utf8(utext, token);
+ fts_icu_utf16_to_utf8(np->utf8_token, np->trans_token->data,
+ np->trans_token->used / sizeof(UChar));
+ *token = str_c(np->utf8_token);
return 1;
}
--- /dev/null
+/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "unichar.h"
+#include "fts-icu.h"
+
+void fts_icu_utf8_to_utf16(buffer_t *dest_utf16, const char *src_utf8)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ unsigned int src_bytes = strlen(src_utf8);
+ int32_t utf16_len;
+ UChar *dest_data, *retp = NULL;
+ int32_t avail_uchars = 0;
+
+ /* try to encode with the current buffer size */
+ avail_uchars = buffer_get_writable_size(dest_utf16) / sizeof(UChar);
+ dest_data = buffer_get_space_unsafe(dest_utf16, 0,
+ buffer_get_writable_size(dest_utf16));
+ retp = u_strFromUTF8Lenient(dest_data, avail_uchars,
+ &utf16_len, src_utf8, src_bytes, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ /* try again with a larger buffer */
+ dest_data = buffer_get_space_unsafe(dest_utf16, 0,
+ utf16_len * sizeof(UChar));
+ err = U_ZERO_ERROR;
+ retp = u_strFromUTF8Lenient(dest_data, utf16_len,
+ &utf16_len, src_utf8,
+ src_bytes, &err);
+ }
+ if (U_FAILURE(err)) {
+ i_panic("LibICU u_strFromUTF8Lenient() failed: %s",
+ u_errorName(err));
+ }
+ buffer_set_used_size(dest_utf16, utf16_len * sizeof(UChar));
+ i_assert(retp == dest_data);
+}
+
+void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
+ unsigned int src_len)
+{
+ int32_t dest_len = 0;
+ int32_t sub_num = 0;
+ char *dest_data, *retp = NULL;
+ UErrorCode err = U_ZERO_ERROR;
+
+ /* try to encode with the current buffer size */
+ dest_data = buffer_get_space_unsafe(dest_utf8, 0,
+ buffer_get_writable_size(dest_utf8));
+ retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8),
+ &dest_len, src_utf16, src_len,
+ UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ /* try again with a larger buffer */
+ dest_data = buffer_get_space_unsafe(dest_utf8, 0, dest_len);
+ err = U_ZERO_ERROR;
+ retp = u_strToUTF8WithSub(dest_data, buffer_get_writable_size(dest_utf8), &dest_len,
+ src_utf16, src_len,
+ UNICODE_REPLACEMENT_CHAR,
+ &sub_num, &err);
+ }
+ if (U_FAILURE(err)) {
+ i_panic("LibICU u_strToUTF8WithSub() failed: %s",
+ u_errorName(err));
+ }
+ buffer_set_used_size(dest_utf8, dest_len);
+ i_assert(retp == dest_data);
+}
+
+int fts_icu_translate(buffer_t *dest_utf16, const UChar *src_utf16,
+ unsigned int src_len, UTransliterator *transliterator,
+ const char **error_r)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t utf16_len = src_len;
+ UChar *dest_data;
+ int32_t avail_uchars, limit = src_len;
+ size_t dest_pos = dest_utf16->used;
+
+ /* translation is done in-place in the buffer. try first with the
+ current buffer size. */
+ buffer_append(dest_utf16, src_utf16, src_len*sizeof(UChar));
+
+ avail_uchars = (buffer_get_writable_size(dest_utf16)-dest_pos) / sizeof(UChar);
+ dest_data = buffer_get_space_unsafe(dest_utf16, dest_pos,
+ buffer_get_writable_size(dest_utf16)-dest_pos);
+ utrans_transUChars(transliterator, dest_data, &utf16_len,
+ avail_uchars, 0, &limit, &err);
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ /* try again with a larger buffer */
+ err = U_ZERO_ERROR;
+ avail_uchars = utf16_len;
+ limit = utf16_len = src_len;
+ buffer_write(dest_utf16, dest_pos,
+ src_utf16, src_len*sizeof(UChar));
+ dest_data = buffer_get_space_unsafe(dest_utf16, dest_pos,
+ avail_uchars * sizeof(UChar));
+ utrans_transUChars(transliterator, dest_data, &utf16_len,
+ avail_uchars, 0, &limit, &err);
+ i_assert(err != U_BUFFER_OVERFLOW_ERROR);
+ }
+ if (U_FAILURE(err)) {
+ *error_r = t_strdup_printf("LibICU utrans_transUChars() failed: %s",
+ u_errorName(err));
+ buffer_set_used_size(dest_utf16, dest_pos);
+ return -1;
+ }
+ buffer_set_used_size(dest_utf16, utf16_len * sizeof(UChar));
+ return 0;
+}
--- /dev/null
+#ifndef HAVE_FTS_ICU_H
+#define HAVE_FTS_ICU_H
+
+#include <unicode/ustring.h>
+#include <unicode/utrans.h>
+
+/* Convert UTF-8 input to UTF-16 output. The dest_utf16 contains UChars. */
+void fts_icu_utf8_to_utf16(buffer_t *dest_utf16, const char *src_utf8);
+/* Convert UTF-16 input to UTF-8 output. */
+void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
+ unsigned int src_len);
+/* Run ICU translation for the string. Returns 0 on success, -1 on error. */
+int fts_icu_translate(buffer_t *dest_utf16, const UChar *src_utf16,
+ unsigned int src_len, UTransliterator *transliterator,
+ const char **error_r);
+
+#endif
#include "lib.h"
#include "sha2.h"
+#include "str.h"
+#include "unichar.h"
#include "test-common.h"
#include "fts-language.h"
#include "fts-filter.h"
test_end();
}
+static void test_fts_filter_normalizer_baddata(void)
+{
+ const char * const settings[] =
+ {"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
+ struct fts_filter *norm;
+ const char *token, *error;
+ string_t *str;
+ unsigned int i;
+
+ test_begin("fts filter normalizer bad data");
+
+ test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ str = t_str_new(128);
+ for (i = 1; i < 0x1ffff; i++) {
+ str_truncate(str, 0);
+ uni_ucs4_to_utf8_c(i, str);
+ token = str_c(str);
+ T_BEGIN {
+ test_assert_idx(fts_filter_filter(norm, &token, &error) >= 0, i);
+ } T_END;
+ }
+
+ str_truncate(str, 0);
+ uni_ucs4_to_utf8_c(0x7fffffff, str);
+ token = str_c(str);
+ test_assert(fts_filter_filter(norm, &token, &error) >= 0);
+
+ fts_filter_unref(&norm);
+ test_end();
+}
+
static void test_fts_filter_normalizer_invalid_id(void)
{
struct fts_filter *norm = NULL;
test_fts_filter_normalizer_swedish_short_default_id,
test_fts_filter_normalizer_french,
test_fts_filter_normalizer_empty,
+ test_fts_filter_normalizer_baddata,
test_fts_filter_normalizer_invalid_id,
#ifdef HAVE_FTS_STEMMER
test_fts_filter_normalizer_stopwords_stemmer_eng,
--- /dev/null
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "unichar.h"
+#include "test-common.h"
+#include "fts-icu.h"
+
+static void test_fts_icu_utf8_to_utf16_ascii_resize(void)
+{
+ buffer_t *dest = buffer_create_dynamic(pool_datastack_create(), 5);
+
+ test_begin("fts_icu_utf8_to_utf16 ascii resize");
+ /* dynamic buffers reserve +1 for str_c()'s NUL, so 5 -> 4 */
+ test_assert(buffer_get_size(dest) == 5);
+ fts_icu_utf8_to_utf16(dest, "12");
+ test_assert(dest->used == 4);
+ test_assert(buffer_get_size(dest) == 5);
+
+ fts_icu_utf8_to_utf16(dest, "123");
+ test_assert(dest->used == 6);
+ test_assert(buffer_get_size(dest) == 8);
+
+ fts_icu_utf8_to_utf16(dest, "12345");
+ test_assert(dest->used == 10);
+
+ test_end();
+}
+
+static void test_fts_icu_utf8_to_utf16_32bit_resize(void)
+{
+ buffer_t *dest;
+ unsigned int i;
+
+ test_begin("fts_icu_utf8_to_utf16 32bit resize");
+ for (i = 2; i <= 5; i++) {
+ dest = buffer_create_dynamic(pool_datastack_create(), i);
+ test_assert(buffer_get_size(dest) == i);
+ fts_icu_utf8_to_utf16(dest, "\xF0\x90\x90\x80"); /* 0x10400 */
+ test_assert(dest->used == 4);
+ }
+
+ test_end();
+}
+
+static void test_fts_icu_utf16_to_utf8(void)
+{
+ string_t *dest = t_str_new(64);
+ const UChar src[] = { 0xbd, 'b', 'c' };
+ unsigned int i;
+
+ test_begin("fts_icu_utf16_to_utf8");
+ for (i = N_ELEMENTS(src); i > 0; i--) {
+ fts_icu_utf16_to_utf8(dest, src, i);
+ test_assert(dest->used == i+1);
+ }
+ test_end();
+}
+
+static void test_fts_icu_utf16_to_utf8_resize(void)
+{
+ string_t *dest;
+ const UChar src = UNICODE_REPLACEMENT_CHAR;
+ unsigned int i;
+
+ test_begin("fts_icu_utf16_to_utf8 resize");
+ for (i = 2; i <= 6; i++) {
+ dest = t_str_new(i);
+ test_assert(buffer_get_size(dest) == i);
+ fts_icu_utf16_to_utf8(dest, &src, 1);
+ test_assert(dest->used == 3);
+ test_assert(strcmp(str_c(dest), UNICODE_REPLACEMENT_CHAR_UTF8) == 0);
+ }
+
+ test_end();
+}
+
+static UTransliterator *get_translit(const char *id)
+{
+ UTransliterator *translit;
+ buffer_t *id_utf16;
+ UErrorCode err = U_ZERO_ERROR;
+ UParseError perr;
+
+ id_utf16 = buffer_create_dynamic(pool_datastack_create(), 16);
+ fts_icu_utf8_to_utf16(id_utf16, id);
+ translit = utrans_openU(id_utf16->data, id_utf16->used/sizeof(UChar),
+ UTRANS_FORWARD, NULL, 0, &perr, &err);
+ test_assert(!U_FAILURE(err));
+ return translit;
+}
+
+static void test_fts_icu_translate(void)
+{
+ const char *translit_id = "Any-Lower";
+ UTransliterator *translit;
+ buffer_t *dest = buffer_create_dynamic(pool_datastack_create(), 64);
+ const UChar src[] = { 0xbd, 'B', 'C' };
+ const char *error;
+ unsigned int i;
+
+ test_begin("fts_icu_translate");
+ translit = get_translit(translit_id);
+ for (i = N_ELEMENTS(src); i > 0; i--) {
+ buffer_set_used_size(dest, 0);
+ test_assert(fts_icu_translate(dest, src, i,
+ translit, &error) == 0);
+ test_assert(dest->used == i * sizeof(UChar));
+ }
+ test_end();
+}
+
+static void test_fts_icu_translate_resize(void)
+{
+ const char *translit_id = "Any-Hex";
+ const char *src_utf8 = "FOO";
+ buffer_t *dest, *src_utf16;
+ UTransliterator *translit;
+ const char *error;
+ unsigned int i;
+
+ test_begin("fts_icu_translate_resize resize");
+
+ src_utf16 = buffer_create_dynamic(pool_datastack_create(), 16);
+ translit = get_translit(translit_id);
+ for (i = 2; i <= 20; i++) {
+ buffer_set_used_size(src_utf16, 0);
+ fts_icu_utf8_to_utf16(src_utf16, src_utf8);
+ dest = buffer_create_dynamic(pool_datastack_create(), i);
+ test_assert(buffer_get_size(dest) == i);
+ test_assert(fts_icu_translate(dest, src_utf16->data,
+ src_utf16->used/sizeof(UChar),
+ translit, &error) == 0);
+ }
+
+ test_end();
+}
+
+int main(void)
+{
+ static void (*test_functions[])(void) = {
+ test_fts_icu_utf8_to_utf16_ascii_resize,
+ test_fts_icu_utf8_to_utf16_32bit_resize,
+ test_fts_icu_utf16_to_utf8,
+ test_fts_icu_utf16_to_utf8_resize,
+ test_fts_icu_translate,
+ test_fts_icu_translate_resize,
+ NULL
+ };
+ return test_run(test_functions);
+}