-#ifndef FTS_COMMON_H
-#define FTS_COMMON_H
+#ifndef LANG_COMMON_H
+#define LANG_COMMON_H
/* Some might consider 0x02BB an apostrophe also. */
#define IS_NONASCII_APOSTROPHE(c) \
((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
#define IS_WB5A_APOSTROPHE(c) \
((c) == 0x0027 || (c) == 0x2019)
-#define FTS_PREFIX_SPLAT_CHAR 0x002A /* '*' */
+#define LANG_PREFIX_SPLAT_CHAR 0x002A /* '*' */
#define IS_PREFIX_SPLAT(c) \
- ((c) == FTS_PREFIX_SPLAT_CHAR)
+ ((c) == LANG_PREFIX_SPLAT_CHAR)
/* The h letters are included because it is an exception in French.
A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
#define IS_ASCII_VOWEL(c) \
#include "lang-filter-common.h"
#include "lang-tokenizer-common.h"
-void fts_filter_truncate_token(string_t *token, size_t max_length)
+void lang_filter_truncate_token(string_t *token, size_t max_length)
{
if (str_len(token) <= max_length)
return;
size_t len = max_length;
- fts_tokenizer_delete_trailing_partial_char(token->data, &len);
+ lang_tokenizer_delete_trailing_partial_char(token->data, &len);
str_truncate(token, len);
i_assert(len <= max_length);
}
-#ifndef FTS_FILTER_COMMON_H
-#define FTS_FILTER_COMMON_H
+#ifndef LANG_FILTER_COMMON_H
+#define LANG_FILTER_COMMON_H
-void fts_filter_truncate_token(string_t *token, size_t max_length);
+void lang_filter_truncate_token(string_t *token, size_t max_length);
#endif
#include "unichar.h"
static int
-fts_filter_contractions_create(const struct fts_language *lang,
+lang_filter_contractions_create(const struct language *lang,
const char *const *settings,
- struct fts_filter **filter_r,
+ struct lang_filter **filter_r,
const char **error_r)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
if (settings[0] != NULL) {
*error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
return -1;
}
- filter = i_new(struct fts_filter, 1);
- *filter = *fts_filter_contractions;
+ filter = i_new(struct lang_filter, 1);
+ *filter = *lang_filter_contractions;
filter->token = str_new(default_pool, 64);
*filter_r = filter;
return 0;
}
static int
-fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+lang_filter_contractions_filter(struct lang_filter *filter ATTR_UNUSED,
const char **_token,
const char **error_r ATTR_UNUSED)
{
return 1;
}
-static const struct fts_filter fts_filter_contractions_real = {
+static const struct lang_filter lang_filter_contractions_real = {
.class_name = "contractions",
.v = {
- fts_filter_contractions_create,
- fts_filter_contractions_filter,
+ lang_filter_contractions_create,
+ lang_filter_contractions_filter,
NULL
}
};
-const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
+const struct lang_filter *lang_filter_contractions = &lang_filter_contractions_real;
}
static int
-fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED,
+lang_filter_english_possessive_filter(struct lang_filter *filter ATTR_UNUSED,
const char **token,
const char **error_r ATTR_UNUSED)
{
return 1;
}
-static const struct fts_filter fts_filter_english_possessive_real = {
+static const struct lang_filter lang_filter_english_possessive_real = {
.class_name = "english-possessive",
.v = {
NULL,
- fts_filter_english_possessive_filter,
+ lang_filter_english_possessive_filter,
NULL
}
};
-const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
+const struct lang_filter *lang_filter_english_possessive = &lang_filter_english_possessive_real;
#endif
static int
-fts_filter_lowercase_create(const struct fts_language *lang ATTR_UNUSED,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+lang_filter_lowercase_create(const struct language *lang ATTR_UNUSED,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
unsigned int i, max_length = 250;
for (i = 0; settings[i] != NULL; i += 2) {
return -1;
}
}
- filter = i_new(struct fts_filter, 1);
- *filter = *fts_filter_lowercase;
+ filter = i_new(struct lang_filter, 1);
+ *filter = *lang_filter_lowercase;
filter->token = str_new(default_pool, 64);
filter->max_length = max_length;
}
static int
-fts_filter_lowercase_filter(struct fts_filter *filter ATTR_UNUSED,
- const char **token,
- const char **error_r ATTR_UNUSED)
+lang_filter_lowercase_filter(struct lang_filter *filter ATTR_UNUSED,
+ const char **token,
+ const char **error_r ATTR_UNUSED)
{
#ifdef HAVE_LIBICU
str_truncate(filter->token, 0);
- fts_icu_lcase(filter->token, *token);
- fts_filter_truncate_token(filter->token, filter->max_length);
+ lang_icu_lcase(filter->token, *token);
+ lang_filter_truncate_token(filter->token, filter->max_length);
*token = str_c(filter->token);
#else
*token = t_str_lcase(*token);
return 1;
}
-static const struct fts_filter fts_filter_lowercase_real = {
+static const struct lang_filter lang_filter_lowercase_real = {
.class_name = "lowercase",
.v = {
- fts_filter_lowercase_create,
- fts_filter_lowercase_filter,
+ lang_filter_lowercase_create,
+ lang_filter_lowercase_filter,
NULL
}
};
-const struct fts_filter *fts_filter_lowercase = &fts_filter_lowercase_real;
+const struct lang_filter *lang_filter_lowercase = &lang_filter_lowercase_real;
#ifdef HAVE_LIBICU
#include "lang-icu.h"
-struct fts_filter_normalizer_icu {
- struct fts_filter filter;
+struct lang_filter_normalizer_icu {
+ struct lang_filter filter;
pool_t pool;
const char *transliterator_id;
string_t *utf8_token;
};
-static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter)
+static void lang_filter_normalizer_icu_destroy(struct lang_filter *filter)
{
- struct fts_filter_normalizer_icu *np =
- (struct fts_filter_normalizer_icu *)filter;
+ struct lang_filter_normalizer_icu *np =
+ (struct lang_filter_normalizer_icu *)filter;
if (np->transliterator != NULL)
utrans_close(np->transliterator);
}
static int
-fts_filter_normalizer_icu_create(const struct fts_language *lang ATTR_UNUSED,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+lang_filter_normalizer_icu_create(const struct language *lang ATTR_UNUSED,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r)
{
- struct fts_filter_normalizer_icu *np;
+ struct lang_filter_normalizer_icu *np;
pool_t pp;
unsigned int i, max_length = 250;
const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC; [\\x20] Remove";
}
}
- pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_normalizer_icu",
- sizeof(struct fts_filter_normalizer_icu));
- np = p_new(pp, struct fts_filter_normalizer_icu, 1);
+ pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_normalizer_icu",
+ sizeof(struct lang_filter_normalizer_icu));
+ np = p_new(pp, struct lang_filter_normalizer_icu, 1);
np->pool = pp;
- np->filter = *fts_filter_normalizer_icu;
+ np->filter = *lang_filter_normalizer_icu;
np->transliterator_id = p_strdup(pp, id);
p_array_init(&np->utf16_token, pp, 64);
p_array_init(&np->trans_token, pp, 64);
}
static int
-fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char **token,
+lang_filter_normalizer_icu_filter(struct lang_filter *filter, const char **token,
const char **error_r)
{
- struct fts_filter_normalizer_icu *np =
- (struct fts_filter_normalizer_icu *)filter;
+ struct lang_filter_normalizer_icu *np =
+ (struct lang_filter_normalizer_icu *)filter;
if (np->transliterator == NULL)
- if (fts_icu_transliterator_create(np->transliterator_id,
- &np->transliterator,
- error_r) < 0)
+ if (lang_icu_transliterator_create(np->transliterator_id,
+ &np->transliterator,
+ error_r) < 0)
return -1;
- fts_icu_utf8_to_utf16(&np->utf16_token, *token);
+ lang_icu_utf8_to_utf16(&np->utf16_token, *token);
array_append_zero(&np->utf16_token);
array_pop_back(&np->utf16_token);
array_clear(&np->trans_token);
- if (fts_icu_translate(&np->trans_token, array_front(&np->utf16_token),
- array_count(&np->utf16_token),
- np->transliterator, error_r) < 0)
+ if (lang_icu_translate(&np->trans_token, array_front(&np->utf16_token),
+ array_count(&np->utf16_token),
+ np->transliterator, error_r) < 0)
return -1;
if (array_count(&np->trans_token) == 0)
return 0;
- fts_icu_utf16_to_utf8(np->utf8_token, array_front(&np->trans_token),
+ lang_icu_utf16_to_utf8(np->utf8_token, array_front(&np->trans_token),
array_count(&np->trans_token));
- fts_filter_truncate_token(np->utf8_token, np->filter.max_length);
+ lang_filter_truncate_token(np->utf8_token, np->filter.max_length);
*token = str_c(np->utf8_token);
return 1;
}
#else
static int
-fts_filter_normalizer_icu_create(const struct fts_language *lang ATTR_UNUSED,
- const char *const *settings ATTR_UNUSED,
- struct fts_filter **filter_r ATTR_UNUSED,
- const char **error_r)
+lang_filter_normalizer_icu_create(const struct language *lang ATTR_UNUSED,
+ const char *const *settings ATTR_UNUSED,
+ struct lang_filter **filter_r ATTR_UNUSED,
+ const char **error_r)
{
*error_r = "libicu support not built in";
return -1;
}
static int
-fts_filter_normalizer_icu_filter(struct fts_filter *filter ATTR_UNUSED,
- const char **token ATTR_UNUSED,
- const char **error_r ATTR_UNUSED)
+lang_filter_normalizer_icu_filter(struct lang_filter *filter ATTR_UNUSED,
+ const char **token ATTR_UNUSED,
+ const char **error_r ATTR_UNUSED)
{
return -1;
}
static void
-fts_filter_normalizer_icu_destroy(struct fts_filter *normalizer ATTR_UNUSED)
+lang_filter_normalizer_icu_destroy(struct lang_filter *normalizer ATTR_UNUSED)
{
}
#endif
-static const struct fts_filter fts_filter_normalizer_icu_real = {
+static const struct lang_filter lang_filter_normalizer_icu_real = {
.class_name = "normalizer-icu",
.v = {
- fts_filter_normalizer_icu_create,
- fts_filter_normalizer_icu_filter,
- fts_filter_normalizer_icu_destroy
+ lang_filter_normalizer_icu_create,
+ lang_filter_normalizer_icu_filter,
+ lang_filter_normalizer_icu_destroy
}
};
-const struct fts_filter *fts_filter_normalizer_icu =
- &fts_filter_normalizer_icu_real;
+const struct lang_filter *lang_filter_normalizer_icu =
+ &lang_filter_normalizer_icu_real;
-#ifndef FTS_FILTER_PRIVATE_H
-#define FTS_FILTER_PRIVATE_H
+#ifndef LANG_FILTER_PRIVATE_H
+#define LANG_FILTER_PRIVATE_H
#include "lang-filter.h"
-#define FTS_FILTER_CLASSES_NR 6
+#define LANG_FILTER_CLASSES_NR 6
/*
API that stemming providers (classes) must provide: The create()
The destroy function is called to destroy an instance of a filter.
*/
-struct fts_filter_vfuncs {
- int (*create)(const struct fts_language *lang,
+struct lang_filter_vfuncs {
+ int (*create)(const struct language *lang,
const char *const *settings,
- struct fts_filter **filter_r,
+ struct lang_filter **filter_r,
const char **error_r);
- int (*filter)(struct fts_filter *filter, const char **token,
+ int (*filter)(struct lang_filter *filter, const char **token,
const char **error_r);
- void (*destroy)(struct fts_filter *filter);
+ void (*destroy)(struct lang_filter *filter);
};
-struct fts_filter {
+struct lang_filter {
const char *class_name; /* name of the class this is based on */
- struct fts_filter_vfuncs v;
- struct fts_filter *parent;
+ struct lang_filter_vfuncs v;
+ struct lang_filter *parent;
string_t *token;
size_t max_length;
int refcount;
#include <libstemmer.h>
-struct fts_filter_stemmer_snowball {
- struct fts_filter filter;
+struct lang_filter_stemmer_snowball {
+ struct lang_filter filter;
pool_t pool;
- struct fts_language *lang;
+ struct language *lang;
struct sb_stemmer *stemmer;
};
-static void fts_filter_stemmer_snowball_destroy(struct fts_filter *filter)
+static void lang_filter_stemmer_snowball_destroy(struct lang_filter *filter)
{
- struct fts_filter_stemmer_snowball *sp =
- (struct fts_filter_stemmer_snowball *)filter;
+ struct lang_filter_stemmer_snowball *sp =
+ (struct lang_filter_stemmer_snowball *)filter;
if (sp->stemmer != NULL)
sb_stemmer_delete(sp->stemmer);
}
static int
-fts_filter_stemmer_snowball_create(const struct fts_language *lang,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+lang_filter_stemmer_snowball_create(const struct language *lang,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r)
{
- struct fts_filter_stemmer_snowball *sp;
+ struct lang_filter_stemmer_snowball *sp;
pool_t pp;
*filter_r = NULL;
*error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
return -1;
}
- pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_stemmer_snowball",
- sizeof(struct fts_filter));
- sp = p_new(pp, struct fts_filter_stemmer_snowball, 1);
+ pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_stemmer_snowball",
+ sizeof(struct lang_filter));
+ sp = p_new(pp, struct lang_filter_stemmer_snowball, 1);
sp->pool = pp;
- sp->filter = *fts_filter_stemmer_snowball;
- sp->lang = p_malloc(sp->pool, sizeof(struct fts_language));
+ sp->filter = *lang_filter_stemmer_snowball;
+ sp->lang = p_malloc(sp->pool, sizeof(struct language));
sp->lang->name = p_strdup(sp->pool, lang->name);
*filter_r = &sp->filter;
return 0;
}
static int
-fts_filter_stemmer_snowball_create_stemmer(struct fts_filter_stemmer_snowball *sp,
- const char **error_r)
+lang_filter_stemmer_snowball_create_stemmer(struct lang_filter_stemmer_snowball *sp,
+ const char **error_r)
{
sp->stemmer = sb_stemmer_new(sp->lang->name, "UTF_8");
if (sp->stemmer == NULL) {
*error_r = t_strdup_printf(
"Creating a Snowball stemmer for language '%s' failed.",
sp->lang->name);
- fts_filter_stemmer_snowball_destroy(&sp->filter);
+ lang_filter_stemmer_snowball_destroy(&sp->filter);
return -1;
}
return 0;
}
static int
-fts_filter_stemmer_snowball_filter(struct fts_filter *filter,
- const char **token, const char **error_r)
+lang_filter_stemmer_snowball_filter(struct lang_filter *filter,
+ const char **token, const char **error_r)
{
- struct fts_filter_stemmer_snowball *sp =
- (struct fts_filter_stemmer_snowball *) filter;
+ struct lang_filter_stemmer_snowball *sp =
+ (struct lang_filter_stemmer_snowball *) filter;
const sb_symbol *base;
if (sp->stemmer == NULL) {
- if (fts_filter_stemmer_snowball_create_stemmer(sp, error_r) < 0)
+ if (lang_filter_stemmer_snowball_create_stemmer(sp, error_r) < 0)
return -1;
}
else {
/* If the stemmer returns an empty token, the return value
* should be 0 instead of 1 (otherwise it causes an assertion
- * fault in fts_filter_filter() ).
+ * fault in lang_filter() ).
* However, removing tokens may bring the same kind of issues
* and inconsistencies that stopwords cause when used with
* multiple languages and negations.
#else
static int
-fts_filter_stemmer_snowball_create(const struct fts_language *lang ATTR_UNUSED,
- const char *const *settings ATTR_UNUSED,
- struct fts_filter **filter_r ATTR_UNUSED,
- const char **error_r)
+lang_filter_stemmer_snowball_create(const struct language *lang ATTR_UNUSED,
+ const char *const *settings ATTR_UNUSED,
+ struct lang_filter **filter_r ATTR_UNUSED,
+ const char **error_r)
{
*error_r = "Snowball support not built in";
return -1;
}
static void
-fts_filter_stemmer_snowball_destroy(struct fts_filter *stemmer ATTR_UNUSED)
+lang_filter_stemmer_snowball_destroy(struct lang_filter *stemmer ATTR_UNUSED)
{
}
static int
-fts_filter_stemmer_snowball_filter(struct fts_filter *filter ATTR_UNUSED,
- const char **token ATTR_UNUSED,
- const char **error_r ATTR_UNUSED)
+lang_filter_stemmer_snowball_filter(struct lang_filter *filter ATTR_UNUSED,
+ const char **token ATTR_UNUSED,
+ const char **error_r ATTR_UNUSED)
{
return -1;
}
#endif
-static const struct fts_filter fts_filter_stemmer_snowball_real = {
+static const struct lang_filter lang_filter_stemmer_snowball_real = {
.class_name = "snowball",
.v = {
- fts_filter_stemmer_snowball_create,
- fts_filter_stemmer_snowball_filter,
- fts_filter_stemmer_snowball_destroy
+ lang_filter_stemmer_snowball_create,
+ lang_filter_stemmer_snowball_filter,
+ lang_filter_stemmer_snowball_destroy
}
};
-const struct fts_filter *fts_filter_stemmer_snowball = &fts_filter_stemmer_snowball_real;
+const struct lang_filter *lang_filter_stemmer_snowball = &lang_filter_stemmer_snowball_real;
#define STOPWORDS_CUTCHARS "|#\t "
#define STOPWORDS_DISALLOWED_CHARS "/\\<>.,\":()\t\n\r"
-struct fts_filter_stopwords {
- struct fts_filter filter;
- struct fts_language *lang;
+struct lang_filter_stopwords {
+ struct lang_filter filter;
+ struct language *lang;
pool_t pool;
HASH_TABLE(const char *, const char *) stopwords;
const char *stopwords_dir;
};
-static int fts_filter_stopwords_read_list(struct fts_filter_stopwords *filter,
- const char **error_r)
+static int lang_filter_stopwords_read_list(struct lang_filter_stopwords *filter,
+ const char **error_r)
{
struct istream *input;
const char *line, *word, *path;
return ret;
}
-static void fts_filter_stopwords_destroy(struct fts_filter *filter)
+static void lang_filter_stopwords_destroy(struct lang_filter *filter)
{
- struct fts_filter_stopwords *sp = (struct fts_filter_stopwords *)filter;
+ struct lang_filter_stopwords *sp = (struct lang_filter_stopwords *)filter;
hash_table_destroy(&sp->stopwords);
pool_unref(&sp->pool);
}
static int
-fts_filter_stopwords_create(const struct fts_language *lang,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+lang_filter_stopwords_create(const struct language *lang,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r)
{
- struct fts_filter_stopwords *sp;
+ struct lang_filter_stopwords *sp;
pool_t pp;
const char *dir = NULL;
unsigned int i;
return -1;
}
}
- pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_stopwords",
- sizeof(struct fts_filter));
- sp = p_new(pp, struct fts_filter_stopwords, 1);
- sp->filter = *fts_filter_stopwords;
+ pp = pool_alloconly_create(MEMPOOL_GROWING"lang_filter_stopwords",
+ sizeof(struct lang_filter));
+ sp = p_new(pp, struct lang_filter_stopwords, 1);
+ sp->filter = *lang_filter_stopwords;
sp->pool = pp;
- sp->lang = p_malloc(sp->pool, sizeof(struct fts_language));
+ sp->lang = p_malloc(sp->pool, sizeof(struct language));
sp->lang->name = p_strdup(sp->pool, lang->name);
if (dir != NULL)
sp->stopwords_dir = p_strdup(pp, dir);
}
static int
-fts_filter_stopwords_filter(struct fts_filter *filter, const char **token,
- const char **error_r)
+lang_filter_stopwords_filter(struct lang_filter *filter, const char **token,
+ const char **error_r)
{
- struct fts_filter_stopwords *sp =
- (struct fts_filter_stopwords *) filter;
+ struct lang_filter_stopwords *sp =
+ (struct lang_filter_stopwords *) filter;
if (!hash_table_is_created(sp->stopwords)) {
hash_table_create(&sp->stopwords, sp->pool, 0, str_hash, strcmp);
- if (fts_filter_stopwords_read_list(sp, error_r) < 0)
+ if (lang_filter_stopwords_read_list(sp, error_r) < 0)
return -1;
}
return hash_table_lookup(sp->stopwords, *token) == NULL ? 1 : 0;
}
-const struct fts_filter fts_filter_stopwords_real = {
+const struct lang_filter lang_filter_stopwords_real = {
.class_name = "stopwords",
.v = {
- fts_filter_stopwords_create,
- fts_filter_stopwords_filter,
- fts_filter_stopwords_destroy
+ lang_filter_stopwords_create,
+ lang_filter_stopwords_filter,
+ lang_filter_stopwords_destroy
}
};
-const struct fts_filter *fts_filter_stopwords = &fts_filter_stopwords_real;
+const struct lang_filter *lang_filter_stopwords = &lang_filter_stopwords_real;
# include "lang-icu.h"
#endif
-static ARRAY(const struct fts_filter *) fts_filter_classes;
+static ARRAY(const struct lang_filter *) lang_filter_classes;
-void fts_filters_init(void)
+void lang_filters_init(void)
{
- i_array_init(&fts_filter_classes, FTS_FILTER_CLASSES_NR);
-
- fts_filter_register(fts_filter_stopwords);
- fts_filter_register(fts_filter_stemmer_snowball);
- fts_filter_register(fts_filter_normalizer_icu);
- fts_filter_register(fts_filter_lowercase);
- fts_filter_register(fts_filter_english_possessive);
- fts_filter_register(fts_filter_contractions);
+ i_array_init(&lang_filter_classes, LANG_FILTER_CLASSES_NR);
+
+ lang_filter_register(lang_filter_stopwords);
+ lang_filter_register(lang_filter_stemmer_snowball);
+ lang_filter_register(lang_filter_normalizer_icu);
+ lang_filter_register(lang_filter_lowercase);
+ lang_filter_register(lang_filter_english_possessive);
+ lang_filter_register(lang_filter_contractions);
}
-void fts_filters_deinit(void)
+void lang_filters_deinit(void)
{
#ifdef HAVE_LIBICU
- fts_icu_deinit();
+ lang_icu_deinit();
#endif
- array_free(&fts_filter_classes);
+ array_free(&lang_filter_classes);
}
-void fts_filter_register(const struct fts_filter *filter_class)
+void lang_filter_register(const struct lang_filter *filter_class)
{
- i_assert(fts_filter_find(filter_class->class_name) == NULL);
+ i_assert(lang_filter_find(filter_class->class_name) == NULL);
- array_push_back(&fts_filter_classes, &filter_class);
+ array_push_back(&lang_filter_classes, &filter_class);
}
-const struct fts_filter *fts_filter_find(const char *name)
+const struct lang_filter *lang_filter_find(const char *name)
{
- const struct fts_filter *filter;
+ const struct lang_filter *filter;
- array_foreach_elem(&fts_filter_classes, filter) {
+ array_foreach_elem(&lang_filter_classes, filter) {
if (strcmp(filter->class_name, name) == 0)
return filter;
}
return NULL;
}
-int fts_filter_create(const struct fts_filter *filter_class,
- struct fts_filter *parent,
- const struct fts_language *lang,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r)
+int lang_filter_create(const struct lang_filter *filter_class,
+ struct lang_filter *parent,
+ const struct language *lang,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r)
{
- struct fts_filter *fp;
+ struct lang_filter *fp;
const char *empty_settings = NULL;
i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
*error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
return -1;
}
- fp = i_new(struct fts_filter, 1);
+ fp = i_new(struct lang_filter, 1);
*fp = *filter_class;
}
fp->refcount = 1;
fp->parent = parent;
if (parent != NULL) {
- fts_filter_ref(parent);
+ lang_filter_ref(parent);
}
*filter_r = fp;
return 0;
}
-void fts_filter_ref(struct fts_filter *fp)
+void lang_filter_ref(struct lang_filter *fp)
{
i_assert(fp->refcount > 0);
fp->refcount++;
}
-void fts_filter_unref(struct fts_filter **_fpp)
+void lang_filter_unref(struct lang_filter **_fpp)
{
- struct fts_filter *fp = *_fpp;
+ struct lang_filter *fp = *_fpp;
i_assert(fp->refcount > 0);
*_fpp = NULL;
return;
if (fp->parent != NULL)
- fts_filter_unref(&fp->parent);
+ lang_filter_unref(&fp->parent);
if (fp->v.destroy != NULL)
fp->v.destroy(fp);
else {
}
}
-int fts_filter_filter(struct fts_filter *filter, const char **token,
- const char **error_r)
+int lang_filter(struct lang_filter *filter, const char **token,
+ const char **error_r)
{
int ret = 0;
/* Recurse to parent. */
if (filter->parent != NULL)
- ret = fts_filter_filter(filter->parent, token, error_r);
+ ret = lang_filter(filter->parent, token, error_r);
/* Parent returned token or no parent. */
if (ret > 0 || filter->parent == NULL)
-#ifndef FTS_FILTER_H
-#define FTS_FILTER_H
+#ifndef LANG_FILTER_H
+#define LANG_FILTER_H
-struct fts_language;
-struct fts_filter;
+struct language;
+struct lang_filter;
/*
Settings are given in the form of a const char * const *settings =
{"key, "value", "key2", "value2", NULL} array of string pairs.
Stopword files are looked up in "<path>"/stopwords_<lang>.txt
*/
-extern const struct fts_filter *fts_filter_stopwords;
+extern const struct lang_filter *lang_filter_stopwords;
/*
Settings: "lang", language of the stemmed language.
*/
-extern const struct fts_filter *fts_filter_stemmer_snowball;
+extern const struct lang_filter *lang_filter_stemmer_snowball;
/*
Settings: "id", description of the normalizing/translitterating rules
"maxlen", maximum length of tokens that ICU normalizer will output.
Defaults to 250.
*/
-extern const struct fts_filter *fts_filter_normalizer_icu;
+extern const struct lang_filter *lang_filter_normalizer_icu;
/* Lowercases the input. Supports UTF8, if libicu is available. */
-extern const struct fts_filter *fts_filter_lowercase;
+extern const struct lang_filter *lang_filter_lowercase;
/* Removes <'s> suffix from words. */
-extern const struct fts_filter *fts_filter_english_possessive;
+extern const struct lang_filter *lang_filter_english_possessive;
/* Removes prefixing contractions from words. */
-extern const struct fts_filter *fts_filter_contractions;
+extern const struct lang_filter *lang_filter_contractions;
/* Register all built-in filters. */
-void fts_filters_init(void);
-void fts_filters_deinit(void);
+void lang_filters_init(void);
+void lang_filters_deinit(void);
/* Register a new class explicitly. Built-in classes are automatically
registered. */
-void fts_filter_register(const struct fts_filter *filter_class);
+void lang_filter_register(const struct lang_filter *filter_class);
/*
Filtering workflow, find --> create --> filter --> destroy.
*/
-const struct fts_filter *fts_filter_find(const char *name);
-int fts_filter_create(const struct fts_filter *filter_class,
- struct fts_filter *parent,
- const struct fts_language *lang,
- const char *const *settings,
- struct fts_filter **filter_r,
- const char **error_r);
-void fts_filter_ref(struct fts_filter *filter);
-void fts_filter_unref(struct fts_filter **filter);
+const struct lang_filter *lang_filter_find(const char *name);
+int lang_filter_create(const struct lang_filter *filter_class,
+ struct lang_filter *parent,
+ const struct language *lang,
+ const char *const *settings,
+ struct lang_filter **filter_r,
+ const char **error_r);
+void lang_filter_ref(struct lang_filter *filter);
+void lang_filter_unref(struct lang_filter **filter);
/* Returns 1 if token is returned in *token, 0 if token was filtered
out (*token is also set to NULL) and -1 on error.
Input is also given via *token.
*/
-int fts_filter_filter(struct fts_filter *filter, const char **token,
- const char **error_r);
+int lang_filter(struct lang_filter *filter, const char **token,
+ const char **error_r);
#endif
static struct UCaseMap *icu_csm = NULL;
-static struct UCaseMap *fts_icu_csm(void)
+static struct UCaseMap *lang_icu_csm(void)
{
UErrorCode err = U_ZERO_ERROR;
return icu_csm;
}
-void fts_icu_utf8_to_utf16(ARRAY_TYPE(icu_utf16) *dest_utf16,
- const char *src_utf8)
+void lang_icu_utf8_to_utf16(ARRAY_TYPE(icu_utf16) *dest_utf16,
+ const char *src_utf8)
{
buffer_t *dest_buf = dest_utf16->arr.buffer;
UErrorCode err = U_ZERO_ERROR;
i_assert(retp == dest_data);
}
-void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
- unsigned int src_len)
+void lang_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
+ unsigned int src_len)
{
int32_t dest_len = 0;
int32_t sub_num = 0;
i_assert(retp == dest_data);
}
-int fts_icu_translate(ARRAY_TYPE(icu_utf16) *dest_utf16, const UChar *src_utf16,
- unsigned int src_len, UTransliterator *transliterator,
- const char **error_r)
+int lang_icu_translate(ARRAY_TYPE(icu_utf16) *dest_utf16, const UChar *src_utf16,
+ unsigned int src_len, UTransliterator *transliterator,
+ const char **error_r)
{
buffer_t *dest_buf = dest_utf16->arr.buffer;
UErrorCode err = U_ZERO_ERROR;
return 0;
}
-void fts_icu_lcase(string_t *dest_utf8, const char *src_utf8)
+void lang_icu_lcase(string_t *dest_utf8, const char *src_utf8)
{
- struct UCaseMap *csm = fts_icu_csm();
+ struct UCaseMap *csm = lang_icu_csm();
size_t avail_bytes, dest_pos = dest_utf8->used;
char *dest_data;
int dest_full_len;
buffer_set_used_size(dest_utf8, dest_full_len);
}
-void fts_icu_deinit(void)
+void lang_icu_deinit(void)
{
if (icu_csm != NULL) {
ucasemap_close(icu_csm);
u_cleanup();
}
-int fts_icu_transliterator_create(const char *id,
- UTransliterator **transliterator_r,
- const char **error_r)
+int lang_icu_transliterator_create(const char *id,
+ UTransliterator **transliterator_r,
+ const char **error_r)
{
UErrorCode err = U_ZERO_ERROR;
UParseError perr;
i_zero(&perr);
t_array_init(&id_utf16, strlen(id));
- fts_icu_utf8_to_utf16(&id_utf16, id);
+ lang_icu_utf8_to_utf16(&id_utf16, id);
*transliterator_r = utrans_openU(array_front(&id_utf16),
array_count(&id_utf16),
UTRANS_FORWARD, NULL, 0, &perr, &err);
-#ifndef HAVE_FTS_ICU_H
-#define HAVE_FTS_ICU_H
+#ifndef HAVE_LANG_ICU_H
+#define HAVE_LANG_ICU_H
#include <unicode/ustring.h>
#include <unicode/utrans.h>
ARRAY_DEFINE_TYPE(icu_utf16, UChar);
/* Convert UTF-8 input to UTF-16 output. */
-void fts_icu_utf8_to_utf16(ARRAY_TYPE(icu_utf16) *dest_utf16,
- const char *src_utf8);
+void lang_icu_utf8_to_utf16(ARRAY_TYPE(icu_utf16) *dest_utf16,
+ const char *src_utf8);
/* Convert UTF-16 input to UTF-8 output. */
-void fts_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
- unsigned int src_len);
+void lang_icu_utf16_to_utf8(string_t *dest_utf8, const UChar *src_utf16,
+ unsigned int src_len);
/* Run ICU translation for the string. Returns 0 on success, -1 on error. */
-int fts_icu_translate(ARRAY_TYPE(icu_utf16) *dest_utf16, const UChar *src_utf16,
- unsigned int src_len, UTransliterator *transliterator,
- const char **error_r);
+int lang_icu_translate(ARRAY_TYPE(icu_utf16) *dest_utf16, const UChar *src_utf16,
+ unsigned int src_len, UTransliterator *transliterator,
+ const char **error_r);
/* Lowercase the given UTF-8 string. */
-void fts_icu_lcase(string_t *dest_utf8, const char *src_utf8);
+void lang_icu_lcase(string_t *dest_utf8, const char *src_utf8);
/* Free all the memory used by ICU functions. */
-void fts_icu_deinit(void);
+void lang_icu_deinit(void);
-int fts_icu_transliterator_create(const char *id,
- UTransliterator **transliterator_r,
- const char **error_r) ;
+int lang_icu_transliterator_create(const char *id,
+ UTransliterator **transliterator_r,
+ const char **error_r) ;
#endif
-#ifndef FTS_INDEXER_STATUS_H
-#define FTS_INDEXER_STATUS_H
+#ifndef LANG_INDEXER_STATUS_H
+#define LANG_INDEXER_STATUS_H
enum indexer_state {
INDEXER_STATE_PROCESSING = 0,
#include "lang-filter.h"
#include "lang-library.h"
-void fts_library_init(void)
+void lang_library_init(void)
{
- fts_languages_init();
- fts_tokenizers_init();
- fts_filters_init();
+ languages_init();
+ lang_tokenizers_init();
+ lang_filters_init();
}
-void fts_library_deinit(void)
+void lang_library_deinit(void)
{
- fts_languages_deinit();
- fts_tokenizers_deinit();
- fts_filters_deinit();
+ languages_deinit();
+ lang_tokenizers_deinit();
+ lang_filters_deinit();
}
-#ifndef FTS_LIBRARY_H
-#define FTS_LIBRARY_H
+#ifndef LANG_LIBRARY_H
+#define LANG_LIBRARY_H
-void fts_library_init(void);
-void fts_library_deinit(void);
+void lang_library_init(void);
+void lang_library_deinit(void);
#endif
#define IS_DTEXT(c) \
(rfc822_atext_chars[(int)(unsigned char)(c)] == 2)
-#define FTS_DEFAULT_ADDRESS_MAX_LENGTH 254
+#define LANG_DEFAULT_ADDRESS_MAX_LENGTH 254
enum email_address_parser_state {
EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
EMAIL_ADDRESS_PARSER_STATE_SKIP,
};
-struct email_address_fts_tokenizer {
- struct fts_tokenizer tokenizer;
+struct email_address_lang_tokenizer {
+ struct lang_tokenizer tokenizer;
enum email_address_parser_state state;
string_t *last_word;
string_t *parent_data; /* Copy of input data between tokens. */
};
static int
-fts_tokenizer_email_address_create(const char *const *settings,
- struct fts_tokenizer **tokenizer_r,
- const char **error_r)
+lang_tokenizer_email_address_create(const char *const *settings,
+ struct lang_tokenizer **tokenizer_r,
+ const char **error_r)
{
- struct email_address_fts_tokenizer *tok;
+ struct email_address_lang_tokenizer *tok;
bool search = FALSE;
- unsigned int max_length = FTS_DEFAULT_ADDRESS_MAX_LENGTH;
+ unsigned int max_length = LANG_DEFAULT_ADDRESS_MAX_LENGTH;
unsigned int i;
for (i = 0; settings[i] != NULL; i += 2) {
}
}
- tok = i_new(struct email_address_fts_tokenizer, 1);
- tok->tokenizer = *fts_tokenizer_email_address;
+ tok = i_new(struct email_address_lang_tokenizer, 1);
+ tok->tokenizer = *lang_tokenizer_email_address;
tok->last_word = str_new(default_pool, 128);
tok->parent_data = str_new(default_pool, 128);
tok->max_length = max_length;
return 0;
}
-static void fts_tokenizer_email_address_destroy(struct fts_tokenizer *_tok)
+static void lang_tokenizer_email_address_destroy(struct lang_tokenizer *_tok)
{
- struct email_address_fts_tokenizer *tok =
- (struct email_address_fts_tokenizer *)_tok;
+ struct email_address_lang_tokenizer *tok =
+ (struct email_address_lang_tokenizer *)_tok;
str_free(&tok->last_word);
str_free(&tok->parent_data);
}
static bool
-fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
- const char **token_r)
+lang_tokenizer_address_current_token(struct email_address_lang_tokenizer *tok,
+ const char **token_r)
{
const unsigned char *data = tok->last_word->data;
size_t len = tok->last_word->used;
IS_DTEXT() does not actually allow utf8 addresses
yet though. */
len = tok->last_word->used;
- fts_tokenizer_delete_trailing_partial_char(data, &len);
+ lang_tokenizer_delete_trailing_partial_char(data, &len);
i_assert(len <= tok->max_length);
}
if (len > 0)
- fts_tokenizer_delete_trailing_invalid_char(data, &len);
+ lang_tokenizer_delete_trailing_invalid_char(data, &len);
*token_r = len == 0 ? "" :
t_strndup(data, len);
return len > 0;
}
static bool
-fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok,
- const char **token_r)
+lang_tokenizer_address_parent_data(struct email_address_lang_tokenizer *tok,
+ const char **token_r)
{
if (tok->tokenizer.parent == NULL || str_len(tok->parent_data) == 0)
return FALSE;
}
static bool
-fts_tokenizer_email_address_too_large(struct email_address_fts_tokenizer *tok,
- size_t pos)
+lang_tokenizer_email_address_too_large(struct email_address_lang_tokenizer *tok,
+ size_t pos)
{
if (str_len(tok->last_word) + pos <= tok->max_length)
return FALSE;
Truncate the input that was added so far to the token, so all of it
gets sent to the parent tokenizer in
- fts_tokenizer_address_parent_data(). */
+ lang_tokenizer_address_parent_data(). */
str_truncate(tok->last_word, 0);
return TRUE;
}
static enum email_address_parser_state
-fts_tokenizer_email_address_parse_local(struct email_address_fts_tokenizer *tok,
- const unsigned char *data, size_t size,
- size_t *skip_r)
+lang_tokenizer_email_address_parse_local(struct email_address_lang_tokenizer *tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r)
{
size_t pos = 0;
bool seen_at = FALSE;
break;
}
- if (fts_tokenizer_email_address_too_large(tok, pos)) {
+ if (lang_tokenizer_email_address_too_large(tok, pos)) {
*skip_r = 0;
return EMAIL_ADDRESS_PARSER_STATE_SKIP;
}
return EMAIL_ADDRESS_PARSER_STATE_NONE;
}
-static bool domain_is_empty(struct email_address_fts_tokenizer *tok)
+static bool domain_is_empty(struct email_address_lang_tokenizer *tok)
{
const char *p, *str = str_c(tok->last_word);
}
static enum email_address_parser_state
-fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok,
- const unsigned char *data, size_t size,
- size_t *skip_r)
+lang_tokenizer_email_address_parse_domain(struct email_address_lang_tokenizer *tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r)
{
size_t pos = 0;
while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.' || data[pos] == '-'))
pos++;
- if (fts_tokenizer_email_address_too_large(tok, pos)) {
+ if (lang_tokenizer_email_address_too_large(tok, pos)) {
*skip_r = 0;
return EMAIL_ADDRESS_PARSER_STATE_SKIP;
}
}
static bool
-fts_tokenizer_address_skip(const unsigned char *data, size_t size,
- size_t *skip_r)
+lang_tokenizer_address_skip(const unsigned char *data, size_t size,
+ size_t *skip_r)
{
for (size_t pos = 0; pos < size; pos++) {
if (!(IS_ATEXT(data[pos]) || data[pos] == '.' ||
/* Buffer raw data for parent. */
static void
-fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok,
- const unsigned char *data, size_t size)
+lang_tokenizer_address_update_parent(struct email_address_lang_tokenizer *tok,
+ const unsigned char *data, size_t size)
{
if (tok->tokenizer.parent != NULL)
str_append_data(tok->parent_data, data, size);
}
-static void fts_tokenizer_email_address_reset(struct fts_tokenizer *_tok)
+static void lang_tokenizer_email_address_reset(struct lang_tokenizer *_tok)
{
- struct email_address_fts_tokenizer *tok =
- (struct email_address_fts_tokenizer *)_tok;
+ struct email_address_lang_tokenizer *tok =
+ (struct email_address_lang_tokenizer *)_tok;
tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
str_truncate(tok->last_word, 0);
}
static int
-fts_tokenizer_email_address_next(struct fts_tokenizer *_tok,
- const unsigned char *data, size_t size,
- size_t *skip_r, const char **token_r,
- const char **error_r ATTR_UNUSED)
+lang_tokenizer_email_address_next(struct lang_tokenizer *_tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r, const char **token_r,
+ const char **error_r ATTR_UNUSED)
{
- struct email_address_fts_tokenizer *tok =
- (struct email_address_fts_tokenizer *)_tok;
+ struct email_address_lang_tokenizer *tok =
+ (struct email_address_lang_tokenizer *)_tok;
size_t pos = 0, local_skip;
bool finished;
if (tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) {
*skip_r = pos;
- if (fts_tokenizer_address_current_token(tok, token_r))
+ if (lang_tokenizer_address_current_token(tok, token_r))
return 1;
}
tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
}
- if (fts_tokenizer_address_parent_data(tok, token_r))
+ if (lang_tokenizer_address_parent_data(tok, token_r))
return 1;
if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN) {
- if (fts_tokenizer_address_current_token(tok, token_r))
+ if (lang_tokenizer_address_current_token(tok, token_r))
return 1;
}
tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
/* no part of address found yet. remove possible
earlier data */
str_truncate(tok->last_word, 0);
- if (fts_tokenizer_address_parent_data(tok, token_r)) {
+ if (lang_tokenizer_address_parent_data(tok, token_r)) {
*skip_r = pos;
return 1;
}
the beginning of data to see if it contains a full
local-part@ */
tok->state =
- fts_tokenizer_email_address_parse_local(tok,
+ lang_tokenizer_email_address_parse_local(tok,
data + pos,
size - pos,
&local_skip);
- fts_tokenizer_address_update_parent(tok, data+pos,
+ lang_tokenizer_address_update_parent(tok, data+pos,
local_skip);
pos += local_skip;
to see if it contains a valid domain. */
tok->state =
- fts_tokenizer_email_address_parse_domain(tok,
+ lang_tokenizer_email_address_parse_domain(tok,
data + pos,
size - pos,
&local_skip);
- fts_tokenizer_address_update_parent(tok, data+pos,
+ lang_tokenizer_address_update_parent(tok, data+pos,
local_skip);
pos += local_skip;
break;
case EMAIL_ADDRESS_PARSER_STATE_COMPLETE:
*skip_r = pos;
- if (fts_tokenizer_address_parent_data(tok, token_r))
+ if (lang_tokenizer_address_parent_data(tok, token_r))
return 1;
- if (fts_tokenizer_address_current_token(tok, token_r))
+ if (lang_tokenizer_address_current_token(tok, token_r))
return 1;
break;
case EMAIL_ADDRESS_PARSER_STATE_SKIP:
simply skipping over it, but the input is being
passed to the parent tokenizer. */
*skip_r = pos;
- if (fts_tokenizer_address_parent_data(tok, token_r))
+ if (lang_tokenizer_address_parent_data(tok, token_r))
return 1;
- finished = fts_tokenizer_address_skip(data + pos,
+ finished = lang_tokenizer_address_skip(data + pos,
size - pos,
&local_skip);
- fts_tokenizer_address_update_parent(tok, data+pos,
+ lang_tokenizer_address_update_parent(tok, data+pos,
local_skip);
pos += local_skip;
if (finished) {
*skip_r = pos;
- if (fts_tokenizer_address_parent_data(tok, token_r)) {
+ if (lang_tokenizer_address_parent_data(tok, token_r)) {
tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
return 1;
}
return 0;
}
-static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = {
- fts_tokenizer_email_address_create,
- fts_tokenizer_email_address_destroy,
- fts_tokenizer_email_address_reset,
- fts_tokenizer_email_address_next
+static const struct lang_tokenizer_vfuncs email_address_tokenizer_vfuncs = {
+ lang_tokenizer_email_address_create,
+ lang_tokenizer_email_address_destroy,
+ lang_tokenizer_email_address_reset,
+ lang_tokenizer_email_address_next
};
-static const struct fts_tokenizer fts_tokenizer_email_address_real = {
+static const struct lang_tokenizer lang_tokenizer_email_address_real = {
.name = "email-address",
.v = &email_address_tokenizer_vfuncs,
.stream_to_parents = TRUE,
};
-const struct fts_tokenizer *fts_tokenizer_email_address =
- &fts_tokenizer_email_address_real;
+const struct lang_tokenizer *lang_tokenizer_email_address =
+ &lang_tokenizer_email_address_real;
#include "unichar.h"
#include "lang-tokenizer-common.h"
void
-fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
- size_t *len)
+lang_tokenizer_delete_trailing_partial_char(const unsigned char *data,
+ size_t *len)
{
size_t pos;
unsigned int char_bytes;
*len = pos;
}
}
-void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
- size_t *len)
+void lang_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
+ size_t *len)
{
size_t pos = *len;
-#ifndef FTS_TOKENIZER_COMMON_H
-#define FTS_TOKENIZER_COMMON_H
+#ifndef LANG_TOKENIZER_COMMON_H
+#define LANG_TOKENIZER_COMMON_H
void
-fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
+lang_tokenizer_delete_trailing_partial_char(const unsigned char *data,
size_t *len);
void
-fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
+lang_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
size_t *len);
#endif
-#ifndef FTS_TOKENIZER_GENERIC_PRIVATE_H
-#define FTS_TOKENIZER_GENERIC_PRIVATE_H
+#ifndef LANG_TOKENIZER_GENERIC_PRIVATE_H
+#define LANG_TOKENIZER_GENERIC_PRIVATE_H
-extern const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple;
-extern const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29;
+extern const struct lang_tokenizer_vfuncs generic_tokenizer_vfuncs_simple;
+extern const struct lang_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29;
/* Word boundary letter type */
enum letter_type {
#define ALGORITHM_TR29_NAME "tr29"
};
-struct generic_fts_tokenizer {
- struct fts_tokenizer tokenizer;
+struct generic_lang_tokenizer {
+ struct lang_tokenizer tokenizer;
unsigned int max_length;
bool prefixsplat; /* for search strings, accept a trailing '*' for explicit prefix */
bool wb5a; /* TR29 rule for prefix separation
#include "word-break-data.c"
/* see comments below between is_base64() and skip_base64() */
-#define FTS_SKIP_BASE64_MIN_SEQUENCES 1
-#define FTS_SKIP_BASE64_MIN_CHARS 50
+#define LANG_SKIP_BASE64_MIN_SEQUENCES 1
+#define LANG_SKIP_BASE64_MIN_CHARS 50
-#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
-#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
+#define LANG_DEFAULT_TOKEN_MAX_LENGTH 30
+#define LANG_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
-static unsigned char fts_ascii_word_breaks[128] = {
+static unsigned char lang_ascii_word_breaks[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
};
static int
-fts_tokenizer_generic_create(const char *const *settings,
- struct fts_tokenizer **tokenizer_r,
- const char **error_r)
+lang_tokenizer_generic_create(const char *const *settings,
+ struct lang_tokenizer **tokenizer_r,
+ const char **error_r)
{
- struct generic_fts_tokenizer *tok;
- unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
+ struct generic_lang_tokenizer *tok;
+ unsigned int max_length = LANG_DEFAULT_TOKEN_MAX_LENGTH;
enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
bool wb5a = FALSE;
bool search = FALSE;
return -1;
}
- tok = i_new(struct generic_fts_tokenizer, 1);
+ tok = i_new(struct generic_lang_tokenizer, 1);
if (algo == BOUNDARY_ALGORITHM_TR29)
tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
else
}
static void
-fts_tokenizer_generic_destroy(struct fts_tokenizer *_tok)
+lang_tokenizer_generic_destroy(struct lang_tokenizer *_tok)
{
- struct generic_fts_tokenizer *tok =
- container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ struct generic_lang_tokenizer *tok =
+ container_of(_tok, struct generic_lang_tokenizer, tokenizer);
buffer_free(&tok->token);
i_free(tok);
}
static inline void
-shift_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt)
+shift_prev_type(struct generic_lang_tokenizer *tok, enum letter_type lt)
{
tok->prev_prev_type = tok->prev_type;
tok->prev_type = lt;
}
static inline void
-add_prev_type(struct generic_fts_tokenizer *tok, enum letter_type lt)
+add_prev_type(struct generic_lang_tokenizer *tok, enum letter_type lt)
{
if(tok->prev_type != LETTER_TYPE_NONE)
tok->prev_prev_type = tok->prev_type;
}
static inline void
-add_letter(struct generic_fts_tokenizer *tok, unichar_t c)
+add_letter(struct generic_lang_tokenizer *tok, unichar_t c)
{
if(tok->letter != 0)
tok->prev_letter = tok->letter;
}
static bool
-fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
- const char **token_r)
+lang_tokenizer_generic_simple_current_token(struct generic_lang_tokenizer *tok,
+ const char **token_r)
{
const unsigned char *data = tok->token->data;
size_t len = tok->token->used;
i_assert(len > 0 && data[len-1] != '*');
}
} else {
- fts_tokenizer_delete_trailing_partial_char(data, &len);
+ lang_tokenizer_delete_trailing_partial_char(data, &len);
}
i_assert(len <= tok->max_length);
BINARY_NUMBER_SEARCH(data, count, value, idx_r);
}
-static bool fts_uni_word_break(unichar_t c)
+static bool lang_uni_word_break(unichar_t c)
{
unsigned int idx;
return FALSE;
}
-enum fts_break_type {
- FTS_FROM_STOP = 0,
- FTS_FROM_WORD = 2,
- FTS_TO_STOP= 0,
- FTS_TO_WORD = 1,
-#define FROM_TO(f,t) FTS_##f##_TO_##t = FTS_FROM_##f | FTS_TO_##t
+enum lang_break_type {
+ LANG_FROM_STOP = 0,
+ LANG_FROM_WORD = 2,
+ LANG_TO_STOP= 0,
+ LANG_TO_WORD = 1,
+#define FROM_TO(f,t) LANG_##f##_TO_##t = LANG_FROM_##f | LANG_TO_##t
FROM_TO(STOP,STOP),
FROM_TO(STOP,WORD),
FROM_TO(WORD,STOP),
FROM_TO(WORD,WORD),
};
-static inline enum fts_break_type
-fts_simple_is_word_break(const struct generic_fts_tokenizer *tok,
+static inline enum lang_break_type
+lang_simple_is_word_break(const struct generic_lang_tokenizer *tok,
unichar_t c, bool apostrophe)
{
/* Until we know better, a letter followed by an apostrophe is continuation of the word.
However, if we see non-word letters afterwards, we'll reverse that decision. */
if (apostrophe)
- return tok->prev_type == LETTER_TYPE_ALETTER ? FTS_WORD_TO_WORD : FTS_STOP_TO_STOP;
+ return tok->prev_type == LETTER_TYPE_ALETTER ? LANG_WORD_TO_WORD : LANG_STOP_TO_STOP;
- bool new_breakiness = (c < 0x80) ? (fts_ascii_word_breaks[c] != 0) : fts_uni_word_break(c);
+ bool new_breakiness = (c < 0x80) ? (lang_ascii_word_breaks[c] != 0) : lang_uni_word_break(c);
- return (new_breakiness ? FTS_TO_STOP : FTS_TO_WORD)
+ return (new_breakiness ? LANG_TO_STOP : LANG_TO_WORD)
+ (tok->prev_type == LETTER_TYPE_ALETTER ||
tok->prev_type == LETTER_TYPE_SINGLE_QUOTE
- ? FTS_FROM_WORD : FTS_FROM_STOP);
+ ? LANG_FROM_WORD : LANG_FROM_STOP);
}
-static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
+static void lang_tokenizer_generic_reset(struct lang_tokenizer *_tok)
{
- struct generic_fts_tokenizer *tok =
- container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ struct generic_lang_tokenizer *tok =
+ container_of(_tok, struct generic_lang_tokenizer, tokenizer);
tok->prev_type = LETTER_TYPE_NONE;
tok->prev_prev_type = LETTER_TYPE_NONE;
buffer_set_used_size(tok->token, 0);
}
-static void tok_append_truncated(struct generic_fts_tokenizer *tok,
+static void tok_append_truncated(struct generic_lang_tokenizer *tok,
const unsigned char *data, size_t size)
{
buffer_append(tok->token, data,
criteria on its own to be discarded. What we pay is we will fail to reject
small base64 chunks segments instead of rejecting the whole sequence.
- When skip_base64() is invoked in fts_tokenizer_generic_XX_next(), we know
+ When skip_base64() is invoked in lang_tokenizer_generic_XX_next(), we know
that we are not halfway the collection of a token.
As (after the previous token) the buffer will contain non-token characters
const unsigned char *past;
for (past = first; past < end && is_base64(*past); past++);
- if (past - first < FTS_SKIP_BASE64_MIN_CHARS)
+ if (past - first < LANG_SKIP_BASE64_MIN_CHARS)
break;
if (past < end && memchr(allowed_base64_trailers, *past,
N_ELEMENTS(allowed_base64_trailers)) == NULL)
start = past;
matches++;
}
- return matches < FTS_SKIP_BASE64_MIN_SEQUENCES ? 0 : start - data;
+ return matches < LANG_SKIP_BASE64_MIN_SEQUENCES ? 0 : start - data;
}
static int
-fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
- const unsigned char *data, size_t size,
- size_t *skip_r, const char **token_r,
- const char **error_r ATTR_UNUSED)
+lang_tokenizer_generic_simple_next(struct lang_tokenizer *_tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r, const char **token_r,
+ const char **error_r ATTR_UNUSED)
{
- struct generic_fts_tokenizer *tok =
- container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ struct generic_lang_tokenizer *tok =
+ container_of(_tok, struct generic_lang_tokenizer, tokenizer);
size_t i, start;
int char_size;
unichar_t c;
bool apostrophe;
- enum fts_break_type break_type;
+ enum lang_break_type break_type;
start = tok->token->used > 0 ? 0 : skip_base64(data, size);
for (i = start; i < size; i += char_size) {
(tok->prev_type == LETTER_TYPE_ALETTER)) {
/* this might be a prefix-mathing query */
shift_prev_type(tok, LETTER_TYPE_PREFIXSPLAT);
- } else if ((break_type = fts_simple_is_word_break(tok, c, apostrophe))
- != FTS_WORD_TO_WORD) {
+ } else if ((break_type = lang_simple_is_word_break(tok, c, apostrophe))
+ != LANG_WORD_TO_WORD) {
tok_append_truncated(tok, data + start, i - start);
- shift_prev_type(tok, (break_type & FTS_TO_WORD) != 0
+ shift_prev_type(tok, (break_type & LANG_TO_WORD) != 0
? LETTER_TYPE_ALETTER : LETTER_TYPE_NONE);
- if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+ if (lang_tokenizer_generic_simple_current_token(tok, token_r)) {
*skip_r = i;
- if (break_type != FTS_STOP_TO_WORD) /* therefore *_TO_STOP */
+ if (break_type != LANG_STOP_TO_WORD) /* therefore *_TO_STOP */
*skip_r += char_size;
return 1;
}
- if ((break_type & FTS_TO_WORD) == 0)
+ if ((break_type & LANG_TO_WORD) == 0)
start = i + char_size;
} else if (apostrophe) {
/* all apostrophes require special handling */
/* return the last token */
if (size == 0) {
shift_prev_type(tok, LETTER_TYPE_NONE);
- if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+ if (lang_tokenizer_generic_simple_current_token(tok, token_r))
return 1;
}
return LETTER_TYPE_OTHER;
}
-static bool letter_panic(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_panic(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
i_panic("Letter type should not be used.");
}
/* WB3, WB3a and WB3b, but really different since we try to eat
whitespace between words. */
-static bool letter_cr_lf_newline(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_cr_lf_newline(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
return TRUE;
}
-static bool letter_extend_format(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_extend_format(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
/* WB4 */
return FALSE;
}
-static bool letter_regional_indicator(struct generic_fts_tokenizer *tok)
+static bool letter_regional_indicator(struct generic_lang_tokenizer *tok)
{
/* WB13c */
if (tok->prev_type == LETTER_TYPE_REGIONAL_INDICATOR)
return TRUE; /* Any / Any */
}
-static bool letter_katakana(struct generic_fts_tokenizer *tok)
+static bool letter_katakana(struct generic_lang_tokenizer *tok)
{
/* WB13 */
if (tok->prev_type == LETTER_TYPE_KATAKANA)
return TRUE; /* Any / Any */
}
-static bool letter_hebrew(struct generic_fts_tokenizer *tok)
+static bool letter_hebrew(struct generic_lang_tokenizer *tok)
{
/* WB5 */
if (tok->prev_type == LETTER_TYPE_HEBREW_LETTER)
return TRUE; /* Any / Any */
}
-static bool letter_aletter(struct generic_fts_tokenizer *tok)
+static bool letter_aletter(struct generic_lang_tokenizer *tok)
{
/* WB5a */
- if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ if (tok->wb5a && tok->token->used <= LANG_WB5A_PREFIX_MAX_LENGTH)
if (IS_WB5A_APOSTROPHE(tok->prev_letter) && IS_VOWEL(tok->letter)) {
tok->seen_wb5a = TRUE;
return TRUE;
return TRUE; /* Any / Any */
}
-static bool letter_single_quote(struct generic_fts_tokenizer *tok)
+static bool letter_single_quote(struct generic_lang_tokenizer *tok)
{
/* WB6 */
if (tok->prev_type == LETTER_TYPE_ALETTER ||
return TRUE; /* Any / Any */
}
-static bool letter_double_quote(struct generic_fts_tokenizer *tok)
+static bool letter_double_quote(struct generic_lang_tokenizer *tok)
{
if (tok->prev_type == LETTER_TYPE_DOUBLE_QUOTE)
return TRUE; /* Any / Any */
}
-static bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_midnumlet(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
/* Break at MidNumLet, non-conformant with WB6/WB7 */
return TRUE;
}
-static bool letter_midletter(struct generic_fts_tokenizer *tok)
+static bool letter_midletter(struct generic_lang_tokenizer *tok)
{
/* WB6 */
if (tok->prev_type == LETTER_TYPE_ALETTER ||
return TRUE; /* Any / Any */
}
-static bool letter_midnum(struct generic_fts_tokenizer *tok)
+static bool letter_midnum(struct generic_lang_tokenizer *tok)
{
/* WB12 */
if (tok->prev_type == LETTER_TYPE_NUMERIC)
return TRUE; /* Any / Any */
}
-static bool letter_numeric(struct generic_fts_tokenizer *tok)
+static bool letter_numeric(struct generic_lang_tokenizer *tok)
{
/* WB8 */
if (tok->prev_type == LETTER_TYPE_NUMERIC)
return TRUE; /* Any / Any */
}
-static bool letter_extendnumlet(struct generic_fts_tokenizer *tok)
+static bool letter_extendnumlet(struct generic_lang_tokenizer *tok)
{
/* WB13a */
return TRUE; /* Any / Any */
}
-static bool letter_apostrophe(struct generic_fts_tokenizer *tok)
+static bool letter_apostrophe(struct generic_lang_tokenizer *tok)
{
if (tok->prev_type == LETTER_TYPE_ALETTER ||
return TRUE; /* Any / Any */
}
-static bool letter_prefixsplat(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_prefixsplat(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
/* Dovecot explicit-prefix specific */
return TRUE; /* Always induces a word break - but with special handling */
}
-static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
+static bool letter_other(struct generic_lang_tokenizer *tok ATTR_UNUSED)
{
return TRUE; /* Any / Any */
}
very kludgy and should be coded into the rules themselves
somehow.
*/
-static bool is_one_past_end(struct generic_fts_tokenizer *tok)
+static bool is_one_past_end(struct generic_lang_tokenizer *tok)
{
/* WB6/7 false positive detected at one past end. */
if (tok->prev_type == LETTER_TYPE_MIDLETTER ||
}
static void
-fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
- const char **token_r)
+lang_tokenizer_generic_tr29_current_token(struct generic_lang_tokenizer *tok,
+ const char **token_r)
{
const unsigned char *data = tok->token->data;
size_t len = tok->token->used;
i_assert(len > 0);
len--;
} else if (tok->untruncated_length > tok->max_length) {
- fts_tokenizer_delete_trailing_partial_char(data, &len);
+ lang_tokenizer_delete_trailing_partial_char(data, &len);
}
/* we're skipping all non-token chars at the beginning of the word,
so by this point we must have something here - even if we just
tok->untruncated_length = 0;
}
-static void wb5a_reinsert(struct generic_fts_tokenizer *tok)
+static void wb5a_reinsert(struct generic_lang_tokenizer *tok)
{
string_t *utf8_str = t_str_new(6);
}
struct letter_fn {
- bool (*fn)(struct generic_fts_tokenizer *tok);
+ bool (*fn)(struct generic_lang_tokenizer *tok);
};
static struct letter_fn letter_fns[] = {
{letter_panic}, {letter_cr_lf_newline}, {letter_cr_lf_newline},
/*
Find word boundaries in input text. Based on Unicode standard annex
- #29, but tailored for FTS purposes.
+ #29, but tailored for language purposes.
http://www.unicode.org/reports/tr29/
Note: The text of tr29 is a living standard, so it keeps
Adaptions:
* Added optional WB5a as a configurable option. The cut of prefix is
- max FTS_WB5A_PREFIX chars.
+ max LANG_WB5A_PREFIX chars.
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
to assist in finding individual words.
*/
static bool
-uni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)
+uni_found_word_boundary(struct generic_lang_tokenizer *tok, enum letter_type lt)
{
/* No rule knows what to do with just one char, except the linebreaks
we eat away (above) anyway. */
}
static int
-fts_tokenizer_generic_tr29_next(struct fts_tokenizer *_tok,
- const unsigned char *data, size_t size,
- size_t *skip_r, const char **token_r,
- const char **error_r ATTR_UNUSED)
+lang_tokenizer_generic_tr29_next(struct lang_tokenizer *_tok,
+ const unsigned char *data, size_t size,
+ size_t *skip_r, const char **token_r,
+ const char **error_r ATTR_UNUSED)
{
- struct generic_fts_tokenizer *tok =
- container_of(_tok, struct generic_fts_tokenizer, tokenizer);
+ struct generic_lang_tokenizer *tok =
+ container_of(_tok, struct generic_lang_tokenizer, tokenizer);
unichar_t c;
size_t i, char_start_i, start_pos;
enum letter_type lt;
continue;
}
- if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ if (tok->wb5a && tok->token->used <= LANG_WB5A_PREFIX_MAX_LENGTH)
add_letter(tok, c);
if (uni_found_word_boundary(tok, lt)) {
tok_append_truncated(tok, data + start_pos,
char_start_i - start_pos);
if (lt == LETTER_TYPE_PREFIXSPLAT && tok->prefixsplat) {
- const unsigned char prefix_char = FTS_PREFIX_SPLAT_CHAR;
+ const unsigned char prefix_char = LANG_PREFIX_SPLAT_CHAR;
tok_append_truncated(tok, &prefix_char, 1);
}
*skip_r = i;
- fts_tokenizer_generic_tr29_current_token(tok, token_r);
+ lang_tokenizer_generic_tr29_current_token(tok, token_r);
return 1;
} else if (lt == LETTER_TYPE_APOSTROPHE ||
lt == LETTER_TYPE_SINGLE_QUOTE) {
if (size == 0 && tok->token->used > 0) {
/* return the last token */
*skip_r = 0;
- fts_tokenizer_generic_tr29_current_token(tok, token_r);
+ lang_tokenizer_generic_tr29_current_token(tok, token_r);
return 1;
}
return 0;
}
static int
-fts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
- const unsigned char *data ATTR_UNUSED,
- size_t size ATTR_UNUSED,
- size_t *skip_r ATTR_UNUSED,
- const char **token_r ATTR_UNUSED,
- const char **error_r ATTR_UNUSED)
+lang_tokenizer_generic_next(struct lang_tokenizer *_tok ATTR_UNUSED,
+ const unsigned char *data ATTR_UNUSED,
+ size_t size ATTR_UNUSED,
+ size_t *skip_r ATTR_UNUSED,
+ const char **token_r ATTR_UNUSED,
+ const char **error_r ATTR_UNUSED)
{
i_unreached();
}
-static const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs = {
- fts_tokenizer_generic_create,
- fts_tokenizer_generic_destroy,
- fts_tokenizer_generic_reset,
- fts_tokenizer_generic_next
+static const struct lang_tokenizer_vfuncs generic_tokenizer_vfuncs = {
+ lang_tokenizer_generic_create,
+ lang_tokenizer_generic_destroy,
+ lang_tokenizer_generic_reset,
+ lang_tokenizer_generic_next
};
-static const struct fts_tokenizer fts_tokenizer_generic_real = {
+static const struct lang_tokenizer lang_tokenizer_generic_real = {
.name = "generic",
.v = &generic_tokenizer_vfuncs
};
-const struct fts_tokenizer *fts_tokenizer_generic = &fts_tokenizer_generic_real;
+const struct lang_tokenizer *lang_tokenizer_generic = &lang_tokenizer_generic_real;
-const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
- fts_tokenizer_generic_create,
- fts_tokenizer_generic_destroy,
- fts_tokenizer_generic_reset,
- fts_tokenizer_generic_simple_next
+const struct lang_tokenizer_vfuncs generic_tokenizer_vfuncs_simple = {
+ lang_tokenizer_generic_create,
+ lang_tokenizer_generic_destroy,
+ lang_tokenizer_generic_reset,
+ lang_tokenizer_generic_simple_next
};
-const struct fts_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
- fts_tokenizer_generic_create,
- fts_tokenizer_generic_destroy,
- fts_tokenizer_generic_reset,
- fts_tokenizer_generic_tr29_next
+const struct lang_tokenizer_vfuncs generic_tokenizer_vfuncs_tr29 = {
+ lang_tokenizer_generic_create,
+ lang_tokenizer_generic_destroy,
+ lang_tokenizer_generic_reset,
+ lang_tokenizer_generic_tr29_next
};
-#ifndef FTS_TOKENIZER_PRIVATE_H
-#define FTS_TOKENIZER_PRIVATE_H
+#ifndef LANG_TOKENIZER_PRIVATE_H
+#define LANG_TOKENIZER_PRIVATE_H
#include "lang-tokenizer.h"
-#define FTS_TOKENIZER_CLASSES_NR 2
+#define LANG_TOKENIZER_CLASSES_NR 2
-struct fts_tokenizer_vfuncs {
+struct lang_tokenizer_vfuncs {
int (*create)(const char *const *settings,
- struct fts_tokenizer **tokenizer_r, const char **error_r);
- void (*destroy)(struct fts_tokenizer *tok);
+ struct lang_tokenizer **tokenizer_r, const char **error_r);
+ void (*destroy)(struct lang_tokenizer *tok);
- void (*reset)(struct fts_tokenizer *tok);
- int (*next)(struct fts_tokenizer *tok, const unsigned char *data,
+ void (*reset)(struct lang_tokenizer *tok);
+ int (*next)(struct lang_tokenizer *tok, const unsigned char *data,
size_t size, size_t *skip_r, const char **token_r,
const char **error_r);
};
-enum fts_tokenizer_parent_state {
- FTS_TOKENIZER_PARENT_STATE_ADD_DATA = 0,
- FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT,
- FTS_TOKENIZER_PARENT_STATE_FINALIZE
+enum lang_tokenizer_parent_state {
+ LANG_TOKENIZER_PARENT_STATE_ADD_DATA = 0,
+ LANG_TOKENIZER_PARENT_STATE_NEXT_OUTPUT,
+ LANG_TOKENIZER_PARENT_STATE_FINALIZE
};
-struct fts_tokenizer {
+struct lang_tokenizer {
const char *name;
- const struct fts_tokenizer_vfuncs *v;
+ const struct lang_tokenizer_vfuncs *v;
int refcount;
- struct fts_tokenizer *parent;
+ struct lang_tokenizer *parent;
buffer_t *parent_input;
- enum fts_tokenizer_parent_state parent_state;
+ enum lang_tokenizer_parent_state parent_state;
const unsigned char *prev_data;
size_t prev_size;
bool finalize_parent_pending;
};
-void fts_tokenizer_register(const struct fts_tokenizer *tok_class);
-void fts_tokenizer_unregister(const struct fts_tokenizer *tok_class);
+void lang_tokenizer_register(const struct lang_tokenizer *tok_class);
+void lang_tokenizer_unregister(const struct lang_tokenizer *tok_class);
#endif
#include "lang-tokenizer.h"
#include "lang-tokenizer-private.h"
-static ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
+static ARRAY(const struct lang_tokenizer *) lang_tokenizer_classes;
-void fts_tokenizers_init(void)
+void lang_tokenizers_init(void)
{
- if (!array_is_created(&fts_tokenizer_classes)) {
- fts_tokenizer_register(fts_tokenizer_generic);
- fts_tokenizer_register(fts_tokenizer_email_address);
+ if (!array_is_created(&lang_tokenizer_classes)) {
+ lang_tokenizer_register(lang_tokenizer_generic);
+ lang_tokenizer_register(lang_tokenizer_email_address);
}
}
-void fts_tokenizers_deinit(void)
+void lang_tokenizers_deinit(void)
{
- if (array_is_created(&fts_tokenizer_classes))
- array_free(&fts_tokenizer_classes);
+ if (array_is_created(&lang_tokenizer_classes))
+ array_free(&lang_tokenizer_classes);
}
/* private */
-void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
+void lang_tokenizer_register(const struct lang_tokenizer *tok_class)
{
- if (!array_is_created(&fts_tokenizer_classes))
- i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
- array_push_back(&fts_tokenizer_classes, &tok_class);
+ if (!array_is_created(&lang_tokenizer_classes))
+ i_array_init(&lang_tokenizer_classes, LANG_TOKENIZER_CLASSES_NR);
+ array_push_back(&lang_tokenizer_classes, &tok_class);
}
/* private */
-void fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
+void lang_tokenizer_unregister(const struct lang_tokenizer *tok_class)
{
- const struct fts_tokenizer *const *tp;
+ const struct lang_tokenizer *const *tp;
unsigned int idx;
- array_foreach(&fts_tokenizer_classes, tp) {
+ array_foreach(&lang_tokenizer_classes, tp) {
if (strcmp((*tp)->name, tok_class->name) == 0) {
- idx = array_foreach_idx(&fts_tokenizer_classes, tp);
- array_delete(&fts_tokenizer_classes, idx, 1);
- if (array_count(&fts_tokenizer_classes) == 0)
- array_free(&fts_tokenizer_classes);
+ idx = array_foreach_idx(&lang_tokenizer_classes, tp);
+ array_delete(&lang_tokenizer_classes, idx, 1);
+ if (array_count(&lang_tokenizer_classes) == 0)
+ array_free(&lang_tokenizer_classes);
return;
}
}
i_unreached();
}
-const struct fts_tokenizer *fts_tokenizer_find(const char *name)
+const struct lang_tokenizer *lang_tokenizer_find(const char *name)
{
- const struct fts_tokenizer *tok;
+ const struct lang_tokenizer *tok;
- array_foreach_elem(&fts_tokenizer_classes, tok) {
+ array_foreach_elem(&lang_tokenizer_classes, tok) {
if (strcmp(tok->name, name) == 0)
return tok;
}
return NULL;
}
-const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
+const char *lang_tokenizer_name(const struct lang_tokenizer *tok)
{
return tok->name;
}
-static void fts_tokenizer_self_reset(struct fts_tokenizer *tok)
+static void lang_tokenizer_self_reset(struct lang_tokenizer *tok)
{
tok->prev_data = NULL;
tok->prev_size = 0;
tok->prev_reply_finished = TRUE;
}
-int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
- struct fts_tokenizer *parent,
- const char *const *settings,
- struct fts_tokenizer **tokenizer_r,
- const char **error_r)
+int lang_tokenizer_create(const struct lang_tokenizer *tok_class,
+ struct lang_tokenizer *parent,
+ const char *const *settings,
+ struct lang_tokenizer **tokenizer_r,
+ const char **error_r)
{
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *empty_settings = NULL;
i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
return -1;
}
tok->refcount = 1;
- fts_tokenizer_self_reset(tok);
+ lang_tokenizer_self_reset(tok);
if (parent != NULL) {
- fts_tokenizer_ref(parent);
+ lang_tokenizer_ref(parent);
tok->parent = parent;
tok->parent_input = buffer_create_dynamic(default_pool, 128);
}
return 0;
}
-void fts_tokenizer_ref(struct fts_tokenizer *tok)
+void lang_tokenizer_ref(struct lang_tokenizer *tok)
{
i_assert(tok->refcount > 0);
tok->refcount++;
}
-void fts_tokenizer_unref(struct fts_tokenizer **_tok)
+void lang_tokenizer_unref(struct lang_tokenizer **_tok)
{
- struct fts_tokenizer *tok = *_tok;
+ struct lang_tokenizer *tok = *_tok;
i_assert(tok->refcount > 0);
*_tok = NULL;
buffer_free(&tok->parent_input);
if (tok->parent != NULL)
- fts_tokenizer_unref(&tok->parent);
+ lang_tokenizer_unref(&tok->parent);
tok->v->destroy(tok);
}
static int
-fts_tokenizer_next_self(struct fts_tokenizer *tok,
- const unsigned char *data, size_t size,
- const char **token_r, const char **error_r)
+lang_tokenizer_next_self(struct lang_tokenizer *tok,
+ const unsigned char *data, size_t size,
+ const char **token_r, const char **error_r)
{
int ret = 0;
size_t skip = 0;
} else if (ret == 0) {
/* Need more data to get the next token. The next call will
provide a whole new data block, so reset the prev_* state. */
- fts_tokenizer_self_reset(tok);
+ lang_tokenizer_self_reset(tok);
}
return ret;
}
-void fts_tokenizer_reset(struct fts_tokenizer *tok)
+void lang_tokenizer_reset(struct lang_tokenizer *tok)
{
tok->v->reset(tok);
- fts_tokenizer_self_reset(tok);
+ lang_tokenizer_self_reset(tok);
}
-int fts_tokenizer_next(struct fts_tokenizer *tok,
- const unsigned char *data, size_t size,
- const char **token_r, const char **error_r)
+int lang_tokenizer_next(struct lang_tokenizer *tok,
+ const unsigned char *data, size_t size,
+ const char **token_r, const char **error_r)
{
int ret;
switch (tok->parent_state) {
- case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
+ case LANG_TOKENIZER_PARENT_STATE_ADD_DATA:
/* Try to get the next token using this tokenizer */
- ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
+ ret = lang_tokenizer_next_self(tok, data, size, token_r, error_r);
if (ret <= 0) {
/* error / more data needed */
if (ret == 0 && size == 0 &&
tokenizer still needs to be finalized. */
tok->finalize_parent_pending = FALSE;
tok->parent_state =
- FTS_TOKENIZER_PARENT_STATE_FINALIZE;
- return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
+ LANG_TOKENIZER_PARENT_STATE_FINALIZE;
+ return lang_tokenizer_next(tok, NULL, 0, token_r, error_r);
}
break;
}
buffer_append(tok->parent_input, *token_r, strlen(*token_r));
tok->parent_state++;
/* fall through */
- case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
+ case LANG_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
/* Return the next token from parent tokenizer */
- ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
+ ret = lang_tokenizer_next(tok->parent, tok->parent_input->data,
tok->parent_input->used, token_r, error_r);
if (ret != 0)
break;
tok->parent_state++;
/* fall through */
- case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
+ case LANG_TOKENIZER_PARENT_STATE_FINALIZE:
/* No more input is coming from the child tokenizer. Return the
final token(s) from the parent tokenizer. */
if (!tok->stream_to_parents || size == 0) {
- ret = fts_tokenizer_next(tok->parent, NULL, 0,
+ ret = lang_tokenizer_next(tok->parent, NULL, 0,
token_r, error_r);
if (ret != 0)
break;
/* We're finished handling the previous child token. See if
there are more child tokens available with this same data
input. */
- tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
- return fts_tokenizer_next(tok, data, size, token_r, error_r);
+ tok->parent_state = LANG_TOKENIZER_PARENT_STATE_ADD_DATA;
+ return lang_tokenizer_next(tok, data, size, token_r, error_r);
default:
i_unreached();
}
return ret;
}
-int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
- const char **error_r)
+int lang_tokenizer_final(struct lang_tokenizer *tok, const char **token_r,
+ const char **error_r)
{
- return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
+ return lang_tokenizer_next(tok, NULL, 0, token_r, error_r);
}
-#ifndef FTS_TOKENIZER_H
-#define FTS_TOKENIZER_H
+#ifndef LANG_TOKENIZER_H
+#define LANG_TOKENIZER_H
/*
Settings are given in the form of a const char * const *settings =
"search" Remove addresses from parent data stream, so they are not processed
further. Defaults to disabled. Enable by defining the keyword (and any
value). */
-extern const struct fts_tokenizer *fts_tokenizer_email_address;
+extern const struct lang_tokenizer *lang_tokenizer_email_address;
/* Generic email content tokenizer. Cuts text into tokens. */
/* Settings:
"maxlen" Maximum length of token, before an arbitrary cut off is made.
- Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
+ Defaults to LANG_DEFAULT_TOKEN_MAX_LENGTH.
"algorithm", accepted values are "simple" or "tr29". Defines the
method for looking for word boundaries. Simple is faster and will
is also significantly slower than simple. The algorithms also
differ in some details, e.g. simple will cut "a.b" and tr29 will
not. The default is "simple" */
-extern const struct fts_tokenizer *fts_tokenizer_generic;
+extern const struct lang_tokenizer *lang_tokenizer_generic;
/*
Tokenizing workflow, find --> create --> filter --> destroy.
*/
/* Register all built-in tokenizers. */
-void fts_tokenizers_init(void);
-void fts_tokenizers_deinit(void);
+void lang_tokenizers_init(void);
+void lang_tokenizers_deinit(void);
-const struct fts_tokenizer *fts_tokenizer_find(const char *name);
+const struct lang_tokenizer *lang_tokenizer_find(const char *name);
/* Create a new tokenizer. The settings are described above. */
-int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
- struct fts_tokenizer *parent,
- const char *const *settings,
- struct fts_tokenizer **tokenizer_r,
- const char **error_r);
-void fts_tokenizer_ref(struct fts_tokenizer *tok);
-void fts_tokenizer_unref(struct fts_tokenizer **tok);
+int lang_tokenizer_create(const struct lang_tokenizer *tok_class,
+ struct lang_tokenizer *parent,
+ const char *const *settings,
+ struct lang_tokenizer **tokenizer_r,
+ const char **error_r);
+void lang_tokenizer_ref(struct lang_tokenizer *tok);
+void lang_tokenizer_unref(struct lang_tokenizer **tok);
-/* Reset FTS tokenizer state */
-void fts_tokenizer_reset(struct fts_tokenizer *tok);
+/* Reset lang tokenizer state */
+void lang_tokenizer_reset(struct lang_tokenizer *tok);
/*
Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error.
This function should be called with the same data+size until it
- returns 0. After that fts_tokenizer_final() should be called until it
+ returns 0. After that lang_tokenizer_final() should be called until it
returns 0 to flush out the final token(s).
data must contain only valid complete UTF-8 sequences, but otherwise it
may be broken into however small pieces. (Input to this function typically
comes from message-decoder, which returns only complete UTF-8 sequences.) */
-int fts_tokenizer_next(struct fts_tokenizer *tok,
- const unsigned char *data, size_t size,
- const char **token_r, const char **error_r);
-/* Returns same as fts_tokenizer_next(). */
-int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
- const char **error_r);
+int lang_tokenizer_next(struct lang_tokenizer *tok,
+ const unsigned char *data, size_t size,
+ const char **token_r, const char **error_r);
+/* Returns same as lang_tokenizer_next(). */
+int lang_tokenizer_final(struct lang_tokenizer *tok, const char **token_r,
+ const char **error_r);
-const char *fts_tokenizer_name(const struct fts_tokenizer *tok);
+const char *lang_tokenizer_name(const struct lang_tokenizer *tok);
#endif
#define DETECT_STR_MAX_LEN 200
-struct fts_textcat {
+struct textcat {
int refcount;
void *handle;
char *config_path, *data_dir, *failed;
};
-struct fts_language_list {
+struct language_list {
pool_t pool;
- ARRAY_TYPE(fts_language) languages;
- struct fts_textcat *textcat;
+ ARRAY_TYPE(language) languages;
+ struct textcat *textcat;
const char *textcat_config;
const char *textcat_datadir;
};
-pool_t fts_languages_pool;
-ARRAY_TYPE(fts_language) fts_languages;
+pool_t languages_pool;
+ARRAY_TYPE(language) languages;
#ifdef HAVE_FTS_EXTTEXTCAT
-static struct fts_textcat *fts_textcat_cache = NULL;
+static struct textcat *textcat_cache = NULL;
#endif
/* ISO 639-1 alpha 2 codes for languages */
-const struct fts_language fts_languages_builtin [] = {
+const struct language languages_builtin [] = {
{ "da" }, /* Danish */
{ "de" }, /* German */
{ "en" }, /* English */
{ "tr" }, /* Turkish */
};
-const struct fts_language fts_language_data = {
+const struct language language_data = {
"data"
};
#ifdef HAVE_FTS_EXTTEXTCAT
-static void fts_textcat_unref(struct fts_textcat *textcat)
+static void textcat_unref(struct textcat *textcat)
{
i_assert(textcat->refcount > 0);
if (--textcat->refcount > 0)
return;
- if (textcat == fts_textcat_cache)
- fts_textcat_cache = NULL;
+ if (textcat == textcat_cache)
+ textcat_cache = NULL;
i_free(textcat->config_path);
i_free(textcat->data_dir);
}
#endif
-void fts_languages_init(void)
+void languages_init(void)
{
unsigned int i;
- const struct fts_language *lp;
-
- fts_languages_pool = pool_alloconly_create("fts_language",
- sizeof(fts_languages_builtin));
- p_array_init(&fts_languages, fts_languages_pool,
- N_ELEMENTS(fts_languages_builtin));
- for (i = 0; i < N_ELEMENTS(fts_languages_builtin); i++){
- lp = &fts_languages_builtin[i];
- array_push_back(&fts_languages, &lp);
+ const struct language *lp;
+
+ languages_pool = pool_alloconly_create("language",
+ sizeof(languages_builtin));
+ p_array_init(&languages, languages_pool, N_ELEMENTS(languages_builtin));
+ for (i = 0; i < N_ELEMENTS(languages_builtin); i++){
+ lp = &languages_builtin[i];
+ array_push_back(&languages, &lp);
}
}
-void fts_languages_deinit(void)
+void languages_deinit(void)
{
#ifdef HAVE_FTS_EXTTEXTCAT
- if (fts_textcat_cache != NULL)
- fts_textcat_unref(fts_textcat_cache);
+ if (textcat_cache != NULL)
+ textcat_unref(textcat_cache);
#endif
- pool_unref(&fts_languages_pool);
+ pool_unref(&languages_pool);
}
-void fts_language_register(const char *name)
+void language_register(const char *name)
{
- struct fts_language *lang;
+ struct language *lang;
- if (fts_language_find(name) != NULL)
+ if (language_find(name) != NULL)
return;
- lang = p_new(fts_languages_pool, struct fts_language, 1);
- lang->name = p_strdup(fts_languages_pool, name);
- array_push_back(&fts_languages, (const struct fts_language **)&lang);
+ lang = p_new(languages_pool, struct language, 1);
+ lang->name = p_strdup(languages_pool, name);
+ array_push_back(&languages, (const struct language **)&lang);
}
-const struct fts_language *fts_language_find(const char *name)
+const struct language *language_find(const char *name)
{
- const struct fts_language *lang;
+ const struct language *lang;
- array_foreach_elem(&fts_languages, lang) {
+ array_foreach_elem(&languages, lang) {
if (strcmp(lang->name, name) == 0)
return lang;
}
return NULL;
}
-int fts_language_list_init(const char *const *settings,
- struct fts_language_list **list_r,
- const char **error_r)
+int language_list_init(const char *const *settings,
+ struct language_list **list_r,
+ const char **error_r)
{
- struct fts_language_list *lp;
+ struct language_list *lp;
pool_t pool;
unsigned int i;
const char *conf = NULL, *data = NULL;
}
}
- pool = pool_alloconly_create("fts_language_list", 128);
- lp = p_new(pool, struct fts_language_list, 1);
+ pool = pool_alloconly_create("language_list", 128);
+ lp = p_new(pool, struct language_list, 1);
lp->pool = pool;
if (conf != NULL)
lp->textcat_config = p_strdup(pool, conf);
return 0;
}
-void fts_language_list_deinit(struct fts_language_list **list)
+void language_list_deinit(struct language_list **list)
{
- struct fts_language_list *lp = *list;
+ struct language_list *lp = *list;
*list = NULL;
#ifdef HAVE_FTS_EXTTEXTCAT
if (lp->textcat != NULL)
- fts_textcat_unref(lp->textcat);
+ textcat_unref(lp->textcat);
#endif
pool_unref(&lp->pool);
}
-static const struct fts_language *
-fts_language_list_find(struct fts_language_list *list, const char *name)
+static const struct language *
+language_list_find(struct language_list *list, const char *name)
{
- const struct fts_language *lang;
+ const struct language *lang;
array_foreach_elem(&list->languages, lang) {
if (strcmp(lang->name, name) == 0)
return NULL;
}
-void fts_language_list_add(struct fts_language_list *list,
- const struct fts_language *lang)
+void language_list_add(struct language_list *list,
+ const struct language *lang)
{
- i_assert(fts_language_list_find(list, lang->name) == NULL);
+ i_assert(language_list_find(list, lang->name) == NULL);
array_push_back(&list->languages, &lang);
}
-bool fts_language_list_add_names(struct fts_language_list *list,
- const char *names,
- const char **unknown_name_r)
+bool language_list_add_names(struct language_list *list,
+ const char *names,
+ const char **unknown_name_r)
{
const char *const *langs;
- const struct fts_language *lang;
+ const struct language *lang;
for (langs = t_strsplit_spaces(names, ", "); *langs != NULL; langs++) {
- lang = fts_language_find(*langs);
+ lang = language_find(*langs);
if (lang == NULL) {
/* unknown language */
*unknown_name_r = *langs;
return FALSE;
}
- if (fts_language_list_find(list, lang->name) == NULL)
- fts_language_list_add(list, lang);
+ if (language_list_find(list, lang->name) == NULL)
+ language_list_add(list, lang);
}
return TRUE;
}
-const ARRAY_TYPE(fts_language) *
-fts_language_list_get_all(struct fts_language_list *list)
+const ARRAY_TYPE(language) *
+language_list_get_all(struct language_list *list)
{
return &list->languages;
}
-const struct fts_language *
-fts_language_list_get_first(struct fts_language_list *list)
+const struct language *
+language_list_get_first(struct language_list *list)
{
- const struct fts_language *const *langp;
+ const struct language *const *langp;
langp = array_front(&list->languages);
return *langp;
}
#ifdef HAVE_FTS_EXTTEXTCAT
-static bool fts_language_match_lists(struct fts_language_list *list,
- candidate_t *candp, int candp_len,
- const struct fts_language **lang_r)
+static bool language_match_lists(struct language_list *list,
+ candidate_t *candp, int candp_len,
+ const struct language **lang_r)
{
const char *name;
/* For Norwegian we treat both bokmal and nynorsk as "no". */
if (strcmp(name, "nb") == 0 || strcmp(name, "nn") == 0)
name = "no";
- if ((*lang_r = fts_language_list_find(list, name)) != NULL)
+ if ((*lang_r = language_list_find(list, name)) != NULL)
return TRUE;
}
return FALSE;
#endif
#ifdef HAVE_FTS_EXTTEXTCAT
-static int fts_language_textcat_init(struct fts_language_list *list,
- const char **error_r)
+static int language_textcat_init(struct language_list *list,
+ const char **error_r)
{
const char *config_path;
const char *data_dir;
TEXTCAT_DATADIR"/fpdb.conf";
data_dir = list->textcat_datadir != NULL ? list->textcat_datadir :
TEXTCAT_DATADIR"/";
- if (fts_textcat_cache != NULL) {
- if (strcmp(fts_textcat_cache->config_path, config_path) == 0 &&
- strcmp(fts_textcat_cache->data_dir, data_dir) == 0) {
- list->textcat = fts_textcat_cache;
+ if (textcat_cache != NULL) {
+ if (strcmp(textcat_cache->config_path, config_path) == 0 &&
+ strcmp(textcat_cache->data_dir, data_dir) == 0) {
+ list->textcat = textcat_cache;
list->textcat->refcount++;
return 0;
}
- fts_textcat_unref(fts_textcat_cache);
+ textcat_unref(textcat_cache);
}
- fts_textcat_cache = list->textcat = i_new(struct fts_textcat, 1);
- fts_textcat_cache->refcount = 2;
- fts_textcat_cache->config_path = i_strdup(config_path);
- fts_textcat_cache->data_dir = i_strdup(data_dir);
- fts_textcat_cache->handle = special_textcat_Init(config_path, data_dir);
- if (fts_textcat_cache->handle == NULL) {
- fts_textcat_cache->failed = i_strdup_printf(
+ textcat_cache = list->textcat = i_new(struct textcat, 1);
+ textcat_cache->refcount = 2;
+ textcat_cache->config_path = i_strdup(config_path);
+ textcat_cache->data_dir = i_strdup(data_dir);
+ textcat_cache->handle = special_textcat_Init(config_path, data_dir);
+ if (textcat_cache->handle == NULL) {
+ textcat_cache->failed = i_strdup_printf(
"special_textcat_Init(%s, %s) failed",
config_path, data_dir);
- *error_r = fts_textcat_cache->failed;
+ *error_r = textcat_cache->failed;
return -1;
}
/* The textcat minimum document size could be set here. It
}
#endif
-static enum fts_language_result
-fts_language_detect_textcat(struct fts_language_list *list ATTR_UNUSED,
- const unsigned char *text ATTR_UNUSED,
- size_t size ATTR_UNUSED,
- const struct fts_language **lang_r ATTR_UNUSED,
- const char **error_r ATTR_UNUSED)
+static enum language_result
+language_detect_textcat(struct language_list *list ATTR_UNUSED,
+ const unsigned char *text ATTR_UNUSED,
+ size_t size ATTR_UNUSED,
+ const struct language **lang_r ATTR_UNUSED,
+ const char **error_r ATTR_UNUSED)
{
#ifdef HAVE_FTS_EXTTEXTCAT
candidate_t *candp; /* textcat candidate result array pointer */
int cnt;
bool match = FALSE;
- if (fts_language_textcat_init(list, error_r) < 0)
- return FTS_LANGUAGE_RESULT_ERROR;
+ if (language_textcat_init(list, error_r) < 0)
+ return LANGUAGE_RESULT_ERROR;
candp = textcat_GetClassifyFullOutput(list->textcat->handle);
if (candp == NULL)
I_MIN(size, DETECT_STR_MAX_LEN), candp);
if (cnt > 0) {
T_BEGIN {
- match = fts_language_match_lists(list, candp, cnt, lang_r);
+ match = language_match_lists(list, candp, cnt, lang_r);
} T_END;
textcat_ReleaseClassifyFullOutput(list->textcat->handle, candp);
if (match)
- return FTS_LANGUAGE_RESULT_OK;
+ return LANGUAGE_RESULT_OK;
else
- return FTS_LANGUAGE_RESULT_UNKNOWN;
+ return LANGUAGE_RESULT_UNKNOWN;
} else {
textcat_ReleaseClassifyFullOutput(list->textcat->handle, candp);
switch (cnt) {
case TEXTCAT_RESULT_SHORT:
i_assert(size < DETECT_STR_MAX_LEN);
- return FTS_LANGUAGE_RESULT_SHORT;
+ return LANGUAGE_RESULT_SHORT;
case TEXTCAT_RESULT_UNKNOWN:
- return FTS_LANGUAGE_RESULT_UNKNOWN;
+ return LANGUAGE_RESULT_UNKNOWN;
default:
i_unreached();
}
}
#else
- return FTS_LANGUAGE_RESULT_UNKNOWN;
+ return LANGUAGE_RESULT_UNKNOWN;
#endif
}
-enum fts_language_result
-fts_language_detect(struct fts_language_list *list,
- const unsigned char *text ATTR_UNUSED,
- size_t size ATTR_UNUSED,
- const struct fts_language **lang_r,
- const char **error_r)
+enum language_result
+language_detect(struct language_list *list,
+ const unsigned char *text ATTR_UNUSED,
+ size_t size ATTR_UNUSED,
+ const struct language **lang_r,
+ const char **error_r)
{
i_assert(array_count(&list->languages) > 0);
/* if there's only a single wanted language, return it always. */
if (array_count(&list->languages) == 1) {
- const struct fts_language *const *langp =
+ const struct language *const *langp =
array_front(&list->languages);
*lang_r = *langp;
- return FTS_LANGUAGE_RESULT_OK;
+ return LANGUAGE_RESULT_OK;
}
- return fts_language_detect_textcat(list, text, size, lang_r, error_r);
+ return language_detect_textcat(list, text, size, lang_r, error_r);
}
-#ifndef FTS_LANGUAGE_H
-#define FTS_LANGUAGE_H
+#ifndef LANGUAGE_H
+#define LANGUAGE_H
-struct fts_language_list;
+struct language_list;
-enum fts_language_result {
+enum language_result {
/* Provided sample is too short. */
- FTS_LANGUAGE_RESULT_SHORT,
+ LANGUAGE_RESULT_SHORT,
/* Language is unknown or not in the provided list . */
- FTS_LANGUAGE_RESULT_UNKNOWN,
+ LANGUAGE_RESULT_UNKNOWN,
- FTS_LANGUAGE_RESULT_OK,
+ LANGUAGE_RESULT_OK,
/* textcat library initialization failed. */
- FTS_LANGUAGE_RESULT_ERROR
+ LANGUAGE_RESULT_ERROR
};
-struct fts_language {
+struct language {
/* Two-letter language name lowercased, e.g. "en" */
const char *name;
};
-ARRAY_DEFINE_TYPE(fts_language, const struct fts_language *);
+ARRAY_DEFINE_TYPE(language, const struct language *);
/* Used for raw data that is indexed. This data shouldn't go through any
language-specific filters. */
-extern const struct fts_language fts_language_data;
+extern const struct language language_data;
/*
Language module API.
*/
-void fts_languages_init(void);
-void fts_languages_deinit(void);
+void languages_init(void);
+void languages_deinit(void);
/* Add a language to the list of supported languages. */
-void fts_language_register(const char *name);
+void language_register(const char *name);
/* Find a specified language by name. This finds from the internal list of
supported languages. */
-const struct fts_language *fts_language_find(const char *name);
+const struct language *language_find(const char *name);
/*
Language list API
*/
-int fts_language_list_init(const char *const *settings,
- struct fts_language_list **list_r,
- const char **error_r);
-void fts_language_list_deinit(struct fts_language_list **list);
+int language_list_init(const char *const *settings,
+ struct language_list **list_r,
+ const char **error_r);
+void language_list_deinit(struct language_list **list);
/* Add a language to the list of wanted languages. */
-void fts_language_list_add(struct fts_language_list *list,
- const struct fts_language *lang);
+void language_list_add(struct language_list *list,
+ const struct language *lang);
/* Add wanted languages from a space-separated list of language names.
Duplicates are ignored. Returns TRUE if ok, FALSE and unknown_name if an
unknown language was found from the list. */
-bool fts_language_list_add_names(struct fts_language_list *list,
- const char *names,
- const char **unknown_name_r);
+bool language_list_add_names(struct language_list *list,
+ const char *names,
+ const char **unknown_name_r);
/* Return an array of all wanted languages. */
-const ARRAY_TYPE(fts_language) *
-fts_language_list_get_all(struct fts_language_list *list);
+const ARRAY_TYPE(language) * language_list_get_all(struct language_list *list);
/* Returns the first wanted language (default language). */
-const struct fts_language *
-fts_language_list_get_first(struct fts_language_list *list);
+const struct language *
+language_list_get_first(struct language_list *list);
/* If text was detected to be one of the languages in the list,
- returns FTS_LANGUAGE_RESULT_OK and (a pointer to) the language (in
- the list). error_r is set for FTS_LANGUAGE_RESULT_ERROR. */
-enum fts_language_result
-fts_language_detect(struct fts_language_list *list,
- const unsigned char *text, size_t size,
- const struct fts_language **lang_r,
- const char **error_r);
+ returns LANGUAGE_RESULT_OK and (a pointer to) the language (in
+ the list). error_r is set for LANGUAGE_RESULT_ERROR. */
+enum language_result
+language_detect(struct language_list *list,
+ const unsigned char *text, size_t size,
+ const struct language **lang_r,
+ const char **error_r);
#endif
#include <stdio.h>
static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
-static struct fts_language english_language = { .name = "en" };
-static struct fts_language french_language = { .name = "fr" };
-static struct fts_language norwegian_language = { .name = "no" };
+static struct language english_language = { .name = "en" };
+static struct language french_language = { .name = "fr" };
+static struct language norwegian_language = { .name = "no" };
#if defined(HAVE_LIBICU) && defined(HAVE_FTS_STEMMER)
-static struct fts_language swedish_language = { .name = "sv" };
+static struct language swedish_language = { .name = "sv" };
#endif
-static void test_fts_filter_find(void)
+static void test_lang_filter_find(void)
{
- test_begin("fts filter find");
- test_assert(fts_filter_find("stopwords") == fts_filter_stopwords);
- test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
- test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
- test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
- test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+ test_begin("lang filter find");
+ test_assert(lang_filter_find("stopwords") == lang_filter_stopwords);
+ test_assert(lang_filter_find("snowball") == lang_filter_stemmer_snowball);
+ test_assert(lang_filter_find("normalizer-icu") == lang_filter_normalizer_icu);
+ test_assert(lang_filter_find("lowercase") == lang_filter_lowercase);
+ test_assert(lang_filter_find("contractions") == lang_filter_contractions);
test_end();
}
-static void test_fts_filter_contractions_fail(void)
+static void test_lang_filter_contractions_fail(void)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
- test_begin("fts filter contractions, unsupported language");
- test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+ test_begin("lang filter contractions, unsupported language");
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
test_assert(error != NULL);
test_end();
}
-static void test_fts_filter_contractions_fr(void)
+static void test_lang_filter_contractions_fr(void)
{
static const struct {
const char *input;
{ "quelqu'un", "quelqu'un" },
{ "l'esprit", "esprit" }
};
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
const char *token;
unsigned int i;
int ret;
- test_begin("fts filter contractions, French");
- test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+ test_begin("lang filter contractions, French");
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
test_assert(ret >= 0);
if (ret > 0)
test_assert_idx(strcmp(token, tests[i].output) == 0, i);
else if (ret == 0)
test_assert_idx(token == NULL && tests[i].output == NULL, i);
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_end();
}
-static void test_fts_filter_lowercase(void)
+static void test_lang_filter_lowercase(void)
{
static const struct {
const char *input;
{ "FOO", "foo" },
{ "fOo", "foo" }
};
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
const char *token;
unsigned int i;
- test_begin("fts filter lowercase");
- test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
+ test_begin("lang filter lowercase");
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
- test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
+ test_assert_idx(lang_filter(filter, &token, &error) > 0 &&
strcmp(token, tests[i].output) == 0, 0);
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_end();
}
#ifdef HAVE_LIBICU
-static void test_fts_filter_lowercase_utf8(void)
+static void test_lang_filter_lowercase_utf8(void)
{
static const struct {
const char *input;
{ "F\xC3\x85\xC3\x85", "f\xC3\xA5\xC3\xA5" },
{ "F\xC3\x85\xC3\xA5", "f\xC3\xA5\xC3\xA5" }
};
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
const char *token;
unsigned int i;
- test_begin("fts filter lowercase, UTF8");
- test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
+ test_begin("lang filter lowercase, UTF8");
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, NULL, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
- test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
+ test_assert_idx(lang_filter(filter, &token, &error) > 0 &&
strcmp(token, tests[i].output) == 0, 0);
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_end();
}
-static void test_fts_filter_lowercase_too_long_utf8(void)
+static void test_lang_filter_lowercase_too_long_utf8(void)
{
static const struct {
const char *input;
{ "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" },
{ "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" }
};
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
const char *token;
const char * const settings[] = {"maxlen", "25", NULL};
unsigned int i;
- test_begin("fts filter lowercase, too long UTF8");
- test_assert(fts_filter_create(fts_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
+ test_begin("lang filter lowercase, too long UTF8");
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, &english_language, settings, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
- test_assert_idx(fts_filter_filter(filter, &token, &error) > 0 &&
+ test_assert_idx(lang_filter(filter, &token, &error) > 0 &&
strcmp(token, tests[i].output) == 0, 0);
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_end();
}
#endif
-static void test_fts_filter_stopwords_eng(void)
+static void test_lang_filter_stopwords_eng(void)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
int ret;
const char *input[] = {"an", "elephant", "and", "a", "bear",
const char **ip, **op;
const char *token;
- test_begin("fts filter stopwords, English");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
+ test_begin("lang filter stopwords, English");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
while (*ip != NULL) {
token = *ip;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*op == NULL);
ip++;
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_assert(filter == NULL);
test_end();
}
-static void test_fts_filter_stopwords_fin(void)
+static void test_lang_filter_stopwords_fin(void)
{
- const struct fts_language finnish = { .name = "fi" };
- struct fts_filter *filter;
+ const struct language finnish = { .name = "fi" };
+ struct lang_filter *filter;
const char *error;
int ret;
const char *input[] = {"olla", "vaiko", "eik\xC3\xB6", "olla",
const char **ip, **op;
const char *token;
- test_begin("fts filter stopwords, Finnish");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
+ test_begin("lang filter stopwords, Finnish");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
while (*ip != NULL) {
token = *ip;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*op == NULL);
ip++;
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_assert(filter == NULL);
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &finnish, stopword_settings, &filter, &error) == 0);
ip = input2;
op = output2;
while (*ip != NULL) {
token = *ip;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*op == NULL);
ip++;
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_assert(filter == NULL);
test_end();
}
-static void test_fts_filter_stopwords_fra(void)
+static void test_lang_filter_stopwords_fra(void)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
int ret;
const char **ip, **op;
const char *token;
- test_begin("fts filter stopwords, French");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
+ test_begin("lang filter stopwords, French");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
while (*ip != NULL) {
token = *ip;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*op == NULL);
ip++;
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_assert(filter == NULL);
test_end();
}
-static void test_fts_filter_stopwords_no(void)
+static void test_lang_filter_stopwords_no(void)
{
- struct fts_filter *filter;
+ struct lang_filter *filter;
const char *error;
int ret;
const char **ip, **op;
const char *token;
- test_begin("fts filter stopwords, Norwegian");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
+ test_begin("lang filter stopwords, Norwegian");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
while (*ip != NULL) {
token = *ip;
- ret = fts_filter_filter(filter, &token, &error);
+ ret = lang_filter(filter, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*op == NULL);
ip++;
}
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_assert(filter == NULL);
test_end();
}
-static void test_fts_filter_stopwords_fail_lazy_init(void)
+static void test_lang_filter_stopwords_fail_lazy_init(void)
{
- const struct fts_language unknown = { .name = "bebobidoop" };
- struct fts_filter *filter = NULL;
+ const struct language unknown = { .name = "bebobidoop" };
+ struct lang_filter *filter = NULL;
const char *error = NULL, *token = "foobar";
- test_begin("fts filter stopwords, fail filter() (lazy init)");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &unknown, stopword_settings, &filter, &error) == 0);
+ test_begin("lang filter stopwords, fail filter() (lazy init)");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &unknown, stopword_settings, &filter, &error) == 0);
test_assert(filter != NULL && error == NULL);
- test_assert(fts_filter_filter(filter, &token, &error) < 0 && error != NULL);
- fts_filter_unref(&filter);
+ test_assert(lang_filter(filter, &token, &error) < 0 && error != NULL);
+ lang_filter_unref(&filter);
test_end();
}
-static void test_fts_filter_stopwords_malformed(void)
+static void test_lang_filter_stopwords_malformed(void)
{
- const struct fts_language malformed = { .name = "malformed" };
- struct fts_filter *filter = NULL;
+ const struct language malformed = { .name = "malformed" };
+ struct lang_filter *filter = NULL;
const char *error = NULL, *token = "foobar";
- test_begin("fts filter stopwords, malformed list");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &malformed, stopword_settings, &filter, &error) == 0);
- test_assert(fts_filter_filter(filter, &token, &error) < 0);
+ test_begin("lang filter stopwords, malformed list");
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &malformed, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter(filter, &token, &error) < 0);
test_assert(strstr(error, "seems empty. Is the file correctly formatted?") != NULL);
test_expect_no_more_errors();
- fts_filter_unref(&filter);
+ lang_filter_unref(&filter);
test_end();
}
#ifdef HAVE_FTS_STEMMER
-static void test_fts_filter_stemmer_snowball_stem_english(void)
+static void test_lang_filter_stemmer_snowball_stem_english(void)
{
- struct fts_filter *stemmer;
+ struct lang_filter *stemmer;
const char *error;
const char *token = NULL;
const char * const tokens[] = {
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filter stem English");
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &english_language, NULL, &stemmer, &error) == 0);
+ test_begin("lang filter stem English");
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &english_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
- test_assert(fts_filter_filter(stemmer, &token, &error) > 0);
+ test_assert(lang_filter(stemmer, &token, &error) > 0);
test_assert(token != NULL);
test_assert(null_strcmp(token, *bpp) == 0);
bpp++;
}
- fts_filter_unref(&stemmer);
+ lang_filter_unref(&stemmer);
test_assert(stemmer == NULL);
test_end();
}
-static void test_fts_filter_stemmer_snowball_stem_french(void)
+static void test_lang_filter_stemmer_snowball_stem_french(void)
{
- struct fts_filter *stemmer;
+ struct lang_filter *stemmer;
const char *error;
const char *token = NULL;
const char * const tokens[] = {
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filter stem French");
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
+ test_begin("lang filter stem French");
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
- test_assert(fts_filter_filter(stemmer, &token, &error) > 0);
+ test_assert(lang_filter(stemmer, &token, &error) > 0);
test_assert(token != NULL);
test_assert(null_strcmp(token, *bpp) == 0);
bpp++;
}
- fts_filter_unref(&stemmer);
+ lang_filter_unref(&stemmer);
test_assert(stemmer == NULL);
test_end();
}
-static void test_fts_filter_stopwords_stemmer_eng(void)
+static void test_lang_filter_stopwords_stemmer_eng(void)
{
int ret;
- struct fts_filter *stemmer;
- struct fts_filter *filter;
+ struct lang_filter *stemmer;
+ struct lang_filter *filter;
const char *error;
const char *token = NULL;
const char * const tokens[] = {
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filters stopwords and stemming chained, English");
+ test_begin("lang filters stopwords and stemming chained, English");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &english_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
- ret = fts_filter_filter(stemmer, &token, &error);
+ ret = lang_filter(stemmer, &token, &error);
test_assert(ret >= 0);
if (ret == 0)
test_assert(*bpp == NULL);
}
bpp++;
}
- fts_filter_unref(&stemmer);
- fts_filter_unref(&filter);
+ lang_filter_unref(&stemmer);
+ lang_filter_unref(&filter);
test_assert(stemmer == NULL);
test_assert(filter == NULL);
test_end();
#endif
#ifdef HAVE_LIBICU
-static void test_fts_filter_normalizer_swedish_short(void)
+static void test_lang_filter_normalizer_swedish_short(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *input[] = {
"Vem",
"\xC3\x85",
const char *token = NULL;
unsigned int i;
- test_begin("fts filter normalizer Swedish short text");
+ test_begin("lang filter normalizer Swedish short text");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
- test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+ test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
}
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_assert(norm == NULL);
test_end();
}
-static void test_fts_filter_normalizer_swedish_short_default_id(void)
+static void test_lang_filter_normalizer_swedish_short_default_id(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *input[] = {
"Vem",
"\xC3\x85",
const char *token = NULL;
unsigned int i;
- test_begin("fts filter normalizer Swedish short text using default ID");
+ test_begin("lang filter normalizer Swedish short text using default ID");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, NULL, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, NULL, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
- test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+ test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
}
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_assert(norm == NULL);
test_end();
}
/* UDHRDIR comes from Automake AM_CPPFLAGS */
#define UDHR_FRA_NAME "/udhr_fra.txt"
-static void test_fts_filter_normalizer_french(void)
+static void test_lang_filter_normalizer_french(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
FILE *input;
const char * const settings[] =
{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
0x8c, 0xd6, 0x7a, 0xb7, 0xc5, 0xc6, 0x85, 0x00};
const char *udhr_path;
- test_begin("fts filter normalizer French UDHR");
+ test_begin("lang filter normalizer French UDHR");
udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
input = fopen(udhr_path, "r");
test_assert(input != NULL);
sha512_init(&ctx);
while (NULL != fgets(buf, sizeof(buf), input)) {
tokens = buf;
- if (fts_filter_filter(norm, &tokens, &error) != 1){
+ if (lang_filter(norm, &tokens, &error) != 1){
break;
}
sha512_loop(&ctx, tokens, strlen(tokens));
sha512_result(&ctx, sha512_digest);
test_assert(memcmp(sha512_digest, correct_digest,
sizeof(sha512_digest)) == 0);
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_assert(norm == NULL);
test_end();
}
-static void test_fts_filter_normalizer_empty(void)
+static void test_lang_filter_normalizer_empty(void)
{
/* test just a couple of these */
static const char *empty_tokens[] = {
};
const char * const settings[] =
{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; [\\x20] Remove", NULL};
- struct fts_filter *norm;
+ struct lang_filter *norm;
const char *error;
unsigned int i;
- test_begin("fts filter normalizer empty tokens");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_begin("lang filter normalizer empty tokens");
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(empty_tokens); i++) {
const char *token = empty_tokens[i];
- test_assert_idx(fts_filter_filter(norm, &token, &error) == 0, i);
+ test_assert_idx(lang_filter(norm, &token, &error) == 0, i);
}
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_end();
}
-static void test_fts_filter_normalizer_baddata(void)
+static void test_lang_filter_normalizer_baddata(void)
{
const char * const settings[] =
{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", NULL};
- struct fts_filter *norm;
+ struct lang_filter *norm;
const char *token, *error;
string_t *str;
unichar_t i;
- test_begin("fts filter normalizer bad data");
+ test_begin("lang filter normalizer bad data");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
str = t_str_new(128);
for (i = 1; i < 0x1ffff; i++) {
if (!uni_is_valid_ucs4(i)) continue;
uni_ucs4_to_utf8_c(i, str);
token = str_c(str);
T_BEGIN {
- test_assert_idx(fts_filter_filter(norm, &token, &error) >= 0, i);
+ test_assert_idx(lang_filter(norm, &token, &error) >= 0, i);
} T_END;
}
str_truncate(str, 0);
uni_ucs4_to_utf8_c(UNICHAR_T_MAX, str);
token = str_c(str);
- test_assert(fts_filter_filter(norm, &token, &error) >= 0);
+ test_assert(lang_filter(norm, &token, &error) >= 0);
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_end();
}
-static void test_fts_filter_normalizer_invalid_id(void)
+static void test_lang_filter_normalizer_invalid_id(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *settings[] =
{"id", "Any-One-Out-There; DKFN; [: Nonspacing Mark :] Remove",
NULL};
const char *error = NULL, *token = "foo";
- test_begin("fts filter normalizer invalid id");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_begin("lang filter normalizer invalid id");
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
test_assert(error == NULL);
- test_assert(fts_filter_filter(norm, &token, &error) < 0 && error != NULL);
- fts_filter_unref(&norm);
+ test_assert(lang_filter(norm, &token, &error) < 0 && error != NULL);
+ lang_filter_unref(&norm);
test_end();
}
-static void test_fts_filter_normalizer_oversized(void)
+static void test_lang_filter_normalizer_oversized(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *settings[] =
{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove", "maxlen", "250",
NULL};
"\xe6\xae\xb4\xe9\x8a\x85\xc4\xb9\xe4\x90\xb2\xe9\x96\xad\xef\x90"
"\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1";
- test_begin("fts filter normalizer over-sized token");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
+ test_begin("lang filter normalizer over-sized token");
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, settings, &norm, &error) == 0);
test_assert(error == NULL);
- test_assert(fts_filter_filter(norm, &token, &error) >= 0);
+ test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strlen(token) <= 250);
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_end();
}
-static void test_fts_filter_normalizer_truncation(void)
+static void test_lang_filter_normalizer_truncation(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *settings[] =
{"id", "Any-Lower;", "maxlen", "10",
NULL};
const char *error = NULL;
const char *token = "abcdefghi\xC3\x85";
- test_begin("fts filter normalizer token truncated mid letter");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL,
+ test_begin("lang filter normalizer token truncated mid letter");
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL,
settings, &norm, &error) == 0);
test_assert(error == NULL);
- test_assert(fts_filter_filter(norm, &token, &error) >= 0);
+ test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strcmp(token, "abcdefghi") == 0);
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_end();
}
#ifdef HAVE_FTS_STEMMER
-static void test_fts_filter_normalizer_stopwords_stemmer_eng(void)
+static void test_lang_filter_normalizer_stopwords_stemmer_eng(void)
{
int ret;
- struct fts_filter *normalizer;
- struct fts_filter *stemmer;
- struct fts_filter *filter;
+ struct lang_filter *normalizer;
+ struct lang_filter *stemmer;
+ struct lang_filter *filter;
const char *error;
const char * const id_settings[] =
//{"id", "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC", NULL};
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filters normalizer, stopwords and stemming chained, English");
+ test_begin("lang filters normalizer, stopwords and stemming chained, English");
- test_assert(fts_filter_create(fts_filter_normalizer_icu, NULL, NULL, id_settings, &normalizer, &error) == 0);
- test_assert(fts_filter_create(fts_filter_stopwords, normalizer, &english_language, stopword_settings, &filter, &error) == 0);
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, NULL, id_settings, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, normalizer, &english_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, &english_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
token = *tpp;
- ret = fts_filter_filter(stemmer, &token, &error);
+ ret = lang_filter(stemmer, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*bpp == NULL);
}
bpp++;
}
- fts_filter_unref(&stemmer);
- fts_filter_unref(&filter);
- fts_filter_unref(&normalizer);
+ lang_filter_unref(&stemmer);
+ lang_filter_unref(&filter);
+ lang_filter_unref(&normalizer);
test_assert(stemmer == NULL);
test_assert(filter == NULL);
test_assert(normalizer == NULL);
test_end();
}
-static void test_fts_filter_stopwords_normalizer_stemmer_no(void)
+static void test_lang_filter_stopwords_normalizer_stemmer_no(void)
{
int ret;
- struct fts_filter *normalizer;
- struct fts_filter *stemmer;
- struct fts_filter *filter;
+ struct lang_filter *normalizer;
+ struct lang_filter *stemmer;
+ struct lang_filter *filter;
const char *error;
const char *token = NULL;
const char * const tokens[] = {
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filters with stopwords, default normalizer and stemming chained, Norwegian");
+ test_begin("lang filters with stopwords, default normalizer and stemming chained, Norwegian");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
- test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &norwegian_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &norwegian_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &norwegian_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
token = *tpp;
- ret = fts_filter_filter(stemmer, &token, &error);
+ ret = lang_filter(stemmer, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*bpp == NULL);
}
bpp++;
}
- fts_filter_unref(&stemmer);
- fts_filter_unref(&normalizer);
- fts_filter_unref(&filter);
+ lang_filter_unref(&stemmer);
+ lang_filter_unref(&normalizer);
+ lang_filter_unref(&filter);
test_assert(stemmer == NULL);
test_assert(filter == NULL);
test_assert(normalizer == NULL);
test_end();
}
-static void test_fts_filter_stopwords_normalizer_stemmer_sv(void)
+static void test_lang_filter_stopwords_normalizer_stemmer_sv(void)
{
int ret;
- struct fts_filter *normalizer;
- struct fts_filter *stemmer;
- struct fts_filter *filter;
+ struct lang_filter *normalizer;
+ struct lang_filter *stemmer;
+ struct lang_filter *filter;
const char *error;
const char *token = NULL;
const char * const tokens[] = {
const char * const *tpp;
const char * const *bpp;
- test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish");
+ test_begin("lang filters with stopwords, default normalizer and stemming chained, Swedish");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
- test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
token = *tpp;
- ret = fts_filter_filter(stemmer, &token, &error);
+ ret = lang_filter(stemmer, &token, &error);
if (ret <= 0) {
test_assert(ret == 0);
test_assert(*bpp == NULL);
}
bpp++;
}
- fts_filter_unref(&stemmer);
- fts_filter_unref(&normalizer);
- fts_filter_unref(&filter);
+ lang_filter_unref(&stemmer);
+ lang_filter_unref(&normalizer);
+ lang_filter_unref(&filter);
test_assert(stemmer == NULL);
test_assert(filter == NULL);
test_assert(normalizer == NULL);
#endif
#endif
-static void test_fts_filter_english_possessive(void)
+static void test_lang_filter_english_possessive(void)
{
- struct fts_filter *norm = NULL;
+ struct lang_filter *norm = NULL;
const char *input[] = {
"foo'",
const char *token = NULL;
unsigned int i;
- test_begin("fts filter english possessive");
+ test_begin("lang filter english possessive");
- test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
- test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+ test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
}
- fts_filter_unref(&norm);
+ lang_filter_unref(&norm);
test_assert(norm == NULL);
test_end();
}
int main(void)
{
static void (*const test_functions[])(void) = {
- test_fts_filter_find,
- test_fts_filter_contractions_fail,
- test_fts_filter_contractions_fr,
- test_fts_filter_lowercase,
+ test_lang_filter_find,
+ test_lang_filter_contractions_fail,
+ test_lang_filter_contractions_fr,
+ test_lang_filter_lowercase,
#ifdef HAVE_LIBICU
- test_fts_filter_lowercase_utf8,
- test_fts_filter_lowercase_too_long_utf8,
+ test_lang_filter_lowercase_utf8,
+ test_lang_filter_lowercase_too_long_utf8,
#endif
- test_fts_filter_stopwords_eng,
- test_fts_filter_stopwords_fin,
- test_fts_filter_stopwords_fra,
- test_fts_filter_stopwords_no,
- test_fts_filter_stopwords_fail_lazy_init,
- test_fts_filter_stopwords_malformed,
+ test_lang_filter_stopwords_eng,
+ test_lang_filter_stopwords_fin,
+ test_lang_filter_stopwords_fra,
+ test_lang_filter_stopwords_no,
+ test_lang_filter_stopwords_fail_lazy_init,
+ test_lang_filter_stopwords_malformed,
#ifdef HAVE_FTS_STEMMER
- test_fts_filter_stemmer_snowball_stem_english,
- test_fts_filter_stemmer_snowball_stem_french,
- test_fts_filter_stopwords_stemmer_eng,
+ test_lang_filter_stemmer_snowball_stem_english,
+ test_lang_filter_stemmer_snowball_stem_french,
+ test_lang_filter_stopwords_stemmer_eng,
#endif
#ifdef HAVE_LIBICU
- test_fts_filter_normalizer_swedish_short,
- test_fts_filter_normalizer_swedish_short_default_id,
- test_fts_filter_normalizer_french,
- test_fts_filter_normalizer_empty,
- test_fts_filter_normalizer_baddata,
- test_fts_filter_normalizer_invalid_id,
- test_fts_filter_normalizer_oversized,
- test_fts_filter_normalizer_truncation,
+ test_lang_filter_normalizer_swedish_short,
+ test_lang_filter_normalizer_swedish_short_default_id,
+ test_lang_filter_normalizer_french,
+ test_lang_filter_normalizer_empty,
+ test_lang_filter_normalizer_baddata,
+ test_lang_filter_normalizer_invalid_id,
+ test_lang_filter_normalizer_oversized,
+ test_lang_filter_normalizer_truncation,
#ifdef HAVE_FTS_STEMMER
- test_fts_filter_normalizer_stopwords_stemmer_eng,
- test_fts_filter_stopwords_normalizer_stemmer_no,
- test_fts_filter_stopwords_normalizer_stemmer_sv,
+ test_lang_filter_normalizer_stopwords_stemmer_eng,
+ test_lang_filter_stopwords_normalizer_stemmer_no,
+ test_lang_filter_stopwords_normalizer_stemmer_sv,
#endif
#endif
- test_fts_filter_english_possessive,
+ test_lang_filter_english_possessive,
NULL
};
int ret;
- fts_filters_init();
+ lang_filters_init();
ret = test_run(test_functions);
- fts_filters_deinit();
+ lang_filters_deinit();
return ret;
}
#include <unicode/uclean.h>
-static void test_fts_icu_utf8_to_utf16_ascii_resize(void)
+static void test_lang_icu_utf8_to_utf16_ascii_resize(void)
{
ARRAY_TYPE(icu_utf16) dest;
- test_begin("fts_icu_utf8_to_utf16 ascii resize");
+ test_begin("lang_icu_utf8_to_utf16 ascii resize");
t_array_init(&dest, 2);
test_assert(buffer_get_writable_size(dest.arr.buffer) == 4);
- fts_icu_utf8_to_utf16(&dest, "12");
+ lang_icu_utf8_to_utf16(&dest, "12");
test_assert(array_count(&dest) == 2);
test_assert(buffer_get_writable_size(dest.arr.buffer) == 4);
- fts_icu_utf8_to_utf16(&dest, "123");
+ lang_icu_utf8_to_utf16(&dest, "123");
test_assert(array_count(&dest) == 3);
test_assert(buffer_get_writable_size(dest.arr.buffer) == 7);
- fts_icu_utf8_to_utf16(&dest, "12345");
+ lang_icu_utf8_to_utf16(&dest, "12345");
test_assert(array_count(&dest) == 5);
test_end();
}
-static void test_fts_icu_utf8_to_utf16_32bit_resize(void)
+static void test_lang_icu_utf8_to_utf16_32bit_resize(void)
{
ARRAY_TYPE(icu_utf16) dest;
unsigned int i;
- test_begin("fts_icu_utf8_to_utf16 32bit resize");
+ test_begin("lang_icu_utf8_to_utf16 32bit resize");
for (i = 1; i <= 2; i++) {
t_array_init(&dest, i);
test_assert(buffer_get_writable_size(dest.arr.buffer) == i*2);
- fts_icu_utf8_to_utf16(&dest, "\xF0\x90\x90\x80"); /* 0x10400 */
+ lang_icu_utf8_to_utf16(&dest, "\xF0\x90\x90\x80"); /* 0x10400 */
test_assert(array_count(&dest) == 2);
}
test_end();
}
-static void test_fts_icu_utf16_to_utf8(void)
+static void test_lang_icu_utf16_to_utf8(void)
{
string_t *dest = t_str_new(64);
const UChar src[] = { 0xbd, 'b', 'c' };
unsigned int i;
- test_begin("fts_icu_utf16_to_utf8");
+ test_begin("lang_icu_utf16_to_utf8");
for (i = N_ELEMENTS(src); i > 0; i--) {
- fts_icu_utf16_to_utf8(dest, src, i);
+ lang_icu_utf16_to_utf8(dest, src, i);
test_assert(dest->used == i+1);
}
test_end();
}
-static void test_fts_icu_utf16_to_utf8_resize(void)
+static void test_lang_icu_utf16_to_utf8_resize(void)
{
string_t *dest;
const UChar src = UNICODE_REPLACEMENT_CHAR;
unsigned int i;
- test_begin("fts_icu_utf16_to_utf8 resize");
+ test_begin("lang_icu_utf16_to_utf8 resize");
for (i = 2; i <= 6; i++) {
dest = t_str_new(i);
test_assert(buffer_get_writable_size(dest) == i);
- fts_icu_utf16_to_utf8(dest, &src, 1);
+ lang_icu_utf16_to_utf8(dest, &src, 1);
test_assert(dest->used == 3);
test_assert(strcmp(str_c(dest), UNICODE_REPLACEMENT_CHAR_UTF8) == 0);
}
UParseError perr;
t_array_init(&id_utf16, 8);
- fts_icu_utf8_to_utf16(&id_utf16, id);
+ lang_icu_utf8_to_utf16(&id_utf16, id);
translit = utrans_openU(array_front(&id_utf16),
array_count(&id_utf16),
UTRANS_FORWARD, NULL, 0, &perr, &err);
return translit;
}
-static void test_fts_icu_translate(void)
+static void test_lang_icu_translate(void)
{
const char *translit_id = "Any-Lower";
UTransliterator *translit;
const char *error;
unsigned int i;
- test_begin("fts_icu_translate");
+ test_begin("lang_icu_translate");
t_array_init(&dest, 32);
translit = get_translit(translit_id);
for (i = N_ELEMENTS(src); i > 0; i--) {
array_clear(&dest);
- test_assert(fts_icu_translate(&dest, src, i,
+ test_assert(lang_icu_translate(&dest, src, i,
translit, &error) == 0);
test_assert(array_count(&dest) == i);
}
test_end();
}
-static void test_fts_icu_translate_resize(void)
+static void test_lang_icu_translate_resize(void)
{
const char *translit_id = "Any-Hex";
const char *src_utf8 = "FOO";
const char *error;
unsigned int i;
- test_begin("fts_icu_translate_resize resize");
+ test_begin("lang_icu_translate_resize resize");
t_array_init(&src_utf16, 8);
translit = get_translit(translit_id);
for (i = 1; i <= 10; i++) {
array_clear(&src_utf16);
- fts_icu_utf8_to_utf16(&src_utf16, src_utf8);
+ lang_icu_utf8_to_utf16(&src_utf16, src_utf8);
t_array_init(&dest, i);
test_assert(buffer_get_writable_size(dest.arr.buffer) == i*2);
- test_assert(fts_icu_translate(&dest, array_front(&src_utf16),
+ test_assert(lang_icu_translate(&dest, array_front(&src_utf16),
array_count(&src_utf16),
translit, &error) == 0);
}
test_end();
}
-static void test_fts_icu_lcase(void)
+static void test_lang_icu_lcase(void)
{
const char *src = "aBcD\xC3\x84\xC3\xA4";
string_t *dest = t_str_new(64);
- test_begin("fts_icu_lcase");
- fts_icu_lcase(dest, src);
+ test_begin("lang_icu_lcase");
+ lang_icu_lcase(dest, src);
test_assert(strcmp(str_c(dest), "abcd\xC3\xA4\xC3\xA4") == 0);
test_end();
}
-static void test_fts_icu_lcase_resize(void)
+static void test_lang_icu_lcase_resize(void)
{
const char *src = "a\xC3\x84";
string_t *dest;
unsigned int i;
- test_begin("fts_icu_lcase resize");
+ test_begin("lang_icu_lcase resize");
for (i = 1; i <= 3; i++) {
dest = t_str_new(i);
test_assert(buffer_get_writable_size(dest) == i);
- fts_icu_lcase(dest, src);
+ lang_icu_lcase(dest, src);
test_assert(strcmp(str_c(dest), "a\xC3\xA4") == 0);
test_assert(buffer_get_writable_size(dest) == 3);
}
test_end();
}
-static void test_fts_icu_lcase_resize_invalid_utf8(void)
+static void test_lang_icu_lcase_resize_invalid_utf8(void)
{
string_t *dest;
- test_begin("fts_icu_lcase resize invalid utf8");
+ test_begin("lang_icu_lcase resize invalid utf8");
dest = t_str_new(1);
- fts_icu_lcase(dest, ".\x80.");
+ lang_icu_lcase(dest, ".\x80.");
test_end();
}
int main(void)
{
static void (*const test_functions[])(void) = {
- test_fts_icu_utf8_to_utf16_ascii_resize,
- test_fts_icu_utf8_to_utf16_32bit_resize,
- test_fts_icu_utf16_to_utf8,
- test_fts_icu_utf16_to_utf8_resize,
- test_fts_icu_translate,
- test_fts_icu_translate_resize,
- test_fts_icu_lcase,
- test_fts_icu_lcase_resize,
- test_fts_icu_lcase_resize_invalid_utf8,
+ test_lang_icu_utf8_to_utf16_ascii_resize,
+ test_lang_icu_utf8_to_utf16_32bit_resize,
+ test_lang_icu_utf16_to_utf8,
+ test_lang_icu_utf16_to_utf8_resize,
+ test_lang_icu_translate,
+ test_lang_icu_translate_resize,
+ test_lang_icu_lcase,
+ test_lang_icu_lcase_resize,
+ test_lang_icu_lcase_resize_invalid_utf8,
NULL
};
int ret = test_run(test_functions);
- fts_icu_deinit();
+ lang_icu_deinit();
return ret;
}
"l" SQ "homme l" SQ "humanit\xC3\xA9 d" SQ "immixtions qu" SQ "il aujourd'hui que'euq"
};
-static void test_fts_tokenizer_find(void)
+static void test_lang_tokenizer_find(void)
{
- test_begin("fts tokenizer find");
- test_assert(fts_tokenizer_find("email-address") == fts_tokenizer_email_address);
- test_assert(fts_tokenizer_find("generic") == fts_tokenizer_generic);
+ test_begin("lang tokenizer find");
+ test_assert(lang_tokenizer_find("email-address") == lang_tokenizer_email_address);
+ test_assert(lang_tokenizer_find("generic") == lang_tokenizer_generic);
test_end();
}
static unsigned int
-test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
+test_tokenizer_inputoutput(struct lang_tokenizer *tok, const char *_input,
const char *const *expected_output,
unsigned int first_outi)
{
/* test all input at once */
outi = first_outi;
- while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
- while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
outi = first_outi;
for (i = 0; i < input_len; i += char_len) {
char_len = uni_utf8_char_bytes(input[i]);
- while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
}
- while (fts_tokenizer_final(tok, &token, &error) > 0) {
+ while (lang_tokenizer_final(tok, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
max = i_rand_minmax(1, input_len - i);
for (char_len = 0; char_len < max; )
char_len += uni_utf8_char_bytes(input[i+char_len]);
- while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
}
- while (fts_tokenizer_final(tok, &token, &error) > 0) {
+ while (lang_tokenizer_final(tok, &token, &error) > 0) {
test_assert_strcmp(token, expected_output[outi]);
outi++;
}
}
static void
-test_tokenizer_inputs(struct fts_tokenizer *tok,
+test_tokenizer_inputs(struct lang_tokenizer *tok,
const char *const *inputs, unsigned int count,
const char *const *expected_output)
{
test_assert_idx(expected_output[outi] == NULL, outi);
}
-static void test_fts_tokenizer_generic_only(void)
+static void test_lang_tokenizer_generic_only(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
NULL
};
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
- test_begin("fts tokenizer generic simple");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
- test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
+ test_begin("lang tokenizer generic simple");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
+ test_assert(((struct generic_lang_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
-static void test_fts_tokenizer_generic_tr29_only(void)
+static void test_lang_tokenizer_generic_tr29_only(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
NULL
};
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
- test_begin("fts tokenizer generic TR29");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
+ test_begin("lang tokenizer generic TR29");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
-static void test_fts_tokenizer_generic_tr29_wb5a(void)
+static void test_lang_tokenizer_generic_tr29_wb5a(void)
{
static const char *const expected_output[] = {
"hello", "world", "And",
NULL
};
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
- test_begin("fts tokenizer generic TR29 with WB5a");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
+ test_begin("lang tokenizer generic TR29 with WB5a");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
-static void test_fts_tokenizer_address_only(void)
+static void test_lang_tokenizer_address_only(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"hypen@hypen-hypen-sick.com",
NULL
};
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
- test_begin("fts tokenizer email address only");
- test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
+ test_begin("lang tokenizer email address only");
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
-static void test_fts_tokenizer_address_parent(const char *name, const char * const *settings)
+static void test_lang_tokenizer_address_parent(const char *name, const char * const *settings)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
"hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com",
NULL
};
- struct fts_tokenizer *tok, *gen_tok;
+ struct lang_tokenizer *tok, *gen_tok;
const char *error;
- test_begin(t_strdup_printf("fts tokenizer email address + parent %s", name));
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
- test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
+ test_begin(t_strdup_printf("lang tokenizer email address + parent %s", name));
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
- fts_tokenizer_unref(&tok);
- fts_tokenizer_unref(&gen_tok);
+ lang_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&gen_tok);
test_end();
}
const char *const simple_settings[] = {"algorithm", "simple", NULL};
-static void test_fts_tokenizer_address_parent_simple(void)
+static void test_lang_tokenizer_address_parent_simple(void)
{
- test_fts_tokenizer_address_parent("simple", simple_settings);
+ test_lang_tokenizer_address_parent("simple", simple_settings);
}
-static void test_fts_tokenizer_address_parent_tr29(void)
+static void test_lang_tokenizer_address_parent_tr29(void)
{
- test_fts_tokenizer_address_parent("tr29", tr29_settings);
+ test_lang_tokenizer_address_parent("tr29", tr29_settings);
}
-static void test_fts_tokenizer_address_search(void)
+static void test_lang_tokenizer_address_search(void)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
NULL
};
static const char *const settings[] = { "search", "", NULL };
- struct fts_tokenizer *tok, *gen_tok;
+ struct lang_tokenizer *tok, *gen_tok;
const char *token, *error;
- test_begin("fts tokenizer search email address + parent");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
- test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
+ test_begin("lang tokenizer search email address + parent");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
/* make sure state is forgotten at EOF */
- test_assert(fts_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+ test_assert(lang_tokenizer_next(tok, (const void *)"foo", 3, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) == 0);
- test_assert(fts_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+ test_assert(lang_tokenizer_next(tok, (const void *)"bar@baz", 7, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "bar@baz") == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) == 0);
- test_assert(fts_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+ test_assert(lang_tokenizer_next(tok, (const void *)"foo@", 4, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "foo") == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) == 0);
/* test reset explicitly */
- test_assert(fts_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
- fts_tokenizer_reset(tok);
- test_assert(fts_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) > 0 &&
+ test_assert(lang_tokenizer_next(tok, (const void *)"a", 1, &token, &error) == 0);
+ lang_tokenizer_reset(tok);
+ test_assert(lang_tokenizer_next(tok, (const void *)"b@c", 3, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) > 0 &&
strcmp(token, "b@c") == 0);
- test_assert(fts_tokenizer_final(tok, &token, &error) == 0);
+ test_assert(lang_tokenizer_final(tok, &token, &error) == 0);
- fts_tokenizer_unref(&tok);
- fts_tokenizer_unref(&gen_tok);
+ lang_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&gen_tok);
test_end();
}
-static void test_fts_tokenizer_delete_trailing_partial_char(void)
+static void test_lang_tokenizer_delete_trailing_partial_char(void)
{
static const struct {
const char *str;
unsigned int i;
size_t size;
- test_begin("fts tokenizer delete trailing partial char");
+ test_begin("lang tokenizer delete trailing partial char");
for (i = 0; i < N_ELEMENTS(tests); i++) {
size = strlen(tests[i].str);
- fts_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
+ lang_tokenizer_delete_trailing_partial_char((const unsigned char *)tests[i].str, &size);
test_assert(size == tests[i].truncated_len);
}
test_end();
}
-static void test_fts_tokenizer_address_maxlen(void)
+static void test_lang_tokenizer_address_maxlen(void)
{
const char *const settings[] = {"maxlen", "5", NULL};
const char *input = "...\357\277\275@a";
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *token, *error;
- test_begin("fts tokenizer address maxlen");
- test_assert(fts_tokenizer_create(fts_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
+ test_begin("lang tokenizer address maxlen");
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, settings, &tok, &error) == 0);
- while (fts_tokenizer_next(tok, (const unsigned char *)input,
+ while (lang_tokenizer_next(tok, (const unsigned char *)input,
strlen(input), &token, &error) > 0) ;
- while (fts_tokenizer_final(tok, &token, &error) > 0) ;
- fts_tokenizer_unref(&tok);
+ while (lang_tokenizer_final(tok, &token, &error) > 0) ;
+ lang_tokenizer_unref(&tok);
test_end();
}
-static void test_fts_tokenizer_random(void)
+static void test_lang_tokenizer_random(void)
{
const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
const char *const settings[] = {"algorithm", "simple", NULL};
unsigned int i;
unsigned char addr[10] = { 0 };
string_t *str = t_str_new(20);
- struct fts_tokenizer *tok, *gen_tok;
+ struct lang_tokenizer *tok, *gen_tok;
const char *token, *error;
- test_begin("fts tokenizer random");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
- test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
+ test_begin("lang tokenizer random");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0);
for (i = 0; i < 10000; i++) T_BEGIN {
for (unsigned int j = 0; j < sizeof(addr); j++)
str_truncate(str, 0);
if (uni_utf8_get_valid_data(addr, sizeof(addr), str))
str_append_data(str, addr, sizeof(addr));
- while (fts_tokenizer_next(tok, str_data(str), str_len(str),
+ while (lang_tokenizer_next(tok, str_data(str), str_len(str),
&token, &error) > 0) ;
- while (fts_tokenizer_final(tok, &token, &error) > 0) ;
+ while (lang_tokenizer_final(tok, &token, &error) > 0) ;
} T_END;
- fts_tokenizer_unref(&tok);
- fts_tokenizer_unref(&gen_tok);
+ lang_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&gen_tok);
test_end();
}
static void
-test_fts_tokenizer_explicit_prefix(void)
+test_lang_tokenizer_explicit_prefix(void)
{
const char *input = "* ** "
"*pre *both* post* "
algos[algo],
searches[search],
prefixes[explicitprefix]));
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, settings,
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings,
&tok, &error) == 0);
test_tokenizer_inputs(
tok, &input, 1,
(search!=0) && (explicitprefix!=0)
? expected_star : expected_nostar);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
}
}
}
-static void test_fts_tokenizer_skip_base64(void)
+static void test_lang_tokenizer_skip_base64(void)
{
/* The skip_base64 works on the data already available in the buffer
of the tokenizer, it does not pull more data to see if a base64
use test_tokenizer_inputoutput that also tests with one-byte-at-once
or random chunking, as those are known to fail with the current
implementation */
- struct fts_tokenizer *tok;
+ struct lang_tokenizer *tok;
const char *error;
const char *token;
NULL
};
- test_begin("fts tokenizer skip base64");
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
+ test_begin("lang tokenizer skip base64");
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
size_t index = 0;
- while (fts_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {
i_assert(index < N_ELEMENTS(expected_output));
test_assert_strcmp(token, expected_output[index]);
++index;
}
- while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
+ while (lang_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
i_assert(index < N_ELEMENTS(expected_output));
test_assert_strcmp(token, expected_output[index]);
++index;
i_assert(index < N_ELEMENTS(expected_output));
test_assert_idx(expected_output[index] == NULL, index);
- fts_tokenizer_unref(&tok);
+ lang_tokenizer_unref(&tok);
test_end();
}
int main(void)
{
static void (*const test_functions[])(void) = {
- test_fts_tokenizer_skip_base64,
- test_fts_tokenizer_find,
- test_fts_tokenizer_generic_only,
- test_fts_tokenizer_generic_tr29_only,
- test_fts_tokenizer_generic_tr29_wb5a,
- test_fts_tokenizer_address_only,
- test_fts_tokenizer_address_parent_simple,
- test_fts_tokenizer_address_parent_tr29,
- test_fts_tokenizer_address_maxlen,
- test_fts_tokenizer_address_search,
- test_fts_tokenizer_delete_trailing_partial_char,
- test_fts_tokenizer_random,
- test_fts_tokenizer_explicit_prefix,
+ test_lang_tokenizer_skip_base64,
+ test_lang_tokenizer_find,
+ test_lang_tokenizer_generic_only,
+ test_lang_tokenizer_generic_tr29_only,
+ test_lang_tokenizer_generic_tr29_wb5a,
+ test_lang_tokenizer_address_only,
+ test_lang_tokenizer_address_parent_simple,
+ test_lang_tokenizer_address_parent_tr29,
+ test_lang_tokenizer_address_maxlen,
+ test_lang_tokenizer_address_search,
+ test_lang_tokenizer_delete_trailing_partial_char,
+ test_lang_tokenizer_random,
+ test_lang_tokenizer_explicit_prefix,
NULL
};
int ret;
- fts_tokenizers_init();
+ lang_tokenizers_init();
ret = test_run(test_functions);
- fts_tokenizers_deinit();
+ lang_tokenizers_deinit();
return ret;
}
"fts_language_data", TEXTCAT_DATADIR"/", NULL};
/* Detect Finnish. fi--utf8 */
-static void test_fts_language_detect_finnish(void)
+static void test_language_detect_finnish(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char finnish[] =
"Yhdistyneiden kansakuntien kolmas yleiskokous hyv\xC3\xA4ksyi "\
"ja julkisti ihmisoikeuksien yleismaailmallisen julistuksen "\
"\xC3\xA4\xC3\xA4nest\xC3\xA4m\xC3\xA4st\xC3\xA4.";
const char names[] = "de, fi, en";
const char *unknown, *error;
- test_begin("fts language detect Finnish");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect Finnish");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "fi") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect English */
-static void test_fts_language_detect_english(void)
+static void test_language_detect_english(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char english[] = "Whereas recognition of the inherent dignity and"\
" of the equal and inalienable rights of all members of the human"\
"family is the foundation of freedom, justice and peace in the "\
const char names[] = "fi, de, fr, en";
const char *unknown, *error;
- test_begin("fts language detect English");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect English");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "en") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect French */
-static void test_fts_language_detect_french(void)
+static void test_language_detect_french(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char french[] =
"D\xC3\xA9""claration universelle des droits de l\xE2\x80\x99"
"homme Pr\xC3\xA9""ambule Consid\xC3\xA9rant que la "\
const char names[] = "de, fi, fr, en";
const char *unknown, *error;
- test_begin("fts language detect French");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, french, sizeof(french)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect French");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, french, sizeof(french)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "fr") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect German */
-static void test_fts_language_detect_german(void)
+static void test_language_detect_german(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char german[] =
"Artikel 1"\
"Alle Menschen sind frei und gleich an W\xC3\xBCrde und Rechten "\
const char names[] = "fi, de, fr, en";
const char *unknown, *error;
- test_begin("fts language detect German");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, german, sizeof(german)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect German");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, german, sizeof(german)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "de") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect Swedish */
-static void test_fts_language_detect_swedish(void)
+static void test_language_detect_swedish(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char swedish[] =
"Artikel 1."\
"Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\
const char names[] = "fi, de, sv, fr, en";
const char *unknown, *error;
- test_begin("fts language detect Swedish");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect Swedish");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, swedish, sizeof(swedish)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "sv") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect Bokmal */
-static void test_fts_language_detect_bokmal(void)
+static void test_language_detect_bokmal(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char bokmal[] =
"Artikkel 1.\n"\
"Alle mennesker er f\xC3\xB8""dt frie og med samme menneskeverd"\
const char names[] = "fi, de, sv, no, fr, en";
const char *unknown, *error;
- test_begin("fts language detect Bokmal as Norwegian");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, bokmal, sizeof(bokmal)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect Bokmal as Norwegian");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, bokmal, sizeof(bokmal)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "no") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect Nynorsk */
-static void test_fts_language_detect_nynorsk(void)
+static void test_language_detect_nynorsk(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char nynorsk[] =
"Artikkel 1.\n"\
"Alle menneske er f\xC3\xB8""dde til fridom og med same "\
const char names[] = "fi, de, sv, no, fr, en";
const char *unknown, *error;
- test_begin("fts language detect Nynorsk as Norwegian");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, nynorsk, sizeof(nynorsk)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect Nynorsk as Norwegian");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, nynorsk, sizeof(nynorsk)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "no") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Detect Finnish as English */
-static void test_fts_language_detect_finnish_as_english(void)
+static void test_language_detect_finnish_as_english(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char finnish[] =
"Yhdistyneiden kansakuntien kolmas yleiskokous hyv\xC3\xA4ksyi "\
"ja julkisti ihmisoikeuksien yleismaailmallisen julistuksen "\
"\xC3\xA4\xC3\xA4nest\xC3\xA4m\xC3\xA4st\xC3\xA4.";
const char names[] = "en";
const char *unknown, *error;
- test_begin("fts language detect Finnish as English");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_OK);
+ test_begin("language detect Finnish as English");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, finnish, sizeof(finnish)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_OK);
test_assert(strcmp(lang_r->name, "en") == 0);
- fts_language_list_deinit(&lp);
+ language_list_deinit(&lp);
test_end();
}
/* Successfully avoid detecting English, when en is not in language list. */
-static void test_fts_language_detect_na(void)
+static void test_language_detect_na(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char english[] = "Whereas recognition of the inherent dignity and"\
" of the equal and inalienable rights of all members of the human"\
"family is the foundation of freedom, justice and peace in the "\
const char names[] = "fi, de, fr";
const char *unknown, *error;
- test_begin("fts language detect not available");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
- == FTS_LANGUAGE_RESULT_UNKNOWN);
- fts_language_list_deinit(&lp);
+ test_begin("language detect not available");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, english, sizeof(english)-1, &lang_r, &error)
+ == LANGUAGE_RESULT_UNKNOWN);
+ language_list_deinit(&lp);
test_end();
}
/* Successfully detect, that Klingon is unknown. */
-static void test_fts_language_detect_unknown(void)
+static void test_language_detect_unknown(void)
{
- struct fts_language_list *lp = NULL;
- const struct fts_language *lang_r = NULL;
+ struct language_list *lp = NULL;
+ const struct language *lang_r = NULL;
const unsigned char klingon[] = "nobwI''a'pu'qoqvam'e' "\
"nuHegh'eghrupqa'moHlaHbe'law'lI'neS "\
"SeH'eghtaHghach'a'na'chajmo'.";
const char names[] = "fi, de, fr";
const char *unknown, *error;
- test_begin("fts language detect unknown");
- test_assert(fts_language_list_init(settings, &lp, &error) == 0);
- test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
- test_assert(fts_language_detect(lp, klingon, sizeof(klingon), &lang_r, &error)
- == FTS_LANGUAGE_RESULT_UNKNOWN);
- fts_language_list_deinit(&lp);
+ test_begin("language detect unknown");
+ test_assert(language_list_init(settings, &lp, &error) == 0);
+ test_assert(language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(language_detect(lp, klingon, sizeof(klingon), &lang_r, &error)
+ == LANGUAGE_RESULT_UNKNOWN);
+ language_list_deinit(&lp);
test_end();
}
-static void test_fts_language_find_builtin(void)
+static void test_language_find_builtin(void)
{
- const struct fts_language *lp;
- test_begin("fts language find built-in");
- lp = fts_language_find("en");
+ const struct language *lp;
+ test_begin("language find built-in");
+ lp = language_find("en");
i_assert(lp != NULL);
test_assert(strcmp(lp->name, "en") == 0);
test_end();
}
-static void test_fts_language_register(void)
+static void test_language_register(void)
{
- const struct fts_language *lp;
- test_begin("fts language register");
- fts_language_register("jp");
- lp = fts_language_find("jp");
+ const struct language *lp;
+ test_begin("language register");
+ language_register("jp");
+ lp = language_find("jp");
i_assert(lp != NULL);
test_assert(strcmp(lp->name, "jp") == 0);
test_end();
{
int ret;
static void (*const test_functions[])(void) = {
- test_fts_language_detect_finnish,
- test_fts_language_detect_english,
- test_fts_language_detect_french,
- test_fts_language_detect_german,
- test_fts_language_detect_swedish,
- test_fts_language_detect_bokmal,
- test_fts_language_detect_nynorsk,
- test_fts_language_detect_finnish_as_english,
- test_fts_language_detect_na,
- test_fts_language_detect_unknown,
- test_fts_language_find_builtin,
- test_fts_language_register,
+ test_language_detect_finnish,
+ test_language_detect_english,
+ test_language_detect_french,
+ test_language_detect_german,
+ test_language_detect_swedish,
+ test_language_detect_bokmal,
+ test_language_detect_nynorsk,
+ test_language_detect_finnish_as_english,
+ test_language_detect_na,
+ test_language_detect_unknown,
+ test_language_find_builtin,
+ test_language_register,
NULL
};
- fts_languages_init();
+ languages_init();
ret = test_run(test_functions);
- fts_languages_deinit();
+ languages_deinit();
return ret;
}
struct mail_namespace *ns = mail_namespace_find_inbox(user->namespaces);
struct fts_backend *backend;
struct fts_user_language *user_lang;
- const struct fts_language *lang = NULL;
+ const struct language *lang = NULL;
int ret, ret2;
bool final = FALSE;
}
if (ctx->language == NULL) {
- struct fts_language_list *lang_list =
+ struct language_list *lang_list =
fts_user_get_language_list(user);
- enum fts_language_result result;
+ enum language_result result;
const char *error;
- result = fts_language_detect(lang_list,
+ result = language_detect(lang_list,
(const unsigned char *)ctx->tokens, strlen(ctx->tokens),
&lang, &error);
if (lang == NULL)
- lang = fts_language_list_get_first(lang_list);
+ lang = language_list_get_first(lang_list);
switch (result) {
- case FTS_LANGUAGE_RESULT_SHORT:
+ case LANGUAGE_RESULT_SHORT:
e_warning(user->event,
"Text too short, can't detect its language - assuming %s",
lang->name);
break;
- case FTS_LANGUAGE_RESULT_UNKNOWN:
+ case LANGUAGE_RESULT_UNKNOWN:
e_warning(user->event,
"Can't detect its language - assuming %s",
lang->name);
break;
- case FTS_LANGUAGE_RESULT_OK:
+ case LANGUAGE_RESULT_OK:
break;
- case FTS_LANGUAGE_RESULT_ERROR:
+ case LANGUAGE_RESULT_ERROR:
e_error(user->event,
"Language detection library initialization failed: %s",
error);
i_unreached();
}
} else {
- lang = fts_language_find(ctx->language);
+ lang = language_find(ctx->language);
if (lang == NULL) {
e_error(user->event,
"Unknown language: %s", ctx->language);
return -1;
}
- fts_tokenizer_reset(user_lang->index_tokenizer);
+ lang_tokenizer_reset(user_lang->index_tokenizer);
for (;;) {
const char *token, *error;
if (!final) {
- ret = fts_tokenizer_next(user_lang->index_tokenizer,
+ ret = lang_tokenizer_next(user_lang->index_tokenizer,
(const unsigned char *)ctx->tokens, strlen(ctx->tokens),
&token, &error);
} else {
- ret = fts_tokenizer_final(user_lang->index_tokenizer,
+ ret = lang_tokenizer_final(user_lang->index_tokenizer,
&token, &error);
}
if (ret < 0)
break;
if (ret > 0 && user_lang->filter != NULL) {
- ret2 = fts_filter_filter(user_lang->filter, &token, &error);
+ ret2 = lang_filter(user_lang->filter, &token, &error);
if (ret2 > 0)
doveadm_print(token);
else if (ret2 < 0)
bool fts_backend_default_can_lookup(struct fts_backend *backend,
const struct mail_search_arg *args);
-void fts_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
+void lang_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
const ARRAY_TYPE(seq_range) *definite_filter,
ARRAY_TYPE(seq_range) *maybe_dest,
const ARRAY_TYPE(seq_range) *maybe_filter);
}
}
-void fts_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
+void lang_filter_uids(ARRAY_TYPE(seq_range) *definite_dest,
const ARRAY_TYPE(seq_range) *definite_filter,
ARRAY_TYPE(seq_range) *maybe_dest,
const ARRAY_TYPE(seq_range) *maybe_filter)
/* reset tokenizer between fields - just to be sure no state
leaks between fields (especially if previous indexing had
failed) */
- fts_tokenizer_reset(user_lang->index_tokenizer);
+ lang_tokenizer_reset(user_lang->index_tokenizer);
}
static void
fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
const unsigned char *data, size_t size)
{
- struct fts_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer;
- struct fts_filter *filter = ctx->cur_user_lang->filter;
+ struct lang_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer;
+ struct lang_filter *filter = ctx->cur_user_lang->filter;
const char *token, *error;
int ret = 1, ret2;
while (ret > 0) T_BEGIN {
- ret = ret2 = fts_tokenizer_next(tokenizer, data, size, &token, &error);
+ ret = ret2 = lang_tokenizer_next(tokenizer, data, size, &token, &error);
if (ret2 > 0 && filter != NULL)
- ret2 = fts_filter_filter(filter, &token, &error);
+ ret2 = lang_filter(filter, &token, &error);
if (ret2 < 0) {
mail_set_critical(ctx->mail,
"fts: Couldn't create indexable tokens: %s",
static int
fts_detect_language(struct fts_mail_build_context *ctx,
const unsigned char *data, size_t size, bool last,
- const struct fts_language **lang_r)
+ const struct language **lang_r)
{
struct mail_user *user = ctx->update_ctx->backend->ns->user;
- struct fts_language_list *lang_list = fts_user_get_language_list(user);
- const struct fts_language *lang;
+ struct language_list *lang_list = fts_user_get_language_list(user);
+ const struct language *lang;
const char *error;
- switch (fts_language_detect(lang_list, data, size, &lang, &error)) {
- case FTS_LANGUAGE_RESULT_SHORT:
+ switch (language_detect(lang_list, data, size, &lang, &error)) {
+ case LANGUAGE_RESULT_SHORT:
/* save the input so far and try again later */
buffer_append(ctx->pending_input, data, size);
if (last) {
/* we've run out of data. use the default language. */
- *lang_r = fts_language_list_get_first(lang_list);
+ *lang_r = language_list_get_first(lang_list);
return 1;
}
return 0;
- case FTS_LANGUAGE_RESULT_UNKNOWN:
+ case LANGUAGE_RESULT_UNKNOWN:
/* use the default language */
- *lang_r = fts_language_list_get_first(lang_list);
+ *lang_r = language_list_get_first(lang_list);
return 1;
- case FTS_LANGUAGE_RESULT_OK:
+ case LANGUAGE_RESULT_OK:
*lang_r = lang;
return 1;
- case FTS_LANGUAGE_RESULT_ERROR:
+ case LANGUAGE_RESULT_ERROR:
/* internal language detection library failure
(e.g. invalid config). don't index anything. */
mail_set_critical(ctx->mail,
const unsigned char *data, size_t size, bool last)
{
struct mail_user *user = ctx->update_ctx->backend->ns->user;
- const struct fts_language *lang;
+ const struct language *lang;
int ret;
if (ctx->cur_user_lang != NULL) {
void fts_plugin_init(struct module *module)
{
- fts_library_init();
+ lang_library_init();
mail_storage_hooks_add(module, &fts_mail_storage_hooks);
}
void fts_plugin_deinit(void)
{
- fts_library_deinit();
+ lang_library_deinit();
fts_parsers_unload();
mail_storage_hooks_remove(&fts_mail_storage_hooks);
}
}
static int
-fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
+fts_backend_dovecot_expand_tokens(struct lang_filter *filter,
pool_t pool,
struct mail_search_arg *parent_arg,
const struct mail_search_arg *orig_arg,
/* add the word filtered */
if (filter != NULL) {
token2 = t_strdup(token);
- ret = fts_filter_filter(filter, &token2, &error);
+ ret = lang_filter(filter, &token2, &error);
if (ret > 0) {
token2 = t_strdup(token2);
array_push_back(&tokens, &token2);
/* reset tokenizer between search args in case there's any state left
from some previous failure */
- fts_tokenizer_reset(user_lang->search_tokenizer);
- while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
+ lang_tokenizer_reset(user_lang->search_tokenizer);
+ while ((ret = lang_tokenizer_next(user_lang->search_tokenizer,
(const void *)orig_token,
orig_token_len, &token, &error)) > 0) {
if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
return -1;
}
while (ret >= 0 &&
- (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
+ (ret = lang_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
and_arg, orig_arg, orig_token,
token, error_r) < 0)
union mail_user_module_context module_ctx;
int refcount;
- struct fts_language_list *lang_list;
+ struct language_list *lang_list;
struct fts_user_language *data_lang;
ARRAY_TYPE(fts_user_language) languages, data_languages;
lang_config[1] = mail_user_plugin_getenv(user, "fts_language_config");
if (lang_config[1] != NULL)
lang_config[0] = "fts_language_config";
- if (fts_language_list_init(lang_config, &fuser->lang_list, error_r) < 0)
+ if (language_list_init(lang_config, &fuser->lang_list, error_r) < 0)
return -1;
- if (!fts_language_list_add_names(fuser->lang_list, languages, &unknown)) {
+ if (!language_list_add_names(fuser->lang_list, languages, &unknown)) {
*error_r = t_strdup_printf(
"fts_languages: Unknown language '%s'", unknown);
return -1;
}
- if (array_count(fts_language_list_get_all(fuser->lang_list)) == 0) {
+ if (array_count(language_list_get_all(fuser->lang_list)) == 0) {
*error_r = "fts_languages setting is empty";
return -1;
}
}
static int
-fts_user_create_filters(struct mail_user *user, const struct fts_language *lang,
- struct fts_filter **filter_r, const char **error_r)
+fts_user_create_filters(struct mail_user *user, const struct language *lang,
+ struct lang_filter **filter_r, const char **error_r)
{
- const struct fts_filter *filter_class;
- struct fts_filter *filter = NULL, *parent = NULL;
+ const struct lang_filter *filter_class;
+ struct lang_filter *filter = NULL, *parent = NULL;
const char *filters_key, *const *filters, *filter_set_name;
const char *str, *error, *set_key;
unsigned int i;
filters = t_strsplit_spaces(str, " ");
for (i = 0; filters[i] != NULL; i++) {
- filter_class = fts_filter_find(filters[i]);
+ filter_class = lang_filter_find(filters[i]);
if (filter_class == NULL) {
*error_r = t_strdup_printf("%s: Unknown filter '%s'",
filters_key, filters[i]);
str = mail_user_plugin_getenv(user, set_key);
}
- if (fts_filter_create(filter_class, parent, lang,
- str_keyvalues_to_array(str),
- &filter, &error) < 0) {
+ if (lang_filter_create(filter_class, parent, lang,
+ str_keyvalues_to_array(str),
+ &filter, &error) < 0) {
*error_r = t_strdup_printf("%s: %s", set_key, error);
ret = -1;
break;
}
if (parent != NULL)
- fts_filter_unref(&parent);
+ lang_filter_unref(&parent);
parent = filter;
}
if (ret < 0) {
if (parent != NULL)
- fts_filter_unref(&parent);
+ lang_filter_unref(&parent);
return -1;
}
*filter_r = filter;
static int
fts_user_create_tokenizer(struct mail_user *user,
- const struct fts_language *lang,
- struct fts_tokenizer **tokenizer_r, bool search,
+ const struct language *lang,
+ struct lang_tokenizer **tokenizer_r, bool search,
const char **error_r)
{
- const struct fts_tokenizer *tokenizer_class;
- struct fts_tokenizer *tokenizer = NULL, *parent = NULL;
+ const struct lang_tokenizer *tokenizer_class;
+ struct lang_tokenizer *tokenizer = NULL, *parent = NULL;
const char *tokenizers_key, *const *tokenizers, *tokenizer_set_name;
const char *str, *error, *set_key;
unsigned int i;
tokenizers = t_strsplit_spaces(str, " ");
for (i = 0; tokenizers[i] != NULL; i++) {
- tokenizer_class = fts_tokenizer_find(tokenizers[i]);
+ tokenizer_class = lang_tokenizer_find(tokenizers[i]);
if (tokenizer_class == NULL) {
*error_r = t_strdup_printf("%s: Unknown tokenizer '%s'",
tokenizers_key, tokenizers[i]);
if (search)
str = t_strconcat("search=yes ", str, NULL);
- if (fts_tokenizer_create(tokenizer_class, parent,
- str_keyvalues_to_array(str),
- &tokenizer, &error) < 0) {
+ if (lang_tokenizer_create(tokenizer_class, parent,
+ str_keyvalues_to_array(str),
+ &tokenizer, &error) < 0) {
*error_r = t_strdup_printf("%s: %s", set_key, error);
ret = -1;
break;
}
if (parent != NULL)
- fts_tokenizer_unref(&parent);
+ lang_tokenizer_unref(&parent);
parent = tokenizer;
}
if (ret < 0) {
if (parent != NULL)
- fts_tokenizer_unref(&parent);
+ lang_tokenizer_unref(&parent);
return -1;
}
*tokenizer_r = tokenizer;
struct fts_user_language *
fts_user_language_find(struct mail_user *user,
- const struct fts_language *lang)
+ const struct language *lang)
{
struct fts_user_language *user_lang;
struct fts_user *fuser = FTS_USER_CONTEXT_REQUIRE(user);
static int fts_user_language_create(struct mail_user *user,
struct fts_user *fuser,
- const struct fts_language *lang,
+ const struct language *lang,
const char **error_r)
{
struct fts_user_language *user_lang;
struct fts_user *fuser,
const char **error_r)
{
- const struct fts_language *lang;
+ const struct language *lang;
- array_foreach_elem(fts_language_list_get_all(fuser->lang_list), lang) {
+ array_foreach_elem(language_list_get_all(fuser->lang_list), lang) {
if (fts_user_language_create(user, fuser, lang, error_r) < 0)
return -1;
}
const char *error;
user_lang = p_new(user->pool, struct fts_user_language, 1);
- user_lang->lang = &fts_language_data;
+ user_lang->lang = &language_data;
if (fts_user_language_init_tokenizers(user, user_lang, error_r) < 0)
return -1;
- if (fts_filter_create(fts_filter_lowercase, NULL, user_lang->lang, NULL,
- &user_lang->filter, &error) < 0)
+ if (lang_filter_create(lang_filter_lowercase, NULL, user_lang->lang, NULL,
+ &user_lang->filter, &error) < 0)
i_unreached();
i_assert(user_lang->filter != NULL);
return 0;
}
-struct fts_language_list *fts_user_get_language_list(struct mail_user *user)
+struct language_list *fts_user_get_language_list(struct mail_user *user)
{
struct fts_user *fuser = FTS_USER_CONTEXT_REQUIRE(user);
static void fts_user_language_free(struct fts_user_language *user_lang)
{
if (user_lang->filter != NULL)
- fts_filter_unref(&user_lang->filter);
+ lang_filter_unref(&user_lang->filter);
if (user_lang->index_tokenizer != NULL)
- fts_tokenizer_unref(&user_lang->index_tokenizer);
+ lang_tokenizer_unref(&user_lang->index_tokenizer);
if (user_lang->search_tokenizer != NULL)
- fts_tokenizer_unref(&user_lang->search_tokenizer);
+ lang_tokenizer_unref(&user_lang->search_tokenizer);
}
static void fts_user_free(struct fts_user *fuser)
struct fts_user_language *user_lang;
if (fuser->lang_list != NULL)
- fts_language_list_deinit(&fuser->lang_list);
+ language_list_deinit(&fuser->lang_list);
if (array_is_created(&fuser->languages)) {
array_foreach_elem(&fuser->languages, user_lang)
#define FTS_USER_H
struct fts_user_language {
- const struct fts_language *lang;
- struct fts_filter *filter;
- struct fts_tokenizer *index_tokenizer, *search_tokenizer;
+ const struct language *lang;
+ struct lang_filter *filter;
+ struct lang_tokenizer *index_tokenizer, *search_tokenizer;
};
ARRAY_DEFINE_TYPE(fts_user_language, struct fts_user_language *);
struct fts_user_language *
fts_user_language_find(struct mail_user *user,
- const struct fts_language *lang);
-struct fts_language_list *fts_user_get_language_list(struct mail_user *user);
+ const struct language *lang);
+struct language_list *fts_user_get_language_list(struct mail_user *user);
const ARRAY_TYPE(fts_user_language) *
fts_user_get_all_languages(struct mail_user *user);
struct fts_user_language *fts_user_get_data_lang(struct mail_user *user);