#include "rfc822-parser.h"
#include "lang-tokenizer-private.h"
#include "lang-tokenizer-common.h"
+#include "lang-settings.h"
#define IS_DTEXT(c) \
(rfc822_atext_chars[(int)(unsigned char)(c)] == 2)
-#define LANG_DEFAULT_ADDRESS_MAX_LENGTH 254
-
enum email_address_parser_state {
EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
EMAIL_ADDRESS_PARSER_STATE_LOCALPART,
};
static int
-lang_tokenizer_email_address_create(const char *const *settings,
+lang_tokenizer_email_address_create(const struct lang_settings *set,
enum lang_tokenizer_flags flags,
struct lang_tokenizer **tokenizer_r,
- const char **error_r)
+ const char **error_r ATTR_UNUSED)
{
struct email_address_lang_tokenizer *tok;
- bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH);
- unsigned int max_length = LANG_DEFAULT_ADDRESS_MAX_LENGTH;
- unsigned int i;
-
- for (i = 0; settings[i] != NULL; i += 2) {
- const char *key = settings[i], *value = settings[i+1];
-
- if (strcmp(key, "maxlen") == 0) {
- if (str_to_uint(value, &max_length) < 0 ||
- max_length == 0) {
- *error_r = t_strdup_printf("Invalid maxlen setting: %s", value);
- return -1;
- }
- } else {
- *error_r = t_strdup_printf("Unknown setting: %s", key);
- return -1;
- }
- }
-
tok = i_new(struct email_address_lang_tokenizer, 1);
tok->tokenizer = *lang_tokenizer_email_address;
tok->last_word = str_new(default_pool, 128);
tok->parent_data = str_new(default_pool, 128);
- tok->max_length = max_length;
- tok->search = search;
+ tok->max_length = set->tokenizer_address_token_maxlen;
+ tok->search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH);
*tokenizer_r = &tok->tokenizer;
return 0;
}
#include "lang-tokenizer-private.h"
#include "lang-tokenizer-generic-private.h"
#include "lang-tokenizer-common.h"
+#include "lang-settings.h"
#include "word-boundary-data.c"
#include "word-break-data.c"
/* see comments below between is_base64() and skip_base64() */
#define LANG_SKIP_BASE64_MIN_SEQUENCES 1
#define LANG_SKIP_BASE64_MIN_CHARS 50
-
-#define LANG_DEFAULT_TOKEN_MAX_LENGTH 30
#define LANG_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
static unsigned char lang_ascii_word_breaks[128] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0 /* 112-127: {|}~ */
};
+struct algorithm {
+ const char *name;
+ enum boundary_algorithm id;
+ const struct lang_tokenizer_vfuncs *v;
+};
+
+static const struct algorithm algorithms[] = {
+ { ALGORITHM_SIMPLE_NAME, BOUNDARY_ALGORITHM_SIMPLE, &generic_tokenizer_vfuncs_simple },
+ { ALGORITHM_TR29_NAME, BOUNDARY_ALGORITHM_TR29, &generic_tokenizer_vfuncs_tr29 },
+ { NULL, 0, NULL }
+};
+
+static const struct algorithm *parse_algorithm(const char *name)
+{
+ for (const struct algorithm *entry = algorithms; entry->name != NULL; entry++)
+ if (strcmp(name, entry->name) == 0)
+ return entry;
+ return NULL;
+}
+
static int
-lang_tokenizer_generic_create(const char *const *settings,
+lang_tokenizer_generic_create(const struct lang_settings *set,
enum lang_tokenizer_flags flags,
struct lang_tokenizer **tokenizer_r,
const char **error_r)
{
- struct generic_lang_tokenizer *tok;
- unsigned int max_length = LANG_DEFAULT_TOKEN_MAX_LENGTH;
- enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
- bool wb5a = FALSE;
- bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH);
- bool explicitprefix = FALSE;
- unsigned int i;
-
- for (i = 0; settings[i] != NULL; i += 2) {
- const char *key = settings[i], *value = settings[i+1];
-
- if (strcmp(key, "maxlen") == 0) {
- if (str_to_uint(value, &max_length) < 0 ||
- max_length == 0) {
- *error_r = t_strdup_printf(
- "Invalid maxlen setting: %s", value);
- return -1;
- }
- } else if (strcmp(key, "algorithm") == 0) {
- if (strcmp(value, ALGORITHM_TR29_NAME) == 0)
- algo = BOUNDARY_ALGORITHM_TR29;
- else if (strcmp(value, ALGORITHM_SIMPLE_NAME) == 0)
- ;
- else {
- *error_r = t_strdup_printf(
- "Invalid algorithm: %s", value);
- return -1;
- }
- } else if (strcasecmp(key, "wb5a") == 0) {
- if (strcasecmp(value, "no") == 0)
- wb5a = FALSE;
- else
- wb5a = TRUE;
- } else if (strcasecmp(key, "explicitprefix") == 0) {
- explicitprefix = TRUE;
- } else {
- *error_r = t_strdup_printf("Unknown setting: %s", key);
- return -1;
- }
+ const struct algorithm *algo = parse_algorithm(set->tokenizer_generic_algorithm);
+ if (algo == NULL) {
+ *error_r = t_strdup_printf(
+ "Unknown language_tokenizer_generic_algorithm: %s",
+ set->tokenizer_generic_algorithm);
+ return -1;
}
- /* Tokenise normally unless tokenising an explicit prefix query */
- if (!search)
- explicitprefix = FALSE;
-
- if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
- *error_r = "Can not use WB5a for algorithms other than TR29.";
+ bool wb5a = set->tokenizer_generic_wb5a;
+ if (wb5a && algo->id != BOUNDARY_ALGORITHM_TR29) {
+ *error_r = "Can not use language_tokenizer_generic_wb5a for "
+ "algorithms other than language_tokenizer_generic_algorithm = tr29";
return -1;
}
+ bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH);
+
+ struct generic_lang_tokenizer *tok;
tok = i_new(struct generic_lang_tokenizer, 1);
- if (algo == BOUNDARY_ALGORITHM_TR29)
- tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
- else
- tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
- tok->max_length = max_length;
- tok->algorithm = algo;
+ tok->tokenizer.v = algo->v;
+ tok->max_length = set->tokenizer_generic_token_maxlen;
+ tok->algorithm = algo->id;
tok->wb5a = wb5a;
- tok->prefixsplat = explicitprefix;
+ tok->prefixsplat = search && set->tokenizer_generic_explicit_prefix;
tok->token = buffer_create_dynamic(default_pool, 64);
*tokenizer_r = &tok->tokenizer;
#define LANG_TOKENIZER_CLASSES_NR 2
struct lang_tokenizer_vfuncs {
- int (*create)(const char *const *settings, unsigned int flags,
- struct lang_tokenizer **tokenizer_r, const char **error_r);
+ int (*create)(const struct lang_settings *set,
+ enum lang_tokenizer_flags flags,
+ struct lang_tokenizer **tokenizer_r,
+ const char **error_r);
void (*destroy)(struct lang_tokenizer *tok);
void (*reset)(struct lang_tokenizer *tok);
int lang_tokenizer_create(const struct lang_tokenizer *tok_class,
struct lang_tokenizer *parent,
- const char *const *settings,
+ const struct lang_settings *set,
enum lang_tokenizer_flags flags,
struct lang_tokenizer **tokenizer_r,
const char **error_r)
{
struct lang_tokenizer *tok;
- const char *empty_settings = NULL;
-
- i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
-
- if (settings == NULL)
- settings = &empty_settings;
-
- if (tok_class->v->create(settings, flags, &tok, error_r) < 0) {
+ if (tok_class->v->create(set, flags, &tok, error_r) < 0) {
*tokenizer_r = NULL;
return -1;
}
#ifndef LANG_TOKENIZER_H
#define LANG_TOKENIZER_H
+struct lang_settings;
+
/*
Settings are given in the form of a const char * const *settings =
{"key, "value", "key2", "value2", NULL} array of string pairs. Some
/* Create a new tokenizer. The settings are described above. */
int lang_tokenizer_create(const struct lang_tokenizer *tok_class,
struct lang_tokenizer *parent,
- const char *const *settings,
+ const struct lang_settings *set,
enum lang_tokenizer_flags flags,
struct lang_tokenizer **tokenizer_r,
const char **error_r);
#include "lang-tokenizer-common.h"
#include "lang-tokenizer-private.h"
#include "lang-tokenizer-generic-private.h"
+#include "lang-settings.h"
+
+static struct lang_settings simple_settings;
+static struct lang_settings tr29_settings;
+static struct lang_settings tr29_wb5a_settings;
+
+static void init_lang_settings(void)
+{
+ simple_settings = lang_default_settings;
+ simple_settings.tokenizer_generic_algorithm = "simple";
+
+ tr29_settings = lang_default_settings;
+ tr29_settings.tokenizer_generic_algorithm = "tr29";
+
+ tr29_wb5a_settings = lang_default_settings;
+ tr29_wb5a_settings.tokenizer_generic_algorithm = "tr29";
+ tr29_wb5a_settings.tokenizer_generic_wb5a = TRUE;
+}
/*there should be a trailing space ' ' at the end of each string except the last one*/
#define TEST_INPUT_ADDRESS \
const char *error;
test_begin("lang tokenizer generic simple");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &tok, &error) == 0);
test_assert(((struct generic_lang_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
test_end();
}
-const char *const tr29_settings[] = {"algorithm", "tr29", NULL};
-
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_lang_tokenizer_generic_tr29_only(void)
const char *error;
test_begin("lang tokenizer generic TR29");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
lang_tokenizer_unref(&tok);
test_end();
}
-const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
-
/* TODO: U+206F is in "Format" and therefore currently not word break.
This definitely needs to be remapped. */
static void test_lang_tokenizer_generic_tr29_wb5a(void)
const char *error;
test_begin("lang tokenizer generic TR29 with WB5a");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings_wb5a, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_wb5a_settings, 0, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
lang_tokenizer_unref(&tok);
test_end();
const char *error;
test_begin("lang tokenizer email address only");
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, NULL, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &lang_default_settings, 0, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
lang_tokenizer_unref(&tok);
test_end();
}
-static void test_lang_tokenizer_address_parent(const char *name, const char * const *settings, unsigned int flags)
+static void test_lang_tokenizer_address_parent(const char *name, struct lang_settings *set, enum lang_tokenizer_flags flags)
{
static const char input[] = TEST_INPUT_ADDRESS;
static const char *const expected_output[] = {
const char *error;
test_begin(t_strdup_printf("lang tokenizer email address + parent %s", name));
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, flags, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, set, flags, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, 0, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
lang_tokenizer_unref(&tok);
lang_tokenizer_unref(&gen_tok);
test_end();
}
-const char *const simple_settings[] = {"algorithm", "simple", NULL};
static void test_lang_tokenizer_address_parent_simple(void)
{
- test_lang_tokenizer_address_parent("simple", simple_settings, 0);
+ test_lang_tokenizer_address_parent("simple", &simple_settings, 0);
}
static void test_lang_tokenizer_address_parent_tr29(void)
{
- test_lang_tokenizer_address_parent("tr29", tr29_settings, 0);
+ test_lang_tokenizer_address_parent("tr29", &tr29_settings, 0);
}
static void test_lang_tokenizer_address_search(void)
const char *token, *error;
test_begin("lang tokenizer search email address + parent");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
/* make sure state is forgotten at EOF */
static void test_lang_tokenizer_address_maxlen(void)
{
- const char *const settings[] = {"maxlen", "5", NULL};
+ struct lang_settings set = lang_default_settings;
+ set.tokenizer_address_token_maxlen = 5;
+
const char *input = "...\357\277\275@a";
struct lang_tokenizer *tok;
const char *token, *error;
test_begin("lang tokenizer address maxlen");
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &set, 0, &tok, &error) == 0);
while (lang_tokenizer_next(tok, (const unsigned char *)input,
strlen(input), &token, &error) > 0) ;
static void test_lang_tokenizer_random(void)
{
const unsigned char test_chars[] = { 0, ' ', '.', 'a', 'b', 'c', '-', '@', '\xC3', '\xA4' };
- const char *const settings[] = {"algorithm", "simple", NULL};
- const char *const email_settings[] = {"maxlen", "9", NULL};
+
+ struct lang_settings set = lang_default_settings;
+ set.tokenizer_generic_algorithm = "simple";
+
+ struct lang_settings email_set = lang_default_settings;
+ email_set.tokenizer_address_token_maxlen = 9;
+
unsigned int i;
unsigned char addr[10] = { 0 };
string_t *str = t_str_new(20);
const char *token, *error;
test_begin("lang tokenizer random");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, 0, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, email_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &set, 0, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &email_set, 0, &tok, &error) == 0);
for (i = 0; i < 10000; i++) T_BEGIN {
for (unsigned int j = 0; j < sizeof(addr); j++)
"twopre", "twoboth", "twopost",
NULL, NULL };
- const char *settings[9] = { "algorithm", "tr29", "wb5a", "yes" };
- const char **setptr;
+ const struct algo {
+ const char *name;
+ bool wb5a;
+ } algos[] = {
+ { ALGORITHM_SIMPLE_NAME, FALSE },
+ { ALGORITHM_TR29_NAME, FALSE },
+ { ALGORITHM_TR29_NAME, TRUE },
+ };
- const char *algos[] = { ALGORITHM_SIMPLE_NAME,
- ALGORITHM_TR29_NAME,
- ALGORITHM_TR29_NAME "+wb5a" };
- const char *searches[] = { "indexing", "searching" };
- const char *prefixes[] = { "fixed", "prefix" };
+ struct lang_settings set = lang_default_settings;
+ for (unsigned int algo_index = 0; algo_index < N_ELEMENTS(algos); algo_index++) {
+ const struct algo *algo = &algos[algo_index];
+ set.tokenizer_generic_wb5a = algo->wb5a;
+ set.tokenizer_generic_algorithm = algo->name;
+ const char *algo_str = t_strdup_printf("%s%s", algo->name, algo->wb5a ? "+wb5a" : "");
- for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */
for (int search = 0; search < 2; search++) {
enum lang_tokenizer_flags flags = search > 0 ? LANG_TOKENIZER_FLAG_SEARCH : 0;
+ const char *search_str = search > 0 ? "searching" : "indexing";
+
for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) {
- setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */
- if (explicitprefix > 0) { *setptr++ = "explicitprefix"; *setptr++ = "y"; }
- *setptr++ = NULL;
+ set.tokenizer_generic_explicit_prefix = explicitprefix > 0;
+ const char *prefix_str = explicitprefix > 0 ? "prefix" : "fixed";
test_begin(t_strdup_printf("prefix search %s:%s:%s",
- algos[algo],
- searches[search],
- prefixes[explicitprefix]));
+ algo_str, search_str, prefix_str));
struct lang_tokenizer *tok;
const char *error;
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings,
- flags, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL,
+ &set, flags, &tok, &error) == 0);
test_tokenizer_inputs(
tok, &input, 1,
(search!=0) && (explicitprefix!=0)
};
test_begin("lang tokenizer skip base64");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0);
size_t index = 0;
while (lang_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {
int main(void)
{
+ init_lang_settings();
static void (*const test_functions[])(void) = {
test_lang_tokenizer_skip_base64,
test_lang_tokenizer_find,
static MODULE_CONTEXT_DEFINE_INIT(lang_user_module,
&mail_user_module_register);
-static const char *const *str_keyvalues_to_array(const char *str)
-{
- const char *key, *value, *const *keyvalues;
- ARRAY_TYPE(const_string) arr;
- unsigned int i;
-
- if (str == NULL)
- return NULL;
-
- t_array_init(&arr, 8);
- keyvalues = t_strsplit_spaces(str, " ");
- for (i = 0; keyvalues[i] != NULL; i++) {
- value = strchr(keyvalues[i], '=');
- if (value != NULL)
- key = t_strdup_until(keyvalues[i], value++);
- else {
- key = keyvalues[i];
- value = "";
- }
- array_push_back(&arr, &key);
- array_push_back(&arr, &value);
- }
- array_append_zero(&arr);
- return array_front(&arr);
-}
-
/* Returns the setting for the given language, or, if the langauge is not
defined, the settings for the default language (which is always the first
in the array) */
-const struct lang_settings *
+static const struct lang_settings *
lang_user_settings_get(struct mail_user *user, const char *lang)
{
struct lang_settings *set;
struct lang_tokenizer **tokenizer_r, bool search,
const char **error_r)
{
- const struct lang_tokenizer *tokenizer_class;
- struct lang_tokenizer *tokenizer = NULL, *parent = NULL;
- const char *tokenizers_key, *const *tokenizers, *tokenizer_set_name;
- const char *str, *error, *set_key;
- unsigned int i;
- int ret = 0;
-
- tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL);
- str = mail_user_plugin_getenv(user, tokenizers_key);
- if (str == NULL) {
- str = mail_user_plugin_getenv(user, "fts_tokenizers");
- if (str == NULL) {
- *error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key);
- return -1;
- }
- tokenizers_key = "fts_tokenizers";
+ const struct lang_settings *set = lang_user_settings_get(user, lang->name);
+ if (array_is_empty(&set->tokenizers)) {
+ /* No tokenizers */
+ *error_r = "Empty language_tokenizers { .. } list";
+ return -1;
}
- tokenizers = t_strsplit_spaces(str, " ");
+ int ret = 0;
+ struct lang_tokenizer *tokenizer = NULL, *parent = NULL;
+ const char *entry_name;
+ array_foreach_elem(&set->tokenizers, entry_name) {
+ const struct lang_tokenizer *entry_class =
+ lang_tokenizer_find(entry_name);
- for (i = 0; tokenizers[i] != NULL; i++) {
- tokenizer_class = lang_tokenizer_find(tokenizers[i]);
- if (tokenizer_class == NULL) {
- *error_r = t_strdup_printf("%s: Unknown tokenizer '%s'",
- tokenizers_key, tokenizers[i]);
+ if (entry_class == NULL) {
+ *error_r = t_strdup_printf(
+ "%s: Unknown tokenizer '%s'",
+ set->name, entry_name);
ret = -1;
break;
}
- tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_');
- set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name);
- str = mail_user_plugin_getenv(user, set_key);
- if (str == NULL) {
- set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
- str = mail_user_plugin_getenv(user, set_key);
- }
-
- if (lang_tokenizer_create(tokenizer_class, parent,
- str_keyvalues_to_array(str),
- search ? LANG_TOKENIZER_FLAG_SEARCH : 0,
- &tokenizer, &error) < 0) {
- *error_r = t_strdup_printf("%s: %s", set_key, error);
+ const char *error;
+ if (lang_tokenizer_create(entry_class, parent, set,
+ search ? LANG_TOKENIZER_FLAG_SEARCH : 0,
+ &tokenizer, &error) < 0) {
+ *error_r = t_strdup_printf(
+ "%s:%s %s", set->name, entry_name, error);
ret = -1;
break;
}