#define LANG_NO "no"
#define LANG_SV "sv"
+/* core filters don't use the event in lang_filter_create() */
+static struct event *const event = NULL;
+
static struct lang_settings stopword_settings;
static void init_lang_settings(void)
{
const char *error;
test_begin("lang filter contractions, unsupported language");
- test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_EN, NULL), &filter, &error) != 0);
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_EN, NULL), event, &filter, &error) != 0);
test_assert(error != NULL);
test_end();
}
int ret;
test_begin("lang filter contractions, French");
- test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_FR, NULL), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_contractions, NULL, make_settings(LANG_FR, NULL), event, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
unsigned int i;
test_begin("lang filter lowercase");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), event, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
unsigned int i;
test_begin("lang filter lowercase, UTF8");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, NULL), event, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
unsigned int i;
test_begin("lang filter lowercase, too long UTF8");
- test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, &set), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, &set), event, &filter, &error) == 0);
for (i = 0; i < N_ELEMENTS(tests); i++) {
token = tests[i].input;
const char *token;
test_begin("lang filter stopwords, English");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), event, &filter, &error) == 0);
ip = input;
op = output;
const char *token;
test_begin("lang filter stopwords, Finnish");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), event, &filter, &error) == 0);
ip = input;
op = output;
lang_filter_unref(&filter);
test_assert(filter == NULL);
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FI, &stopword_settings), event, &filter, &error) == 0);
ip = input2;
op = output2;
while (*ip != NULL) {
const char *token;
test_begin("lang filter stopwords, French");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FR, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_FR, &stopword_settings), event, &filter, &error) == 0);
ip = input;
op = output;
const char *token;
test_begin("lang filter stopwords, Norwegian");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), event, &filter, &error) == 0);
ip = input;
op = output;
const char *error = NULL, *token = "foobar";
test_begin("lang filter stopwords, fail filter() (lazy init)");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(UNKNOWN, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(UNKNOWN, &stopword_settings), event, &filter, &error) == 0);
test_assert(filter != NULL && error == NULL);
test_assert(lang_filter(filter, &token, &error) < 0 && error != NULL);
lang_filter_unref(&filter);
const char *error = NULL, *token = "foobar";
test_begin("lang filter stopwords, malformed list");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(MALFORMED, &stopword_settings), &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(MALFORMED, &stopword_settings), event, &filter, &error) == 0);
test_assert(lang_filter(filter, &token, &error) < 0);
test_assert(strstr(error, "seems empty. Is the file correctly formatted?") != NULL);
test_expect_no_more_errors();
const char * const *bpp;
test_begin("lang filter stem English");
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_EN, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
const char * const *bpp;
test_begin("lang filter stem French");
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_FR, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, NULL, make_settings(LANG_FR, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
test_begin("lang filters stopwords and stemming chained, English");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_EN, &stopword_settings), event, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
test_begin("lang filter normalizer Swedish short text");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_begin("lang filter normalizer Swedish short text using default ID");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, NULL), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, NULL), event, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
test_begin("lang filter normalizer French UDHR");
udhr_path = t_strconcat(UDHRDIR, UDHR_FRA_NAME, NULL);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
input = fopen(udhr_path, "r");
test_assert(input != NULL);
sha512_init(&ctx);
unsigned int i;
test_begin("lang filter normalizer empty tokens");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(empty_tokens); i++) {
const char *token = empty_tokens[i];
test_assert_idx(lang_filter(norm, &token, &error) == 0, i);
test_begin("lang filter normalizer bad data");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
str = t_str_new(128);
for (i = 1; i < 0x1ffff; i++) {
if (!uni_is_valid_ucs4(i)) continue;
const char *error = NULL, *token = "foo";
test_begin("lang filter normalizer invalid id");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) < 0 && error != NULL);
lang_filter_unref(&norm);
"\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1";
test_begin("lang filter normalizer over-sized token");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strlen(token) <= 250);
const char *token = "abcdefghi\xC3\x85";
test_begin("lang filter normalizer token truncated mid letter");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0);
test_assert(error == NULL);
test_assert(lang_filter(norm, &token, &error) >= 0);
test_assert(strcmp(token, "abcdefghi") == 0);
test_begin("lang filters normalizer, stopwords and stemming chained, English");
- test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stopwords, normalizer, make_settings(LANG_EN, &stopword_settings), &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, normalizer, make_settings(LANG_EN, &stopword_settings), event, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, filter, make_settings(LANG_EN, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filters with stopwords, default normalizer and stemming chained, Norwegian");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_NO, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_NO, &stopword_settings), event, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), event, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_NO, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filters with stopwords, default normalizer and stemming chained, Swedish");
- test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_SV, &stopword_settings), &filter, &error) == 0);
- test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), &normalizer, &error) == 0);
- test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_SV, NULL), &stemmer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stopwords, NULL, make_settings(LANG_SV, &stopword_settings), event, &filter, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_normalizer_icu, filter, make_settings(NULL, NULL), event, &normalizer, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_stemmer_snowball, normalizer, make_settings(LANG_SV, NULL), event, &stemmer, &error) == 0);
bpp = bases;
for (tpp = tokens; *tpp != NULL; tpp++) {
test_begin("lang filter english possessive");
- test_assert(lang_filter_create(lang_filter_english_possessive, NULL, make_settings(NULL, NULL), &norm, &error) == 0);
+ test_assert(lang_filter_create(lang_filter_english_possessive, NULL, make_settings(NULL, NULL), event, &norm, &error) == 0);
for (i = 0; i < N_ELEMENTS(input); i++) {
token = input[i];
test_assert_idx(lang_filter(norm, &token, &error) == 1, i);
#include "lang-tokenizer-generic-private.h"
#include "lang-settings.h"
+/* core filters don't use the event in lang_filter_create() */
+static struct event *const event = NULL;
+
static struct lang_settings simple_settings;
static struct lang_settings tr29_settings;
static struct lang_settings tr29_wb5a_settings;
const char *error;
test_begin("lang tokenizer generic simple");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, event, 0, &tok, &error) == 0);
test_assert(((struct generic_lang_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
const char *error;
test_begin("lang tokenizer generic TR29");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, event, 0, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
lang_tokenizer_unref(&tok);
test_end();
const char *error;
test_begin("lang tokenizer generic TR29 with WB5a");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_wb5a_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_wb5a_settings, event, 0, &tok, &error) == 0);
test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output);
lang_tokenizer_unref(&tok);
test_end();
const char *error;
test_begin("lang tokenizer email address only");
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &lang_default_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &lang_default_settings, event, 0, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
lang_tokenizer_unref(&tok);
test_end();
const char *error;
test_begin(t_strdup_printf("lang tokenizer email address + parent %s", name));
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, set, flags, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, set, event, flags, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, event, 0, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
lang_tokenizer_unref(&tok);
lang_tokenizer_unref(&gen_tok);
const char *token, *error;
test_begin("lang tokenizer search email address + parent");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, 0, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &lang_default_settings, event, 0, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &lang_default_settings, event, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0);
test_tokenizer_inputoutput(tok, input, expected_output, 0);
/* make sure state is forgotten at EOF */
const char *token, *error;
test_begin("lang tokenizer address maxlen");
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &set, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, &set, event, 0, &tok, &error) == 0);
while (lang_tokenizer_next(tok, (const unsigned char *)input,
strlen(input), &token, &error) > 0) ;
const char *token, *error;
test_begin("lang tokenizer random");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &set, 0, &gen_tok, &error) == 0);
- test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &email_set, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &set, event, 0, &gen_tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, &email_set, event, 0, &tok, &error) == 0);
for (i = 0; i < 10000; i++) T_BEGIN {
for (unsigned int j = 0; j < sizeof(addr); j++)
const char *error;
test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL,
- &set, flags, &tok, &error) == 0);
+ &set, event, flags, &tok, &error) == 0);
test_tokenizer_inputs(
tok, &input, 1,
(search!=0) && (explicitprefix!=0)
};
test_begin("lang tokenizer skip base64");
- test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, 0, &tok, &error) == 0);
+ test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, &tr29_settings, event, 0, &tok, &error) == 0);
size_t index = 0;
while (lang_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {