From: Marco Bettini Date: Wed, 31 Jan 2024 14:41:30 +0000 (+0000) Subject: lib-language: Change search tokenizers pseudo-setting into proper API X-Git-Tag: 2.4.1~1009 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=e3f74208a324821ae0eaadbaf02368d3b433ad95;p=thirdparty%2Fdovecot%2Fcore.git lib-language: Change search tokenizers pseudo-setting into proper API --- diff --git a/src/lib-language/lang-tokenizer-address.c b/src/lib-language/lang-tokenizer-address.c index 57b0cc2494..f084d786d5 100644 --- a/src/lib-language/lang-tokenizer-address.c +++ b/src/lib-language/lang-tokenizer-address.c @@ -31,20 +31,19 @@ struct email_address_lang_tokenizer { static int lang_tokenizer_email_address_create(const char *const *settings, + enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r) { struct email_address_lang_tokenizer *tok; - bool search = FALSE; + bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); unsigned int max_length = LANG_DEFAULT_ADDRESS_MAX_LENGTH; unsigned int i; for (i = 0; settings[i] != NULL; i += 2) { const char *key = settings[i], *value = settings[i+1]; - if (strcmp(key, "search") == 0) { - search = TRUE; - } else if (strcmp(key, "maxlen") == 0) { + if (strcmp(key, "maxlen") == 0) { if (str_to_uint(value, &max_length) < 0 || max_length == 0) { *error_r = t_strdup_printf("Invalid maxlen setting: %s", value); diff --git a/src/lib-language/lang-tokenizer-generic.c b/src/lib-language/lang-tokenizer-generic.c index 91b3f8283d..f2199026b2 100644 --- a/src/lib-language/lang-tokenizer-generic.c +++ b/src/lib-language/lang-tokenizer-generic.c @@ -34,6 +34,7 @@ static unsigned char lang_ascii_word_breaks[128] = { static int lang_tokenizer_generic_create(const char *const *settings, + enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r) { @@ -41,7 +42,7 @@ lang_tokenizer_generic_create(const char *const *settings, unsigned int max_length = LANG_DEFAULT_TOKEN_MAX_LENGTH; enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE; bool wb5a = FALSE; - bool search = FALSE; + bool search = HAS_ALL_BITS(flags, LANG_TOKENIZER_FLAG_SEARCH); bool explicitprefix = FALSE; unsigned int i; @@ -65,10 +66,6 @@ lang_tokenizer_generic_create(const char *const *settings, "Invalid algorithm: %s", value); return -1; } - } else if (strcmp(key, "search") == 0) { - /* tokenizing a search string - - makes no difference to us */ - search = TRUE; } else if (strcasecmp(key, "wb5a") == 0) { if (strcasecmp(value, "no") == 0) wb5a = FALSE; diff --git a/src/lib-language/lang-tokenizer-private.h b/src/lib-language/lang-tokenizer-private.h index 6ba11ee1c2..73ff5b73f6 100644 --- a/src/lib-language/lang-tokenizer-private.h +++ b/src/lib-language/lang-tokenizer-private.h @@ -6,7 +6,7 @@ #define LANG_TOKENIZER_CLASSES_NR 2 struct lang_tokenizer_vfuncs { - int (*create)(const char *const *settings, + int (*create)(const char *const *settings, unsigned int flags, struct lang_tokenizer **tokenizer_r, const char **error_r); void (*destroy)(struct lang_tokenizer *tok); diff --git a/src/lib-language/lang-tokenizer.c b/src/lib-language/lang-tokenizer.c index b732765413..ebc91bab8b 100644 --- a/src/lib-language/lang-tokenizer.c +++ b/src/lib-language/lang-tokenizer.c @@ -77,6 +77,7 @@ static void lang_tokenizer_self_reset(struct lang_tokenizer *tok) int lang_tokenizer_create(const struct lang_tokenizer *tok_class, struct lang_tokenizer *parent, const char *const *settings, + enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r) { @@ -88,7 +89,7 @@ int lang_tokenizer_create(const struct lang_tokenizer *tok_class, if (settings == NULL) settings = &empty_settings; - if (tok_class->v->create(settings, &tok, error_r) < 0) { + if (tok_class->v->create(settings, flags, &tok, error_r) < 0) { *tokenizer_r = NULL; return -1; } diff --git a/src/lib-language/lang-tokenizer.h b/src/lib-language/lang-tokenizer.h index 7a54af51ec..589b3b3d43 100644 --- a/src/lib-language/lang-tokenizer.h +++ b/src/lib-language/lang-tokenizer.h @@ -4,9 +4,8 @@ /* Settings are given in the form of a const char * const *settings = {"key, "value", "key2", "value2", NULL} array of string pairs. Some - keys, like "no_parent" and "search" are a sort of boolean and the - value does not matter, just mentioning the key enables the functionality. - The array has to be NULL terminated. + keys are a sort of boolean and the value does not matter, just mentioning + the key enables the functionality. The array has to be NULL terminated. */ /* Email address header tokenizer that returns "user@domain.org" input as "user@domain.org" token as well as passing it through to the parent @@ -15,13 +14,6 @@ allows doing an explicit "user@domain" search, which returns only mails matching that exact address (instead of e.g. a mail with both user@domain2 and user2@domain words). */ -/* Settings: - "no_parent", Return only our tokens, no data for parent to process. - Defaults to disabled. Should normally not be needed. - - "search" Remove addresses from parent data stream, so they are not processed - further. Defaults to disabled. Enable by defining the keyword (and any - value). */ extern const struct lang_tokenizer *lang_tokenizer_email_address; /* Generic email content tokenizer. Cuts text into tokens. */ @@ -41,6 +33,12 @@ extern const struct lang_tokenizer *lang_tokenizer_email_address; not. The default is "simple" */ extern const struct lang_tokenizer *lang_tokenizer_generic; +enum lang_tokenizer_flags { + /* Remove addresses from parent data stream, so they are not + processed further. */ + LANG_TOKENIZER_FLAG_SEARCH = 0x01, +}; + /* Tokenizing workflow, find --> create --> filter --> destroy. Do init before first use and deinit after all done. @@ -56,6 +54,7 @@ const struct lang_tokenizer *lang_tokenizer_find(const char *name); int lang_tokenizer_create(const struct lang_tokenizer *tok_class, struct lang_tokenizer *parent, const char *const *settings, + enum lang_tokenizer_flags flags, struct lang_tokenizer **tokenizer_r, const char **error_r); void lang_tokenizer_ref(struct lang_tokenizer *tok); diff --git a/src/lib-language/test-lang-tokenizer.c b/src/lib-language/test-lang-tokenizer.c index 5ccc4f7f95..19f975ccdd 100644 --- a/src/lib-language/test-lang-tokenizer.c +++ b/src/lib-language/test-lang-tokenizer.c @@ -192,7 +192,7 @@ static void test_lang_tokenizer_generic_only(void) const char *error; test_begin("lang tokenizer generic simple"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &tok, &error) == 0); test_assert(((struct generic_lang_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); @@ -250,7 +250,7 @@ static void test_lang_tokenizer_generic_tr29_only(void) const char *error; test_begin("lang tokenizer generic TR29"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); lang_tokenizer_unref(&tok); test_end(); @@ -307,7 +307,7 @@ static void test_lang_tokenizer_generic_tr29_wb5a(void) const char *error; test_begin("lang tokenizer generic TR29 with WB5a"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings_wb5a, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings_wb5a, 0, &tok, &error) == 0); test_tokenizer_inputs(tok, test_inputs, N_ELEMENTS(test_inputs), expected_output); lang_tokenizer_unref(&tok); test_end(); @@ -330,13 +330,13 @@ static void test_lang_tokenizer_address_only(void) const char *error; test_begin("lang tokenizer email address only"); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, NULL, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, NULL, 0, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); lang_tokenizer_unref(&tok); test_end(); } -static void test_lang_tokenizer_address_parent(const char *name, const char * const *settings) +static void test_lang_tokenizer_address_parent(const char *name, const char * const *settings, unsigned int flags) { static const char input[] = TEST_INPUT_ADDRESS; static const char *const expected_output[] = { @@ -366,8 +366,8 @@ static void test_lang_tokenizer_address_parent(const char *name, const char * co const char *error; test_begin(t_strdup_printf("lang tokenizer email address + parent %s", name)); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, flags, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, 0, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); lang_tokenizer_unref(&tok); lang_tokenizer_unref(&gen_tok); @@ -377,12 +377,12 @@ static void test_lang_tokenizer_address_parent(const char *name, const char * co const char *const simple_settings[] = {"algorithm", "simple", NULL}; static void test_lang_tokenizer_address_parent_simple(void) { - test_lang_tokenizer_address_parent("simple", simple_settings); + test_lang_tokenizer_address_parent("simple", simple_settings, 0); } static void test_lang_tokenizer_address_parent_tr29(void) { - test_lang_tokenizer_address_parent("tr29", tr29_settings); + test_lang_tokenizer_address_parent("tr29", tr29_settings, 0); } static void test_lang_tokenizer_address_search(void) @@ -411,13 +411,12 @@ static void test_lang_tokenizer_address_search(void) "hypen@hypen-hypen-sick.com", NULL }; - static const char *const settings[] = { "search", "", NULL }; struct lang_tokenizer *tok, *gen_tok; const char *token, *error; test_begin("lang tokenizer search email address + parent"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, NULL, 0, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, NULL, LANG_TOKENIZER_FLAG_SEARCH, &tok, &error) == 0); test_tokenizer_inputoutput(tok, input, expected_output, 0); /* make sure state is forgotten at EOF */ @@ -485,7 +484,7 @@ static void test_lang_tokenizer_address_maxlen(void) const char *token, *error; test_begin("lang tokenizer address maxlen"); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, settings, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, NULL, settings, 0, &tok, &error) == 0); while (lang_tokenizer_next(tok, (const unsigned char *)input, strlen(input), &token, &error) > 0) ; @@ -506,8 +505,8 @@ static void test_lang_tokenizer_random(void) const char *token, *error; test_begin("lang tokenizer random"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, &gen_tok, &error) == 0); - test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, email_settings, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, 0, &gen_tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_email_address, gen_tok, email_settings, 0, &tok, &error) == 0); for (i = 0; i < 10000; i++) T_BEGIN { for (unsigned int j = 0; j < sizeof(addr); j++) @@ -550,17 +549,12 @@ test_lang_tokenizer_explicit_prefix(void) const char *prefixes[] = { "fixed", "prefix" }; for (int algo = 2; algo >= 0; algo--) { /* We overwrite the settings over time */ - setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */ - for (int search = 0; search < 2; search++) { - const char **setptr2 = setptr; - if (search > 0) { *setptr2++ = "search"; *setptr2++ = "yes"; } - + enum lang_tokenizer_flags flags = search > 0 ? LANG_TOKENIZER_FLAG_SEARCH : 0; for (int explicitprefix = 0; explicitprefix < 2; explicitprefix++) { - const char **setptr3 = setptr2; - if (explicitprefix > 0) { *setptr3++ = "explicitprefix"; *setptr3++ = "y"; } - - *setptr3++ = NULL; + setptr = &settings[algo*2]; /* 4, 2, or 0 settings strings preserved */ + if (explicitprefix > 0) { *setptr++ = "explicitprefix"; *setptr++ = "y"; } + *setptr++ = NULL; test_begin(t_strdup_printf("prefix search %s:%s:%s", algos[algo], @@ -570,7 +564,7 @@ test_lang_tokenizer_explicit_prefix(void) const char *error; test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, settings, - &tok, &error) == 0); + flags, &tok, &error) == 0); test_tokenizer_inputs( tok, &input, 1, (search!=0) && (explicitprefix!=0) @@ -641,7 +635,7 @@ static void test_lang_tokenizer_skip_base64(void) }; test_begin("lang tokenizer skip base64"); - test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0); + test_assert(lang_tokenizer_create(lang_tokenizer_generic, NULL, tr29_settings, 0, &tok, &error) == 0); size_t index = 0; while (lang_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) { diff --git a/src/plugins/fts/fts-user.c b/src/plugins/fts/fts-user.c index 48964908a7..cf8bc6e52e 100644 --- a/src/plugins/fts/fts-user.c +++ b/src/plugins/fts/fts-user.c @@ -193,13 +193,9 @@ fts_user_create_tokenizer(struct mail_user *user, str = mail_user_plugin_getenv(user, set_key); } - /* tell the tokenizers that we're tokenizing a search string - (instead of tokenizing indexed data) */ - if (search) - str = t_strconcat("search=yes ", str, NULL); - if (lang_tokenizer_create(tokenizer_class, parent, str_keyvalues_to_array(str), + search ? LANG_TOKENIZER_FLAG_SEARCH : 0, &tokenizer, &error) < 0) { *error_r = t_strdup_printf("%s: %s", set_key, error); ret = -1;