From: Marco Bettini Date: Wed, 28 Feb 2024 10:08:30 +0000 (+0000) Subject: lib-language: Remove truncation from filters X-Git-Tag: 2.4.1~994 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=be19311ed016021c311eae5fad950cda860a29d5;p=thirdparty%2Fdovecot%2Fcore.git lib-language: Remove truncation from filters --- diff --git a/src/lib-language/Makefile.am b/src/lib-language/Makefile.am index 1c68a3f50f..0131b66acc 100644 --- a/src/lib-language/Makefile.am +++ b/src/lib-language/Makefile.am @@ -81,7 +81,6 @@ liblanguage_la_LIBADD = \ liblanguage_la_SOURCES = \ lang-filter.c \ lang-filter-contractions.c \ - lang-filter-common.c \ lang-filter-english-possessive.c \ lang-filter-lowercase.c \ lang-filter-normalizer-icu.c \ @@ -99,7 +98,6 @@ liblanguage_la_SOURCES = \ headers = \ lang-common.h \ lang-filter.h \ - lang-filter-common.h \ lang-filter-private.h \ lang-icu.h \ language.h \ diff --git a/src/lib-language/lang-filter-common.c b/src/lib-language/lang-filter-common.c deleted file mode 100644 index ba14101ae1..0000000000 --- a/src/lib-language/lang-filter-common.c +++ /dev/null @@ -1,20 +0,0 @@ -/* Copyright (c) 2016-2018 Dovecot authors, see the included COPYING file */ - -#include "lib.h" -#include "str.h" -#include "buffer.h" -#include "unichar.h" -#include "lang-filter-private.h" -#include "lang-filter-common.h" -#include "lang-tokenizer-common.h" - -void lang_filter_truncate_token(string_t *token, size_t max_length) -{ - if (str_len(token) <= max_length) - return; - - size_t len = max_length; - lang_tokenizer_delete_trailing_partial_char(token->data, &len); - str_truncate(token, len); - i_assert(len <= max_length); -} diff --git a/src/lib-language/lang-filter-common.h b/src/lib-language/lang-filter-common.h deleted file mode 100644 index 08ca137e18..0000000000 --- a/src/lib-language/lang-filter-common.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef LANG_FILTER_COMMON_H -#define LANG_FILTER_COMMON_H - -void lang_filter_truncate_token(string_t *token, size_t max_length); - -#endif diff --git a/src/lib-language/lang-filter-lowercase.c b/src/lib-language/lang-filter-lowercase.c index 369c543abc..2d7535b4c6 100644 --- a/src/lib-language/lang-filter-lowercase.c +++ b/src/lib-language/lang-filter-lowercase.c @@ -8,11 +8,10 @@ #ifdef HAVE_LIBICU # include "lang-icu.h" -# include "lang-filter-common.h" #endif static int -lang_filter_lowercase_create(const struct lang_settings *set, +lang_filter_lowercase_create(const struct lang_settings *set ATTR_UNUSED, struct event *event ATTR_UNUSED, struct lang_filter **filter_r, const char **error_r ATTR_UNUSED) @@ -21,7 +20,6 @@ lang_filter_lowercase_create(const struct lang_settings *set, filter = i_new(struct lang_filter, 1); *filter = *lang_filter_lowercase; filter->token = str_new(default_pool, 64); - filter->max_length = set->filter_lowercase_token_maxlen; *filter_r = filter; return 0; @@ -35,7 +33,6 @@ lang_filter_lowercase_filter(struct lang_filter *filter ATTR_UNUSED, #ifdef HAVE_LIBICU str_truncate(filter->token, 0); lang_icu_lcase(filter->token, *token); - lang_filter_truncate_token(filter->token, filter->max_length); *token = str_c(filter->token); #else *token = t_str_lcase(*token); diff --git a/src/lib-language/lang-filter-normalizer-icu.c b/src/lib-language/lang-filter-normalizer-icu.c index 3c1d140380..2b83ef98ed 100644 --- a/src/lib-language/lang-filter-normalizer-icu.c +++ b/src/lib-language/lang-filter-normalizer-icu.c @@ -4,7 +4,6 @@ #include "array.h" #include "str.h" #include "unichar.h" /* unicode replacement char */ -#include "lang-filter-common.h" #include "lang-filter-private.h" #include "lang-settings.h" #include "language.h" @@ -50,7 +49,6 @@ lang_filter_normalizer_icu_create(const struct lang_settings *set, p_array_init(&np->utf16_token, pp, 64); p_array_init(&np->trans_token, pp, 64); np->utf8_token = buffer_create_dynamic(pp, 128); - np->filter.max_length = set->filter_normalizer_token_maxlen; *filter_r = &np->filter; return 0; } @@ -82,7 +80,6 @@ lang_filter_normalizer_icu_filter(struct lang_filter *filter, const char **token lang_icu_utf16_to_utf8(np->utf8_token, array_front(&np->trans_token), array_count(&np->trans_token)); - lang_filter_truncate_token(np->utf8_token, np->filter.max_length); *token = str_c(np->utf8_token); return 1; } diff --git a/src/lib-language/lang-filter-private.h b/src/lib-language/lang-filter-private.h index 880a6840f8..28a584014c 100644 --- a/src/lib-language/lang-filter-private.h +++ b/src/lib-language/lang-filter-private.h @@ -30,7 +30,6 @@ struct lang_filter { struct lang_filter_vfuncs v; struct lang_filter *parent; string_t *token; - size_t max_length; int refcount; }; diff --git a/src/lib-language/lang-settings.c b/src/lib-language/lang-settings.c index 3cbeed1591..b3b1d74088 100644 --- a/src/lib-language/lang-settings.c +++ b/src/lib-language/lang-settings.c @@ -19,9 +19,7 @@ static const struct setting_define lang_setting_defines[] = { DEF(STR, name), SETTING_DEFINE_STRUCT_BOOL("language_default", is_default, struct lang_settings), DEF(BOOLLIST, filters), - DEF(UINT, filter_lowercase_token_maxlen), DEF(STR, filter_normalizer_icu_id), - DEF(UINT, filter_normalizer_token_maxlen), DEF(STR, filter_stopwords_dir), DEF(BOOLLIST, tokenizers), DEF(UINT, tokenizer_address_token_maxlen), @@ -36,8 +34,6 @@ const struct lang_settings lang_default_settings = { .name = "", .is_default = FALSE, .filters = ARRAY_INIT, - .filter_lowercase_token_maxlen = 250, - .filter_normalizer_token_maxlen = 250, .filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC; [\\x20] Remove", .filter_stopwords_dir = DATADIR"/stopwords", .tokenizers = ARRAY_INIT, diff --git a/src/lib-language/lang-settings.h b/src/lib-language/lang-settings.h index c806e04688..fb0f551331 100644 --- a/src/lib-language/lang-settings.h +++ b/src/lib-language/lang-settings.h @@ -16,8 +16,6 @@ struct lang_settings { const char *tokenizer_generic_algorithm; ARRAY_TYPE(const_string) filters; ARRAY_TYPE(const_string) tokenizers; - unsigned int filter_lowercase_token_maxlen; - unsigned int filter_normalizer_token_maxlen; unsigned int tokenizer_address_token_maxlen; unsigned int tokenizer_generic_token_maxlen; bool tokenizer_generic_explicit_prefix; diff --git a/src/lib-language/test-lang-filter.c b/src/lib-language/test-lang-filter.c index 9fc9de8169..cc6f15254a 100644 --- a/src/lib-language/test-lang-filter.c +++ b/src/lib-language/test-lang-filter.c @@ -165,35 +165,6 @@ static void test_lang_filter_lowercase_utf8(void) test_end(); } -static void test_lang_filter_lowercase_too_long_utf8(void) -{ - static const struct { - const char *input; - const char *output; - } tests[] = { - { "f\xC3\x85\xC3\x85", "f\xC3\xA5\xC3\xA5" }, - { "abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxy" }, - { "abc\xC3\x85""defghijklmnopqrstuvwxyz", "abc\xC3\xA5""defghijklmnopqrstuvw" }, - { "abcdefghijklmnopqrstuvwx\xC3\x85", "abcdefghijklmnopqrstuvwx" } - }; - struct lang_filter *filter; - const char *error; - const char *token; - struct lang_settings set = lang_default_settings; - set.filter_lowercase_token_maxlen = 25; - unsigned int i; - - test_begin("lang filter lowercase, too long UTF8"); - test_assert(lang_filter_create(lang_filter_lowercase, NULL, make_settings(LANG_EN, &set), event, &filter, &error) == 0); - - for (i = 0; i < N_ELEMENTS(tests); i++) { - token = tests[i].input; - test_assert_idx(lang_filter(filter, &token, &error) > 0 && - strcmp(token, tests[i].output) == 0, 0); - } - lang_filter_unref(&filter); - test_end(); -} #endif static void test_lang_filter_stopwords_eng(void) @@ -713,57 +684,6 @@ static void test_lang_filter_normalizer_invalid_id(void) test_end(); } -static void test_lang_filter_normalizer_oversized(void) -{ - struct lang_filter *norm = NULL; - struct lang_settings set = lang_default_settings; - set.filter_normalizer_icu_id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove"; - set.filter_normalizer_token_maxlen = 250; - const char *error = NULL; - const char *token = "\xe4\x95\x91\x25\xe2\x94\xad\xe1\x90\xad\xee\x94\x81\xe2\x8e\x9e" - "\xe7\x9a\xb7\xea\xbf\x97\xe3\xb2\x8f\xe4\x9c\xbe\xee\xb4\x98\xe1" - "\x8d\x99\xe2\x91\x83\xe3\xb1\xb8\xef\xbf\xbd\xe8\xbb\x9c\xef\xbf" - "\xbd\xea\xbb\x98\xea\xb5\xac\xe4\x87\xae\xe4\x88\x93\xe9\x86\x8f" - "\xe9\x86\x83\xe6\x8f\x8d\xe7\xa3\x9d\xed\x89\x96\xe2\x89\x85\xe6" - "\x8c\x82\xec\x80\x98\xee\x91\x96\xe7\xa8\x8a\xec\xbc\x85\xeb\x9c" - "\xbd\xeb\x97\x95\xe3\xa4\x9d\xd7\xb1\xea\xa7\x94\xe0\xbb\xac\xee" - "\x95\x87\xd5\x9d\xe8\xba\x87\xee\x8b\xae\xe5\xb8\x80\xe9\x8d\x82" - "\xe7\xb6\x8c\xe7\x9b\xa0\xef\x82\x9f\xed\x96\xa4\xe3\x8d\xbc\xe1" - "\x81\xbd\xe9\x81\xb2\xea\xac\xac\xec\x9b\x98\xe7\x84\xb2\xee\xaf" - "\xbc\xeb\xa2\x9d\xe9\x86\xb3\xe0\xb0\x89\xeb\x80\xb6\xe3\x8c\x9d" - "\xe9\x8f\x9e\xe2\xae\x8a\xee\x9e\x9a\xef\xbf\xbd\xe7\xa3\x9b\xe4" - "\xa3\x8b\xe4\x82\xb9\xeb\x8e\x93\xec\xb5\x82\xe5\xa7\x81\xe2\x8c" - "\x97\xea\xbb\xb4\xe5\x85\xb7\xeb\x96\xbe\xe7\x97\x91\xea\xbb\x98" - "\xe6\xae\xb4\xe9\x8a\x85\xc4\xb9\xe4\x90\xb2\xe9\x96\xad\xef\x90" - "\x9c\xe5\xa6\xae\xe9\x93\x91\xe8\x87\xa1"; - - test_begin("lang filter normalizer over-sized token"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0); - test_assert(error == NULL); - test_assert(lang_filter(norm, &token, &error) >= 0); - test_assert(strlen(token) <= 250); - lang_filter_unref(&norm); - test_end(); -} - -static void test_lang_filter_normalizer_truncation(void) -{ - struct lang_filter *norm = NULL; - struct lang_settings set = lang_default_settings; - set.filter_normalizer_icu_id = "Any-Lower;"; - set.filter_normalizer_token_maxlen = 10; - const char *error = NULL; - const char *token = "abcdefghi\xC3\x85"; - - test_begin("lang filter normalizer token truncated mid letter"); - test_assert(lang_filter_create(lang_filter_normalizer_icu, NULL, make_settings(NULL, &set), event, &norm, &error) == 0); - test_assert(error == NULL); - test_assert(lang_filter(norm, &token, &error) >= 0); - test_assert(strcmp(token, "abcdefghi") == 0); - lang_filter_unref(&norm); - test_end(); -} - #ifdef HAVE_LANG_STEMMER static void test_lang_filter_normalizer_stopwords_stemmer_eng(void) { @@ -1002,7 +922,6 @@ int main(void) test_lang_filter_lowercase, #ifdef HAVE_LIBICU test_lang_filter_lowercase_utf8, - test_lang_filter_lowercase_too_long_utf8, #endif test_lang_filter_stopwords_eng, test_lang_filter_stopwords_fin, @@ -1022,8 +941,6 @@ int main(void) test_lang_filter_normalizer_empty, test_lang_filter_normalizer_baddata, test_lang_filter_normalizer_invalid_id, - test_lang_filter_normalizer_oversized, - test_lang_filter_normalizer_truncation, #ifdef HAVE_LANG_STEMMER test_lang_filter_normalizer_stopwords_stemmer_eng, test_lang_filter_stopwords_normalizer_stemmer_no,