From: Baofeng Wang Date: Tue, 7 Jun 2016 12:58:38 +0000 (+0300) Subject: lib-fts: remove trailing period character from email-address X-Git-Tag: 2.3.0.rc1~3455 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=507ea0bc5b25efb4c96033a19dec66689a50ebd0;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: remove trailing period character from email-address any trailing period character '.' should be removed when email tokenization is done. --- diff --git a/src/lib-fts/fts-tokenizer-address.c b/src/lib-fts/fts-tokenizer-address.c index 96f10eccdd..13c201271c 100644 --- a/src/lib-fts/fts-tokenizer-address.c +++ b/src/lib-fts/fts-tokenizer-address.c @@ -79,6 +79,9 @@ static void fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok, const char **token_r) { + const unsigned char *data = tok->last_word->data; + size_t len = tok->last_word->used; + tok->tokenizer.skip_parents = TRUE; tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; if (str_len(tok->last_word) > tok->max_length) { @@ -86,15 +89,15 @@ fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok, /* As future proofing, delete partial utf8. IS_DTEXT() does not actually allow utf8 addresses yet though. */ - const unsigned char *data = tok->last_word->data; - size_t len = tok->last_word->used; + len = tok->last_word->used; fts_tokenizer_delete_trailing_partial_char(data, &len); i_assert(len <= tok->max_length); - *token_r = len == 0 ? "" : - t_strndup(tok->last_word->data, len); - } else { - *token_r = t_strdup(str_c(tok->last_word)); } + + if (len > 0) + fts_tokenizer_delete_trailing_invalid_char(data, &len); + *token_r = len == 0 ? "" : + t_strndup(data, len); } static bool @@ -189,7 +192,7 @@ fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.')) pos++; /* A complete domain name */ - if ((pos > 1 && pos < size) || /* non-atext after atext in this data*/ + if ((pos > 0 && pos < size) || /* non-atext after atext in this data*/ (pos < size && !domain_is_empty(tok))) { /* non-atext after previous atext */ str_append_n(tok->last_word, data, pos); *skip_r = pos; diff --git a/src/lib-fts/fts-tokenizer-common.c b/src/lib-fts/fts-tokenizer-common.c index f71113d036..87faa7e9c3 100644 --- a/src/lib-fts/fts-tokenizer-common.c +++ b/src/lib-fts/fts-tokenizer-common.c @@ -20,3 +20,13 @@ fts_tokenizer_delete_trailing_partial_char(const unsigned char *data, *len = pos; } } +void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data, + size_t *len) +{ + size_t pos = *len; + + /* the token may contain '.' in the end - remove all of them. */ + while (pos > 0 && data[pos-1] == '.') + pos--; + *len = pos; +} diff --git a/src/lib-fts/fts-tokenizer-common.h b/src/lib-fts/fts-tokenizer-common.h index fdd3b16313..b90e54353e 100644 --- a/src/lib-fts/fts-tokenizer-common.h +++ b/src/lib-fts/fts-tokenizer-common.h @@ -3,4 +3,7 @@ void fts_tokenizer_delete_trailing_partial_char(const unsigned char *data, size_t *len); +void +fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data, + size_t *len); #endif diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c index 06e14ed8f8..960e5119d3 100644 --- a/src/lib-fts/test-fts-tokenizer.c +++ b/src/lib-fts/test-fts-tokenizer.c @@ -4,16 +4,20 @@ #include "unichar.h" #include "test-common.h" #include "fts-tokenizer.h" +#include "fts-tokenizer-common.h" #include "fts-tokenizer-private.h" #include "fts-tokenizer-generic-private.h" - +/*there should be a trailing space ' ' at the end of each string except the last one*/ #define TEST_INPUT_ADDRESS \ "@invalid invalid@ Abc Dfg , " \ "Bar Baz " \ "Foo Bar (comment)foo.bar@host.example.org " \ "foo, foo@domain " \ - "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld" + "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \ + "trailing, period@blue.com. " \ + "multi-trialing, mul@trail.com..... " \ + "m@s" static const char *test_inputs[] = { /* generic things and word truncation: */ @@ -78,11 +82,11 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input, /* test all input at once */ outi = first_outi; while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } test_assert_idx(expected_output[outi] == NULL, outi); @@ -92,12 +96,12 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input, for (i = 0; i < input_len; i += char_len) { char_len = uni_utf8_char_bytes(input[i]); while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } } while (fts_tokenizer_final(tok, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } test_assert_idx(expected_output[outi] == NULL, outi); @@ -109,12 +113,12 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input, for (char_len = 0; char_len < max; ) char_len += uni_utf8_char_bytes(input[i+char_len]); while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } } while (fts_tokenizer_final(tok, &token, &error) > 0) { - test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi); + test_assert_strcmp(token, expected_output[outi]); outi++; } test_assert_idx(expected_output[outi] == NULL, outi); @@ -309,7 +313,11 @@ static void test_fts_tokenizer_address_only(void) static const char *const expected_output[] = { "abc.dfg@example.com", "bar@example.org", "foo.bar@host.example.org", "foo@domain", - "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL + "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", + "period@blue.com", /*trailing period '.' in email */ + "mul@trail.com", + "m@s", /*one letter local-part and domain name */ + NULL }; struct fts_tokenizer *tok; const char *error; @@ -328,7 +336,11 @@ static void test_fts_tokenizer_address_parent(const char *name, const char * con "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com", "Bar", "Baz", "bar", "example", "org", "bar@example.org", "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org", - "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL + "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", + "trailing", "period", "blue", "com", "period@blue.com", + "multi", "trialing", "mul", "trail", "com", "mul@trail.com", + "m", "s", "m@s", + NULL }; struct fts_tokenizer *tok, *gen_tok; const char *error; @@ -360,7 +372,11 @@ static void test_fts_tokenizer_address_search(void) "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com", "Bar", "Baz", "bar@example.org", "Foo", "Bar", "comment", "foo.bar@host.example.org", - "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL + "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", + "trailing", "period@blue.com", + "multi", "trialing", "mul@trail.com", + "m@s", + NULL }; static const char *const settings[] = { "search", "", NULL }; struct fts_tokenizer *tok, *gen_tok; @@ -418,5 +434,6 @@ int main(void) fts_tokenizers_init(); ret = test_run(test_functions); fts_tokenizers_deinit(); + return ret; }