]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: remove trailing period character from email-address
authorBaofeng Wang <baofeng.wang@dovecot.fi>
Tue, 7 Jun 2016 12:58:38 +0000 (15:58 +0300)
committerTimo Sirainen <timo.sirainen@dovecot.fi>
Tue, 21 Jun 2016 09:07:24 +0000 (12:07 +0300)
any trailing period character '.' should be removed when email
tokenization is done.

src/lib-fts/fts-tokenizer-address.c
src/lib-fts/fts-tokenizer-common.c
src/lib-fts/fts-tokenizer-common.h
src/lib-fts/test-fts-tokenizer.c

index 96f10eccddcab3ef73db8f457c11fdbd4502a3e0..13c201271c650eeae3a2d0fbb83d5371f30be9c8 100644 (file)
@@ -79,6 +79,9 @@ static void
 fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
                                     const char **token_r)
 {
+       const unsigned char *data = tok->last_word->data;
+       size_t len = tok->last_word->used;
+
        tok->tokenizer.skip_parents = TRUE;
        tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
        if (str_len(tok->last_word) > tok->max_length) {
@@ -86,15 +89,15 @@ fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
                /* As future proofing, delete partial utf8.
                   IS_DTEXT() does not actually allow utf8 addresses
                   yet though. */
-               const unsigned char *data = tok->last_word->data;
-               size_t len = tok->last_word->used;
+               len = tok->last_word->used;
                fts_tokenizer_delete_trailing_partial_char(data, &len);
                i_assert(len <= tok->max_length);
-               *token_r = len == 0 ? "" :
-                       t_strndup(tok->last_word->data, len);
-       } else {
-               *token_r = t_strdup(str_c(tok->last_word));
        }
+
+       if (len > 0)
+               fts_tokenizer_delete_trailing_invalid_char(data, &len);
+       *token_r = len == 0 ? "" :
+               t_strndup(data, len);
 }
 
 static bool
@@ -189,7 +192,7 @@ fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok
        while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.'))
                pos++;
         /* A complete domain name */
-       if ((pos > 1 && pos < size) || /* non-atext after atext in this data*/
+       if ((pos > 0 && pos < size) || /* non-atext after atext in this data*/
            (pos < size && !domain_is_empty(tok))) { /* non-atext after previous atext */
                str_append_n(tok->last_word, data, pos);
                *skip_r = pos;
index f71113d036d0c2e504f8494f2cdecb780d546537..87faa7e9c3e649a07d5954e7b7904fac51320d2d 100644 (file)
@@ -20,3 +20,13 @@ fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
                *len = pos;
        }
 }
+void fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
+                  size_t *len)
+{
+       size_t pos = *len;
+
+       /* the token may contain '.' in the end - remove all of them. */
+       while (pos > 0 && data[pos-1] == '.')
+           pos--;
+       *len = pos;
+}
index fdd3b163137859d94d8dc37cbd69a7973f46379e..b90e54353e98cfda45cc86072c5d338b98b00008 100644 (file)
@@ -3,4 +3,7 @@
 void
 fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
                                            size_t *len);
+void
+fts_tokenizer_delete_trailing_invalid_char(const unsigned char *data,
+                  size_t *len);
 #endif
index 06e14ed8f837846b791ef442ae7d3eaeed847cc6..960e5119d3c3cc3f3497e00f8e9cb3a645018718 100644 (file)
@@ -4,16 +4,20 @@
 #include "unichar.h"
 #include "test-common.h"
 #include "fts-tokenizer.h"
+#include "fts-tokenizer-common.h"
 #include "fts-tokenizer-private.h"
 #include "fts-tokenizer-generic-private.h"
 
-
+/*there should be a trailing space ' ' at the end of each string except the last one*/
 #define TEST_INPUT_ADDRESS \
        "@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
        "Bar Baz <bar@example.org>" \
        "Foo Bar (comment)foo.bar@host.example.org " \
        "foo, foo@domain " \
-       "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld"
+       "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
+       "trailing, period@blue.com. " \
+       "multi-trialing, mul@trail.com..... " \
+       "m@s"
 
 static const char *test_inputs[] = {
        /* generic things and word truncation: */
@@ -78,11 +82,11 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
        /* test all input at once */
        outi = first_outi;
        while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
-               test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+               test_assert_strcmp(token, expected_output[outi]);
                outi++;
        }
        while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
-               test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+               test_assert_strcmp(token, expected_output[outi]);
                outi++;
        }
        test_assert_idx(expected_output[outi] == NULL, outi);
@@ -92,12 +96,12 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
        for (i = 0; i < input_len; i += char_len) {
                char_len = uni_utf8_char_bytes(input[i]);
                while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
-                       test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+                       test_assert_strcmp(token, expected_output[outi]);
                        outi++;
                }
        }
        while (fts_tokenizer_final(tok, &token, &error) > 0) {
-               test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+               test_assert_strcmp(token, expected_output[outi]);
                outi++;
        }
        test_assert_idx(expected_output[outi] == NULL, outi);
@@ -109,12 +113,12 @@ test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
                for (char_len = 0; char_len < max; )
                        char_len += uni_utf8_char_bytes(input[i+char_len]);
                while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
-                       test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+                       test_assert_strcmp(token, expected_output[outi]);
                        outi++;
                }
        }
        while (fts_tokenizer_final(tok, &token, &error) > 0) {
-               test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+               test_assert_strcmp(token, expected_output[outi]);
                outi++;
        }
        test_assert_idx(expected_output[outi] == NULL, outi);
@@ -309,7 +313,11 @@ static void test_fts_tokenizer_address_only(void)
        static const char *const expected_output[] = {
                "abc.dfg@example.com", "bar@example.org",
                "foo.bar@host.example.org", "foo@domain",
-               "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
+               "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+               "period@blue.com", /*trailing period '.' in email */
+               "mul@trail.com",
+               "m@s", /*one letter local-part and domain name */
+               NULL
        };
        struct fts_tokenizer *tok;
        const char *error;
@@ -328,7 +336,11 @@ static void test_fts_tokenizer_address_parent(const char *name, const char * con
                "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
                "Bar", "Baz", "bar", "example", "org", "bar@example.org",
                "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
-               "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",  "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",  NULL
+               "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde",  "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+               "trailing", "period", "blue", "com", "period@blue.com",
+               "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
+               "m", "s", "m@s",
+               NULL
        };
        struct fts_tokenizer *tok, *gen_tok;
        const char *error;
@@ -360,7 +372,11 @@ static void test_fts_tokenizer_address_search(void)
                "invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
                "Bar", "Baz", "bar@example.org",
                "Foo", "Bar", "comment", "foo.bar@host.example.org",
-               "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
+               "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+               "trailing", "period@blue.com",
+               "multi", "trialing", "mul@trail.com",
+               "m@s",
+               NULL
        };
        static const char *const settings[] = { "search", "", NULL };
        struct fts_tokenizer *tok, *gen_tok;
@@ -418,5 +434,6 @@ int main(void)
        fts_tokenizers_init();
        ret = test_run(test_functions);
        fts_tokenizers_deinit();
+
        return ret;
 }