fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
const char **token_r)
{
+ const unsigned char *data = tok->last_word->data;
+ size_t len = tok->last_word->used;
+
tok->tokenizer.skip_parents = TRUE;
tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
if (str_len(tok->last_word) > tok->max_length) {
/* As future proofing, delete partial utf8.
IS_DTEXT() does not actually allow utf8 addresses
yet though. */
- const unsigned char *data = tok->last_word->data;
- size_t len = tok->last_word->used;
+ len = tok->last_word->used;
fts_tokenizer_delete_trailing_partial_char(data, &len);
i_assert(len <= tok->max_length);
- *token_r = len == 0 ? "" :
- t_strndup(tok->last_word->data, len);
- } else {
- *token_r = t_strdup(str_c(tok->last_word));
}
+
+ if (len > 0)
+ fts_tokenizer_delete_trailing_invalid_char(data, &len);
+ *token_r = len == 0 ? "" :
+ t_strndup(data, len);
}
static bool
while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.'))
pos++;
/* A complete domain name */
- if ((pos > 1 && pos < size) || /* non-atext after atext in this data*/
+ if ((pos > 0 && pos < size) || /* non-atext after atext in this data*/
(pos < size && !domain_is_empty(tok))) { /* non-atext after previous atext */
str_append_n(tok->last_word, data, pos);
*skip_r = pos;
#include "unichar.h"
#include "test-common.h"
#include "fts-tokenizer.h"
+#include "fts-tokenizer-common.h"
#include "fts-tokenizer-private.h"
#include "fts-tokenizer-generic-private.h"
-
+/*there should be a trailing space ' ' at the end of each string except the last one*/
#define TEST_INPUT_ADDRESS \
"@invalid invalid@ Abc Dfg <abc.dfg@example.com>, " \
"Bar Baz <bar@example.org>" \
"Foo Bar (comment)foo.bar@host.example.org " \
"foo, foo@domain " \
- "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld"
+ "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.tld " \
+ "trailing, period@blue.com. " \
+ "multi-trialing, mul@trail.com..... " \
+ "m@s"
static const char *test_inputs[] = {
/* generic things and word truncation: */
/* test all input at once */
outi = first_outi;
while (fts_tokenizer_next(tok, input, input_len, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
test_assert_idx(expected_output[outi] == NULL, outi);
for (i = 0; i < input_len; i += char_len) {
char_len = uni_utf8_char_bytes(input[i]);
while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
}
while (fts_tokenizer_final(tok, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
test_assert_idx(expected_output[outi] == NULL, outi);
for (char_len = 0; char_len < max; )
char_len += uni_utf8_char_bytes(input[i+char_len]);
while (fts_tokenizer_next(tok, input+i, char_len, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
}
while (fts_tokenizer_final(tok, &token, &error) > 0) {
- test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ test_assert_strcmp(token, expected_output[outi]);
outi++;
}
test_assert_idx(expected_output[outi] == NULL, outi);
static const char *const expected_output[] = {
"abc.dfg@example.com", "bar@example.org",
"foo.bar@host.example.org", "foo@domain",
- "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
+ "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+ "period@blue.com", /*trailing period '.' in email */
+ "mul@trail.com",
+ "m@s", /*one letter local-part and domain name */
+ NULL
};
struct fts_tokenizer *tok;
const char *error;
"invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com",
"Bar", "Baz", "bar", "example", "org", "bar@example.org",
"Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org",
- "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
+ "foo", "foo", "domain", "foo@domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyz","tld", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+ "trailing", "period", "blue", "com", "period@blue.com",
+ "multi", "trialing", "mul", "trail", "com", "mul@trail.com",
+ "m", "s", "m@s",
+ NULL
};
struct fts_tokenizer *tok, *gen_tok;
const char *error;
"invalid", "invalid", "Abc", "Dfg", "abc.dfg@example.com",
"Bar", "Baz", "bar@example.org",
"Foo", "Bar", "comment", "foo.bar@host.example.org",
- "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu", NULL
+ "foo", "foo@domain", "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstu",
+ "trailing", "period@blue.com",
+ "multi", "trialing", "mul@trail.com",
+ "m@s",
+ NULL
};
static const char *const settings[] = { "search", "", NULL };
struct fts_tokenizer *tok, *gen_tok;
fts_tokenizers_init();
ret = test_run(test_functions);
fts_tokenizers_deinit();
+
return ret;
}