From e18843502604d9f4317000923a7493e8f6c8b132 Mon Sep 17 00:00:00 2001 From: Timo Sirainen Date: Tue, 26 Oct 2021 16:59:29 +0300 Subject: [PATCH] lib-fts: Fix address tokenizer to handle large input properly Previously it could have used excessive amounts of memory if the input didn't contain separator characters. The fix changes a bit how the address-tokenizer works: Previously large email addresses were saved as truncated tokens. Now they're skipped entirely by the address tokenizer. Similarly when searching long email addresses they're no longer searched as truncated tokens, but instead simply fed to the parent tokenizer which (likely) searches them in smaller pieces. Note that this also sometimes changes the order in which tokens are returned, e.g. "foo", "example", "foo@example.com", "com" instead of returning "com" before the email address. This isn't ideal, but fixing it seems annoyingly complicated and practically it doesn't matter right now. --- src/lib-fts/fts-tokenizer-address.c | 74 ++++++++++++++++++++++++++++- src/lib-fts/test-fts-tokenizer.c | 26 ++++++---- 2 files changed, 89 insertions(+), 11 deletions(-) diff --git a/src/lib-fts/fts-tokenizer-address.c b/src/lib-fts/fts-tokenizer-address.c index 4b834098a1..1a2fb3d24f 100644 --- a/src/lib-fts/fts-tokenizer-address.c +++ b/src/lib-fts/fts-tokenizer-address.c @@ -16,7 +16,8 @@ enum email_address_parser_state { EMAIL_ADDRESS_PARSER_STATE_NONE = 0, EMAIL_ADDRESS_PARSER_STATE_LOCALPART, EMAIL_ADDRESS_PARSER_STATE_DOMAIN, - EMAIL_ADDRESS_PARSER_STATE_COMPLETE + EMAIL_ADDRESS_PARSER_STATE_COMPLETE, + EMAIL_ADDRESS_PARSER_STATE_SKIP, }; struct email_address_fts_tokenizer { @@ -139,6 +140,22 @@ static size_t skip_nonlocal_part(const unsigned char *data, size_t size) return skip; } +static bool +fts_tokenizer_email_address_too_large(struct email_address_fts_tokenizer *tok, + size_t pos) +{ + if (str_len(tok->last_word) + pos <= tok->max_length) + return FALSE; + + /* The token is too large - skip over it. + + Truncate the input that was added so far to the token, so all of it + gets sent to the parent tokenizer in + fts_tokenizer_address_parent_data(). */ + str_truncate(tok->last_word, 0); + return TRUE; +} + static enum email_address_parser_state fts_tokenizer_email_address_parse_local(struct email_address_fts_tokenizer *tok, const unsigned char *data, size_t size, @@ -157,6 +174,12 @@ fts_tokenizer_email_address_parse_local(struct email_address_fts_tokenizer *tok, if (seen_at) break; } + + if (fts_tokenizer_email_address_too_large(tok, pos)) { + *skip_r = 0; + return EMAIL_ADDRESS_PARSER_STATE_SKIP; + } + /* localpart and @ */ if (seen_at && (pos > 1 || str_len(tok->last_word) > 0)) { str_append_data(tok->last_word, data, pos); @@ -194,6 +217,12 @@ fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok while (pos < size && (IS_DTEXT(data[pos]) || data[pos] == '.' || data[pos] == '-')) pos++; + + if (fts_tokenizer_email_address_too_large(tok, pos)) { + *skip_r = 0; + return EMAIL_ADDRESS_PARSER_STATE_SKIP; + } + /* A complete domain name */ if ((pos > 0 && pos < size) || /* non-atext after atext in this data*/ (pos < size && !domain_is_empty(tok))) { /* non-atext after previous atext */ @@ -212,6 +241,21 @@ fts_tokenizer_email_address_parse_domain(struct email_address_fts_tokenizer *tok return EMAIL_ADDRESS_PARSER_STATE_NONE; } +static bool +fts_tokenizer_address_skip(const unsigned char *data, size_t size, + size_t *skip_r) +{ + for (size_t pos = 0; pos < size; pos++) { + if (!(IS_ATEXT(data[pos]) || data[pos] == '.' || + data[pos] == '-') || data[pos] == '@') { + *skip_r = pos; + return TRUE; + } + } + *skip_r = size; + return FALSE; +} + /* Buffer raw data for parent. */ static void fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok, @@ -240,6 +284,7 @@ fts_tokenizer_email_address_next(struct fts_tokenizer *_tok, struct email_address_fts_tokenizer *tok = (struct email_address_fts_tokenizer *)_tok; size_t pos = 0, local_skip; + bool finished; if (tok->tokenizer.skip_parents == TRUE) tok->tokenizer.skip_parents = FALSE; @@ -318,6 +363,30 @@ fts_tokenizer_email_address_next(struct fts_tokenizer *_tok, if (fts_tokenizer_address_current_token(tok, token_r)) return 1; break; + case EMAIL_ADDRESS_PARSER_STATE_SKIP: + /* The curernt token is too large to determine if it's + an email address or not. The address-tokenizer is + simply skipping over it, but the input is being + passed to the parent tokenizer. */ + *skip_r = pos; + if (fts_tokenizer_address_parent_data(tok, token_r)) + return 1; + + finished = fts_tokenizer_address_skip(data + pos, + size - pos, + &local_skip); + fts_tokenizer_address_update_parent(tok, data+pos, + local_skip); + pos += local_skip; + if (finished) { + *skip_r = pos; + if (fts_tokenizer_address_parent_data(tok, token_r)) { + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + return 1; + } + tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE; + } + break; default: i_unreached(); } @@ -336,7 +405,8 @@ static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = { static const struct fts_tokenizer fts_tokenizer_email_address_real = { .name = "email-address", - .v = &email_address_tokenizer_vfuncs + .v = &email_address_tokenizer_vfuncs, + .stream_to_parents = TRUE, }; const struct fts_tokenizer *fts_tokenizer_email_address = &fts_tokenizer_email_address_real; diff --git a/src/lib-fts/test-fts-tokenizer.c b/src/lib-fts/test-fts-tokenizer.c index ed86b9f237..47057f7f75 100644 --- a/src/lib-fts/test-fts-tokenizer.c +++ b/src/lib-fts/test-fts-tokenizer.c @@ -319,7 +319,6 @@ static void test_fts_tokenizer_address_only(void) static const char *const expected_output[] = { "abc.dfg@example.com", "bar@example.org", "foo.bar@host.example.org", "foo@domain", - "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@1bcdefghijklmnopqrstuvxy1.2bcdefghijklmnopqrstuvxy2.3bcdefghijklmnopqrstuvxy3.4bcdefghijklmnopqrstuvxy4.5bcdefghijklmnopqrstuvxy5.6bcdefghijklmnopqrstuvxy6.7bcdefghijklmnopqrstu", "period@blue.com", /*trailing period '.' in email */ "mul@trail.com", "m@s", /*one letter local-part and domain name */ @@ -341,10 +340,10 @@ static void test_fts_tokenizer_address_parent(const char *name, const char * con { static const char input[] = TEST_INPUT_ADDRESS; static const char *const expected_output[] = { - "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "com", "abc.dfg@example.com", - "Bar", "Baz", "bar", "example", "org", "bar@example.org", - "Foo", "Bar", "comment", "foo", "bar", "host", "example", "org", "foo.bar@host.example.org", - "foo", "foo", "domain", "foo@domain", + "invalid", "invalid", "Abc", "Dfg", "abc", "dfg", "example", "abc.dfg@example.com", "com", + "Bar", "Baz", "bar", "example", "bar@example.org", "org", + "Foo", "Bar", "comment", "foo", "bar", "host", "example", "foo.bar@host.example.org", "org", + "foo", "foo", "foo@domain", "domain", "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", "1bcdefghijklmnopqrstuvxy1", "2bcdefghijklmnopqrstuvxy2", @@ -356,11 +355,10 @@ static void test_fts_tokenizer_address_parent(const char *name, const char * con "8bcdefghijklmnopqrstuvxy8", "9bcdefghijklmnopqrstuvxy9", "0bcdefghijklmnopqrstuvxy0", "tld", - "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@1bcdefghijklmnopqrstuvxy1.2bcdefghijklmnopqrstuvxy2.3bcdefghijklmnopqrstuvxy3.4bcdefghijklmnopqrstuvxy4.5bcdefghijklmnopqrstuvxy5.6bcdefghijklmnopqrstuvxy6.7bcdefghijklmnopqrstu", "trailing", "period", "blue", "com", "period@blue.com", "multi", "trialing", "mul", "trail", "com", "mul@trail.com", - "m", "s", "m@s", - "hypen", "hypen", "hypen", "com", "hypen@hypen-hypen.com", + "m", "m@s", "s", + "hypen", "hypen", "hypen", "hypen@hypen-hypen.com", "com", "hypen", "hypen", "hypen", "sick", "com", "hypen@hypen-hypen-sick.com", NULL }; @@ -395,7 +393,17 @@ static void test_fts_tokenizer_address_search(void) "Bar", "Baz", "bar@example.org", "Foo", "Bar", "comment", "foo.bar@host.example.org", "foo", "foo@domain", - "abcdefghijklmnopqrstuvxyz.abcdefghijklmnopqrstuvxyzabcdefghijklmnopqrstuvxyz@1bcdefghijklmnopqrstuvxy1.2bcdefghijklmnopqrstuvxy2.3bcdefghijklmnopqrstuvxy3.4bcdefghijklmnopqrstuvxy4.5bcdefghijklmnopqrstuvxy5.6bcdefghijklmnopqrstuvxy6.7bcdefghijklmnopqrstu", + "abcdefghijklmnopqrstuvxyz", "abcdefghijklmnopqrstuvxyzabcde", + "1bcdefghijklmnopqrstuvxy1", + "2bcdefghijklmnopqrstuvxy2", + "3bcdefghijklmnopqrstuvxy3", + "4bcdefghijklmnopqrstuvxy4", + "5bcdefghijklmnopqrstuvxy5", + "6bcdefghijklmnopqrstuvxy6", + "7bcdefghijklmnopqrstuvxy7", + "8bcdefghijklmnopqrstuvxy8", + "9bcdefghijklmnopqrstuvxy9", + "0bcdefghijklmnopqrstuvxy0", "tld", "trailing", "period@blue.com", "multi", "trialing", "mul@trail.com", "m@s", -- 2.47.3