From efa2d6c77cc80d5ca5019c56de2781a3f7f78277 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 21 Oct 2017 13:46:29 +0100 Subject: [PATCH] [Fix] Further tokenization fixes MFH: rspamd-1.6 --- src/libmime/message.c | 4 ++-- src/libstat/tokenizers/tokenizers.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 36dbee9456..7bc9bf2c2d 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -233,8 +233,8 @@ rspamd_extract_words (struct rspamd_task *task, /* Ugly workaround */ if (IS_PART_HTML (part)) { part->normalized_words = rspamd_tokenize_text ( - part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg, + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, part->exceptions, FALSE, NULL); } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index d79d68144e..f75310fe3b 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -235,7 +235,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto process_exception; } - else if (!u_isgraph (uc) || u_ispunct (uc)) { + else if (!u_isalnum (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } -- 2.47.3