From 6b33eadcf597d91a9a6aaaa9fc39bd35ccbbc9f1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 21 Oct 2017 13:46:29 +0100 Subject: [PATCH] [Fix] Further tokenization fixes MFH: rspamd-1.6 --- src/libmime/message.c | 4 ++-- src/libstat/tokenizers/tokenizers.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index cae61643c8..fe2cdbdae9 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -234,8 +234,8 @@ rspamd_extract_words (struct rspamd_task *task, /* Ugly workaround */ if (IS_PART_HTML (part)) { part->normalized_words = rspamd_tokenize_text ( - part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg, + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, part->exceptions, FALSE, NULL); } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index d79d68144e..f75310fe3b 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -235,7 +235,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto process_exception; } - else if (!u_isgraph (uc) || u_ispunct (uc)) { + else if (!u_isalnum (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; } -- 2.47.3