From: Vsevolod Stakhov Date: Sat, 21 Oct 2017 12:46:29 +0000 (+0100) Subject: [Fix] Further tokenization fixes X-Git-Tag: 1.6.5~1^2~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=efa2d6c77cc80d5ca5019c56de2781a3f7f78277;p=thirdparty%2Frspamd.git [Fix] Further tokenization fixes MFH: rspamd-1.6 --- diff --git a/src/libmime/message.c b/src/libmime/message.c index 36dbee9456..7bc9bf2c2d 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -233,8 +233,8 @@ rspamd_extract_words (struct rspamd_task *task, /* Ugly workaround */ if (IS_PART_HTML (part)) { part->normalized_words = rspamd_tokenize_text ( - part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg, + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, part->exceptions, FALSE, NULL); } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index d79d68144e..f75310fe3b 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -235,7 +235,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto process_exception; } - else if (!u_isgraph (uc) || u_ispunct (uc)) { + else if (!u_isalnum (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; }