From: Vsevolod Stakhov Date: Sat, 21 Oct 2017 12:46:29 +0000 (+0100) Subject: [Fix] Further tokenization fixes X-Git-Tag: 1.7.0~531 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=6b33eadcf597d91a9a6aaaa9fc39bd35ccbbc9f1;p=thirdparty%2Frspamd.git [Fix] Further tokenization fixes MFH: rspamd-1.6 --- diff --git a/src/libmime/message.c b/src/libmime/message.c index cae61643c8..fe2cdbdae9 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -234,8 +234,8 @@ rspamd_extract_words (struct rspamd_task *task, /* Ugly workaround */ if (IS_PART_HTML (part)) { part->normalized_words = rspamd_tokenize_text ( - part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg, + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, part->exceptions, FALSE, NULL); } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index d79d68144e..f75310fe3b 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -235,7 +235,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto process_exception; } - else if (!u_isgraph (uc) || u_ispunct (uc)) { + else if (!u_isalnum (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; goto set_token; }