]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Further tokenization fixes
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 21 Oct 2017 12:46:29 +0000 (13:46 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 21 Oct 2017 12:46:29 +0000 (13:46 +0100)
MFH: rspamd-1.6

src/libmime/message.c
src/libstat/tokenizers/tokenizers.c

index cae61643c8267deda4ffb0e7572e933a8ea5c638..fe2cdbdae92900af309dac52e79afd4d88e0694f 100644 (file)
@@ -234,8 +234,8 @@ rspamd_extract_words (struct rspamd_task *task,
        /* Ugly workaround */
        if (IS_PART_HTML (part)) {
                part->normalized_words = rspamd_tokenize_text (
-                               part->content->data,
-                               part->content->len, IS_PART_UTF (part), task->cfg,
+                               part->stripped_content->data,
+                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
                                part->exceptions, FALSE,
                                NULL);
        }
index d79d68144e18ad94d943cda9d4e7a31dec7120d3..f75310fe3ba907ba1dd799c870ae7f66589ae924 100644 (file)
@@ -235,7 +235,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                                token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
                                goto process_exception;
                        }
-                       else if (!u_isgraph (uc) || u_ispunct (uc)) {
+                       else if (!u_isalnum (uc)) {
                                token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
                                goto set_token;
                        }