From: Vsevolod Stakhov Date: Mon, 23 Feb 2015 14:28:47 +0000 (+0000) Subject: Add routines to normalize text parts. X-Git-Tag: 0.9.0~638 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=53991167d09fb43907dcbb69e5147bc0c0011c15;p=thirdparty%2Frspamd.git Add routines to normalize text parts. --- diff --git a/src/libmime/message.c b/src/libmime/message.c index bcdb86259f..e6d27563b8 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -31,6 +31,7 @@ #include "images.h" #include "utlist.h" #include "tokenizers/tokenizers.h" +#include "libstemmer.h" #include @@ -1169,6 +1170,54 @@ detect_text_language (struct mime_text_part *part) } } +static void +rspamd_normalize_text_part (struct rspamd_task *task, + struct mime_text_part *part) +{ + struct sb_stemmer *stem = NULL; + rspamd_fstring_t *w, stw; + const guchar *r; + guint i; + + if (part->language && part->language[0] != '\0' && part->is_utf) { + stem = sb_stemmer_new (part->language, "UTF_8"); + if (stem == NULL) { + msg_info ("<%s> cannot create lemmatizer for %s language", + task->message_id, part->language); + } + } + + g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), + part->words->len); + for (i = 0; i < part->words->len; i ++) { + w = &g_array_index (part->words, rspamd_fstring_t, i); + if (stem) { + r = sb_stemmer_stem (stem, w->begin, w->len); + } + + if (stem == NULL || r == NULL) { + stw.begin = rspamd_mempool_fstrdup (task->task_pool, w); + stw.len = w->len; + } + else { + stw.begin = rspamd_mempool_strdup (task->task_pool, r); + stw.len = strlen (r); + } + + if (part->is_utf) { + rspamd_str_lc_utf8 (stw.begin, stw.len); + } + else { + rspamd_str_lc (stw.begin, stw.len); + } + g_array_append_val (part->normalized_words, stw); + } + + if (stem != NULL) { + sb_stemmer_delete (stem); + } +} + static void process_text_part (struct rspamd_task *task, GByteArray *part_content, @@ -1273,6 +1322,7 @@ process_text_part (struct rspamd_task *task, text_part->words = rspamd_tokenize_text (text_part->content->data, text_part->content->len, text_part->is_utf, task->cfg->min_word_len, &text_part->urls_offset); + rspamd_normalize_text_part (task, text_part); } #ifdef GMIME24 diff --git a/src/libmime/message.h b/src/libmime/message.h index d418b6cf67..ef881ebd10 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -40,6 +40,7 @@ struct mime_text_part { GMimeObject *parent; rspamd_fstring_t *diff_str; GArray *words; + GArray *normalized_words; }; struct received_header { diff --git a/src/libserver/task.c b/src/libserver/task.c index 699e129abe..c442db8fef 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -243,6 +243,9 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft) if (tp->words) { g_array_free (tp->words, TRUE); } + if (tp->normalized_words) { + g_array_free (tp->normalized_words, TRUE); + } part = g_list_next (part); }