]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Add routines to normalize text parts.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:28:47 +0000 (14:28 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 23 Feb 2015 14:28:47 +0000 (14:28 +0000)
src/libmime/message.c
src/libmime/message.h
src/libserver/task.c

index bcdb86259fc2682fddec03b67bd331cf5f35a720..e6d27563b8768d38df524a2bede24fe382af539c 100644 (file)
@@ -31,6 +31,7 @@
 #include "images.h"
 #include "utlist.h"
 #include "tokenizers/tokenizers.h"
+#include "libstemmer.h"
 
 #include <iconv.h>
 
@@ -1169,6 +1170,54 @@ detect_text_language (struct mime_text_part *part)
        }
 }
 
+static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+               struct mime_text_part *part)
+{
+       struct sb_stemmer *stem = NULL;
+       rspamd_fstring_t *w, stw;
+       const guchar *r;
+       guint i;
+
+       if (part->language && part->language[0] != '\0' && part->is_utf) {
+               stem = sb_stemmer_new (part->language, "UTF_8");
+               if (stem == NULL) {
+                       msg_info ("<%s> cannot create lemmatizer for %s language",
+                               task->message_id, part->language);
+               }
+       }
+
+       g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t),
+                       part->words->len);
+       for (i = 0; i < part->words->len; i ++) {
+               w = &g_array_index (part->words, rspamd_fstring_t, i);
+               if (stem) {
+                       r = sb_stemmer_stem (stem, w->begin, w->len);
+               }
+
+               if (stem == NULL || r == NULL) {
+                       stw.begin = rspamd_mempool_fstrdup (task->task_pool, w);
+                       stw.len = w->len;
+               }
+               else {
+                       stw.begin = rspamd_mempool_strdup (task->task_pool, r);
+                       stw.len = strlen (r);
+               }
+
+               if (part->is_utf) {
+                       rspamd_str_lc_utf8 (stw.begin, stw.len);
+               }
+               else {
+                       rspamd_str_lc (stw.begin, stw.len);
+               }
+               g_array_append_val (part->normalized_words, stw);
+       }
+
+       if (stem != NULL) {
+               sb_stemmer_delete (stem);
+       }
+}
+
 static void
 process_text_part (struct rspamd_task *task,
        GByteArray *part_content,
@@ -1273,6 +1322,7 @@ process_text_part (struct rspamd_task *task,
        text_part->words = rspamd_tokenize_text (text_part->content->data,
                        text_part->content->len, text_part->is_utf, task->cfg->min_word_len,
                        &text_part->urls_offset);
+       rspamd_normalize_text_part (task, text_part);
 }
 
 #ifdef GMIME24
index d418b6cf67a833add576ebd31b41092e6314f58b..ef881ebd10cead911a91c587b81f17b5696b2548 100644 (file)
@@ -40,6 +40,7 @@ struct mime_text_part {
        GMimeObject *parent;
        rspamd_fstring_t *diff_str;
        GArray *words;
+       GArray *normalized_words;
 };
 
 struct received_header {
index 699e129abeb54db9aa79ff238b42950ea802cfd6..c442db8fef3c0a92d3a4510d981f6edb0bf0f5d0 100644 (file)
@@ -243,6 +243,9 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft)
                                if (tp->words) {
                                        g_array_free (tp->words, TRUE);
                                }
+                               if (tp->normalized_words) {
+                                       g_array_free (tp->normalized_words, TRUE);
+                               }
                                part = g_list_next (part);
                        }