]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Move distance calculation to message parsing.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jul 2015 22:58:56 +0000 (23:58 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 14 Jul 2015 22:58:56 +0000 (23:58 +0100)
src/libmime/message.c
src/libmime/mime_expressions.c

index b1d80f7e9dd59a4ad9300b250f99e978fc2bb0b4..fde23ccb28750065b86e1ee3975df429b89e491d 100644 (file)
@@ -1254,6 +1254,49 @@ rspamd_normalize_text_part (struct rspamd_task *task,
        }
 }
 
+#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+
+static gint
+rspamd_words_levenshtein_distance (GArray *w1, GArray *w2)
+{
+       guint s1len, s2len, x, y, lastdiag, olddiag;
+       guint *column;
+       rspamd_fstring_t *s1, *s2;
+       gint eq;
+       static const guint max_words = 8192;
+
+       s1len = w1->len;
+       s2len = w2->len;
+
+       if (s1len > max_words) {
+               msg_err ("cannot compare parts with more than %ud words: %ud",
+                               max_words, s1len);
+               return 0;
+       }
+
+       column = g_alloca ((s1len + 1) * sizeof (guint));
+
+       for (y = 1; y <= s1len; y++) {
+               column[y] = y;
+       }
+
+       for (x = 1; x <= s2len; x++) {
+               column[0] = x;
+
+               for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
+                       olddiag = column[y];
+                       s1 = &g_array_index (w1, rspamd_fstring_t, y - 1);
+                       s2 = &g_array_index (w1, rspamd_fstring_t, x - 1);
+                       eq = rspamd_fstring_equal (s1, s2) ? 0 : 1;
+                       column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
+                                       lastdiag + (eq));
+                       lastdiag = olddiag;
+               }
+       }
+
+       return column[s1len];
+}
+
 static int
 rspamd_gtube_cb (int strnum, int textpos, void *context)
 {
@@ -1624,6 +1667,9 @@ rspamd_message_parse (struct rspamd_task *task)
        GList *first, *cur;
        GMimePart *part;
        GMimeDataWrapper *wrapper;
+       GMimeObject *parent;
+       const GMimeContentType *ct;
+       struct mime_text_part *p1, *p2;
        struct mime_foreach_data md;
        struct received_header *recv;
        gchar *mid, *url_str;
@@ -1631,7 +1677,8 @@ rspamd_message_parse (struct rspamd_task *task)
        struct rspamd_url *subject_url;
        gsize len;
        gint64 hdr_start, hdr_end;
-       gint rc, state = 0;
+       gint rc, state = 0, diff, *pdiff;
+       guint tw, dw;
 
        tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
        p = task->msg.start;
@@ -1863,6 +1910,55 @@ rspamd_message_parse (struct rspamd_task *task)
                }
        }
 
+       /* Calculate distance for 2-parts messages */
+       if (task->text_parts->len == 2) {
+               p1 = g_ptr_array_index (task->text_parts, 0);
+               p2 = g_ptr_array_index (task->text_parts, 1);
+
+               /* First of all check parent object */
+               if (p1->parent && p1->parent == p2->parent) {
+                       parent = p1->parent;
+                       ct = g_mime_object_get_content_type (parent);
+                       if (ct == NULL ||
+                                       !g_mime_content_type_is_type ((GMimeContentType *)ct,
+                                                       "multipart", "alternative")) {
+                               debug_task (
+                                               "two parts are not belong to multipart/alternative container, skip check");
+                       }
+               }
+               else {
+                       debug_task (
+                                       "message contains two parts but they are in different multi-parts");
+               }
+
+               if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
+                               p1->normalized_words && p2->normalized_words) {
+
+                       tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
+                       dw = rspamd_words_levenshtein_distance (p1->normalized_words,
+                                       p2->normalized_words);
+                       diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100;
+
+                       msg_info (
+                                       "different words: %d, total words: %d, "
+                                       "got likeliness between parts of %d%%",
+                                       dw, tw,
+                                       diff);
+
+                       pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
+                       *pdiff = diff;
+                       rspamd_mempool_set_variable (task->task_pool,
+                                       "parts_distance",
+                                       pdiff,
+                                       NULL);
+               }
+       }
+       else {
+               debug_task (
+                               "message has too many text parts, so do not try to compare "
+                               "them with each other");
+       }
+
        return TRUE;
 }
 
index 446493b4df8d10b49b532849350dbdb72df70138..be49f11d8f8f0cf0e1df927dce7d76f07e6c3735 100644 (file)
@@ -1165,42 +1165,6 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
        return FALSE;
 }
 
-#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
-
-static gint
-rspamd_words_levenshtein_distance (GArray *w1, GArray *w2)
-{
-       guint s1len, s2len, x, y, lastdiag, olddiag;
-       guint *column;
-       rspamd_fstring_t *s1, *s2;
-       gint eq;
-
-       s1len = w1->len;
-       s2len = w2->len;
-
-       column = g_alloca ((s1len + 1) * sizeof (guint));
-
-       for (y = 1; y <= s1len; y++) {
-               column[y] = y;
-       }
-
-       for (x = 1; x <= s2len; x++) {
-               column[0] = x;
-
-               for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
-                       olddiag = column[y];
-                       s1 = &g_array_index (w1, rspamd_fstring_t, y - 1);
-                       s2 = &g_array_index (w1, rspamd_fstring_t, x - 1);
-                       eq = rspamd_fstring_equal (s1, s2) ? 0 : 1;
-                       column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
-                                       lastdiag + (eq));
-                       lastdiag = olddiag;
-               }
-       }
-
-       return column[s1len];
-}
-
 
 /*
  * This function is designed to find difference between text/html and text/plain parts
@@ -1212,11 +1176,7 @@ gboolean
 rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
 {
        gint threshold, threshold2 = -1, diff;
-       struct mime_text_part *p1, *p2;
        struct expression_argument *arg;
-       GMimeObject *parent;
-       const GMimeContentType *ct;
-       guint tw, dw;
        gint *pdiff;
 
        if (args == NULL || args->len == 0) {
@@ -1278,98 +1238,6 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
                }
        }
 
-       if (task->text_parts->len == 2) {
-               p1 = g_ptr_array_index (task->text_parts, 0);
-               p2 = g_ptr_array_index (task->text_parts, 1);
-               pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
-               *pdiff = -1;
-
-               /* First of all check parent object */
-               if (p1->parent && p1->parent == p2->parent) {
-                       parent = p1->parent;
-                       ct = g_mime_object_get_content_type (parent);
-#ifndef GMIME24
-                       if (ct == NULL ||
-                               !g_mime_content_type_is_type (ct, "multipart", "alternative")) {
-#else
-                       if (ct == NULL ||
-                               !g_mime_content_type_is_type ((GMimeContentType *)ct,
-                               "multipart", "alternative")) {
-#endif
-                               debug_task (
-                                       "two parts are not belong to multipart/alternative container, skip check");
-                               rspamd_mempool_set_variable (task->task_pool,
-                                       "parts_distance",
-                                       pdiff,
-                                       NULL);
-                               return FALSE;
-                       }
-               }
-               else {
-                       debug_task (
-                               "message contains two parts but they are in different multi-parts");
-                       rspamd_mempool_set_variable (task->task_pool,
-                               "parts_distance",
-                               pdiff,
-                               NULL);
-                       return FALSE;
-               }
-               if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
-                               p1->normalized_words && p2->normalized_words) {
-
-                       tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
-                       dw = rspamd_words_levenshtein_distance (p1->normalized_words,
-                                       p2->normalized_words);
-                       diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100;
-
-                       msg_debug (
-                               "different words: %d, total words: %d, "
-                               "got likeliness between parts of %d%%, threshold is %d%%",
-                               dw, tw,
-                               diff,
-                               threshold);
-
-                       *pdiff = diff;
-                       rspamd_mempool_set_variable (task->task_pool,
-                               "parts_distance",
-                               pdiff,
-                               NULL);
-                       if (threshold2 > 0) {
-                               if (diff >=
-                                       MIN (threshold,
-                                       threshold2) && diff < MAX (threshold, threshold2)) {
-                                       return TRUE;
-                               }
-                       }
-                       else {
-                               if (diff <= threshold) {
-                                       return TRUE;
-                               }
-                       }
-               }
-               else if ((IS_PART_EMPTY (p1) &&
-                       !IS_PART_EMPTY (p2)) || (!IS_PART_EMPTY (p1)&& IS_PART_EMPTY (p2))) {
-                       /* Empty and non empty parts are different */
-                       *pdiff = 0;
-                       rspamd_mempool_set_variable (task->task_pool,
-                               "parts_distance",
-                               pdiff,
-                               NULL);
-                       return TRUE;
-               }
-       }
-       else {
-               debug_task (
-                       "message has too many text parts, so do not try to compare them with each other");
-               rspamd_mempool_set_variable (task->task_pool,
-                       "parts_distance",
-                       pdiff,
-                       NULL);
-               return FALSE;
-       }
-
-       rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff,
-               NULL);
        return FALSE;
 }