]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Skip same text parts when processing statistics.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 29 Jul 2015 11:59:46 +0000 (12:59 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 29 Jul 2015 11:59:46 +0000 (12:59 +0100)
src/libstat/stat_process.c

index f216d964baa8bb8c6d74f2d13a412400ddd9bbfe..93f48b3e946664dc623640075edb38121e3e80a9 100644 (file)
@@ -35,6 +35,8 @@
 #define RSPAMD_LEARN_OP 1
 #define RSPAMD_UNLEARN_OP 2
 
+static const gint similarity_treshold = 80;
+
 struct preprocess_cb_data {
        struct rspamd_task *task;
        GList *classifier_runtimes;
@@ -187,9 +189,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        GArray *words;
        gchar *sub;
        guint i;
+       gint *pdiff;
        gboolean compat;
 
        compat = tok->tokenizer->is_compat (tok);
+       pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance");
 
        for (i = 0; i < task->text_parts->len; i ++) {
                part = g_ptr_array_index (task->text_parts, i);
@@ -205,7 +209,11 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                        }
                }
 
-               /* TODO: compare parts distance */
+               if (pdiff != NULL && *pdiff > similarity_treshold) {
+                       msg_debug ("message has two common parts (%d%%), so skip the last one",
+                                       *pdiff);
+                       break;
+               }
        }
 
        if (task->subject != NULL) {