[Fix] Fix and rescore R_PARTS_DIFFER logic

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 27 Apr 2016 15:05:15 +0000 (16:05 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 27 Apr 2016 15:05:47 +0000 (16:05 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 27 Apr 2016 15:05:15 +0000 (16:05 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 27 Apr 2016 15:05:47 +0000 (16:05 +0100)
diff --git a/rules/misc.lua b/rules/misc.lua

index b3926e46b707c03204da6fdeeb601472d93b57cc..2c6d503179ca438218fcd2e719ce5ac6b50d3283 100644 (file)
--- a/rules/misc.lua
+++ b/rules/misc.lua
@@ -33,15 +33,24 @@ reconf['R_FLASH_REDIR_IMGSHACK'] = '/^(?:http:\\/\\/)?img\\d{1,5}\\.imageshack\\
  
  -- Different text parts
  rspamd_config.R_PARTS_DIFFER = function(task)
-  local distance = task:get_mempool():get_variable('parts_distance', 'int')
+  local distance = task:get_mempool():get_variable('parts_distance', 'double')
  
    if distance then
      local nd = tonumber(distance)
-
-    if nd < 50 then
-      local score = 1 - util.tanh(nd / 100.0)
-
-      task:insert_result('R_PARTS_DIFFER', score, tostring(nd) .. '%')
+    -- ND is relation of different words to total words
+    if nd >= 0.5 then
+      local tw = task:get_mempool():get_variable('total_words', 'int')
+
+      if tw then
+        if tw > 30 then
+          -- We are confident about difference
+          local score = (nd - 0.5) * 2.0
+        else
+          -- We are not so confident about difference
+          local score = (nd - 0.5)
+        end
+        task:insert_result('R_PARTS_DIFFER', score, tostring(100.0 * nd) .. '%')
+      end
      end
    end
  
diff --git a/src/libmime/message.c b/src/libmime/message.c

index cd18f6ce43c97e68e7a71716fab6da5d77a64720..444d5c7d78fb79c2e18b50f69c8a423afaf0689a 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1566,8 +1566,9 @@ rspamd_message_parse (struct rspamd_task *task)
         const gchar *p;
         gsize len;
         goffset hdr_pos;
-       gint diff, *pdiff, i;
-       guint tw, dw;
+       gint i;
+       gdouble diff, *pdiff;
+       guint tw, *ptw, dw;
  
         tmp = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
         p = task->msg.begin;
@@ -1843,26 +1844,34 @@ rspamd_message_parse (struct rspamd_task *task)
                                 if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
                                                 p1->normalized_words && p2->normalized_words) {
  
-                                       tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
+                                       tw = p1->normalized_words->len + p2->normalized_words->len;
  
                                         if (tw > 0) {
                                                 dw = rspamd_words_levenshtein_distance (task,
                                                                 p1->normalized_words,
                                                                 p2->normalized_words);
-                                               diff = (100.0 * (gdouble)(tw - dw) / (gdouble)tw);
+                                               diff = (2.0 * (gdouble)dw) / (gdouble)tw;
  
-                                               debug_task (
+                                               msg_err_task (
                                                                 "different words: %d, total words: %d, "
-                                                               "got likeliness between parts of %d%%",
+                                                               "got diff between parts of %.2f",
                                                                 dw, tw,
                                                                 diff);
  
-                                               pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
+                                               pdiff = rspamd_mempool_alloc (task->task_pool,
+                                                               sizeof (gdouble));
                                                 *pdiff = diff;
                                                 rspamd_mempool_set_variable (task->task_pool,
                                                                 "parts_distance",
                                                                 pdiff,
                                                                 NULL);
+                                               ptw = rspamd_mempool_alloc (task->task_pool,
+                                                               sizeof (gint));
+                                               *ptw = tw;
+                                               rspamd_mempool_set_variable (task->task_pool,
+                                                               "total_words",
+                                                               ptw,
+                                                               NULL);
                                         }
                                 }
                         }
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c

index ea8af2dcd59be16de6c9bbdac9f8aee1b6ae6003..c107703a99a42dbfd7d5731fa8c45d071087b53c 100644 (file)
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -955,9 +955,9 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
  gboolean
  rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
  {
-       gint threshold, threshold2 = -1, diff;
+       gint threshold, threshold2 = -1;
         struct expression_argument *arg;
-       gint *pdiff;
+       gdouble *pdiff, diff;
  
         if (args == NULL || args->len == 0) {
                 debug_task ("no threshold is specified, assume it 100");
@@ -997,12 +997,13 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
         if ((pdiff =
                 rspamd_mempool_get_variable (task->task_pool,
                 "parts_distance")) != NULL) {
-               diff = *pdiff;
+               diff = (1.0 - (*pdiff)) * 100.0;
+
                 if (diff != -1) {
                         if (threshold2 > 0) {
-                               if (diff >=
-                                       MIN (threshold,
-                                       threshold2) && diff < MAX (threshold, threshold2)) {
+                               if (diff >= MIN (threshold, threshold2) &&
+                                       diff < MAX (threshold, threshold2)) {
+
                                         return TRUE;
                                 }
                         }
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index e6d34e406072a15ca66011a0917a706d5841516d..486d82c084d343c8545cd1ea40f75097855cf04a 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -28,7 +28,7 @@
  #define RSPAMD_LEARN_OP 1
  #define RSPAMD_UNLEARN_OP 2
  
-static const gint similarity_treshold = 80;
+static const gdouble similarity_treshold = 80.0;
  
  static void
  rspamd_stat_tokenize_header (struct rspamd_task *task,
@@ -173,7 +173,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
         GArray *words;
         gchar *sub;
         guint i, reserved_len = 0;
-       gint *pdiff;
+       gdouble *pdiff;
  
         for (i = 0; i < task->text_parts->len; i++) {
                 part = g_ptr_array_index (task->text_parts, i);
@@ -200,7 +200,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                 }
  
  
-               if (pdiff != NULL && *pdiff > similarity_treshold) {
+               if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
                         msg_debug_task ("message has two common parts (%d%%), so skip the last one",
                                         *pdiff);
                         break;
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 27 Apr 2016 15:05:15 +0000 (16:05 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 27 Apr 2016 15:05:47 +0000 (16:05 +0100)
rules/misc.lua		patch \| blob \| blame \| history
src/libmime/message.c		patch \| blob \| blame \| history
src/libmime/mime_expressions.c		patch \| blob \| blame \| history
src/libstat/stat_process.c		patch \| blob \| blame \| history