Remove legacy words, use merely normalized_words

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 19bef072a8a8d8058ac3398ad6dbddbcc5a80133..10d7f04f45efbc54805bc711f8b0c04b05da7f54 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -995,7 +995,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
  #endif
  
         /* Ugly workaround */
-       tmp = rspamd_tokenize_text (part->content->data,
+       part->normalized_words = rspamd_tokenize_text (part->content->data,
                         part->content->len, IS_PART_UTF (part), task->cfg,
                         part->urls_offset, FALSE,
                         NULL);
@@ -1034,7 +1034,6 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                                 }
                         }
                 }
-               part->normalized_words = tmp;
         }
  #ifdef WITH_SNOWBALL
         if (stem != NULL) {
@@ -1246,10 +1245,6 @@ process_text_part (struct rspamd_task *task,
  
         /* Post process part */
         detect_text_language (text_part);
-       text_part->words = rspamd_tokenize_text (text_part->content->data,
-                       text_part->content->len, IS_PART_UTF (text_part), task->cfg,
-                       text_part->urls_offset, FALSE,
-                       &text_part->hash);
         rspamd_normalize_text_part (task, text_part);
  
         /* Calculate number of lines */
diff --git a/src/libmime/message.h b/src/libmime/message.h

index aea5c3750c3903d8cdc3620e88edb8fa411acc4e..13ccaa4fa3fbbcb4905c30427f41d7e50b757410 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -45,7 +45,6 @@ struct mime_text_part {
         GList *urls_offset;     /**< list of offsets of urls                                            */
         GMimeObject *parent;
         struct mime_part *mime_part;
-       GArray *words;
         GArray *normalized_words;
         guint nlines;
         guint64 hash;
diff --git a/src/libserver/task.c b/src/libserver/task.c

index 7d34e830b599fc243f10aeae788bf491a941fd05..eea9057eeab0099110cf3a02463307401c80568e 100644 (file)
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -185,9 +185,6 @@ rspamd_task_free (struct rspamd_task *task)
  
                 for (i = 0; i < task->text_parts->len; i ++) {
                         tp = g_ptr_array_index (task->text_parts, i);
-                       if (tp->words) {
-                               g_array_free (tp->words, TRUE);
-                       }
                         if (tp->normalized_words) {
                                 g_array_free (tp->normalized_words, TRUE);
                         }
diff --git a/src/libstat/learn_cache/sqlite3_cache.c b/src/libstat/learn_cache/sqlite3_cache.c

index 889395b4dc012d16d174507fec0aaa63a3d7d9e3..cf4ab615a8bc6960beac0beec504c0e9cbc51a6b 100644 (file)
--- a/src/libstat/learn_cache/sqlite3_cache.c
+++ b/src/libstat/learn_cache/sqlite3_cache.c
@@ -257,9 +257,9 @@ rspamd_stat_cache_sqlite3_process (struct rspamd_task *task,
                 for (i = 0; i < task->text_parts->len; i ++) {
                         part = g_ptr_array_index (task->text_parts, i);
  
-                       if (part->words != NULL) {
-                               for (j = 0; j < part->words->len; j ++) {
-                                       word = &g_array_index (part->words, rspamd_ftok_t, j);
+                       if (part->normalized_words != NULL) {
+                               for (j = 0; j < part->normalized_words->len; j ++) {
+                                       word = &g_array_index (part->normalized_words, rspamd_ftok_t, j);
                                         rspamd_cryptobox_hash_update (&st, word->begin, word->len);
                                 }
                         }
diff --git a/src/libstat/stat_process.c b/src/libstat/stat_process.c

index 952330b4937e4485a31499ba2ca267a78cc88343..c0aad19303d5fde26600becccbfdacfdec04d0ab 100644 (file)
--- a/src/libstat/stat_process.c
+++ b/src/libstat/stat_process.c
@@ -198,17 +198,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
         for (i = 0; i < task->text_parts->len; i ++) {
                 part = g_ptr_array_index (task->text_parts, i);
  
-               if (!IS_PART_EMPTY (part) && part->words != NULL) {
-                       if (compat) {
-                               tok->tokenizer->tokenize_func (tok, task->task_pool,
-                                       part->words, IS_PART_UTF (part), NULL);
-                       }
-                       else {
-                               tok->tokenizer->tokenize_func (tok, task->task_pool,
+               if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+                       tok->tokenizer->tokenize_func (tok, task->task_pool,
                                         part->normalized_words, IS_PART_UTF (part), NULL);
-                       }
                 }
  
+
                 if (pdiff != NULL && *pdiff > similarity_treshold) {
                         msg_debug_task ("message has two common parts (%d%%), so skip the last one",
                                         *pdiff);
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c

index cf58eb672221ea48da338029e8a942e1423d2538..e726419db0b189b1e4fde07a00f796d70c8cd0ce 100644 (file)
--- a/src/plugins/fuzzy_check.c
+++ b/src/plugins/fuzzy_check.c
@@ -592,17 +592,7 @@ fuzzy_io_fin (void *ud)
  static GArray *
  fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool)
  {
-       GArray *res;
-
-       if (!IS_PART_UTF (part) || !part->language || part->language[0] == '\0' ||
-                       part->normalized_words == NULL) {
-               res = part->words;
-       }
-       else {
-               res = part->normalized_words;
-       }
-
-       return res;
+       return part->normalized_words;
  }
  
  /*
@@ -1259,14 +1249,14 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                         continue;
                 }
  
-               if (part->words == NULL || part->words->len == 0) {
+               if (part->normalized_words == NULL || part->normalized_words->len == 0) {
                         msg_info_task ("<%s>, part hash empty, skip fuzzy check",
                                 task->message_id);
                         continue;
                 }
  
                 if (fuzzy_module_ctx->min_hash_len != 0 &&
-                       part->words->len < fuzzy_module_ctx->min_hash_len) {
+                       part->normalized_words->len < fuzzy_module_ctx->min_hash_len) {
                         msg_info_task (
                                 "<%s>, part hash is shorter than %d symbols, skip fuzzy check",
                                 task->message_id,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
src/libmime/message.c		patch \| blob \| blame \| history
src/libmime/message.h		patch \| blob \| blame \| history
src/libserver/task.c		patch \| blob \| blame \| history
src/libstat/learn_cache/sqlite3_cache.c		patch \| blob \| blame \| history
src/libstat/stat_process.c		patch \| blob \| blame \| history
src/plugins/fuzzy_check.c		patch \| blob \| blame \| history