]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Remove legacy words, use merely normalized_words
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 26 Nov 2015 17:04:55 +0000 (17:04 +0000)
src/libmime/message.c
src/libmime/message.h
src/libserver/task.c
src/libstat/learn_cache/sqlite3_cache.c
src/libstat/stat_process.c
src/plugins/fuzzy_check.c

index 19bef072a8a8d8058ac3398ad6dbddbcc5a80133..10d7f04f45efbc54805bc711f8b0c04b05da7f54 100644 (file)
@@ -995,7 +995,7 @@ rspamd_normalize_text_part (struct rspamd_task *task,
 #endif
 
        /* Ugly workaround */
-       tmp = rspamd_tokenize_text (part->content->data,
+       part->normalized_words = rspamd_tokenize_text (part->content->data,
                        part->content->len, IS_PART_UTF (part), task->cfg,
                        part->urls_offset, FALSE,
                        NULL);
@@ -1034,7 +1034,6 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                                }
                        }
                }
-               part->normalized_words = tmp;
        }
 #ifdef WITH_SNOWBALL
        if (stem != NULL) {
@@ -1246,10 +1245,6 @@ process_text_part (struct rspamd_task *task,
 
        /* Post process part */
        detect_text_language (text_part);
-       text_part->words = rspamd_tokenize_text (text_part->content->data,
-                       text_part->content->len, IS_PART_UTF (text_part), task->cfg,
-                       text_part->urls_offset, FALSE,
-                       &text_part->hash);
        rspamd_normalize_text_part (task, text_part);
 
        /* Calculate number of lines */
index aea5c3750c3903d8cdc3620e88edb8fa411acc4e..13ccaa4fa3fbbcb4905c30427f41d7e50b757410 100644 (file)
@@ -45,7 +45,6 @@ struct mime_text_part {
        GList *urls_offset;     /**< list of offsets of urls                                            */
        GMimeObject *parent;
        struct mime_part *mime_part;
-       GArray *words;
        GArray *normalized_words;
        guint nlines;
        guint64 hash;
index 7d34e830b599fc243f10aeae788bf491a941fd05..eea9057eeab0099110cf3a02463307401c80568e 100644 (file)
@@ -185,9 +185,6 @@ rspamd_task_free (struct rspamd_task *task)
 
                for (i = 0; i < task->text_parts->len; i ++) {
                        tp = g_ptr_array_index (task->text_parts, i);
-                       if (tp->words) {
-                               g_array_free (tp->words, TRUE);
-                       }
                        if (tp->normalized_words) {
                                g_array_free (tp->normalized_words, TRUE);
                        }
index 889395b4dc012d16d174507fec0aaa63a3d7d9e3..cf4ab615a8bc6960beac0beec504c0e9cbc51a6b 100644 (file)
@@ -257,9 +257,9 @@ rspamd_stat_cache_sqlite3_process (struct rspamd_task *task,
                for (i = 0; i < task->text_parts->len; i ++) {
                        part = g_ptr_array_index (task->text_parts, i);
 
-                       if (part->words != NULL) {
-                               for (j = 0; j < part->words->len; j ++) {
-                                       word = &g_array_index (part->words, rspamd_ftok_t, j);
+                       if (part->normalized_words != NULL) {
+                               for (j = 0; j < part->normalized_words->len; j ++) {
+                                       word = &g_array_index (part->normalized_words, rspamd_ftok_t, j);
                                        rspamd_cryptobox_hash_update (&st, word->begin, word->len);
                                }
                        }
index 952330b4937e4485a31499ba2ca267a78cc88343..c0aad19303d5fde26600becccbfdacfdec04d0ab 100644 (file)
@@ -198,17 +198,12 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        for (i = 0; i < task->text_parts->len; i ++) {
                part = g_ptr_array_index (task->text_parts, i);
 
-               if (!IS_PART_EMPTY (part) && part->words != NULL) {
-                       if (compat) {
-                               tok->tokenizer->tokenize_func (tok, task->task_pool,
-                                       part->words, IS_PART_UTF (part), NULL);
-                       }
-                       else {
-                               tok->tokenizer->tokenize_func (tok, task->task_pool,
+               if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) {
+                       tok->tokenizer->tokenize_func (tok, task->task_pool,
                                        part->normalized_words, IS_PART_UTF (part), NULL);
-                       }
                }
 
+
                if (pdiff != NULL && *pdiff > similarity_treshold) {
                        msg_debug_task ("message has two common parts (%d%%), so skip the last one",
                                        *pdiff);
index cf58eb672221ea48da338029e8a942e1423d2538..e726419db0b189b1e4fde07a00f796d70c8cd0ce 100644 (file)
@@ -592,17 +592,7 @@ fuzzy_io_fin (void *ud)
 static GArray *
 fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool)
 {
-       GArray *res;
-
-       if (!IS_PART_UTF (part) || !part->language || part->language[0] == '\0' ||
-                       part->normalized_words == NULL) {
-               res = part->words;
-       }
-       else {
-               res = part->normalized_words;
-       }
-
-       return res;
+       return part->normalized_words;
 }
 
 /*
@@ -1259,14 +1249,14 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule,
                        continue;
                }
 
-               if (part->words == NULL || part->words->len == 0) {
+               if (part->normalized_words == NULL || part->normalized_words->len == 0) {
                        msg_info_task ("<%s>, part hash empty, skip fuzzy check",
                                task->message_id);
                        continue;
                }
 
                if (fuzzy_module_ctx->min_hash_len != 0 &&
-                       part->words->len < fuzzy_module_ctx->min_hash_len) {
+                       part->normalized_words->len < fuzzy_module_ctx->min_hash_len) {
                        msg_info_task (
                                "<%s>, part hash is shorter than %d symbols, skip fuzzy check",
                                task->message_id,