]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Skip stop words in statistics
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Nov 2018 15:02:48 +0000 (15:02 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Nov 2018 15:02:48 +0000 (15:02 +0000)
src/libstat/classifiers/bayes.c
src/libstat/stat_process.c
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.h

index 2e494e5267ca872ae9c04bd662650bdb6ed1eca2..1898df4fee3ded4bf0bbc47deb626625ceaa13ad 100644 (file)
@@ -175,7 +175,7 @@ bayes_classify_token (struct rspamd_classifier *ctx,
        }
 
        /* Probability for this token */
-       if (total_count > ctx->cfg->min_token_hits) {
+       if (total_count >= ctx->cfg->min_token_hits) {
                spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
                ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
                spam_prob = spam_freq / (spam_freq + ham_freq);
index 87c5c3190aad2d7c4b1808f9eb2c4323320d70fb..e06bd1fe366d486dd0ac831cd4d1b62a6f0c1445 100644 (file)
@@ -294,7 +294,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
 
        lua_settop (L, 0);
        st_ctx->tokenizer->tokenize_func (st_ctx,
-                       task->task_pool,
+                       task,
                        ar,
                        TRUE,
                        "META:",
@@ -345,7 +345,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                part = g_ptr_array_index (task->text_parts, i);
 
                if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
-                       st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+                       st_ctx->tokenizer->tokenize_func (st_ctx, task,
                                        part->utf_words, IS_PART_UTF (part),
                                        NULL, task->tokens);
                }
@@ -362,7 +362,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                words = rspamd_tokenize_subject (task);
                if (words != NULL) {
                        st_ctx->tokenizer->tokenize_func (st_ctx,
-                                       task->task_pool,
+                                       task,
                                        words,
                                        TRUE,
                                        "SUBJECT",
index 8784a6858833d2c906a309147807a3f7b5a31ab8..d68e3bc6069ecee3cfa22dccfb401feb4efb9648 100644 (file)
  * OSB tokenizer
  */
 
+
 #include "tokenizers.h"
 #include "stat_internal.h"
+#include "libmime/lang_detection.h"
 
 /* Size for features pipe */
 #define DEFAULT_FEATURE_WINDOW_SIZE 5
@@ -259,11 +261,11 @@ struct token_pipe_entry {
 
 gint
 rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
-               rspamd_mempool_t *pool,
-               GArray *words,
-               gboolean is_utf,
-               const gchar *prefix,
-               GPtrArray *result)
+                                         struct rspamd_task *task,
+                                         GArray *words,
+                                         gboolean is_utf,
+                                         const gchar *prefix,
+                                         GPtrArray *result)
 {
        rspamd_token_t *new_tok = NULL;
        rspamd_stat_token_t *token;
@@ -303,6 +305,14 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                token = &g_array_index (words, rspamd_stat_token_t, w);
                token_flags = token->flags;
 
+               if (task->lang_det) {
+                       if (rspamd_language_detector_is_stop_word (task->lang_det,
+                                       token->begin, token->len)) {
+                               /* Skip it */
+                               continue;
+                       }
+               }
+
                if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
                        rspamd_ftok_t ftok;
 
@@ -327,7 +337,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                }
 
                if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
-                       new_tok = rspamd_mempool_alloc0 (pool, token_size);
+                       new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size);
                        new_tok->flags = token_flags;
                        new_tok->t1 = token;
                        new_tok->t2 = token;
@@ -339,7 +349,7 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                }
 
 #define ADD_TOKEN do {\
-    new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+    new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); \
     new_tok->flags = token_flags; \
     new_tok->t1 = hashpipe[0].t; \
     new_tok->t2 = hashpipe[i].t; \
@@ -375,7 +385,9 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
                        processed++;
 
                        for (i = 1; i < window_size; i++) {
-                               ADD_TOKEN;
+                               if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) {
+                                       ADD_TOKEN;
+                               }
                        }
                }
        }
index bfabde74f761199e886812b5cc12fd4481ab3ba8..668f08cdc829ac1f94091b8f1c516e2e4445c225 100644 (file)
@@ -18,13 +18,13 @@ struct rspamd_stat_ctx;
 struct rspamd_stat_tokenizer {
        gchar *name;
        gpointer (*get_config) (rspamd_mempool_t *pool,
-                       struct rspamd_tokenizer_config *cf, gsize *len);
+                                                       struct rspamd_tokenizer_config *cf, gsize *len);
        gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
-                       rspamd_mempool_t *pool,
-                       GArray *words,
-                       gboolean is_utf,
-                       const gchar *prefix,
-                       GPtrArray *result);
+                                                 struct rspamd_task *task,
+                                                 GArray *words,
+                                                 gboolean is_utf,
+                                                 const gchar *prefix,
+                                                 GPtrArray *result);
 };
 
 enum rspamd_tokenize_type {
@@ -47,11 +47,11 @@ GArray * rspamd_tokenize_text (const gchar *text, gsize len,
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
-               rspamd_mempool_t *pool,
-               GArray *words,
-               gboolean is_utf,
-               const gchar *prefix,
-               GPtrArray *result);
+                                                  struct rspamd_task *task,
+                                                  GArray *words,
+                                                  gboolean is_utf,
+                                                  const gchar *prefix,
+                                                  GPtrArray *result);
 
 gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
                struct rspamd_tokenizer_config *cf,