]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
More changes to tokenization.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 27 Jul 2015 12:28:15 +0000 (13:28 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 27 Jul 2015 12:28:15 +0000 (13:28 +0100)
src/libserver/cfg_file.h
src/libserver/cfg_rcl.c
src/libstat/stat_internal.h
src/libstat/stat_process.c
src/libstat/tokenizers/tokenizers.h

index e2c43845ae20d32af8fe73c96bd681780c84645f..b14690632ce0367e12ebbad861e6b0d3dcbf3dfb 100644 (file)
@@ -104,7 +104,6 @@ struct rspamd_statfile_config {
        gchar *label;                                   /**< label of this statfile                                                             */
        ucl_object_t *opts;                             /**< other options                                                                              */
        gboolean is_spam;                               /**< spam flag                                                                                  */
-       const gchar *backend;                                                   /**< name of statfile's backend                                                 */
        struct rspamd_classifier_config *clcf;                  /**< parent pointer of classifier configuration                 */
        gpointer data;                                                                  /**< opaque data                                                                                */
 };
@@ -123,6 +122,7 @@ struct rspamd_classifier_config {
        gchar *metric;                                  /**< metric of this classifier                          */
        gchar *classifier;                                      /**< classifier interface                               */
        struct rspamd_tokenizer_config *tokenizer;      /**< tokenizer used for classifier                                              */
+       const gchar *backend;                                                   /**< name of statfile's backend                                                 */
        ucl_object_t *opts;                             /**< other options                                      */
        GList *pre_callbacks;                           /**< list of callbacks that are called before classification */
        GList *post_callbacks;                          /**< list of callbacks that are called after classification */
index aba8b741b95116925d4170c858b85bc48d5ad418..13c5eb958135ed608be26facb32d39a4a8e22d14 100644 (file)
@@ -1494,6 +1494,11 @@ rspamd_rcl_config_init (void)
                rspamd_rcl_parse_struct_integer,
                G_STRUCT_OFFSET (struct rspamd_classifier_config, max_tokens),
                RSPAMD_CL_FLAG_INT_32);
+       rspamd_rcl_add_default_handler (sub,
+               "backend",
+               rspamd_rcl_parse_struct_string,
+               G_STRUCT_OFFSET (struct rspamd_classifier_config, backend),
+               0);
 
        /*
         * Statfile defaults
@@ -1519,11 +1524,6 @@ rspamd_rcl_config_init (void)
                rspamd_rcl_parse_struct_boolean,
                G_STRUCT_OFFSET (struct rspamd_statfile_config, is_spam),
                0);
-       rspamd_rcl_add_default_handler (ssub,
-               "backend",
-               rspamd_rcl_parse_struct_string,
-               G_STRUCT_OFFSET (struct rspamd_statfile_config, backend),
-               0);
 
        /**
         * Composites handler
index c7167f701942843191bae3f0e1fed89f4be0dfba..64790cddc00287fa6e117e757677a38966d3ae83 100644 (file)
@@ -45,8 +45,6 @@ struct rspamd_tokenizer_runtime {
 
 struct rspamd_statfile_runtime {
        struct rspamd_statfile_config *st;
-       struct rspamd_stat_backend *backend;
-       struct rspamd_tokenizer_runtime *tok;
        gpointer backend_runtime;
        guint64 hits;
        guint64 total_hits;
@@ -55,7 +53,8 @@ struct rspamd_statfile_runtime {
 struct rspamd_classifier_runtime {
        struct rspamd_classifier_config *clcf;
        struct rspamd_stat_classifier *cl;
-       GHashTable *tokenizers;
+       struct rspamd_stat_backend *backend;
+       struct rspamd_tokenizer_runtime *tok;
        double ham_prob;
        double spam_prob;
        enum stat_process_stage stage;
index e60cfe1d42e75eb840da21136969f2ea9a4f6811..e6d7c90c1814771150880df6e7b27b799dad88d1 100644 (file)
@@ -220,8 +220,8 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
                        res->cl_runtime = cl_runtime;
                        res->st_runtime = st_runtime;
 
-                       if (st_runtime->backend->process_token (cbdata->task, t, res,
-                                       st_runtime->backend->ctx)) {
+                       if (cl_runtime->backend->process_token (cbdata->task, t, res,
+                                       cl_runtime->backend->ctx)) {
 
                                if (cl_runtime->clcf->max_tokens > 0 &&
                                                cl_runtime->processed_tokens > cl_runtime->clcf->max_tokens) {
@@ -237,6 +237,7 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
                        i ++;
                        curst = g_list_next (curst);
                }
+
                cur = g_list_next (cur);
        }
 
@@ -244,26 +245,6 @@ preprocess_init_stat_token (gpointer k, gpointer v, gpointer d)
        return FALSE;
 }
 
-static gboolean
-rspamd_tokenizer_equal (gconstpointer a, gconstpointer b)
-{
-       struct rspamd_tokenizer_runtime *ta = a, *tb = b;
-
-       if (ta->conf_len == tb->conf_len) {
-               return memcmp (ta->config, tb->config, ta->conf_len) == 0;
-       }
-
-       return FALSE;
-}
-
-static guint
-rspamd_tokenizer_hash (gconstpointer a)
-{
-       struct rspamd_tokenizer_runtime *ta = a;
-
-       return XXH64 (ta->config, ta->conf_len, 0xdeadbabe);
-}
-
 static GList*
 rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                struct rspamd_task *task,
@@ -274,11 +255,13 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
        struct rspamd_classifier_runtime *cl_runtime;
        struct rspamd_statfile_runtime *st_runtime;
        struct rspamd_stat_backend *bk;
-       gpointer backend_runtime;
+       gpointer backend_runtime, tok_config;
        GList *cur, *st_list = NULL, *curst;
        GList *cl_runtimes = NULL;
+       GHashTableIter it;
        guint result_size = 0, start_pos = 0, end_pos = 0;
-       struct rspamd_tokenizer_runtime *tok_runtime, srch_tok;
+       gsize conf_len;
+       struct rspamd_tokenizer_runtime *tok_runtime;
        struct preprocess_cb_data cbdata;
 
        cur = g_list_first (task->cfg->classifiers);
@@ -302,11 +285,6 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                /* Now init runtime values */
                cl_runtime = rspamd_mempool_alloc0 (task->task_pool, sizeof (*cl_runtime));
                cl_runtime->cl = rspamd_stat_get_classifier (clcf->classifier);
-               cl_runtime->tokenizers = g_hash_table_new (rspamd_tokenizer_hash,
-                               rspamd_tokenizer_equal);
-               rspamd_mempool_add_destructor (task->task_pool,
-                               (rspamd_mempool_destruct_t)g_hash_table_destroy,
-                               cl_runtime->tokenizers);
 
                if (cl_runtime->cl == NULL) {
                        g_set_error (err, rspamd_stat_quark(), 500,
@@ -316,6 +294,12 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                }
 
                cl_runtime->clcf = clcf;
+               bk = rspamd_stat_get_backend (clcf->backend);
+               if (bk == NULL) {
+                       msg_warn ("backend of type %s is not defined", clcf->backend);
+                       cur = g_list_next (cur);
+                       continue;
+               }
 
                curst = st_list;
                while (curst != NULL) {
@@ -327,14 +311,6 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                                continue;
                        }
 
-                       bk = rspamd_stat_get_backend (stcf->backend);
-
-                       if (bk == NULL) {
-                               msg_warn ("backend of type %s is not defined", stcf->backend);
-                               curst = g_list_next (curst);
-                               continue;
-                       }
-
                        backend_runtime = bk->runtime (task, stcf, op != RSPAMD_CLASSIFY_OP,
                                        bk->ctx);
 
@@ -350,29 +326,42 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                                else {
                                        /* Just skip this element */
                                        msg_warn ("backend of type %s does not exist: %s",
-                                                       stcf->backend, stcf->symbol);
+                                                       clcf->backend, stcf->symbol);
                                        curst = g_list_next (curst);
                                        continue;
                                }
                        }
 
-                       srch_tok.config = bk->load_tokenizer_config (backend_runtime,
-                                       &srch_tok.conf_len);
+                       tok_config = bk->load_tokenizer_config (backend_runtime,
+                                       &conf_len);
+
+                       if (cl_runtime->tok == NULL) {
+                               cl_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer,
+                                               st_ctx, task, cl_runtime, tok_config, conf_len);
+
+                               if (cl_runtime->tok == NULL) {
+                                       g_set_error (err, rspamd_stat_quark(), 500,
+                                                       "cannot initialize tokenizer for statfile %s", stcf->symbol);
+                                       g_list_free (cl_runtimes);
+
+                                       return NULL;
+                               }
+                       }
+                       else {
+                               if (!cl_runtime->tok->tokenizer->compatible_config (
+                                               cl_runtime->tok, tok_config, conf_len)) {
+                                       g_set_error (err, rspamd_stat_quark(), 500,
+                                                       "incompatible tokenizer for statfile %s", stcf->symbol);
+                                       g_list_free (cl_runtimes);
+
+                                       return NULL;
+                               }
+                       }
 
                        st_runtime = rspamd_mempool_alloc0 (task->task_pool,
                                        sizeof (*st_runtime));
                        st_runtime->st = stcf;
                        st_runtime->backend_runtime = backend_runtime;
-                       st_runtime->backend = bk;
-                       st_runtime->tok = g_hash_table_lookup (cl_runtime->tokenizers, &srch_tok);
-
-                       if (st_runtime->tok == NULL) {
-                               st_runtime->tok = rspamd_stat_get_tokenizer_runtime (clcf->tokenizer,
-                                               st_ctx, task, cl_runtime, srch_tok.config,
-                                               srch_tok.conf_len);
-
-                               g_assert (st_runtime->tok != NULL);
-                       }
 
                        if (stcf->is_spam) {
                                cl_runtime->total_spam += bk->total_learns (task, backend_runtime,
@@ -420,7 +409,7 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                cbdata.classifier_runtimes = cl_runtimes;
                cbdata.task = task;
                cbdata.tok = cl_runtime->tok;
-               g_tree_foreach (cl_runtime->tok->tokens, preprocess_init_stat_token,
+               g_tree_foreach (cbdata.tok->tokens, preprocess_init_stat_token,
                                &cbdata);
        }
 
@@ -522,9 +511,9 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
 
                while (curst) {
                        st_run = curst->data;
-                       st_run->backend->finalize_process (task,
+                       cl_run->backend->finalize_process (task,
                                        st_run->backend_runtime,
-                                       st_run->backend->ctx);
+                                       cl_run->backend->ctx);
                        curst = g_list_next (curst);
                }
 
@@ -567,8 +556,8 @@ rspamd_stat_learn_token (gpointer k, gpointer v, gpointer d)
                        res = &g_array_index (t->results, struct rspamd_token_result, i);
                        st_runtime = (struct rspamd_statfile_runtime *)curst->data;
 
-                       if (st_runtime->backend->learn_token (cbdata->task, t, res,
-                                       st_runtime->backend->ctx)) {
+                       if (cl_runtime->backend->learn_token (cbdata->task, t, res,
+                                       cl_runtime->backend->ctx)) {
                                cl_runtime->processed_tokens ++;
 
                                if (cl_runtime->clcf->max_tokens > 0 &&
@@ -698,23 +687,23 @@ rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L,
                                                st_run = (struct rspamd_statfile_runtime *)curst->data;
 
                                                if (unlearn && spam != st_run->st->is_spam) {
-                                                       nrev = st_run->backend->dec_learns (task,
+                                                       nrev = cl_run->backend->dec_learns (task,
                                                                        st_run->backend_runtime,
-                                                                       st_run->backend->ctx);
+                                                                       cl_run->backend->ctx);
                                                        msg_debug ("unlearned %s, new revision: %ul",
                                                                        st_run->st->symbol, nrev);
                                                }
                                                else {
-                                                       nrev = st_run->backend->inc_learns (task,
+                                                       nrev = cl_run->backend->inc_learns (task,
                                                                st_run->backend_runtime,
-                                                               st_run->backend->ctx);
+                                                               cl_run->backend->ctx);
                                                        msg_debug ("learned %s, new revision: %ul",
                                                                st_run->st->symbol, nrev);
                                                }
 
-                                               st_run->backend->finalize_learn (task,
-                                                                                                               st_run->backend_runtime,
-                                                                                                               st_run->backend->ctx);
+                                               cl_run->backend->finalize_learn (task,
+                                                               st_run->backend_runtime,
+                                                               cl_run->backend->ctx);
 
                                                curst = g_list_next (curst);
                                        }
@@ -762,10 +751,10 @@ rspamd_stat_statistics (struct rspamd_config *cfg, guint64 *total_learns)
                        while (curst != NULL) {
                                stcf = (struct rspamd_statfile_config *)curst->data;
 
-                               bk = rspamd_stat_get_backend (stcf->backend);
+                               bk = rspamd_stat_get_backend (clcf->backend);
 
                                if (bk == NULL) {
-                                       msg_warn ("backend of type %s is not defined", stcf->backend);
+                                       msg_warn ("backend of type %s is not defined", clcf->backend);
                                        curst = g_list_next (curst);
                                        continue;
                                }
index 1f4b0a54fc594fcfa63fc1c729871a1e7e21d929..a2ff388ef3b50c1f502d11483dbc3226784804cb 100644 (file)
@@ -9,11 +9,13 @@
 
 #define RSPAMD_DEFAULT_TOKENIZER "osb"
 
+struct rspamd_tokenizer_runtime;
+
 /* Common tokenizer structure */
 struct rspamd_stat_tokenizer {
        gchar *name;
        gpointer (*get_config) (struct rspamd_tokenizer_config *cf, gsize *len);
-       gboolean (*compatible_config) (struct rspamd_tokenizer_config *cf,
+       gboolean (*compatible_config) (struct rspamd_tokenizer_runtime *rt,
                        gpointer ptr, gsize len);
        gboolean (*load_config) (struct rspamd_tokenizer_runtime *rt,
                        gpointer ptr, gsize len);
@@ -44,7 +46,7 @@ gpointer rspamd_tokenizer_osb_get_config (struct rspamd_tokenizer_config *cf,
                gsize *len);
 
 gboolean
-rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_config *cf,
+rspamd_tokenizer_osb_compatible_config (struct rspamd_tokenizer_runtime *rt,
                        gpointer ptr, gsize len);
 
 gboolean