]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Improve bayes debug logging, remove unused stuff
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 16 Nov 2018 12:12:23 +0000 (12:12 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 16 Nov 2018 12:12:23 +0000 (12:12 +0000)
lualib/lua_stat.lua
src/libstat/classifiers/bayes.c
src/libstat/classifiers/classifiers.h
src/libstat/classifiers/lua_classifier.c
src/libstat/stat_config.c
src/libstat/stat_internal.h
src/libstat/stat_process.c
src/libutil/logger.h

index ff7d192acd6b339099de3f4f0060078ce777f287..9b72a1f22bf1207bd3696d35b982814a52854127 100644 (file)
@@ -14,6 +14,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ]]--
 
+--[[[
+-- @module lua_stat
+-- This module contains helper functions for supporting statistics
+--]]
+
 local logger = require "rspamd_logger"
 local sqlite3 = require "rspamd_sqlite3"
 local util = require "rspamd_util"
index ee21254571384dcef7555c30cc04be2d09b73a83..edaae4e79b288c5a23121344da25bfacccd093ed 100644 (file)
@@ -38,7 +38,7 @@
         G_STRFUNC, \
         __VA_ARGS__)
 
-INIT_LOG_MODULE(bayes)
+INIT_LOG_MODULE_PUBLIC(bayes)
 
 static inline GQuark
 bayes_error_quark (void)
@@ -254,13 +254,20 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 
 
 gboolean
-bayes_init (rspamd_mempool_t *pool, struct rspamd_classifier *cl)
+bayes_init (struct rspamd_config *cfg,
+                       struct event_base *ev_base,
+                       struct rspamd_classifier *cl)
 {
        cl->cfg->flags |= RSPAMD_FLAG_CLASSIFIER_INTEGER;
 
        return TRUE;
 }
 
+void
+bayes_fin (struct rspamd_classifier *cl)
+{
+}
+
 gboolean
 bayes_classify (struct rspamd_classifier * ctx,
                GPtrArray *tokens,
index e30f2153aa01041b53f857fbb7266f2d9262858b..fd6daf433333785a3ca9bcf72c44cd450880f234 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "config.h"
 #include "mem_pool.h"
+#include <event.h>
 
 #define RSPAMD_DEFAULT_CLASSIFIER "bayes"
 /* Consider this value as 0 */
 
 struct rspamd_classifier_config;
 struct rspamd_task;
+struct rspamd_config;
 struct rspamd_classifier;
 
 struct token_node_s;
 
 struct rspamd_stat_classifier {
        char *name;
-       gboolean (*init_func)(rspamd_mempool_t *pool,
-                       struct rspamd_classifier *cl);
+       gboolean (*init_func)(struct rspamd_config *cfg,
+                                                 struct event_base *ev_base,
+                                                 struct rspamd_classifier *cl);
        gboolean (*classify_func)(struct rspamd_classifier * ctx,
-                       GPtrArray *tokens,
-                       struct rspamd_task *task);
+                                                         GPtrArray *tokens,
+                                                         struct rspamd_task *task);
        gboolean (*learn_spam_func)(struct rspamd_classifier * ctx,
-                       GPtrArray *input,
-                       struct rspamd_task *task,
-                       gboolean is_spam,
-                       gboolean unlearn,
-                       GError **err);
+                                                               GPtrArray *input,
+                                                               struct rspamd_task *task,
+                                                               gboolean is_spam,
+                                                               gboolean unlearn,
+                                                               GError **err);
+       void (*fin_func)(struct rspamd_classifier *cl);
 };
 
 /* Bayes algorithm */
-gboolean bayes_init (rspamd_mempool_t *pool,
-               struct rspamd_classifier *);
+gboolean bayes_init (struct rspamd_config *cfg,
+                                        struct event_base *ev_base,
+                                        struct rspamd_classifier *);
 gboolean bayes_classify (struct rspamd_classifier *ctx,
                GPtrArray *tokens,
                struct rspamd_task *task);
@@ -41,10 +46,12 @@ gboolean bayes_learn_spam (struct rspamd_classifier *ctx,
                gboolean is_spam,
                gboolean unlearn,
                GError **err);
+void bayes_fin (struct rspamd_classifier *);
 
 /* Generic lua classifier */
-gboolean lua_classifier_init (rspamd_mempool_t *pool,
-               struct rspamd_classifier *);
+gboolean lua_classifier_init (struct rspamd_config *cfg,
+                                                         struct event_base *ev_base,
+                                                         struct rspamd_classifier *);
 gboolean lua_classifier_classify (struct rspamd_classifier *ctx,
                GPtrArray *tokens,
                struct rspamd_task *task);
@@ -55,6 +62,11 @@ gboolean lua_classifier_learn_spam (struct rspamd_classifier *ctx,
                gboolean unlearn,
                GError **err);
 
+extern guint rspamd_bayes_log_id;
+#define msg_debug_bayes(...)  rspamd_conditional_debug_fast (NULL, task->from_addr, \
+        rspamd_bayes_log_id, "bayes", task->task_pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
 
 #endif
 /*
index 7b495b16594491cfa311db828bf5685af8202c12..83ce7b0e19b6ac45144da0443f1f3e630782de1f 100644 (file)
@@ -47,8 +47,9 @@ static GHashTable *lua_classifiers = NULL;
 INIT_LOG_MODULE(luacl)
 
 gboolean
-lua_classifier_init (rspamd_mempool_t *pool,
-               struct rspamd_classifier *cl)
+lua_classifier_init (struct rspamd_config *cfg,
+                                        struct event_base *ev_base,
+                                        struct rspamd_classifier *cl)
 {
        struct rspamd_lua_classifier_ctx *ctx;
        lua_State *L = cl->ctx->cfg->lua_state;
@@ -62,7 +63,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
        ctx = g_hash_table_lookup (lua_classifiers, cl->subrs->name);
 
        if (ctx != NULL) {
-               msg_err_pool ("duplicate lua classifier definition: %s",
+               msg_err_config ("duplicate lua classifier definition: %s",
                                cl->subrs->name);
 
                return FALSE;
@@ -70,7 +71,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
 
        lua_getglobal (L, "rspamd_classifiers");
        if (lua_type (L, -1) != LUA_TTABLE) {
-               msg_err_pool ("cannot register classifier %s: no rspamd_classifier global",
+               msg_err_config ("cannot register classifier %s: no rspamd_classifier global",
                                cl->subrs->name);
                lua_pop (L, 1);
 
@@ -81,7 +82,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
        lua_gettable (L, -2);
 
        if (lua_type (L, -1) != LUA_TTABLE) {
-               msg_err_pool ("cannot register classifier %s: bad lua type: %s",
+               msg_err_config ("cannot register classifier %s: bad lua type: %s",
                                cl->subrs->name, lua_typename (L, lua_type (L, -1)));
                lua_pop (L, 2);
 
@@ -92,7 +93,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
        lua_gettable (L, -2);
 
        if (lua_type (L, -1) != LUA_TFUNCTION) {
-               msg_err_pool ("cannot register classifier %s: bad lua type for classify: %s",
+               msg_err_config ("cannot register classifier %s: bad lua type for classify: %s",
                                cl->subrs->name, lua_typename (L, lua_type (L, -1)));
                lua_pop (L, 3);
 
@@ -105,7 +106,7 @@ lua_classifier_init (rspamd_mempool_t *pool,
        lua_gettable (L, -2);
 
        if (lua_type (L, -1) != LUA_TFUNCTION) {
-               msg_err_pool ("cannot register classifier %s: bad lua type for learn: %s",
+               msg_err_config ("cannot register classifier %s: bad lua type for learn: %s",
                                cl->subrs->name, lua_typename (L, lua_type (L, -1)));
                lua_pop (L, 3);
 
index 9d1e57f13ff3b919f13c96c99f8614805c33b857..d2772e9cae4c40f98b4c0debaa99138ab5fc3913 100644 (file)
@@ -28,6 +28,7 @@ static struct rspamd_stat_classifier lua_classifier = {
        .init_func = lua_classifier_init,
        .classify_func = lua_classifier_classify,
        .learn_spam_func = lua_classifier_learn_spam,
+       .fin_func = NULL,
 };
 
 static struct rspamd_stat_classifier stat_classifiers[] = {
@@ -36,6 +37,7 @@ static struct rspamd_stat_classifier stat_classifiers[] = {
                .init_func = bayes_init,
                .classify_func = bayes_classify,
                .learn_spam_func = bayes_learn_spam,
+               .fin_func = bayes_fin,
        }
 };
 
@@ -182,7 +184,7 @@ rspamd_stat_init (struct rspamd_config *cfg, struct event_base *ev_base)
                        continue;
                }
 
-               if (!cl->subrs->init_func (cfg->cfg_pool, cl)) {
+               if (!cl->subrs->init_func (cfg, ev_base, cl)) {
                        g_free (cl);
                        msg_err_config ("cannot init classifier type %s", clf->name);
                        cur = g_list_next (cur);
@@ -328,6 +330,11 @@ rspamd_stat_close (void)
                }
 
                g_array_free (cl->statfiles_ids, TRUE);
+
+               if (cl->subrs->fin_func) {
+                       cl->subrs->fin_func (cl);
+               }
+
                g_free (cl);
        }
 
@@ -475,11 +482,11 @@ rspamd_stat_ctx_register_async (rspamd_stat_async_handler handler,
        g_assert (st_ctx != NULL);
 
        elt = g_malloc0 (sizeof (*elt));
-       REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
        elt->handler = handler;
        elt->cleanup = cleanup;
        elt->ud = d;
        elt->timeout = timeout;
+       REF_INIT_RETAIN (elt, rspamd_async_elt_dtor);
        /* Enabled by default */
 
 
index 44f48ae5ab8029d880b82b1441579507f08257a8..746199d45f829ac395fe9fde95cf9d40faff7e01 100644 (file)
@@ -41,6 +41,7 @@ struct rspamd_classifier {
        gulong ham_learns;
        struct rspamd_classifier_config *cfg;
        struct rspamd_stat_classifier *subrs;
+       gpointer specific;
 };
 
 struct rspamd_statfile {
index e4f95a514ea86ea953bfd6d9ec2315be2de34289..d07e241562169951bd6d37a4f9911a729acbd9f3 100644 (file)
@@ -63,7 +63,7 @@ rspamd_stat_tokenize_header (struct rspamd_task *task,
                        }
                }
 
-               msg_debug_task ("added stat tokens for header '%s'", name);
+               msg_debug_bayes ("added stat tokens for header '%s'", name);
        }
 }
 
@@ -114,7 +114,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
                                        g_array_append_val (ar, elt);
                                }
 
-                               msg_debug_task ("added stat tokens for image '%s'", img->html_image->src);
+                               msg_debug_bayes ("added stat tokens for image '%s'", img->html_image->src);
                        }
                }
                else if (part->cd && part->cd->filename.len > 0) {
@@ -133,7 +133,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
                        elt.len = part->ct->boundary.len;
 
                        if (elt.len) {
-                               msg_debug_task ("added stat tokens for mime boundary '%*s'",
+                               msg_debug_bayes ("added stat tokens for mime boundary '%*s'",
                                                (gint)elt.len, elt.begin);
                                g_array_append_val (ar, elt);
                        }
@@ -155,13 +155,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
                if (tp->language != NULL && tp->language[0] != '\0') {
                        elt.begin = (gchar *)tp->language;
                        elt.len = strlen (elt.begin);
-                       msg_debug_task ("added stat tokens for part language '%s'", elt.begin);
+                       msg_debug_bayes ("added stat tokens for part language '%s'", elt.begin);
                        g_array_append_val (ar, elt);
                }
                if (tp->real_charset != NULL) {
                        elt.begin = (gchar *)tp->real_charset;
                        elt.len = strlen (elt.begin);
-                       msg_debug_task ("added stat tokens for part charset '%s'", elt.begin);
+                       msg_debug_bayes ("added stat tokens for part charset '%s'", elt.begin);
                        g_array_append_val (ar, elt);
                }
        }
@@ -184,124 +184,6 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
                g_array_append_val (ar, elt);
        }
 
-       /* Use more precise headers order */
-#if 0
-       cur = g_list_first (task->headers_order->head);
-       while (cur) {
-               hdr = cur->data;
-
-               if (hdr->name && hdr->type != RSPAMD_HEADER_RECEIVED) {
-                       elt.begin = hdr->name;
-                       elt.len = strlen (hdr->name);
-                       g_array_append_val (ar, elt);
-               }
-
-               cur = g_list_next (cur);
-       }
-#endif
-
-       /* Use metatokens plugin from Lua */
-       lua_getglobal (L, "rspamd_plugins");
-
-       if (lua_type (L, -1) == LUA_TTABLE) {
-               lua_pushstring (L, "stat_metatokens");
-               lua_gettable (L, -2);
-
-               if (lua_type (L, -1) == LUA_TTABLE) {
-                       gint old_top;
-
-                       old_top = lua_gettop (L);
-                       lua_pushstring (L, "callback");
-                       lua_gettable (L, -2);
-
-                       if (lua_type (L, -1) == LUA_TFUNCTION) {
-                               struct rspamd_task **ptask;
-
-                               ptask = lua_newuserdata (L, sizeof (*ptask));
-                               rspamd_lua_setclass (L, "rspamd{task}", -1);
-                               *ptask = task;
-
-                               if (lua_pcall (L, 1, LUA_MULTRET, 0) != 0) {
-                                       msg_err_task ("stat_metatokens failed: %s",
-                                                       lua_tostring (L, -1));
-                                       lua_pop (L, 1);
-                               } else {
-                                       if (lua_gettop (L) > old_top &&
-                                                       lua_istable (L, old_top + 1)) {
-                                               lua_pushvalue (L, old_top + 1);
-                                               /* Iterate over table of tables */
-                                               for (lua_pushnil (L); lua_next (L, -2);
-                                                               lua_pop (L, 1)) {
-                                                       elt.flags = RSPAMD_STAT_TOKEN_FLAG_META|
-                                                                       RSPAMD_STAT_TOKEN_FLAG_LUA_META;
-
-                                                       if (lua_isnumber (L, -1)) {
-                                                               gdouble num = lua_tonumber (L, -1);
-                                                               guint8 *pnum = rspamd_mempool_alloc (
-                                                                               task->task_pool,
-                                                                               sizeof (num));
-
-                                                               msg_debug_task ("got metatoken number: %.2f",
-                                                                               num);
-                                                               memcpy (pnum, &num, sizeof (num));
-                                                               elt.begin = (gchar *) pnum;
-                                                               elt.len = sizeof (num);
-                                                               g_array_append_val (ar, elt);
-                                                       } else if (lua_isstring (L, -1)) {
-                                                               const gchar *str;
-                                                               gsize tlen;
-
-                                                               str = lua_tolstring (L, -1, &tlen);
-                                                               guint8 *pstr = rspamd_mempool_alloc (
-                                                                               task->task_pool,
-                                                                               tlen);
-                                                               memcpy (pstr, str, tlen);
-
-                                                               msg_debug_task ("got metatoken string: %*s",
-                                                                               (gint) tlen, str);
-                                                               elt.begin = (gchar *) pstr;
-                                                               elt.len = tlen;
-                                                               g_array_append_val (ar, elt);
-                                                       }
-                                                       else if (lua_istable (L, -1)) {
-                                                               /* Treat that as unigramms */
-                                                               for (lua_pushnil (L); lua_next (L, -2);
-                                                                               lua_pop (L, 1)) {
-                                                                       if (lua_isstring (L, -1)) {
-                                                                               const gchar *str;
-                                                                               gsize tlen;
-
-                                                                               str = lua_tolstring (L, -1, &tlen);
-                                                                               guint8 *pstr = rspamd_mempool_alloc (
-                                                                                               task->task_pool,
-                                                                                               tlen);
-                                                                               memcpy (pstr, str, tlen);
-
-                                                                               msg_debug_task ("got unigramm "
-                                                                                               "metatoken string: %*s",
-                                                                                               (gint) tlen, str);
-                                                                               elt.begin = (gchar *) pstr;
-                                                                               elt.len = tlen;
-                                                                               elt.flags |= RSPAMD_STAT_TOKEN_FLAG_UNIGRAM;
-                                                                               g_array_append_val (ar, elt);
-                                                                       }
-                                                               }
-                                                       }
-                                               }
-                                       }
-                               }
-                       }
-               }
-       }
-
-       lua_settop (L, 0);
-       st_ctx->tokenizer->tokenize_func (st_ctx,
-                       task,
-                       ar,
-                       TRUE,
-                       "META:",
-                       task->tokens);
-
        rspamd_mempool_add_destructor (task->task_pool,
                        rspamd_array_free_hard, ar);
 }
@@ -354,7 +236,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
 
 
                if (pdiff != NULL && (1.0 - *pdiff) * 100.0 > similarity_treshold) {
-                       msg_debug_task ("message has two common parts (%.2f), so skip the last one",
+                       msg_debug_bayes ("message has two common parts (%.2f), so skip the last one",
                                        *pdiff);
                        break;
                }
@@ -425,7 +307,7 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
                if (!rspamd_symcache_is_symbol_enabled (task, task->cfg->cache,
                                st->stcf->symbol)) {
                        g_ptr_array_index (task->stat_runtimes, i) = NULL;
-                       msg_debug_task ("symbol %s is disabled, skip classification",
+                       msg_debug_bayes ("symbol %s is disabled, skip classification",
                                        st->stcf->symbol);
                        continue;
                }
@@ -574,7 +456,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 
                                if (bk_run == NULL) {
                                        skip = TRUE;
-                                       msg_debug_task ("disable classifier %s as statfile symbol %s is disabled",
+                                       msg_debug_bayes ("disable classifier %s as statfile symbol %s is disabled",
                                                        cl->cfg->name, st->stcf->symbol);
                                        break;
                                }
@@ -583,7 +465,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
 
                if (!skip) {
                        if (cl->cfg->min_tokens > 0 && task->tokens->len < cl->cfg->min_tokens) {
-                               msg_debug_task (
+                               msg_debug_bayes (
                                                "<%s> contains less tokens than required for %s classifier: "
                                                "%ud < %ud",
                                                task->message_id,
@@ -593,7 +475,7 @@ rspamd_stat_classifiers_process (struct rspamd_stat_ctx *st_ctx,
                                continue;
                        }
                        else if (cl->cfg->max_tokens > 0 && task->tokens->len > cl->cfg->max_tokens) {
-                               msg_debug_task (
+                               msg_debug_bayes (
                                                "<%s> contains more tokens than allowed for %s classifier: "
                                                "%ud > %ud",
                                                task->message_id,
@@ -1090,7 +972,7 @@ rspamd_stat_has_classifier_symbols (struct rspamd_task *task,
 
                if (rspamd_task_find_symbol_result (task, st->stcf->symbol)) {
                        if (is_spam == !!st->stcf->is_spam) {
-                               msg_debug_task ("do not autolearn %s as symbol %s is already "
+                               msg_debug_bayes ("do not autolearn %s as symbol %s is already "
                                                "added", is_spam ? "spam" : "ham", st->stcf->symbol);
 
                                return TRUE;
index a969e66b060efe9f70ca9b9e51e1ee64aa7e3289..7347e67b147d0bad7f6d8b088925974c52a4b4c3 100644 (file)
@@ -111,6 +111,12 @@ guint rspamd_logger_add_debug_module (const gchar *mod);
                rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \
 }
 
+#define INIT_LOG_MODULE_PUBLIC(mname) \
+       guint rspamd_##mname##_log_id = (guint)-1; \
+       RSPAMD_CONSTRUCTOR(rspamd_##mname##_log_init) { \
+               rspamd_##mname##_log_id = rspamd_logger_add_debug_module(#mname); \
+}
+
 void rspamd_logger_configure_modules (GHashTable *mods_enabled);
 
 /**