]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Allow configurable tokenizers.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 22 Feb 2015 21:32:22 +0000 (21:32 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 22 Feb 2015 21:32:22 +0000 (21:32 +0000)
src/libserver/cfg_file.h
src/libserver/cfg_rcl.c
src/libstat/stat_process.c
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizers.h

index a0eb149df47511f1da6fae3d13fdaee48890cf75..3b6191306a67aa9ec439d03842d3ad430c96e5cf 100644 (file)
@@ -141,6 +141,11 @@ struct rspamd_statfile_config {
        gpointer data;                                                                  /**< opaque data                                                                                */
 };
 
+struct rspamd_tokenizer_config {
+       const ucl_object_t *opts;                        /**< other options                                                                             */
+       const gchar *name;                                                              /**< name of tokenizer                                                                  */
+};
+
 /**
  * Classifier config definition
  */
@@ -149,7 +154,7 @@ struct rspamd_classifier_config {
        GHashTable *labels;                             /**< statfiles with labels                                                              */
        gchar *metric;                                  /**< metric of this classifier                          */
        gchar *classifier;                                      /**< classifier interface                               */
-       gchar *tokenizer;                                       /**< tokenizer used for classifier                                              */
+       struct rspamd_tokenizer_config *tokenizer;      /**< tokenizer used for classifier                                              */
        ucl_object_t *opts;                             /**< other options                                      */
        GList *pre_callbacks;                           /**< list of callbacks that are called before classification */
        GList *post_callbacks;                          /**< list of callbacks that are called after classification */
index eece86fb7e74ee6fc83c76bea86e8f1850a97d3a..0ba9423a1d5e755ed61e8690e038d36a147fa01b 100644 (file)
@@ -930,6 +930,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
        struct rspamd_classifier_config *ccf;
        gboolean res = TRUE;
        struct rspamd_rcl_section *stat_section;
+       struct rspamd_tokenizer_config *tkcf = NULL;
 
        ccf = rspamd_config_new_classifier (cfg, NULL);
 
@@ -960,6 +961,19 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
                                                }
                                        }
                                }
+                               else if (g_ascii_strcasecmp (key, "tokenizer") == 0) {
+                                       tkcf = rspamd_mempool_alloc0 (cfg->cfg_pool, sizeof (*tkcf));
+                                       if (ucl_object_type (val) == UCL_STRING) {
+                                               tkcf->name = ucl_object_tostring (val);
+                                       }
+                                       else if (ucl_object_type (val) == UCL_OBJECT) {
+                                               cur = ucl_object_find_key (val, "name");
+                                               if (cur != NULL) {
+                                                       tkcf->name = ucl_object_tostring (cur);
+                                                       tkcf->opts = val;
+                                               }
+                                       }
+                               }
                        }
                }
        }
@@ -968,6 +982,7 @@ rspamd_rcl_classifier_handler (struct rspamd_config *cfg,
        }
 
        ccf->opts = (ucl_object_t *)obj;
+       ccf->tokenizer = tkcf;
        cfg->classifiers = g_list_prepend (cfg->classifiers, ccf);
 
 
@@ -1356,11 +1371,6 @@ rspamd_rcl_config_init (void)
                rspamd_rcl_parse_struct_string,
                G_STRUCT_OFFSET (struct rspamd_classifier_config, classifier),
                0);
-       rspamd_rcl_add_default_handler (sub,
-               "tokenizer",
-               rspamd_rcl_parse_struct_string,
-               G_STRUCT_OFFSET (struct rspamd_classifier_config, tokenizer),
-               0);
        rspamd_rcl_add_default_handler (sub,
                "min_tokens",
                rspamd_rcl_parse_struct_integer,
index 511a9f8003889c49d449d977a342e82d6e43d9eb..eafbe209266099e69f756b3f784addc1a73959d2 100644 (file)
@@ -43,10 +43,19 @@ struct preprocess_cb_data {
 };
 
 static struct rspamd_tokenizer_runtime *
-rspamd_stat_get_tokenizer_runtime (const gchar *name, rspamd_mempool_t *pool,
+rspamd_stat_get_tokenizer_runtime (struct rspamd_tokenizer_config *cf,
+               rspamd_mempool_t *pool,
                struct rspamd_tokenizer_runtime **ls)
 {
        struct rspamd_tokenizer_runtime *tok = NULL, *cur;
+       const gchar *name;
+
+       if (cf == NULL || cf->name == NULL) {
+               name = RSPAMD_DEFAULT_TOKENIZER;
+       }
+       else {
+               name = cf->name;
+       }
 
        LL_FOREACH (*ls, cur) {
                if (strcmp (cur->name, name) == 0) {
@@ -270,7 +279,8 @@ rspamd_stat_preprocess (struct rspamd_stat_ctx *st_ctx,
  * Tokenize task using the tokenizer specified
  */
 static void
-rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
+rspamd_stat_process_tokenize (struct rspamd_tokenizer_config *cf,
+               struct rspamd_stat_ctx *st_ctx,
                struct rspamd_task *task, struct rspamd_tokenizer_runtime *tok)
 {
        struct mime_text_part *part;
@@ -287,7 +297,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
                        /*
                         * XXX: Use normalized words if needed here
                         */
-                       tok->tokenizer->tokenize_func (tok->tokenizer, task->task_pool,
+                       tok->tokenizer->tokenize_func (cf, task->task_pool,
                                        part->words, tok->tokens, part->is_utf);
                }
 
@@ -304,7 +314,7 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
        if (sub != NULL) {
                words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL);
                if (words != NULL) {
-                       tok->tokenizer->tokenize_func (tok->tokenizer,
+                       tok->tokenizer->tokenize_func (cf,
                                        task->task_pool,
                                        words,
                                        tok->tokens,
@@ -349,11 +359,12 @@ rspamd_stat_classify (struct rspamd_task *task, lua_State *L, GError **err)
 
                if (tok == NULL) {
                        g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
-                                       "for tokenizers", clcf->tokenizer);
+                                       "for tokenizers", clcf->tokenizer ?
+                                                       clcf->tokenizer->name : "unknown");
                        return RSPAMD_STAT_PROCESS_ERROR;
                }
 
-               rspamd_stat_process_tokenize (st_ctx, task, tok);
+               rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok);
 
                cur = g_list_next (cur);
        }
@@ -487,11 +498,12 @@ rspamd_stat_learn (struct rspamd_task *task, gboolean spam, lua_State *L,
 
                if (tok == NULL) {
                        g_set_error (err, rspamd_stat_quark (), 500, "type %s is not defined"
-                                       "for tokenizers", clcf->tokenizer);
+                                       "for tokenizers", clcf->tokenizer ?
+                                                       clcf->tokenizer->name : "unknown");
                        return RSPAMD_STAT_PROCESS_ERROR;
                }
 
-               rspamd_stat_process_tokenize (st_ctx, task, tok);
+               rspamd_stat_process_tokenize (clcf->tokenizer, st_ctx, task, tok);
 
                cur = g_list_next (cur);
        }
index 0a8d01ce1328b37ca3b95baee67a7d17c17ebd1e..b51e909a9f789d4b0d62803a1df2c0708fbc8fb1 100644 (file)
@@ -38,7 +38,7 @@
 extern const int primes[];
 
 int
-osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
+osb_tokenize_text (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t * pool,
        GArray * input,
        GTree * tree,
index d4c116e1327c35a8691869bfd6ad57c63db2218c..0bc5948423b30f00fe981564443148cab109ac6d 100644 (file)
@@ -12,7 +12,7 @@
 /* Common tokenizer structure */
 struct rspamd_stat_tokenizer {
        gchar *name;
-       gint (*tokenize_func)(struct rspamd_stat_tokenizer *rspamd_stat_tokenizer,
+       gint (*tokenize_func)(struct rspamd_tokenizer_config *cf,
                        rspamd_mempool_t *pool,
                        GArray *words,
                        GTree *result,
@@ -31,7 +31,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf,
                gsize min_len, GList **exceptions);
 
 /* OSB tokenize function */
-int osb_tokenize_text (struct rspamd_stat_tokenizer *tokenizer,
+int osb_tokenize_text (struct rspamd_tokenizer_config *cf,
        rspamd_mempool_t *pool,
        GArray *input,
        GTree *tokens,