# Symbols to check, can also be list of files or regexp:
symbols = "/^[A-Z]{2}_SURBL_MULTI$/i";
};
+
+# Settings files
+settings {
+ # json data for user's settings
+ #user_settings = "file:///some/json/file";
+
+ # json data for domain's settings
+ #domain_settings = "file:///some/other/json/file";
+};
password = "q1";
};
+# Settings for fuzzy storage interface
+worker {
+ type = "fuzzy";
+
+ # Bind socket for control interface
+ bind_socket = localhost:11335;
+
+ count = 1;
+ # Path to filesystem storage
+ hashfile = "/tmp/fuzzy.db";
+};
+
+# Options for lmtp worker
+#worker {
+ #type = "lmtp";
+ # Bind socket for lmtp interface
+ #bind_socket = localhost:11335;
+ # Metric that is considered as main. If we have spam result on
+ # this metric, lmtp delivery would be failed
+ #metric = "default";
+ # Number of lmtp workers
+ #count = 1;
+#};
+
+#worker {
+ #type = "delivery";
+ # Path to delivery agent, %f is expanded as mail from address and %r
+ # is expanded as recipient address
+ # Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
+ #agent = "/dev/null";
+ # Bind socket for lmtp interface
+ # Example: bind_socket = localhost:25
+
+ # Whether we should use lmtp for MTA delivery
+ #lmtp = no;
+#};
+
+
# Sample metric definition
metric {
# Name of metric
name = "testmetric";
# Score to count message as spam by this metric
required_score = 10.1;
+ # Symbols cache path for optimal checks planning
+ cache_file = "/tmp/symbols.cache";
};
# Logging settings
# Default: 100M
statfile_pool_size = 40M;
-
-# Sample statfile definition
-#statfile {
- # Alias is used for learning and is used as symbol
- #alias = "test.spam";
- # Pattern is path to file, can include %r - recipient name and %f - mail from value
- #pattern = "./test.spam";
- # Weight in spam/ham classifier
- #weight = 1.0;
- # Size of this statfile class
- #size = 10M;
- # Tokenizer for this statfile
- # Deafault: osb-text
- #tokenizer = "osb-text";
-#};
-#statfile {
- #alias = "test.ham";
- #pattern = "./test.ham";
- #weight = -2.0;
- #size = 10M;
-#};
+# Classifier definition
+classifier {
+ # Type of classfier
+ type = "winnow";
+ # Tokenizer used
+ tokenizer = "osb-text";
+ # Sample statfile definition
+ statfile {
+ # Alias is used for learning and is used as symbol
+ symbol = "WINNOW_SPAM";
+ # Pattern is path to file, can include %r - recipient name and %f - mail from value
+ path = "/tmp/test.spam";
+ # Size of this statfile class
+ size = 10M;
+ # Tokenizer for this statfile
+ # Deafault: osb-text
+ #tokenizer = "osb-text";
+ autolearn {
+ min_mark = 10.0;
+ };
+ };
+ statfile {
+ symbol = "WINNOW_HAM";
+ path = "/tmp/test.ham";
+ size = 10M;
+ autolearn {
+ max_mark = 0.1;
+ };
+ };
+};
# Factors coefficients
factors {
"R_MIXED_CHARSET" = 5;
"R_BAD_EMAIL" = 10.5;
};
-# Options for lmtp worker
-#worker {
- #type = "lmtp";
- # Bind socket for lmtp interface
- #bind_socket = localhost:11335;
- # Metric that is considered as main. If we have spam result on
- # this metric, lmtp delivery would be failed
- #metric = "default";
- # Number of lmtp workers
- #count = 1;
-#};
-#worker {
- #type = "delivery";
- # Path to delivery agent, %f is expanded as mail from address and %r
- # is expanded as recipient address
- # Expample: agent = "/usr/local/bin/procmail -f %f -d %r"
- #agent = "/dev/null";
- # Bind socket for lmtp interface
- # Example: bind_socket = localhost:25
-
- # Whether we should use lmtp for MTA delivery
- #lmtp = no;
-#};
# SURBL module params, note that single quotes are mandatory here
.module 'surbl' {
#blacklist = "file:///some/path/emails.lst";
};
+# Module for fuzzy checksum loading
+.module 'fuzzy_check' {
+ metric = "default";
+ symbol = "R_FUZZY";
+ # List of fuzzy storage servers, separated by ',' or ';' or simple by spaces
+ servers = "localhost:11335";
+};
+
# If enables threat each regexp as raw regex and do not try to convert
# each text part to utf8 encoding. Save a lot of resources but less
# portable.
# json data for domain's settings
#domain_settings = "file:///some/other/json/file";
};
+
+# Example of json config:
+# [
+# {
+# "name": "cebka@test.ru",
+# "metrics":
+# {
+# "default": 5.5
+# },
+# "factors":
+# {
+# "R_FUZZY": 10.1
+# },
+# "want_spam": false
+# }
+# ]
* Statfile config definition
*/
struct statfile {
- char *alias; /**< alias of statfile */
- char *pattern; /**< filesystem pattern (with %r or %f) */
- double weight; /**< weight scale */
- char *metric; /**< metric name */
+ char *symbol; /**< symbol of statfile */
+ char *path; /**< filesystem pattern (with %r or %f) */
size_t size; /**< size of statfile */
- struct tokenizer *tokenizer; /**< tokenizer used for statfile */
GList *sections; /**< list of sections in statfile */
struct statfile_autolearn_params *autolearn; /**< autolearn params */
};
+/**
+ * Classifier config definition
+ */
+struct classifier_config {
+ GList *statfiles; /**< statfiles list */
+ char *metric; /**< metric of this classifier */
+ struct classifier *classifier; /**< classifier interface */
+ struct tokenizer *tokenizer; /**< tokenizer used for classifier */
+ GHashTable *opts; /**< other options */
+};
+
/**
* Config option for importing to script module
*/
GHashTable* factors; /**< hash of factors indexed by symbol name */
GHashTable* c_modules; /**< hash of c modules indexed by module name */
GHashTable* composite_symbols; /**< hash of composite symbols indexed by its name */
- GHashTable* statfiles; /**< hash of defined statfiles indexed by alias */
+ GList *classifiers; /**< list of all classifiers defined */
+ GHashTable *classifiers_symbols; /**< hashtable indexed by symbol name of classifiers */
GHashTable* cfg_params; /**< all cfg params indexed by its name in this structure */
int clock_res; /**< resolution of clock used */
GList *views; /**< views */
void unescape_quotes (char *line);
GList* parse_comma_list (memory_pool_t *pool, char *line);
-
+struct classifier_config* check_classifier_cfg (struct config_file *cfg, struct classifier_config *c);
int yylex (void);
int yyparse (void);
%x module
%x lua
%x worker
+%x classifier
%{
YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH];
int line_stack[MAX_INCLUDE_DEPTH];
int include_stack_ptr = 0;
+int nested_depth = 0;
extern struct config_file *cfg;
%}
delivery return DELIVERY;
agent return AGENT;
-statfile return STATFILE;
-alias return ALIAS;
-pattern return PATTERN;
-weight return WEIGHT;
-size return SIZE;
-tokenizer return TOKENIZER;
-classifier return CLASSIFIER;
-section return SECTION;
-autolearn return AUTOLEARN;
-min_mark return MIN_MARK;
-max_mark return MAX_MARK;
+classifier BEGIN(classifier); return CLASSIFIER;
logging return LOGGING;
<module>[ \t]+ /* ignore whitespace */;
<module>[ \t]*#.* /* ignore comments */;
<module>\'[a-zA-Z0-9_-]+\' yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; return MODULE_OPT;
-<module>\{ return OBRACE;
-<module>\} BEGIN(INITIAL); return EBRACE;
+<module>\{ nested_depth ++; return OBRACE;
+<module>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<module>\; return SEMICOLON;
<module>= return EQSIGN;
<module>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<worker>\n /* ignore EOL */;
<worker>[ \t]+ /* ignore whitespace */;
<worker>[ \t]*#.* /* ignore comments */;
-<worker>\{ return OBRACE;
-<worker>\} BEGIN(INITIAL); return EBRACE;
+<worker>\{ nested_depth ++; return OBRACE;
+<worker>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
<worker>\; return SEMICOLON;
<worker>= return EQSIGN;
<worker>type return TYPE;
<worker>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
<worker>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;
+<classifier>\n /* ignore EOL */;
+<classifier>[ \t]+ /* ignore whitespace */;
+<classifier>[ \t]*#.* /* ignore comments */;
+<classifier>\{ nested_depth ++; return OBRACE;
+<classifier>\} if (--nested_depth == 0) { BEGIN(INITIAL); } return EBRACE;
+<classifier>\; return SEMICOLON;
+<classifier>= return EQSIGN;
+<classifier>type return TYPE;
+<classifier>bind_socket return BINDSOCK;
+<classifier>count return COUNT;
+<classifier>statfile return STATFILE;
+<classifier>symbol return SYMBOL;
+<classifier>path return PATH;
+<classifier>size return SIZE;
+<classifier>tokenizer return TOKENIZER;
+<classifier>section return SECTION;
+<classifier>autolearn return AUTOLEARN;
+<classifier>min_mark return MIN_MARK;
+<classifier>max_mark return MAX_MARK;
+<classifier>[0-9]+ yylval.number=strtol(yytext, NULL, 10); return NUMBER;
+<classifier>-?[0-9]+\.?[0-9]* yylval.fract=strtod(yytext, NULL); return FRACT;
+<classifier>[0-9]+[kKmMgG]? yylval.limit=parse_limit(yytext); return SIZELIMIT;
+<classifier>\$[a-zA-Z_][a-zA-Z0-9_]+ yylval.string=strdup(yytext + 1); return VARIABLE;
+<classifier>[a-zA-Z0-9_%-]+ yylval.string=strdup(yytext); return PARAM;
+<classifier>\".+[^\\]\" yylval.string=strdup(yytext + 1); yylval.string[strlen(yylval.string) - 1] = '\0'; unescape_quotes(yylval.string); return QUOTEDSTRING;
+
<lua>\n /* ignore EOL */;
<lua>[ \t]+ /* ignore whitespace */;
<lua>[ \t]*#.* /* ignore comments */;
GList *cur_module_opt = NULL;
struct metric *cur_metric = NULL;
+struct classifier_config *cur_classifier = NULL;
struct statfile *cur_statfile = NULL;
struct statfile_section *cur_section = NULL;
struct statfile_autolearn_params *cur_autolearn = NULL;
%token DELIVERY LMTP ENABLED AGENT SECTION LUACODE RAW_MODE PROFILE_FILE COUNT
%token VIEW IP FROM SYMBOLS
%token AUTOLEARN MIN_MARK MAX_MARK
-%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS
+%token SETTINGS USER_SETTINGS DOMAIN_SETTINGS SYMBOL PATH
%type <string> STRING
%type <string> VARIABLE
| metric
| composites
| logging
- | statfile
+ | classifier
| statfile_pool_size
| luacode
| raw_mode
}
;
+
+classifier:
+ CLASSIFIER OBRACE classifierbody EBRACE {
+ if (cur_classifier == NULL || cur_classifier->classifier == NULL) {
+ yyerror ("yyparse: invalid classifier definition");
+ YYERROR;
+ }
+ if (cur_classifier->metric == NULL) {
+ cur_classifier->metric = DEFAULT_METRIC;
+ }
+ if (cur_classifier->tokenizer == NULL) {
+ cur_classifier->tokenizer = get_tokenizer ("osb-text");
+ }
+
+ cfg->classifiers = g_list_prepend (cfg->classifiers, cur_classifier);
+ cur_classifier = NULL;
+ }
+ ;
+
+classifierbody:
+ | classifiercmd SEMICOLON
+ | classifierbody classifiercmd SEMICOLON
+ ;
+
+classifiercmd:
+ | statfile
+ | classifiertype
+ | classifiermetric
+ | classifiertokenizer
+ | classifieroption
+ ;
+
+classifiertype:
+ TYPE EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ if ((cur_classifier->classifier = get_classifier ($3)) == NULL) {
+ yyerror ("yyparse: unknown classifier type: %s", $3);
+ YYERROR;
+ }
+ }
+ ;
+classifiertokenizer:
+ TOKENIZER EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ if ((cur_classifier->tokenizer = get_tokenizer ($3)) == NULL) {
+ yyerror ("yyparse: unknown tokenizer %s", $3);
+ YYERROR;
+ }
+ }
+ ;
+
+classifiermetric:
+ METRIC EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ cur_classifier->metric = $3;
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, cur_classifier->metric);
+ }
+ ;
+
+classifieroption:
+ PARAM EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ g_hash_table_insert (cur_classifier->opts, $1, $3);
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, $1);
+ memory_pool_add_destructor (cfg->cfg_pool, g_free, $3);
+ };
+
statfile:
STATFILE OBRACE statfilebody EBRACE {
- if (cur_statfile == NULL || cur_statfile->alias == NULL || cur_statfile->pattern == NULL
- || cur_statfile->weight == 0 || cur_statfile->size == 0) {
+ if (cur_statfile == NULL || cur_statfile->path == NULL || cur_statfile->size == 0) {
yyerror ("yyparse: not enough arguments in statfile definition");
YYERROR;
}
- if (cur_statfile->metric == NULL) {
- cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, "default");
- }
- if (cur_statfile->tokenizer == NULL) {
- cur_statfile->tokenizer = get_tokenizer ("osb-text");
- }
- g_hash_table_insert (cfg->statfiles, cur_statfile->alias, cur_statfile);
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
+ cur_classifier->statfiles = g_list_prepend (cur_classifier->statfiles, cur_statfile);
cur_statfile = NULL;
}
;
;
statfilecmd:
- | statfilealias
- | statfilepattern
- | statfileweight
+ | statfilesymbol
+ | statfilepath
| statfilesize
- | statfilemetric
- | statfiletokenizer
| statfilesection
| statfileautolearn
;
-statfilealias:
- ALIAS EQSIGN QUOTEDSTRING {
+statfilesymbol:
+ SYMBOL EQSIGN QUOTEDSTRING {
+ cur_classifier = check_classifier_cfg (cfg, cur_classifier);
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
- cur_statfile->alias = memory_pool_strdup (cfg->cfg_pool, $3);
+ cur_statfile->symbol = memory_pool_strdup (cfg->cfg_pool, $3);
+ g_hash_table_insert (cfg->classifiers_symbols, $3, cur_classifier);
}
;
-statfilepattern:
- PATTERN EQSIGN QUOTEDSTRING {
+statfilepath:
+ PATH EQSIGN QUOTEDSTRING {
if (cur_statfile == NULL) {
cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
}
- cur_statfile->pattern = memory_pool_strdup (cfg->cfg_pool, $3);
+ cur_statfile->path = memory_pool_strdup (cfg->cfg_pool, $3);
}
;
-statfileweight:
- WEIGHT EQSIGN NUMBER {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->weight = $3;
- }
- | WEIGHT EQSIGN FRACT {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->weight = $3;
- }
- ;
statfilesize:
SIZE EQSIGN NUMBER {
}
;
-statfilemetric:
- METRIC EQSIGN QUOTEDSTRING {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- cur_statfile->metric = memory_pool_strdup (cfg->cfg_pool, $3);
- }
- ;
-statfiletokenizer:
- TOKENIZER EQSIGN QUOTEDSTRING {
- if (cur_statfile == NULL) {
- cur_statfile = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct statfile));
- }
- if ((cur_statfile->tokenizer = get_tokenizer ($3)) == NULL) {
- yyerror ("yyparse: unknown tokenizer %s", $3);
- YYERROR;
- }
- }
- ;
statfilesection:
SECTION OBRACE sectionbody EBRACE {
cfg->factors = g_hash_table_new (g_str_hash, g_str_equal);
cfg->c_modules = g_hash_table_new (g_str_hash, g_str_equal);
cfg->composite_symbols = g_hash_table_new (g_str_hash, g_str_equal);
- cfg->statfiles = g_hash_table_new (g_str_hash, g_str_equal);
+ cfg->classifiers_symbols = g_hash_table_new (g_str_hash, g_str_equal);
cfg->cfg_params = g_hash_table_new (g_str_hash, g_str_equal);
init_settings (cfg);
g_hash_table_unref (cfg->c_modules);
g_hash_table_remove_all (cfg->composite_symbols);
g_hash_table_unref (cfg->composite_symbols);
- g_hash_table_remove_all (cfg->statfiles);
- g_hash_table_unref (cfg->statfiles);
g_hash_table_remove_all (cfg->cfg_params);
g_hash_table_unref (cfg->cfg_params);
+ g_hash_table_destroy (cfg->classifiers_symbols);
+ g_list_free (cfg->classifiers);
g_list_free (cfg->metrics_list);
memory_pool_delete (cfg->cfg_pool);
}
return res;
}
+struct classifier_config *
+check_classifier_cfg (struct config_file *cfg, struct classifier_config *c)
+{
+ if (c == NULL) {
+ c = memory_pool_alloc0 (cfg->cfg_pool, sizeof (struct classifier_config));
+ }
+ if (c->opts == NULL) {
+ c->opts = g_hash_table_new (g_str_hash, g_str_equal);
+ memory_pool_add_destructor (cfg->cfg_pool, (pool_destruct_func)g_hash_table_destroy, c->opts);
+ }
+
+ return c;
+}
+
/*
* vi:ts=4
*/
.init_func = winnow_init,
.classify_func = winnow_classify,
.learn_func = winnow_learn,
- .result_file_func = winnow_result_file
},
};
#include "../statfile.h"
#include "../tokenizers/tokenizers.h"
+struct classifier_config;
+struct worker_task;
+
struct classifier_ctx {
memory_pool_t *pool;
GHashTable *results;
+ struct classifier_config *cfg;
};
/* Common classifier structure */
struct classifier {
char *name;
- struct classifier_ctx* (*init_func)(memory_pool_t *pool);
- void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, double scale);
+ struct classifier_ctx* (*init_func)(memory_pool_t *pool, struct classifier_config *cf);
+ void (*classify_func)(struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
void (*learn_func)(struct classifier_ctx* ctx, statfile_pool_t *pool,
- char *statfile, GTree *input, int in_class);
- char* (*result_file_func)(struct classifier_ctx *ctx, double *probability);
+ char *symbol, GTree *input, gboolean in_class);
};
/* Get classifier structure by name or return NULL if this name is not found */
struct classifier* get_classifier (char *name);
/* Winnow algorithm */
-struct classifier_ctx* winnow_init (memory_pool_t *pool);
-void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale);
-void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class);
-char* winnow_result_file (struct classifier_ctx* ctx, double *probability);
+struct classifier_ctx* winnow_init (memory_pool_t *pool, struct classifier_config *cf);
+void winnow_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task);
+void winnow_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, char *symbol, GTree *input, gboolean in_class);
/* Array of all defined classifiers */
extern struct classifier classifiers[];
*/
#include "classifiers.h"
+#include "../main.h"
+#include "../filter.h"
+#include "../cfg_file.h"
#define WINNOW_PROMOTION 1.23
#define WINNOW_DEMOTION 0.83
}
struct classifier_ctx*
-winnow_init (memory_pool_t *pool)
+winnow_init (memory_pool_t *pool, struct classifier_config *cfg)
{
struct classifier_ctx *ctx = memory_pool_alloc (pool, sizeof (struct classifier_ctx));
ctx->pool = pool;
- ctx->results = g_hash_table_new (g_str_hash, g_str_equal);
- memory_pool_add_destructor (pool, (pool_destruct_func)g_hash_table_destroy, ctx->results);
+ ctx->cfg = cfg;
return ctx;
}
void
-winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, double scale)
+winnow_classify (struct classifier_ctx *ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task)
{
struct winnow_callback_data data;
double *res = memory_pool_alloc (ctx->pool, sizeof (double));
+ double max = 0;
+ GList *cur;
+ struct statfile *st, *sel = NULL;
g_assert (pool != NULL);
g_assert (ctx != NULL);
data.count = 0;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = ctx->cfg->statfiles;
+ while (cur) {
+ st = cur->data;
+ if ((data.file = statfile_pool_is_open (pool, st->path)) == NULL) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_warn ("winnow_classify: cannot open %s, skip it", st->path);
+ cur = g_list_next (cur);
+ continue;
+ }
}
- }
- g_tree_foreach (input, classify_callback, &data);
+ g_tree_foreach (input, classify_callback, &data);
- if (data.count != 0) {
- *res = scale * (data.sum / data.count);
+ if (data.count != 0) {
+ *res = (data.sum / data.count);
+ }
+ else {
+ *res = 0;
+ }
+ if (*res > max) {
+ max = *res;
+ sel = st;
+ }
+ cur = g_list_next (cur);
}
- else {
- *res = 0;
+
+ if (sel != NULL) {
+ insert_result (task, ctx->cfg->metric, sel->symbol, 1, NULL);
}
-
- g_hash_table_insert (ctx->results, statfile, res);
}
void
-winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *statfile, GTree *input, int in_class)
+winnow_learn (struct classifier_ctx *ctx, statfile_pool_t *pool, char *symbol, GTree *input, int in_class)
{
struct winnow_callback_data data;
+ GList *cur;
+ struct statfile *st;
g_assert (pool != NULL);
g_assert (ctx != NULL);
data.in_class = in_class;
data.now = time (NULL);
data.ctx = ctx;
-
- if ((data.file = statfile_pool_is_open (pool, statfile)) == NULL) {
- if ((data.file = statfile_pool_open (pool, statfile)) == NULL) {
- return;
+
+ cur = g_list_first (ctx->cfg->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (strcmp (symbol, st->symbol) == 0) {
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ /* Try to create statfile */
+ if (statfile_pool_create (pool,
+ st->path, st->size / sizeof (struct stat_file_block)) == -1) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ if ((data.file = statfile_pool_open (pool, st->path)) == NULL) {
+ msg_err ("winnow_learn: cannot create statfile %s", st->path);
+ return;
+ }
+ }
+ break;
}
+ cur = g_list_next (cur);
}
statfile_pool_lock_file (pool, data.file);
g_tree_foreach (input, learn_callback, &data);
statfile_pool_unlock_file (pool, data.file);
}
-
-struct winnow_result_data {
- char *filename;
- double max_score;
- double sum;
-};
-
-static void
-result_file_callback (gpointer key, gpointer value, gpointer data)
-{
- struct winnow_result_data *d = (struct winnow_result_data *)data;
- double w = *((double *)value);
-
- if (fabs (w) > fabs (d->max_score)) {
- d->filename = (char *)key;
- d->max_score = w;
- }
- d->sum += fabs (w);
-}
-
-char*
-winnow_result_file (struct classifier_ctx* ctx, double *probability)
-{
- struct winnow_result_data data = { NULL, 0, 0 };
- g_assert (ctx != NULL);
-
- g_hash_table_foreach (ctx->results, result_file_callback, &data);
- if (data.sum != 0) {
- *probability = data.max_score / data.sum;
- }
- else {
- *probability = 1;
- }
-
- return data.filename;
-}
int r = 0, days, hours, minutes;
time_t uptime;
unsigned long size = 0;
- struct statfile *statfile;
- stat_file_t *file;
- struct metric *metric;
+ struct classifier_config *cl;
memory_pool_stat_t mem_st;
char *password = g_hash_table_lookup (session->worker->cf->params, "password");
return;
}
- statfile = g_hash_table_lookup (session->cfg->statfiles, *cmd_args);
- if (statfile == NULL) {
+ session->learn_symbol = *cmd_args;
+ cl = g_hash_table_lookup (session->cfg->classifiers_symbols, *cmd_args);
+ if (cl == NULL) {
r = snprintf (out_buf, sizeof (out_buf), "statfile %s is not defined" CRLF, *cmd_args);
rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
return;
}
+ session->learn_classifier = cl;
- metric = g_hash_table_lookup (session->cfg->metrics, statfile->metric);
-
- session->learn_rcpt = NULL;
- session->learn_from = NULL;
- session->learn_filename = NULL;
- session->learn_tokenizer = statfile->tokenizer;
- if (metric != NULL) {
- session->learn_classifier = metric->classifier;
- }
- else {
- session->learn_classifier = get_classifier ("winnow");
- }
/* By default learn positive */
session->in_class = 1;
/* Get all arguments */
}
}
}
- session->learn_filename = resolve_stat_filename (session->session_pool, statfile->pattern,
- session->learn_rcpt, session->learn_from);
- if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
- /* Try to create statfile */
- if (statfile_pool_create (session->worker->srv->statfile_pool,
- session->learn_filename, statfile->size / sizeof (struct stat_file_block)) == -1) {
- r = snprintf (out_buf, sizeof (out_buf), "cannot create statfile %s" CRLF, session->learn_filename);
- rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
- return;
- }
- if ((file = statfile_pool_open (session->worker->srv->statfile_pool, session->learn_filename)) == NULL) {
- r = snprintf (out_buf, sizeof (out_buf), "cannot open statfile %s" CRLF, session->learn_filename);
- rspamd_dispatcher_write (session->dispatcher, out_buf, r, FALSE, FALSE);
- return;
- }
- }
rspamd_set_dispatcher_policy (session->dispatcher, BUFFER_CHARACTER, size);
session->state = STATE_LEARN;
}
while ((content = get_next_text_part (session->session_pool, session->parts, &cur)) != NULL) {
c.begin = content->data;
c.len = content->len;
- if (!session->learn_tokenizer->tokenize_func (session->learn_tokenizer,
+ if (!session->learn_classifier->tokenizer->tokenize_func (session->learn_classifier->tokenizer,
session->session_pool, &c, &tokens)) {
i = snprintf (out_buf, sizeof (out_buf), "learn fail, tokenizer error" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);
return;
}
}
- cls_ctx = session->learn_classifier->init_func (session->session_pool);
- session->learn_classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
- session->learn_filename, tokens, session->in_class);
+ cls_ctx = session->learn_classifier->classifier->init_func (session->session_pool, session->learn_classifier);
+ session->learn_classifier->classifier->learn_func (cls_ctx, session->worker->srv->statfile_pool,
+ session->learn_symbol, tokens, session->in_class);
session->worker->srv->stat->messages_learned ++;
i = snprintf (out_buf, sizeof (out_buf), "learn ok" CRLF);
rspamd_dispatcher_write (session->dispatcher, out_buf, i, FALSE, FALSE);
return FALSE;
}
-static void
+void
process_autolearn (struct statfile *st, struct worker_task *task, GTree *tokens,
struct classifier *classifier, char *filename, struct classifier_ctx* ctx)
{
}
}
- classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, 1);
+ classifier->learn_func (ctx, task->worker->srv->statfile_pool, filename, tokens, TRUE);
}
}
}
g_hash_table_foreach (task->results, composites_metric_callback, task);
}
-struct statfile_result_data {
- struct metric *metric;
- struct classifier_ctx *ctx;
-};
struct statfile_callback_data {
GHashTable *tokens;
- GHashTable *classifiers;
struct worker_task *task;
};
static void
-statfiles_callback (gpointer key, gpointer value, void *arg)
+classifiers_callback (gpointer value, void *arg)
{
struct statfile_callback_data *data= (struct statfile_callback_data *)arg;
struct worker_task *task = data->task;
- struct statfile *st = (struct statfile *)value;
- struct classifier *classifier;
- struct statfile_result_data *res_data;
- struct metric *metric;
+ struct classifier_config *cl = value;
+ struct classifier_ctx *ctx;
struct mime_text_part *text_part;
-
+ struct statfile *st;
GTree *tokens = NULL;
GList *cur;
-
- char *filename;
f_str_t c;
- if (g_list_length (task->rcpt) == 1) {
- filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, (char *)task->rcpt->data);
- }
- else {
- /* XXX: handle multiply recipients correctly */
- filename = resolve_stat_filename (task->task_pool, st->pattern, task->from, "");
- }
-
- if (statfile_pool_open (task->worker->srv->statfile_pool, filename) == NULL && !check_autolearn (st->autolearn, task)) {
- return;
- }
-
cur = g_list_first (task->text_parts);
- if ((tokens = g_hash_table_lookup (data->tokens, st->tokenizer)) == NULL) {
+ if ((tokens = g_hash_table_lookup (data->tokens, cl->tokenizer)) == NULL) {
while (cur != NULL) {
text_part = (struct mime_text_part *)cur->data;
if (text_part->is_empty) {
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!st->tokenizer->tokenize_func (st->tokenizer, task->task_pool, &c, &tokens)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens)) {
msg_info ("statfiles_callback: cannot tokenize input");
return;
}
cur = g_list_next (cur);
}
- g_hash_table_insert (data->tokens, st->tokenizer, tokens);
+ g_hash_table_insert (data->tokens, cl->tokenizer, tokens);
}
- metric = g_hash_table_lookup (task->cfg->metrics, st->metric);
- if (metric == NULL) {
- classifier = get_classifier ("winnow");
- }
- else {
- classifier = metric->classifier;
- }
- if ((res_data = g_hash_table_lookup (data->classifiers, classifier)) == NULL) {
- res_data = memory_pool_alloc (task->task_pool, sizeof (struct statfile_result_data));
- res_data->ctx = classifier->init_func (task->task_pool);
- res_data->metric = metric;
- g_hash_table_insert (data->classifiers, classifier, res_data);
- }
+ ctx = cl->classifier->init_func (task->task_pool, cl);
+ cl->classifier->classify_func (ctx, task->worker->srv->statfile_pool, tokens, task);
- classifier->classify_func (res_data->ctx, task->worker->srv->statfile_pool, filename, tokens, st->weight);
-
- if (st->autolearn) {
- /* Process autolearn */
- process_autolearn (st, task, tokens, classifier, filename, res_data->ctx);
+ /* Autolearning */
+ cur = g_list_first (cl->statfiles);
+ while (cur) {
+ st = cur->data;
+ if (st->autolearn) {
+ if (check_autolearn (st->autolearn, task)) {
+ /* Process autolearn */
+ process_autolearn (st, task, tokens, cl->classifier, st->path, ctx);
+ }
+ }
+ cur = g_list_next (cur);
}
}
-static void
-statfiles_results_callback (gpointer key, gpointer value, void *arg)
-{
- struct worker_task *task = (struct worker_task *)arg;
- struct statfile_result_data *res = (struct statfile_result_data *)value;
- struct classifier *classifier = (struct classifier *)key;
- double *w;
- char *filename;
-
- w = memory_pool_alloc (task->task_pool, sizeof (double));
- filename = classifier->result_file_func (res->ctx, w);
- insert_result (task, res->metric->name, classifier->name, *w, NULL);
- msg_debug ("statfiles_results_callback: got total weight %.2f for metric %s", *w, res->metric->name);
-}
-
void
process_statfiles (struct worker_task *task)
cd.task = task;
cd.tokens = g_hash_table_new (g_direct_hash, g_direct_equal);
- cd.classifiers = g_hash_table_new (g_str_hash, g_str_equal);
- g_hash_table_foreach (task->cfg->statfiles, statfiles_callback, &cd);
- g_hash_table_foreach (cd.classifiers, statfiles_results_callback, task);
-
+ g_list_foreach (task->cfg->classifiers, classifiers_callback, &cd);
g_hash_table_destroy (cd.tokens);
- g_hash_table_destroy (cd.classifiers);
- /* Process results */
- g_hash_table_foreach (task->results, metric_process_callback_forced, task);
+ /* Process results */
task->state = WRITE_REPLY;
}
#define RSPAMD_LUA_H
#include "../config.h"
+#ifdef WITH_LUA
+
#include "../main.h"
#include "../cfg_file.h"
#include <lua.h>
double lua_consolidation_func (struct worker_task *task, const char *metric_name, const char *function_name);
void add_luabuf (const char *line);
-#endif
+#endif /* WITH_LUA */
+#endif /* RSPAMD_LUA_H */
struct config_file;
struct tokenizer;
struct classifier;
+struct classifier_config;
struct mime_part;
struct rspamd_view;
struct config_file *cfg; /**< pointer to config file */
char *learn_rcpt; /**< recipient for learning */
char *learn_from; /**< from address for learning */
- struct tokenizer *learn_tokenizer; /**< tokenizer for learning */
- struct classifier *learn_classifier; /**< classifier for learning */
- char *learn_filename; /**< real filename for learning */
+ struct classifier_config *learn_classifier;
+ char *learn_symbol; /**< symbol to train */
rspamd_io_dispatcher_t *dispatcher; /**< IO dispatcher object */
f_str_t *learn_buf; /**< learn input */
GList *parts; /**< extracted mime parts */
task->rcpt = g_list_prepend (task->rcpt, tmp);
msg_debug ("parse_header: read rcpt header, value: %s", tmp);
}
- else {
- msg_info ("parse_header: wrong header: %s", headern);
- return -1;
- }
- break;
- case 'n':
- case 'N':
- /* nrcpt */
- if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
+ else if (strncasecmp (headern, NRCPT_HEADER, sizeof (NRCPT_HEADER) - 1) == 0) {
tmp = memory_pool_fstrdup (task->task_pool, line);
task->nrcpt = strtoul (tmp, &err, 10);
msg_debug ("parse_header: read rcpt header, value: %d", (int)task->nrcpt);
item = &cache->items[0];
}
else {
+ if (cache == NULL) {
+ return FALSE;
+ }
/* Next pointer */
if (*saved_item - cache->items >= cache->used_items - 1) {
/* No more items in cache */
rspamd_statfile_test_func ()
{
statfile_pool_t *pool;
+ stat_file_t *st;
uint32_t random_hashes[HASHES_NUM], i, v;
time_t now;
/* Create new file */
g_assert (statfile_pool_create (pool, TEST_FILENAME, 65535) != -1);
- g_assert (statfile_pool_open (pool, TEST_FILENAME) != -1);
+ g_assert ((st = statfile_pool_open (pool, TEST_FILENAME)) != NULL);
/* Get and set random blocks */
- statfile_pool_lock_file (pool, TEST_FILENAME);
+ statfile_pool_lock_file (pool, st);
for (i = 0; i < HASHES_NUM; i ++) {
- statfile_pool_set_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now, 1.0);
+ statfile_pool_set_block (pool, st, random_hashes[i], random_hashes[i], now, 1.0);
}
- statfile_pool_unlock_file (pool, TEST_FILENAME);
+ statfile_pool_unlock_file (pool, st);
for (i = 0; i < HASHES_NUM; i ++) {
- v = statfile_pool_get_block (pool, TEST_FILENAME, random_hashes[i], random_hashes[i], now);
+ v = statfile_pool_get_block (pool, st, random_hashes[i], random_hashes[i], now);
g_assert(v == 1.0);
}
g_test_add_func ("/rspamd/statfile", rspamd_statfile_test_func);
g_test_run ();
+
+ return 0;
}
}
memory_pool_delete (pool);
+
+ return 0;
}