From 7e4b88f7bc3ccccbf462a62de865beff388deae6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 19 Apr 2016 15:56:41 +0100 Subject: [PATCH] [Doc] Improve classifiers documentation --- doc/markdown/configuration/statistic.md | 129 ++++++++++++++---------- 1 file changed, 75 insertions(+), 54 deletions(-) diff --git a/doc/markdown/configuration/statistic.md b/doc/markdown/configuration/statistic.md index f314a31a65..18e870652c 100644 --- a/doc/markdown/configuration/statistic.md +++ b/doc/markdown/configuration/statistic.md @@ -17,7 +17,7 @@ This schema is displayed in the following picture: ![OSB algorithm](https://rspamd.com/img/rspamd-schemes.004.png "Rspamd OSB scheme") -The main disadvantage is the amount of tokens which is multiplied by size of window. In rspamd, we use a window of 5 tokens that means that +The main disadvantage is the amount of tokens which is multiplied by size of window. In rspamd, we use a window of 5 tokens that means that the number of tokens is about 5 times larger than the amount of words. Statistical tokens are stored in statfiles which, in turn, are mapped to specific backends. This architecture is displayed in the following image: @@ -30,15 +30,24 @@ Starting from rspamd 1.0, we propose to use `sqlite3` as backed and `osb` as tok metainformation in statistics. The following configuration demonstrates the recommended statistics configuration: ~~~ucl -classifier { - type = "bayes"; +# Classifier's algorith is BAYES +classifier "bayes" { tokenizer { name = "osb"; } + + # Unique name used to learn the specific classifier + name = "common_bayes"; + cache { path = "${DBDIR}/learn_cache.sqlite"; } + + # Minimum number of words required for statistics processing min_tokens = 11; + # Minimum learn count for both spam and ham classes to perform classification + min_learns = 200; + backend = "sqlite3"; languages_enabled = true; statfile { @@ -67,15 +76,19 @@ It is also possible to create custom lua scripts to use customized user or langu of such a script for extracting domain names from recipients organizing thus per-domain statistics: ~~~ucl - classifier { - tokenizer { - name = "osb"; - } - name = "bayes2"; - min_tokens = 11; - backend = "sqlite3"; - per_language = true; - per_user = <