}
/* Probability for this token */
- if (total_count > ctx->cfg->min_token_hits) {
+ if (total_count >= ctx->cfg->min_token_hits) {
spam_freq = ((double)spam_count / MAX (1., (double) ctx->spam_learns));
ham_freq = ((double)ham_count / MAX (1., (double)ctx->ham_learns));
spam_prob = spam_freq / (spam_freq + ham_freq);
lua_settop (L, 0);
st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
+ task,
ar,
TRUE,
"META:",
part = g_ptr_array_index (task->text_parts, i);
if (!IS_PART_EMPTY (part) && part->utf_words != NULL) {
- st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool,
+ st_ctx->tokenizer->tokenize_func (st_ctx, task,
part->utf_words, IS_PART_UTF (part),
NULL, task->tokens);
}
words = rspamd_tokenize_subject (task);
if (words != NULL) {
st_ctx->tokenizer->tokenize_func (st_ctx,
- task->task_pool,
+ task,
words,
TRUE,
"SUBJECT",
* OSB tokenizer
*/
+
#include "tokenizers.h"
#include "stat_internal.h"
+#include "libmime/lang_detection.h"
/* Size for features pipe */
#define DEFAULT_FEATURE_WINDOW_SIZE 5
gint
rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result)
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result)
{
rspamd_token_t *new_tok = NULL;
rspamd_stat_token_t *token;
token = &g_array_index (words, rspamd_stat_token_t, w);
token_flags = token->flags;
+ if (task->lang_det) {
+ if (rspamd_language_detector_is_stop_word (task->lang_det,
+ token->begin, token->len)) {
+ /* Skip it */
+ continue;
+ }
+ }
+
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
rspamd_ftok_t ftok;
}
if (token_flags & RSPAMD_STAT_TOKEN_FLAG_UNIGRAM) {
- new_tok = rspamd_mempool_alloc0 (pool, token_size);
+ new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size);
new_tok->flags = token_flags;
new_tok->t1 = token;
new_tok->t2 = token;
}
#define ADD_TOKEN do {\
- new_tok = rspamd_mempool_alloc0 (pool, token_size); \
+ new_tok = rspamd_mempool_alloc0 (task->task_pool, token_size); \
new_tok->flags = token_flags; \
new_tok->t1 = hashpipe[0].t; \
new_tok->t2 = hashpipe[i].t; \
processed++;
for (i = 1; i < window_size; i++) {
- ADD_TOKEN;
+ if (!(hashpipe[i].t->flags & RSPAMD_STAT_TOKEN_FLAG_EXCEPTION)) {
+ ADD_TOKEN;
+ }
}
}
}
struct rspamd_stat_tokenizer {
gchar *name;
gpointer (*get_config) (rspamd_mempool_t *pool,
- struct rspamd_tokenizer_config *cf, gsize *len);
+ struct rspamd_tokenizer_config *cf, gsize *len);
gint (*tokenize_func)(struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
};
enum rspamd_tokenize_type {
/* OSB tokenize function */
gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
- rspamd_mempool_t *pool,
- GArray *words,
- gboolean is_utf,
- const gchar *prefix,
- GPtrArray *result);
+ struct rspamd_task *task,
+ GArray *words,
+ gboolean is_utf,
+ const gchar *prefix,
+ GPtrArray *result);
gpointer rspamd_tokenizer_osb_get_config (rspamd_mempool_t *pool,
struct rspamd_tokenizer_config *cf,