From 36c874383c4c56fb10c737a3f5932abc173080e2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 29 Nov 2018 15:07:26 +0000 Subject: [PATCH] [Feature] Add more words regexp classes --- src/libmime/mime_expressions.c | 8 +++++++ src/libserver/re_cache.c | 41 +++++++++++++++++++++++++++++----- src/libserver/re_cache.h | 2 ++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index c6d258c494..535b8a1247 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -243,6 +243,14 @@ rspamd_parse_long_option (const gchar *start, gsize len, ret = TRUE; a->type = RSPAMD_RE_WORDS; } + else if (TYPE_CHECK (start, "raw_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_RAWWORDS; + } + else if (TYPE_CHECK (start, "stem_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_STEMWORDS; + } else if (TYPE_CHECK (start, "selector", len)) { ret = TRUE; a->type = RSPAMD_RE_SELECTOR; diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c2c7464fc2..b323ffa0e3 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_WORDS: + case RSPAMD_RE_STEMWORDS: + case RSPAMD_RE_RAWWORDS: if (task->text_parts->len > 0) { cnt = 0; + raw = FALSE; PTR_ARRAY_FOREACH (task->text_parts, i, part) { if (part->utf_words) { @@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, guint j; rspamd_stat_token_t *tok; - if (part->utf_words) { for (j = 0; j < part->utf_words->len; j ++) { tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j); - if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - scvec[cnt] = tok->normalized.begin; - lenvec[cnt++] = tok->normalized.len; + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { + if (!re_class->has_utf8) { + raw = TRUE; + } + else { + continue; /* Skip */ + } + } + } + else { + continue; /* Skip non text */ + } + + if (re_class->type == RSPAMD_RE_RAWWORDS) { + if (tok->original.len > 0) { + scvec[cnt] = tok->original.begin; + lenvec[cnt++] = tok->original.len; + } + } + else if (re_class->type == RSPAMD_RE_WORDS) { + if (tok->normalized.len > 0) { + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; + } + } + else { + /* Stemmed words */ + if (tok->stemmed.len > 0) { + scvec[cnt] = tok->stemmed.begin; + lenvec[cnt++] = tok->stemmed.len; + } } } } } ret = rspamd_re_cache_process_regexp_data (rt, re, - task, scvec, lenvec, cnt, TRUE); + task, scvec, lenvec, cnt, raw); msg_debug_re_task ("checking sa words regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h index 596ea08c27..15146c5dd8 100644 --- a/src/libserver/re_cache.h +++ b/src/libserver/re_cache.h @@ -36,6 +36,8 @@ enum rspamd_re_type { RSPAMD_RE_SABODY, /* body in SA */ RSPAMD_RE_SARAWBODY, /* rawbody in SA */ RSPAMD_RE_WORDS, /* normalized words */ + RSPAMD_RE_RAWWORDS, /* raw words */ + RSPAMD_RE_STEMWORDS, /* stemmed words */ RSPAMD_RE_SELECTOR, /* use lua selector to process regexp */ RSPAMD_RE_MAX }; -- 2.47.3