]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add more words regexp classes
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 29 Nov 2018 15:07:26 +0000 (15:07 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 29 Nov 2018 15:07:26 +0000 (15:07 +0000)
src/libmime/mime_expressions.c
src/libserver/re_cache.c
src/libserver/re_cache.h

index c6d258c494e17eaed62d02b046f2f24c23b680de..535b8a12476bcfe4b8ade43175315fdc0054f446 100644 (file)
@@ -243,6 +243,14 @@ rspamd_parse_long_option (const gchar *start, gsize len,
                ret = TRUE;
                a->type = RSPAMD_RE_WORDS;
        }
+       else if (TYPE_CHECK (start, "raw_words", len)) {
+               ret = TRUE;
+               a->type = RSPAMD_RE_RAWWORDS;
+       }
+       else if (TYPE_CHECK (start, "stem_words", len)) {
+               ret = TRUE;
+               a->type = RSPAMD_RE_STEMWORDS;
+       }
        else if (TYPE_CHECK (start, "selector", len)) {
                ret = TRUE;
                a->type = RSPAMD_RE_SELECTOR;
index c2c7464fc282b10f2b9d82e1d76a5cd3563b88a1..b323ffa0e390ee0c6aabb873c9130978722b8de2 100644 (file)
@@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                }
                break;
        case RSPAMD_RE_WORDS:
+       case RSPAMD_RE_STEMWORDS:
+       case RSPAMD_RE_RAWWORDS:
                if (task->text_parts->len > 0) {
                        cnt = 0;
+                       raw = FALSE;
 
                        PTR_ARRAY_FOREACH (task->text_parts, i, part) {
                                if (part->utf_words) {
@@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                        guint j;
                                        rspamd_stat_token_t *tok;
 
-
                                        if (part->utf_words) {
                                                for (j = 0; j < part->utf_words->len; j ++) {
                                                        tok = &g_array_index (part->utf_words,
                                                                        rspamd_stat_token_t, j);
 
-                                                       if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
-                                                               scvec[cnt] = tok->normalized.begin;
-                                                               lenvec[cnt++] = tok->normalized.len;
+                                                       if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                                                               if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
+                                                                       if (!re_class->has_utf8) {
+                                                                               raw = TRUE;
+                                                                       }
+                                                                       else {
+                                                                               continue; /* Skip */
+                                                                       }
+                                                               }
+                                                       }
+                                                       else {
+                                                               continue; /* Skip non text */
+                                                       }
+
+                                                       if (re_class->type == RSPAMD_RE_RAWWORDS) {
+                                                               if (tok->original.len > 0) {
+                                                                       scvec[cnt] = tok->original.begin;
+                                                                       lenvec[cnt++] = tok->original.len;
+                                                               }
+                                                       }
+                                                       else if (re_class->type == RSPAMD_RE_WORDS) {
+                                                               if (tok->normalized.len > 0) {
+                                                                       scvec[cnt] = tok->normalized.begin;
+                                                                       lenvec[cnt++] = tok->normalized.len;
+                                                               }
+                                                       }
+                                                       else {
+                                                               /* Stemmed words */
+                                                               if (tok->stemmed.len > 0) {
+                                                                       scvec[cnt] = tok->stemmed.begin;
+                                                                       lenvec[cnt++] = tok->stemmed.len;
+                                                               }
                                                        }
                                                }
                                        }
                                }
 
                                ret = rspamd_re_cache_process_regexp_data (rt, re,
-                                               task, scvec, lenvec, cnt, TRUE);
+                                               task, scvec, lenvec, cnt, raw);
 
                                msg_debug_re_task ("checking sa words regexp: %s -> %d",
                                                rspamd_regexp_get_pattern (re), ret);
index 596ea08c271313de623973e66cf012b2849c78da..15146c5dd8106a1fa13282c54050b9bba90b0ca8 100644 (file)
@@ -36,6 +36,8 @@ enum rspamd_re_type {
        RSPAMD_RE_SABODY, /* body in SA */
        RSPAMD_RE_SARAWBODY, /* rawbody in SA */
        RSPAMD_RE_WORDS, /* normalized words */
+       RSPAMD_RE_RAWWORDS, /* raw words */
+       RSPAMD_RE_STEMWORDS, /* stemmed words */
        RSPAMD_RE_SELECTOR, /* use lua selector to process regexp */
        RSPAMD_RE_MAX
 };