]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Project] Finish basic tasks in new unicode project
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 17:34:08 +0000 (17:34 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 17:34:08 +0000 (17:34 +0000)
12 files changed:
src/libmime/lang_detection.c
src/libserver/re_cache.c
src/libstat/backends/redis_backend.c
src/libstat/classifiers/bayes.c
src/libstat/stat_process.c
src/libstat/tokenizers/osb.c
src/libutil/shingles.c
src/lua/lua_mimepart.c
src/lua/lua_task.c
src/lua/lua_util.c
src/plugins/chartable.c
src/plugins/fuzzy_check.c

index dbe9dbe9542db7a418fd9a9c039164367857752e..e80a13e290a9424002a235c936186de948025099 100644 (file)
@@ -560,8 +560,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 
                        cur_ucs = ucs_elt->s;
                        nsym = 0;
+                       uc_err = U_ZERO_ERROR;
 
-                       while (keylen > 0) {
+                       while (cur_utf < end) {
                                *cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf,
                                                end, &uc_err);
                                if (!U_SUCCESS (uc_err)) {
@@ -569,12 +570,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                                }
 
                                nsym ++;
-                               keylen --;
                        }
 
                        if (!U_SUCCESS (uc_err)) {
-                               msg_warn_config ("cannot convert key to unicode: %s",
-                                               u_errorName (uc_err));
+                               msg_warn_config ("cannot convert key %*s to unicode: %s",
+                                               (gint)keylen, key, u_errorName (uc_err));
 
                                continue;
                        }
@@ -1178,7 +1178,7 @@ rspamd_language_detector_detect_type (struct rspamd_task *task,
 {
        guint nparts = MIN (words->len, nwords);
        goffset *selected_words;
-       rspamd_stat_token_t *tok, ucs_w;
+       rspamd_stat_token_t *tok;
        guint i;
 
        selected_words = g_new0 (goffset, nparts);
index 7b7cabb69651ded97185a54e55388d8685051e22..e43de2c640993e164f4f0bb5b88635825f58663d 100644 (file)
@@ -1223,9 +1223,10 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
 
                                        if (part->utf_words) {
                                                for (j = 0; j < part->utf_words->len; j ++) {
-                                                       tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j);
-                                                       scvec[cnt] = tok->begin;
-                                                       lenvec[cnt++] = tok->len;
+                                                       tok = &g_array_index (part->utf_words,
+                                                                       rspamd_stat_token_t, j);
+                                                       scvec[cnt] = tok->normalized.begin;
+                                                       lenvec[cnt++] = tok->normalized.len;
                                                }
                                        }
                                }
@@ -1433,6 +1434,9 @@ rspamd_re_cache_type_to_string (enum rspamd_re_type type)
        case RSPAMD_RE_SELECTOR:
                ret = "selector";
                break;
+       case RSPAMD_RE_WORDS:
+               ret = "words";
+               break;
        case RSPAMD_RE_MAX:
                ret = "invalid class";
                break;
index 00441a7a6e3f8df8813111a17c51f9e17a74ed47..b003d5a27be53e02a78aa6040adc7cccd73bea96 100644 (file)
@@ -527,14 +527,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task,
                                                                "HSET %b_tokens %b %b:%b",
                                                                prefix, (size_t) prefix_len,
                                                                n0, (size_t) l0,
-                                                               tok->t1->begin, tok->t1->len,
-                                                               tok->t2->begin, tok->t2->len);
+                                                               tok->t1->stemmed.begin, tok->t1->stemmed.len,
+                                                               tok->t2->stemmed.begin, tok->t2->stemmed.len);
                                        } else if (tok->t1) {
                                                redisAsyncCommand (rt->redis, NULL, NULL,
                                                                "HSET %b_tokens %b %b",
                                                                prefix, (size_t) prefix_len,
                                                                n0, (size_t) l0,
-                                                               tok->t1->begin, tok->t1->len);
+                                                               tok->t1->stemmed.begin, tok->t1->stemmed.len);
                                        }
                                }
                                else {
@@ -548,14 +548,14 @@ rspamd_redis_tokens_to_query (struct rspamd_task *task,
                                                                "HSET %b %s %b:%b",
                                                                n0, (size_t) l0,
                                                                "tokens",
-                                                               tok->t1->begin, tok->t1->len,
-                                                               tok->t2->begin, tok->t2->len);
+                                                               tok->t1->stemmed.begin, tok->t1->stemmed.len,
+                                                               tok->t2->stemmed.begin, tok->t2->stemmed.len);
                                        } else if (tok->t1) {
                                                redisAsyncCommand (rt->redis, NULL, NULL,
                                                                "HSET %b %s %b",
                                                                n0, (size_t) l0,
                                                                "tokens",
-                                                               tok->t1->begin, tok->t1->len);
+                                                               tok->t1->stemmed.begin, tok->t1->stemmed.len);
                                        }
                                }
 
index 934c8d941383addf7093645b61008c3df861ff8f..2b0cf21e81a2ac7e9222f7b59e81c4739befff05 100644 (file)
@@ -147,8 +147,8 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                                msg_debug_bayes (
                                                "token(meta) %uL <%*s:%*s> probabilistically skipped",
                                                tok->data,
-                                               (int) tok->t1->len, tok->t1->begin,
-                                               (int) tok->t2->len, tok->t2->begin);
+                                               (int) tok->t1->original.len, tok->t1->original.begin,
+                                               (int) tok->t2->original.len, tok->t2->original.begin);
                        }
 
                        return;
@@ -199,8 +199,9 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                        msg_debug_bayes (
                                        "token %uL <%*s:%*s> skipped, prob not in range: %f",
                                        tok->data,
-                                       (int) tok->t1->len, tok->t1->begin,
-                                       (int) tok->t2->len, tok->t2->begin, bayes_spam_prob);
+                                       (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+                                       (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+                                       bayes_spam_prob);
 
                        return;
                }
@@ -227,8 +228,8 @@ bayes_classify_token (struct rspamd_classifier *ctx,
                                        "current spam prob: %.3f, current ham prob: %.3f",
                                        token_type,
                                        tok->data,
-                                       (int) tok->t1->len, tok->t1->begin,
-                                       (int) tok->t2->len, tok->t2->begin,
+                                       (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+                                       (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
                                        fw, w, total_count, spam_count, ham_count,
                                        spam_prob, ham_prob,
                                        bayes_spam_prob, bayes_ham_prob,
@@ -541,8 +542,8 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
                        msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
                                        "spam_count: %d, ham_count: %d",
                                        tok->data,
-                                       (int) tok->t1->len, tok->t1->begin,
-                                       (int) tok->t2->len, tok->t2->begin,
+                                       (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+                                       (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
                                        tok->window_idx, total_cnt, spam_cnt, ham_cnt);
                }
                else {
index 0465f0c3c8693b796cf0a5125cf216052782e258..ed3f78fdeedf29aaec302c812b680aed43e0eb92 100644 (file)
@@ -41,6 +41,7 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
        lua_State *L = task->cfg->lua_state;
 
        ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+       memset (&elt, 0, sizeof (elt));
        elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
 
        if (st_ctx->lua_stat_tokens_ref != -1) {
@@ -82,8 +83,13 @@ rspamd_stat_tokenize_parts_metadata (struct rspamd_stat_ctx *st_ctx,
                                        tok.begin = lua_tolstring (L, -1, &tok.len);
 
                                        if (tok.begin && tok.len > 0) {
-                                               elt.begin = rspamd_mempool_ftokdup (task->task_pool, &tok);
-                                               elt.len = tok.len;
+                                               elt.original.begin =
+                                                               rspamd_mempool_ftokdup (task->task_pool, &tok);
+                                               elt.original.len = tok.len;
+                                               elt.stemmed.begin = elt.original.begin;
+                                               elt.stemmed.len = elt.original.len;
+                                               elt.normalized.begin = elt.original.begin;
+                                               elt.normalized.len = elt.original.len;
 
                                                g_array_append_val (ar, elt);
                                        }
index a19217a890d27187189e59a1331c0883e859d6b0..0b53f8af975b2bf8566b69b73eac35040ab14a3f 100644 (file)
@@ -304,30 +304,40 @@ rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
        for (w = 0; w < words->len; w ++) {
                token = &g_array_index (words, rspamd_stat_token_t, w);
                token_flags = token->flags;
+               const gchar *begin;
+               gsize len;
 
-               if (task->lang_det) {
-                       if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
-                               /* Skip stop word */
-                               continue;
-                       }
+               if (token->flags &
+                       (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
+                       /* Skip stop/skipped words */
+                       continue;
+               }
+
+               if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                       begin = token->stemmed.begin;
+                       len = token->stemmed.len;
+               }
+               else {
+                       begin = token->original.begin;
+                       len = token->original.len;
                }
 
                if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
                        rspamd_ftok_t ftok;
 
-                       ftok.begin = token->begin;
-                       ftok.len = token->len;
+                       ftok.begin = begin;
+                       ftok.len = len;
                        cur = rspamd_fstrhash_lc (&ftok, is_utf);
                }
                else {
                        /* We know that the words are normalized */
                        if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
                                cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
-                                               token->begin, token->len, osb_cf->seed);
+                                               begin, len, osb_cf->seed);
                        }
                        else {
-                               rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
-                                               token->len, osb_cf->sk);
+                               rspamd_cryptobox_siphash ((guchar *)&cur, begin,
+                                               len, osb_cf->sk);
 
                                if (prefix) {
                                        cur ^= seed;
index 240facc4a180c2e88e142186a4eede9bb0fb3bd4..87099a6e7e310aac81f4d67fdbba87b8b72c4969 100644 (file)
@@ -154,7 +154,8 @@ rspamd_shingles_from_text (GArray *input,
                        if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
                                for (j = beg; j < i; j ++) {
                                        word = &g_array_index (input, rspamd_stat_token_t, j);
-                                       row = rspamd_fstring_append (row, word->begin, word->len);
+                                       row = rspamd_fstring_append (row, word->stemmed.begin,
+                                                       word->stemmed.len);
                                }
 
                                /* Now we need to create a new row here */
@@ -172,7 +173,7 @@ rspamd_shingles_from_text (GArray *input,
                }
        }
        else {
-               guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+               guint64 window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
 
                switch (alg) {
                case RSPAMD_SHINGLES_XXHASH:
@@ -186,27 +187,27 @@ rspamd_shingles_from_text (GArray *input,
                        break;
                }
 
-               memset (res, 0, sizeof (res));
+               memset (window, 0, sizeof (window));
                for (i = 0; i <= (gint)input->len; i ++) {
                        if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
 
                                for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
                                        /* Shift hashes window to right */
                                        for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
-                                               res[j * SHINGLES_WINDOW + k] =
-                                                               res[j * SHINGLES_WINDOW + k + 1];
+                                               window[j * SHINGLES_WINDOW + k] =
+                                                               window[j * SHINGLES_WINDOW + k + 1];
                                        }
 
                                        word = &g_array_index (input, rspamd_stat_token_t, beg);
                                        /* Insert the last element to the pipe */
                                        memcpy (&seed, keys[j], sizeof (seed));
-                                       res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+                                       window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
                                                        rspamd_cryptobox_fast_hash_specific (ht,
-                                                                       word->begin, word->len,
+                                                                       word->stemmed.begin, word->stemmed.len,
                                                                        seed);
                                        val = 0;
                                        for (k = 0; k < SHINGLES_WINDOW; k ++) {
-                                               val ^= res[j * SHINGLES_WINDOW + k] >>
+                                               val ^= window[j * SHINGLES_WINDOW + k] >>
                                                                (8 * (SHINGLES_WINDOW - k - 1));
                                        }
 
index 9e74c87c09286814180f2f4c6f4d8cce0fb0cabf..d2ff7e8e449d4e3e44d57aec9062aabeea29295c 100644 (file)
@@ -775,7 +775,7 @@ lua_textpart_get_words (lua_State *L)
                for (i = 0; i < part->utf_words->len; i ++) {
                        w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 
-                       lua_pushlstring (L, w->begin, w->len);
+                       lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
                        lua_rawseti (L, -2, i + 1);
                }
        }
@@ -983,7 +983,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L)
 
                for (i = 0; i < part->utf_words->len; i ++) {
                        word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
-                       rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+                       rspamd_cryptobox_hash_update (&st,
+                                       word->stemmed.begin, word->stemmed.len);
                }
 
                rspamd_cryptobox_hash_final (&st, digest);
index b8ac864dfbb35dd88075732fdb2ec4348de050e3..4f28e94922ccab268eace018c64b930c67982f98 100644 (file)
@@ -4757,13 +4757,13 @@ lua_push_stat_token (lua_State *L, rspamd_token_t *tok)
 
        if (tok->t1) {
                lua_pushstring (L, "t1");
-               lua_pushlstring (L, tok->t1->begin, tok->t1->len);
+               lua_pushlstring (L, tok->t1->stemmed.begin, tok->t1->stemmed.len);
                lua_settable (L, -3);
        }
 
        if (tok->t2) {
                lua_pushstring (L, "t2");
-               lua_pushlstring (L, tok->t2->begin, tok->t2->len);
+               lua_pushlstring (L, tok->t2->stemmed.begin, tok->t2->stemmed.len);
                lua_settable (L, -3);
        }
 
index 9ed095c34d22c427b859377a652508aff7c80a2c..1f9b84c85448072a6edafb325c9c41374dcc3357 100644 (file)
@@ -1154,7 +1154,7 @@ lua_util_tokenize_text (lua_State *L)
 
                for (i = 0; i < res->len; i ++) {
                        w = &g_array_index (res, rspamd_stat_token_t, i);
-                       lua_pushlstring (L, w->begin, w->len);
+                       lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
                        lua_rawseti (L, -2, i + 1);
                }
        }
index 0efbe55cacf0c42dd31f9dab75eee01f7d5c115c..c566cc517b32f49ba67aa56bc87f116187d263a3 100644 (file)
@@ -358,12 +358,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                                                                   guint *ncap,
                                                                   struct chartable_ctx *chartable_module_ctx)
 {
-       const gchar *p, *end;
+       const UChar32 *p, *end;
        gdouble badness = 0.0;
        UChar32 uc;
        UBlockCode sc;
        gint last_is_latin = -1;
-       guint same_script_count = 0, nsym = 0, i = 0;
+       guint same_script_count = 0, nsym = 0;
        enum {
                start_process = 0,
                got_alpha,
@@ -371,13 +371,13 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                got_unknown,
        } state = start_process, prev_state = start_process;
 
-       p = w->begin;
-       end = p + w->len;
+       p = w->unicode.begin;
+       end = p + w->unicode.len;
 
        /* We assume that w is normalized */
 
-       while (p + i < end) {
-               U8_NEXT (p, i, w->len, uc);
+       while (p < end) {
+               uc = *p++;
 
                if (((gint32)uc) < 0) {
                        break;
@@ -464,7 +464,8 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
                }
        }
 
-       msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin,
+       msg_debug_chartable ("word %*s, badness: %.2f",
+                       (gint)w->normalized.len, w->normalized.begin,
                        badness);
 
        return badness;
@@ -490,11 +491,11 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
                got_unknown,
        } state = start_process;
 
-       p = w->begin;
-       end = p + w->len;
+       p = w->normalized.begin;
+       end = p + w->normalized.len;
        last_sc = 0;
 
-       if (w->len > chartable_module_ctx->max_word_len) {
+       if (w->normalized.len > chartable_module_ctx->max_word_len) {
                return 0.0;
        }
 
@@ -549,7 +550,8 @@ rspamd_chartable_process_word_ascii (struct rspamd_task *task,
                badness = 4.0;
        }
 
-       msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin,
+       msg_debug_chartable ("word %*s, badness: %.2f",
+                       (gint)w->normalized.len, w->normalized.begin,
                        badness);
 
        return badness;
@@ -572,9 +574,9 @@ rspamd_chartable_process_part (struct rspamd_task *task,
        for (i = 0; i < part->utf_words->len; i++) {
                w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
 
-               if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+               if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
 
-                       if (IS_PART_UTF (part)) {
+                       if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
                                cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
                                                &ncap, chartable_module_ctx);
                        }
@@ -659,6 +661,8 @@ chartable_url_symbol_callback (struct rspamd_task *task,
                struct rspamd_symcache_item *item,
                void *unused)
 {
+       /* XXX: TODO: unbreak module once URLs unicode project is over */
+#if 0
        struct rspamd_url *u;
        GHashTableIter it;
        gpointer k, v;
@@ -677,10 +681,10 @@ chartable_url_symbol_callback (struct rspamd_task *task,
                }
 
                if (u->hostlen > 0) {
-                       w.begin = u->host;
-                       w.len = u->hostlen;
+                       w.stemmed.begin = u->host;
+                       w.stemmed.len = u->hostlen;
 
-                       if (g_utf8_validate (w.begin, w.len, NULL)) {
+                       if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
                                cur_score += rspamd_chartable_process_word_utf (task, &w,
                                                TRUE, NULL, chartable_module_ctx);
                        }
@@ -702,10 +706,10 @@ chartable_url_symbol_callback (struct rspamd_task *task,
                }
 
                if (u->hostlen > 0) {
-                       w.begin = u->host;
-                       w.len = u->hostlen;
+                       w.stemmed.begin = u->host;
+                       w.stemmed.len = u->hostlen;
 
-                       if (g_utf8_validate (w.begin, w.len, NULL)) {
+                       if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
                                cur_score += rspamd_chartable_process_word_utf (task, &w,
                                                TRUE, NULL, chartable_module_ctx);
                        }
@@ -721,6 +725,6 @@ chartable_url_symbol_callback (struct rspamd_task *task,
                                cur_score, NULL);
 
        }
-
+#endif
        rspamd_symcache_finalize_item (task, item);
 }
index fa9e9191cd97305a893cb536949465ed7ad52a9e..dd59fc542ce9243aabf6135a775d15d9156adc04 100644 (file)
@@ -1459,7 +1459,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task,
 
                        for (i = 0; i < words->len; i ++) {
                                word = &g_array_index (words, rspamd_stat_token_t, i);
-                               rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+                               rspamd_cryptobox_hash_update (&st, word->stemmed.begin,
+                                               word->stemmed.len);
                        }
 
                        rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);