]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Rework] Use GLib agnostic type for words
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 20 Jun 2025 13:44:56 +0000 (14:44 +0100)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 20 Jun 2025 13:44:56 +0000 (14:44 +0100)
25 files changed:
src/libmime/lang_detection.c
src/libmime/lang_detection_fasttext.cxx
src/libmime/lang_detection_fasttext.h
src/libmime/message.c
src/libmime/message.h
src/libserver/re_cache.c
src/libserver/task.c
src/libserver/task.h
src/libserver/word.h [new file with mode: 0644]
src/libstat/stat_api.h
src/libstat/stat_process.c
src/libstat/tokenizers/custom_tokenizer.h
src/libstat/tokenizers/osb.c
src/libstat/tokenizers/tokenizer_manager.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/libutil/shingles.c
src/libutil/shingles.h
src/lua/lua_common.c
src/lua/lua_common.h
src/lua/lua_mimepart.c
src/lua/lua_task.c
src/plugins/chartable.cxx
src/plugins/fuzzy_check.c
test/rspamd_shingles_test.c

index 07ecff76db48314177a6148eece7a72f0395f391..b783b8325d16aef391aad2f6eb7062e3e5dc1bbe 100644 (file)
@@ -936,7 +936,7 @@ end:
 }
 
 static void
-rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
+rspamd_language_detector_random_select(rspamd_words_t *ucs_tokens, unsigned int nwords,
                                                                           goffset *offsets_out,
                                                                           uint64_t *seed)
 {
@@ -946,7 +946,7 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
 
        g_assert(nwords != 0);
        g_assert(offsets_out != NULL);
-       g_assert(ucs_tokens->len >= nwords);
+       g_assert(kv_size(*ucs_tokens) >= nwords);
        /*
         * We split input array into `nwords` parts. For each part we randomly select
         * an element from this particular split. Here is an example:
@@ -963,22 +963,22 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
         * their splits. It is not uniform distribution but it seems to be better
         * to include words from different text parts
         */
-       step_len = ucs_tokens->len / nwords;
-       remainder = ucs_tokens->len % nwords;
+       step_len = kv_size(*ucs_tokens) / nwords;
+       remainder = kv_size(*ucs_tokens) % nwords;
 
        out_idx = 0;
        coin = rspamd_random_uint64_fast_seed(seed);
        sel = coin % (step_len + remainder);
        offsets_out[out_idx] = sel;
 
-       for (i = step_len + remainder; i < ucs_tokens->len;
+       for (i = step_len + remainder; i < kv_size(*ucs_tokens);
                 i += step_len, out_idx++) {
                unsigned int ntries = 0;
                coin = rspamd_random_uint64_fast_seed(seed);
                sel = (coin % step_len) + i;
 
                for (;;) {
-                       tok = &g_array_index(ucs_tokens, rspamd_stat_token_t, sel);
+                       tok = &kv_A(*ucs_tokens, sel);
                        /* Filter bad tokens */
 
                        if (tok->unicode.len >= 2 &&
@@ -995,8 +995,8 @@ rspamd_language_detector_random_select(GArray *ucs_tokens, unsigned int nwords,
                                if (ntries < step_len) {
                                        sel = (coin % step_len) + i;
                                }
-                               else if (ntries < ucs_tokens->len) {
-                                       sel = coin % ucs_tokens->len;
+                               else if (ntries < kv_size(*ucs_tokens)) {
+                                       sel = coin % kv_size(*ucs_tokens);
                                }
                                else {
                                        offsets_out[out_idx] = sel;
@@ -1223,12 +1223,12 @@ static void
 rspamd_language_detector_detect_type(struct rspamd_task *task,
                                                                         unsigned int nwords,
                                                                         struct rspamd_lang_detector *d,
-                                                                        GArray *words,
+                                                                        rspamd_words_t *words,
                                                                         enum rspamd_language_category cat,
                                                                         khash_t(rspamd_candidates_hash) * candidates,
                                                                         struct rspamd_mime_text_part *part)
 {
-       unsigned int nparts = MIN(words->len, nwords);
+       unsigned int nparts = MIN(kv_size(*words), nwords);
        goffset *selected_words;
        rspamd_stat_token_t *tok;
        unsigned int i;
@@ -1241,8 +1241,7 @@ rspamd_language_detector_detect_type(struct rspamd_task *task,
        msg_debug_lang_det("randomly selected %d words", nparts);
 
        for (i = 0; i < nparts; i++) {
-               tok = &g_array_index(words, rspamd_stat_token_t,
-                                                        selected_words[i]);
+               tok = &kv_A(*words, selected_words[i]);
 
                if (tok->unicode.len >= 3) {
                        rspamd_language_detector_detect_word(task, d, tok, candidates,
@@ -1282,7 +1281,7 @@ static enum rspamd_language_detected_type
 rspamd_language_detector_try_ngramm(struct rspamd_task *task,
                                                                        unsigned int nwords,
                                                                        struct rspamd_lang_detector *d,
-                                                                       GArray *ucs_tokens,
+                                                                       rspamd_words_t *ucs_tokens,
                                                                        enum rspamd_language_category cat,
                                                                        khash_t(rspamd_candidates_hash) * candidates,
                                                                        struct rspamd_mime_text_part *part)
@@ -1863,7 +1862,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
                if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) {
                        rspamd_fasttext_predict_result_t fasttext_predict_result =
                                rspamd_lang_detection_fasttext_detect(d->fasttext_detector, task,
-                                                                                                         part->utf_words, 4);
+                                                                                                         &part->utf_words, 4);
 
                        ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result);
 
@@ -1930,11 +1929,11 @@ rspamd_language_detector_detect(struct rspamd_task *task,
                        if (!ret) {
                                /* Apply trigramms detection */
                                candidates = kh_init(rspamd_candidates_hash);
-                               if (part->utf_words->len < default_short_text_limit) {
+                               if (kv_size(part->utf_words) < default_short_text_limit) {
                                        r = rs_detect_none;
                                        msg_debug_lang_det("text is too short for trigrams detection: "
                                                                           "%d words; at least %d words required",
-                                                                          (int) part->utf_words->len,
+                                                                          (int) kv_size(part->utf_words),
                                                                           (int) default_short_text_limit);
                                        switch (cat) {
                                        case RSPAMD_LANGUAGE_CYRILLIC:
@@ -1960,7 +1959,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
                                        r = rspamd_language_detector_try_ngramm(task,
                                                                                                                        default_words,
                                                                                                                        d,
-                                                                                                                       part->utf_words,
+                                                                                                                       &part->utf_words,
                                                                                                                        cat,
                                                                                                                        candidates,
                                                                                                                        part);
@@ -2123,4 +2122,4 @@ int rspamd_language_detector_elt_flags(const struct rspamd_language_elt *elt)
        }
 
        return 0;
-}
\ No newline at end of file
+}
index 8ea2706e66e81cf2aee685aa47e5d8e4221fa84c..983ff78de4b9ebc4ee3810356fca7ec3b69cc278 100644 (file)
@@ -22,6 +22,7 @@
 #include "libserver/logger.h"
 #include "contrib/fmt/include/fmt/base.h"
 #include "stat_api.h"
+#include "libserver/word.h"
 #include <exception>
 #include <string_view>
 #include <vector>
@@ -180,26 +181,32 @@ bool rspamd_lang_detection_fasttext_is_enabled(void *ud)
 
 rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
                                                                                                                                           struct rspamd_task *task,
-                                                                                                                                          GArray *utf_words,
+                                                                                                                                          rspamd_words_t *utf_words,
                                                                                                                                           int k)
 {
 #ifndef WITH_FASTTEXT
        return nullptr;
 #else
        /* Avoid too long inputs */
-       static const unsigned int max_fasttext_input_len = 1024 * 1024;
+       static const size_t max_fasttext_input_len = 1024 * 1024;
        auto *real_model = FASTTEXT_MODEL_TO_C_API(ud);
        std::vector<std::int32_t> words_vec;
-       words_vec.reserve(utf_words->len);
 
-       for (auto i = 0; i < std::min(utf_words->len, max_fasttext_input_len); i++) {
-               const auto *w = &g_array_index(utf_words, rspamd_stat_token_t, i);
+       if (!utf_words || !utf_words->a) {
+               return nullptr;
+       }
+
+       auto words_count = kv_size(*utf_words);
+       words_vec.reserve(words_count);
+
+       for (auto i = 0; i < std::min(words_count, max_fasttext_input_len); i++) {
+               const auto *w = &kv_A(*utf_words, i);
                if (w->original.len > 0) {
                        real_model->word2vec(w->original.begin, w->original.len, words_vec);
                }
        }
 
-       msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), utf_words->len);
+       msg_debug_lang_det("fasttext: got %z word tokens from %ud words", words_vec.size(), words_count);
 
        auto *res = real_model->detect_language(words_vec, k);
 
@@ -266,4 +273,4 @@ void rspamd_fasttext_predict_result_destroy(rspamd_fasttext_predict_result_t res
 #endif
 }
 
-G_END_DECLS
\ No newline at end of file
+G_END_DECLS
index 2a27569680489775f25c141599926056e206cdaa..e2b67181ae15c48fa25a6079bd160365fdb7c376 100644 (file)
@@ -17,6 +17,7 @@
 #define RSPAMD_LANG_DETECTION_FASTTEXT_H
 
 #include "config.h"
+#include "libserver/word.h"
 
 G_BEGIN_DECLS
 struct rspamd_config;
@@ -53,7 +54,7 @@ typedef void *rspamd_fasttext_predict_result_t;
  * @return TRUE if language is detected
  */
 rspamd_fasttext_predict_result_t rspamd_lang_detection_fasttext_detect(void *ud,
-                                                                                                                                          struct rspamd_task *task, GArray *utf_words, int k);
+                                                                                                                                          struct rspamd_task *task, rspamd_words_t *utf_words, int k);
 
 /**
  * Get number of languages detected
index 60894d879afc3f53e5cc3c584f1e4b8819fc0402..bac67fb079bbe7d46f610af032d195dfc7bf4885 100644 (file)
@@ -72,14 +72,14 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
        rspamd_stat_token_t *w;
        unsigned int i, total_len = 0, short_len = 0;
 
-       if (part->utf_words) {
-               rspamd_stem_words(part->utf_words, task->task_pool, part->language,
+       if (part->utf_words.a) {
+               rspamd_stem_words(&part->utf_words, task->task_pool, part->language,
                                                  task->lang_det);
 
-               for (i = 0; i < part->utf_words->len; i++) {
+               for (i = 0; i < kv_size(part->utf_words); i++) {
                        uint64_t h;
 
-                       w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+                       w = &kv_A(part->utf_words, i);
 
                        if (w->stemmed.len > 0) {
                                /*
@@ -109,7 +109,7 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
                        }
                }
 
-               if (part->utf_words->len) {
+               if (kv_size(part->utf_words)) {
                        double *avg_len_p, *short_len_p;
 
                        avg_len_p = rspamd_mempool_get_variable(task->task_pool,
@@ -186,21 +186,24 @@ rspamd_mime_part_create_words(struct rspamd_task *task,
                tok_type = RSPAMD_TOKENIZE_RAW;
        }
 
-       part->utf_words = rspamd_tokenize_text(
+       /* Initialize kvec for words */
+       kv_init(part->utf_words);
+
+       rspamd_tokenize_text(
                part->utf_stripped_content->data,
                part->utf_stripped_content->len,
                &part->utf_stripped_text,
                tok_type, task->cfg,
                part->exceptions,
                NULL,
-               NULL,
+               &part->utf_words,
                task->task_pool);
 
 
-       if (part->utf_words) {
+       if (part->utf_words.a) {
                part->normalized_hashes = g_array_sized_new(FALSE, FALSE,
-                                                                                                       sizeof(uint64_t), part->utf_words->len);
-               rspamd_normalize_words(part->utf_words, task->task_pool);
+                                                                                                       sizeof(uint64_t), kv_size(part->utf_words));
+               rspamd_normalize_words(&part->utf_words, task->task_pool);
        }
 }
 
@@ -210,7 +213,7 @@ rspamd_mime_part_detect_language(struct rspamd_task *task,
 {
        struct rspamd_lang_detector_res *lang;
 
-       if (!IS_TEXT_PART_EMPTY(part) && part->utf_words && part->utf_words->len > 0 &&
+       if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a && kv_size(part->utf_words) > 0 &&
                task->lang_det) {
                if (rspamd_language_detector_detect(task, task->lang_det, part)) {
                        lang = g_ptr_array_index(part->languages, 0);
@@ -1107,8 +1110,8 @@ rspamd_message_dtor(struct rspamd_message *msg)
 
        PTR_ARRAY_FOREACH(msg->text_parts, i, tp)
        {
-               if (tp->utf_words) {
-                       g_array_free(tp->utf_words, TRUE);
+               if (tp->utf_words.a) {
+                       kv_destroy(tp->utf_words);
                }
                if (tp->normalized_hashes) {
                        g_array_free(tp->normalized_hashes, TRUE);
@@ -1584,7 +1587,7 @@ void rspamd_message_process(struct rspamd_task *task)
 
                rspamd_mime_part_extract_words(task, text_part);
 
-               if (text_part->utf_words) {
+               if (text_part->utf_words.a) {
                        total_words += text_part->nwords;
                }
        }
index cb695773e97efe9619b53022e72b451a9a52ad86..e6b4543625178b29a53bb691f4629bb84845ffb3 100644 (file)
@@ -16,6 +16,7 @@
 #include "libserver/url.h"
 #include "libutil/ref.h"
 #include "libutil/str_util.h"
+#include "libserver/word.h"
 
 #include <unicode/uchar.h>
 #include <unicode/utext.h>
@@ -139,7 +140,7 @@ struct rspamd_mime_text_part {
        GByteArray *utf_raw_content;      /* utf raw content */
        GByteArray *utf_stripped_content; /* utf content with no newlines */
        GArray *normalized_hashes;        /* Array of uint64_t */
-       GArray *utf_words;                /* Array of rspamd_stat_token_t */
+       rspamd_words_t utf_words;         /* kvec of rspamd_word_t */
        UText utf_stripped_text;          /* Used by libicu to represent the utf8 content */
 
        GPtrArray *newlines; /**< positions of newlines in text, relative to content*/
index 06e9f3328f6ba73ef872c9f663ec34079a356218..50b155ae047ba6128895b534c81f979d529a86a3 100644 (file)
@@ -998,20 +998,21 @@ rspamd_re_cache_process_selector(struct rspamd_task *task,
        return result;
 }
 
+
 static inline unsigned int
-rspamd_process_words_vector(GArray *words,
-                                                       const unsigned char **scvec,
-                                                       unsigned int *lenvec,
-                                                       struct rspamd_re_class *re_class,
-                                                       unsigned int cnt,
-                                                       gboolean *raw)
+rspamd_process_words_vector_kvec(rspamd_words_t *words,
+                                                                const unsigned char **scvec,
+                                                                unsigned int *lenvec,
+                                                                struct rspamd_re_class *re_class,
+                                                                unsigned int cnt,
+                                                                gboolean *raw)
 {
        unsigned int j;
-       rspamd_stat_token_t *tok;
+       rspamd_word_t *tok;
 
-       if (words) {
-               for (j = 0; j < words->len; j++) {
-                       tok = &g_array_index(words, rspamd_stat_token_t, j);
+       if (words && words->a) {
+               for (j = 0; j < kv_size(*words); j++) {
+                       tok = &kv_A(*words, j);
 
                        if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
                                if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
@@ -1432,13 +1433,13 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
 
                        PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
                        {
-                               if (text_part->utf_words) {
-                                       cnt += text_part->utf_words->len;
+                               if (text_part->utf_words.a) {
+                                       cnt += kv_size(text_part->utf_words);
                                }
                        }
 
-                       if (task->meta_words && task->meta_words->len > 0) {
-                               cnt += task->meta_words->len;
+                       if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+                               cnt += kv_size(task->meta_words);
                        }
 
                        if (cnt > 0) {
@@ -1449,15 +1450,15 @@ rspamd_re_cache_exec_re(struct rspamd_task *task,
 
                                PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, text_part)
                                {
-                                       if (text_part->utf_words) {
-                                               cnt = rspamd_process_words_vector(text_part->utf_words,
-                                                                                                                 scvec, lenvec, re_class, cnt, &raw);
+                                       if (text_part->utf_words.a) {
+                                               cnt = rspamd_process_words_vector_kvec(&text_part->utf_words,
+                                                                                                                          scvec, lenvec, re_class, cnt, &raw);
                                        }
                                }
 
-                               if (task->meta_words) {
-                                       cnt = rspamd_process_words_vector(task->meta_words,
-                                                                                                         scvec, lenvec, re_class, cnt, &raw);
+                               if (task->meta_words.a) {
+                                       cnt = rspamd_process_words_vector_kvec(&task->meta_words,
+                                                                                                                  scvec, lenvec, re_class, cnt, &raw);
                                }
 
                                ret = rspamd_re_cache_process_regexp_data(rt, re,
index bd1e07549c9d498b46c00d53d0b416f7b4a88440..9f5b1f00a1b08b31e606cffcb3c27e2e1b903c19 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -196,8 +196,8 @@ void rspamd_task_free(struct rspamd_task *task)
                        rspamd_email_address_free(task->from_envelope_orig);
                }
 
-               if (task->meta_words) {
-                       g_array_free(task->meta_words, TRUE);
+               if (task->meta_words.a) {
+                       kv_destroy(task->meta_words);
                }
 
                ucl_object_unref(task->messages);
index 6be3500987127c470f73dedb35d45f8407548d14..1c1778fee4d740dd511ca1bbed9723efc57d6f0b 100644 (file)
@@ -24,6 +24,7 @@
 #include "dns.h"
 #include "re_cache.h"
 #include "khash.h"
+#include "libserver/word.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -187,7 +188,7 @@ struct rspamd_task {
        struct rspamd_scan_result *result;                  /**< Metric result                                                                  */
        khash_t(rspamd_task_lua_cache) lua_cache;           /**< cache of lua objects                                                   */
        GPtrArray *tokens;                                  /**< statistics tokens */
-       GArray *meta_words;                                 /**< rspamd_stat_token_t produced from meta headers
+       rspamd_words_t meta_words;                          /**< rspamd_word_t produced from meta headers
                                                                                                                (e.g. Subject) */
 
        GPtrArray *rcpt_envelope; /**< array of rspamd_email_address                                    */
diff --git a/src/libserver/word.h b/src/libserver/word.h
new file mode 100644 (file)
index 0000000..7698bf3
--- /dev/null
@@ -0,0 +1,88 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_WORD_H
+#define RSPAMD_WORD_H
+
+#include "config.h"
+#include "fstring.h"
+#include "contrib/libucl/kvec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @file word.h
+ * Word processing structures and definitions
+ */
+
+/* Word flags */
+#define RSPAMD_WORD_FLAG_TEXT (1u << 0)
+#define RSPAMD_WORD_FLAG_META (1u << 1)
+#define RSPAMD_WORD_FLAG_LUA_META (1u << 2)
+#define RSPAMD_WORD_FLAG_EXCEPTION (1u << 3)
+#define RSPAMD_WORD_FLAG_HEADER (1u << 4)
+#define RSPAMD_WORD_FLAG_UNIGRAM (1u << 5)
+#define RSPAMD_WORD_FLAG_UTF (1u << 6)
+#define RSPAMD_WORD_FLAG_NORMALISED (1u << 7)
+#define RSPAMD_WORD_FLAG_STEMMED (1u << 8)
+#define RSPAMD_WORD_FLAG_BROKEN_UNICODE (1u << 9)
+#define RSPAMD_WORD_FLAG_STOP_WORD (1u << 10)
+#define RSPAMD_WORD_FLAG_SKIPPED (1u << 11)
+#define RSPAMD_WORD_FLAG_INVISIBLE_SPACES (1u << 12)
+#define RSPAMD_WORD_FLAG_EMOJI (1u << 13)
+
+/**
+ * Word structure representing tokenized text
+ */
+typedef struct rspamd_word_s {
+       rspamd_ftok_t original;        /* utf8 raw */
+       rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
+       rspamd_ftok_t normalized;      /* normalized and lowercased utf8 */
+       rspamd_ftok_t stemmed;         /* stemmed utf8 */
+       unsigned int flags;
+} rspamd_word_t;
+
+/**
+ * Vector of words using kvec
+ */
+typedef kvec_t(rspamd_word_t) rspamd_words_t;
+
+/* Legacy typedefs for backward compatibility */
+typedef rspamd_word_t rspamd_stat_token_t;
+
+/* Legacy flag aliases for backward compatibility */
+#define RSPAMD_STAT_TOKEN_FLAG_TEXT RSPAMD_WORD_FLAG_TEXT
+#define RSPAMD_STAT_TOKEN_FLAG_META RSPAMD_WORD_FLAG_META
+#define RSPAMD_STAT_TOKEN_FLAG_LUA_META RSPAMD_WORD_FLAG_LUA_META
+#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION RSPAMD_WORD_FLAG_EXCEPTION
+#define RSPAMD_STAT_TOKEN_FLAG_HEADER RSPAMD_WORD_FLAG_HEADER
+#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM RSPAMD_WORD_FLAG_UNIGRAM
+#define RSPAMD_STAT_TOKEN_FLAG_UTF RSPAMD_WORD_FLAG_UTF
+#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED RSPAMD_WORD_FLAG_NORMALISED
+#define RSPAMD_STAT_TOKEN_FLAG_STEMMED RSPAMD_WORD_FLAG_STEMMED
+#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE RSPAMD_WORD_FLAG_BROKEN_UNICODE
+#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD RSPAMD_WORD_FLAG_STOP_WORD
+#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED RSPAMD_WORD_FLAG_SKIPPED
+#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES RSPAMD_WORD_FLAG_INVISIBLE_SPACES
+#define RSPAMD_STAT_TOKEN_FLAG_EMOJI RSPAMD_WORD_FLAG_EMOJI
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RSPAMD_WORD_H */
index f28922588ef5abea97c71aaa6cf5a1112dd306e4..811566ad382581a6d4059392612b2054206645ba 100644 (file)
@@ -20,6 +20,7 @@
 #include "task.h"
 #include "lua/lua_common.h"
 #include "contrib/libev/ev.h"
+#include "libserver/word.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -30,36 +31,14 @@ extern "C" {
  * High level statistics API
  */
 
-#define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
-#define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
-#define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
-#define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
-#define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
-#define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
-#define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
-#define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
-#define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
-#define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
-#define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
-#define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
-#define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
-#define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
-
-typedef struct rspamd_stat_token_s {
-       rspamd_ftok_t original;        /* utf8 raw */
-       rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
-       rspamd_ftok_t normalized;      /* normalized and lowercased utf8 */
-       rspamd_ftok_t stemmed;         /* stemmed utf8 */
-       unsigned int flags;
-} rspamd_stat_token_t;
 
 #define RSPAMD_TOKEN_VALUE_TYPE float
 typedef struct token_node_s {
        uint64_t data;
        unsigned int window_idx;
        unsigned int flags;
-       rspamd_stat_token_t *t1;
-       rspamd_stat_token_t *t2;
+       rspamd_word_t *t1;
+       rspamd_word_t *t2;
        RSPAMD_TOKEN_VALUE_TYPE values[0];
 } rspamd_token_t;
 
index 17caf4cc61f049c541e646804d051a2a1b6621a4..0bb658a3a72e657c8accb1a178397d4c58497ddd 100644 (file)
@@ -36,12 +36,13 @@ static void
 rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
                                                                        struct rspamd_task *task)
 {
-       GArray *ar;
-       rspamd_stat_token_t elt;
+       rspamd_words_t *words;
+       rspamd_word_t elt;
        unsigned int i;
        lua_State *L = task->cfg->lua_state;
 
-       ar = g_array_sized_new(FALSE, FALSE, sizeof(elt), 16);
+       words = rspamd_mempool_alloc(task->task_pool, sizeof(*words));
+       kv_init(*words);
        memset(&elt, 0, sizeof(elt));
        elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
 
@@ -87,7 +88,7 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
                                                elt.normalized.begin = elt.original.begin;
                                                elt.normalized.len = elt.original.len;
 
-                                               g_array_append_val(ar, elt);
+                                               kv_push_safe(rspamd_word_t, *words, elt, meta_words_error);
                                        }
 
                                        lua_pop(L, 1);
@@ -99,17 +100,20 @@ rspamd_stat_tokenize_parts_metadata(struct rspamd_stat_ctx *st_ctx,
        }
 
 
-       if (ar->len > 0) {
+       if (kv_size(*words) > 0) {
                st_ctx->tokenizer->tokenize_func(st_ctx,
                                                                                 task,
-                                                                                ar,
+                                                                                words,
                                                                                 TRUE,
                                                                                 "M",
                                                                                 task->tokens);
        }
+       goto meta_words_done;
 
-       rspamd_mempool_add_destructor(task->task_pool,
-                                                                 rspamd_array_free_hard, ar);
+meta_words_error:
+       /* On error, just continue without the problematic tokens */
+meta_words_done:
+       /* kvec memory will be freed with task pool */
 }
 
 /*
@@ -134,8 +138,8 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
 
        PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
        {
-               if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
-                       reserved_len += part->utf_words->len;
+               if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
+                       reserved_len += kv_size(part->utf_words);
                }
                /* XXX: normal window size */
                reserved_len += 5;
@@ -149,9 +153,9 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
 
        PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
        {
-               if (!IS_TEXT_PART_EMPTY(part) && part->utf_words != NULL) {
+               if (!IS_TEXT_PART_EMPTY(part) && part->utf_words.a) {
                        st_ctx->tokenizer->tokenize_func(st_ctx, task,
-                                                                                        part->utf_words, IS_TEXT_PART_UTF(part),
+                                                                                        &part->utf_words, IS_TEXT_PART_UTF(part),
                                                                                         NULL, task->tokens);
                }
 
@@ -163,10 +167,10 @@ void rspamd_stat_process_tokenize(struct rspamd_stat_ctx *st_ctx,
                }
        }
 
-       if (task->meta_words != NULL) {
+       if (task->meta_words.a) {
                st_ctx->tokenizer->tokenize_func(st_ctx,
                                                                                 task,
-                                                                                task->meta_words,
+                                                                                &task->meta_words,
                                                                                 TRUE,
                                                                                 "SUBJECT",
                                                                                 task->tokens);
index b620320f4286c18e9ed1afeab9fdd4b2c48e82f7..addf089200562812014714c56263f12e46debd87 100644 (file)
@@ -19,6 +19,7 @@
 
 #include "config.h"
 #include "ucl.h"
+#include "libserver/word.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -27,13 +28,10 @@ extern "C" {
 #define RSPAMD_CUSTOM_TOKENIZER_API_VERSION 1
 
 /**
- * Tokenization result - array of word positions as (start, length) pairs
- * The array is terminated by a pair with both values set to 0
+ * Tokenization result - kvec of rspamd_word_t
+ * Uses kvec to avoid exposing GLIB structures to external API
  */
-struct rspamd_tokenizer_result {
-       unsigned int *positions; /* Array of (start, length) pairs */
-       size_t count;            /* Number of words (not array size!) */
-};
+typedef rspamd_words_t rspamd_tokenizer_result_t;
 
 /**
  * Custom tokenizer API that must be implemented by language-specific tokenizer plugins
@@ -71,25 +69,25 @@ typedef struct rspamd_custom_tokenizer_api {
         * Main tokenization function
         * @param text UTF-8 text to tokenize
         * @param len Length of the text in bytes
-        * @param result Output structure to fill with word positions
+        * @param result Output kvec to fill with rspamd_word_t elements
         * @return 0 on success, non-zero on failure
         *
-        * The tokenizer should allocate result->positions using its own allocator
+        * The tokenizer should allocate result->a using its own allocator
         * Rspamd will call cleanup_result() to free it after processing
         */
        int (*tokenize)(const char *text, size_t len,
-                                       struct rspamd_tokenizer_result *result);
+                                       rspamd_tokenizer_result_t *result);
 
        /**
         * Cleanup the result from tokenize()
-        * @param result Result structure returned by tokenize()
+        * @param result Result kvec returned by tokenize()
         *
-        * This function should free result->positions using the same allocator
-        * that was used in tokenize() and reset the structure fields.
+        * This function should free result->a using the same allocator
+        * that was used in tokenize() and reset the kvec fields.
         * This ensures proper memory management across DLL boundaries.
         * Note: This does NOT free the result structure itself, only its contents.
         */
-       void (*cleanup_result)(struct rspamd_tokenizer_result *result);
+       void (*cleanup_result)(rspamd_tokenizer_result_t *result);
 
        /**
         * Optional: Get language hint for better language detection
@@ -155,7 +153,7 @@ struct rspamd_custom_tokenizer *rspamd_tokenizer_manager_detect(
        const char **detected_lang_hint);
 
 /* Helper function to tokenize with exceptions handling */
-GArray *rspamd_custom_tokenizer_tokenize_with_exceptions(
+rspamd_tokenizer_result_t *rspamd_custom_tokenizer_tokenize_with_exceptions(
        struct rspamd_custom_tokenizer *tokenizer,
        const char *text,
        gsize len,
index 0bc3414a557dcf79c78a7d406dfbcffaa4c38ef1..360c71d36658b51869b5a3c2983722581cabac10 100644 (file)
@@ -21,6 +21,7 @@
 #include "tokenizers.h"
 #include "stat_internal.h"
 #include "libmime/lang_detection.h"
+#include "libserver/word.h"
 
 /* Size for features pipe */
 #define DEFAULT_FEATURE_WINDOW_SIZE 2
@@ -268,7 +269,7 @@ struct token_pipe_entry {
 
 int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
                                                 struct rspamd_task *task,
-                                                GArray *words,
+                                                rspamd_words_t *words,
                                                 gboolean is_utf,
                                                 const char *prefix,
                                                 GPtrArray *result)
@@ -282,7 +283,7 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
        gsize token_size;
        unsigned int processed = 0, i, w, window_size, token_flags = 0;
 
-       if (words == NULL) {
+       if (words == NULL || !words->a) {
                return FALSE;
        }
 
@@ -306,8 +307,8 @@ int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
                                 sizeof(RSPAMD_TOKEN_VALUE_TYPE) * ctx->statfiles->len;
        g_assert(token_size > 0);
 
-       for (w = 0; w < words->len; w++) {
-               token = &g_array_index(words, rspamd_stat_token_t, w);
+       for (w = 0; w < kv_size(*words); w++) {
+               token = &kv_A(*words, w);
                token_flags = token->flags;
                const char *begin;
                gsize len;
index e2011712a556742390efa5c55d49e5654b7f5b45..b9bfe0e6f9220e36e44d63f5b2497bcafa159937 100644 (file)
@@ -327,7 +327,7 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
 }
 
 /* Helper function to tokenize with a custom tokenizer handling exceptions */
-GArray *
+rspamd_tokenizer_result_t *
 rspamd_custom_tokenizer_tokenize_with_exceptions(
        struct rspamd_custom_tokenizer *tokenizer,
        const char *text,
@@ -335,36 +335,28 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
        GList *exceptions,
        rspamd_mempool_t *pool)
 {
-       GArray *words;
-       struct rspamd_tokenizer_result result;
+       rspamd_tokenizer_result_t *words;
+       rspamd_tokenizer_result_t result;
        struct rspamd_process_exception *ex;
        GList *cur_ex = exceptions;
        gsize pos = 0;
        unsigned int i;
        int ret;
 
-       words = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t), 128);
+       /* Allocate result kvec in pool */
+       words = rspamd_mempool_alloc(pool, sizeof(*words));
+       kv_init(*words);
 
        /* If no exceptions, tokenize the whole text */
        if (!exceptions) {
-               result.positions = NULL;
-               result.count = 0;
+               kv_init(result);
 
                ret = tokenizer->api->tokenize(text, len, &result);
-               if (ret == 0 && result.positions) {
-                       /* Convert positions to tokens */
-                       for (i = 0; i < result.count; i++) {
-                               rspamd_stat_token_t tok;
-                               unsigned int start = result.positions[i * 2];
-                               unsigned int length = result.positions[i * 2 + 1];
-
-                               if (start + length <= len) {
-                                       memset(&tok, 0, sizeof(tok));
-                                       tok.original.begin = text + start;
-                                       tok.original.len = length;
-                                       tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
-                                       g_array_append_val(words, tok);
-                               }
+               if (ret == 0 && result.a) {
+                       /* Copy tokens from result to output */
+                       for (i = 0; i < kv_size(result); i++) {
+                               rspamd_word_t tok = kv_A(result, i);
+                               kv_push(rspamd_word_t, *words, tok);
                        }
 
                        /* Use tokenizer's cleanup function */
@@ -383,23 +375,22 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
                /* Tokenize text before exception */
                if (ex->pos > pos) {
                        gsize segment_len = ex->pos - pos;
-                       result.positions = NULL;
-                       result.count = 0;
+                       kv_init(result);
 
                        ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
-                       if (ret == 0 && result.positions) {
-                               /* Convert positions to tokens, adjusting for segment offset */
-                               for (i = 0; i < result.count; i++) {
-                                       rspamd_stat_token_t tok;
-                                       unsigned int start = result.positions[i * 2] + pos;
-                                       unsigned int length = result.positions[i * 2 + 1];
-
-                                       if (start + length <= ex->pos) {
-                                               memset(&tok, 0, sizeof(tok));
-                                               tok.original.begin = text + start;
-                                               tok.original.len = length;
-                                               tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
-                                               g_array_append_val(words, tok);
+                       if (ret == 0 && result.a) {
+                               /* Copy tokens from result, adjusting positions for segment offset */
+                               for (i = 0; i < kv_size(result); i++) {
+                                       rspamd_word_t tok = kv_A(result, i);
+
+                                       /* Adjust pointers to point to the original text */
+                                       gsize offset_in_segment = tok.original.begin - (text + pos);
+                                       if (offset_in_segment < segment_len) {
+                                               tok.original.begin = text + pos + offset_in_segment;
+                                               /* Ensure we don't go past the exception boundary */
+                                               if (tok.original.begin + tok.original.len <= text + ex->pos) {
+                                                       kv_push(rspamd_word_t, *words, tok);
+                                               }
                                        }
                                }
 
@@ -411,7 +402,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
                }
 
                /* Add exception as a special token */
-               rspamd_stat_token_t ex_tok;
+               rspamd_word_t ex_tok;
                memset(&ex_tok, 0, sizeof(ex_tok));
 
                if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -423,7 +414,7 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
                        ex_tok.original.len = ex->len;
                }
                ex_tok.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-               g_array_append_val(words, ex_tok);
+               kv_push(rspamd_word_t, *words, ex_tok);
 
                /* Move past exception */
                pos = ex->pos + ex->len;
@@ -432,23 +423,19 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 
        /* Process remaining text after last exception */
        if (pos < len) {
-               result.positions = NULL;
-               result.count = 0;
+               kv_init(result);
 
                ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
-               if (ret == 0 && result.positions) {
-                       /* Convert positions to tokens, adjusting for segment offset */
-                       for (i = 0; i < result.count; i++) {
-                               rspamd_stat_token_t tok;
-                               unsigned int start = result.positions[i * 2] + pos;
-                               unsigned int length = result.positions[i * 2 + 1];
-
-                               if (start + length <= len) {
-                                       memset(&tok, 0, sizeof(tok));
-                                       tok.original.begin = text + start;
-                                       tok.original.len = length;
-                                       tok.flags = RSPAMD_STAT_TOKEN_FLAG_TEXT | RSPAMD_STAT_TOKEN_FLAG_UTF;
-                                       g_array_append_val(words, tok);
+               if (ret == 0 && result.a) {
+                       /* Copy tokens from result, adjusting positions for segment offset */
+                       for (i = 0; i < kv_size(result); i++) {
+                               rspamd_word_t tok = kv_A(result, i);
+
+                               /* Adjust pointers to point to the original text */
+                               gsize offset_in_segment = tok.original.begin - (text + pos);
+                               if (offset_in_segment < (len - pos)) {
+                                       tok.original.begin = text + pos + offset_in_segment;
+                                       kv_push(rspamd_word_t, *words, tok);
                                }
                        }
 
index 4667976fbe308bd5f42da5fed13ca0483b637cfd..1c5b0a4c895fb15d4f8388bf8a70b7056a311546 100644 (file)
@@ -37,8 +37,8 @@
 
 #include <math.h>
 
-typedef gboolean (*token_get_function)(rspamd_stat_token_t *buf, char const **pos,
-                                                                          rspamd_stat_token_t *token,
+typedef gboolean (*token_get_function)(rspamd_word_t *buf, char const **pos,
+                                                                          rspamd_word_t *token,
                                                                           GList **exceptions, gsize *rl, gboolean check_signature);
 
 const char t_delimiters[256] = {
@@ -71,8 +71,8 @@ const char t_delimiters[256] = {
 
 /* Get next word from specified f_str_t buf */
 static gboolean
-rspamd_tokenizer_get_word_raw(rspamd_stat_token_t *buf,
-                                                         char const **cur, rspamd_stat_token_t *token,
+rspamd_tokenizer_get_word_raw(rspamd_word_t *buf,
+                                                         char const **cur, rspamd_word_t *token,
                                                          GList **exceptions, gsize *rl, gboolean unused)
 {
        gsize remain, pos;
@@ -166,7 +166,7 @@ rspamd_tokenize_check_limit(gboolean decay,
                                                        unsigned int nwords,
                                                        uint64_t *hv,
                                                        uint64_t *prob,
-                                                       const rspamd_stat_token_t *token,
+                                                       const rspamd_word_t *token,
                                                        gssize remain,
                                                        gssize total)
 {
@@ -244,9 +244,9 @@ rspamd_utf_word_valid(const unsigned char *text, const unsigned char *end,
        } while (0)
 
 static inline void
-rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
+rspamd_tokenize_exception(struct rspamd_process_exception *ex, rspamd_words_t *res)
 {
-       rspamd_stat_token_t token;
+       rspamd_word_t token;
 
        memset(&token, 0, sizeof(token));
 
@@ -255,7 +255,7 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
                token.original.len = sizeof("!!EX!!") - 1;
                token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
 
-               g_array_append_val(res, token);
+               kv_push_safe(rspamd_word_t, *res, token, exception_error);
                token.flags = 0;
        }
        else if (ex->type == RSPAMD_EXCEPTION_URL) {
@@ -273,28 +273,33 @@ rspamd_tokenize_exception(struct rspamd_process_exception *ex, GArray *res)
                }
 
                token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-               g_array_append_val(res, token);
+               kv_push_safe(rspamd_word_t, *res, token, exception_error);
                token.flags = 0;
        }
+       return;
+
+exception_error:
+       /* On error, just skip this exception token */
+       return;
 }
 
 
-GArray *
+rspamd_words_t *
 rspamd_tokenize_text(const char *text, gsize len,
                                         const UText *utxt,
                                         enum rspamd_tokenize_type how,
                                         struct rspamd_config *cfg,
                                         GList *exceptions,
                                         uint64_t *hash,
-                                        GArray *cur_words,
+                                        rspamd_words_t *output_kvec,
                                         rspamd_mempool_t *pool)
 {
-       rspamd_stat_token_t token, buf;
+       rspamd_word_t token, buf;
        const char *pos = NULL;
        gsize l = 0;
-       GArray *res;
+       rspamd_words_t *res;
        GList *cur = exceptions;
-       unsigned int min_len = 0, max_len = 0, word_decay = 0, initial_size = 128;
+       unsigned int min_len = 0, max_len = 0, word_decay = 0;
        uint64_t hv = 0;
        gboolean decay = FALSE, long_text_mode = FALSE;
        uint64_t prob = 0;
@@ -307,7 +312,7 @@ rspamd_tokenize_text(const char *text, gsize len,
        const char *detected_lang = NULL;
 
        if (text == NULL) {
-               return cur_words;
+               return output_kvec;
        }
 
        if (len > long_text_limit) {
@@ -328,17 +333,16 @@ rspamd_tokenize_text(const char *text, gsize len,
                min_len = cfg->min_word_len;
                max_len = cfg->max_word_len;
                word_decay = cfg->words_decay;
-               initial_size = word_decay * 2;
        }
 
-       if (!cur_words) {
-               res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_stat_token_t),
-                                                               initial_size);
-       }
-       else {
-               res = cur_words;
+       if (!output_kvec) {
+               /* Should not happen in normal usage */
+               return NULL;
        }
 
+       res = output_kvec;
+       kv_init(*res);
+
        /* Try custom tokenizers first if we're in UTF mode */
        if (cfg && cfg->tokenizer_manager && how == RSPAMD_TOKENIZE_UTF && utxt != NULL) {
                custom_tok = rspamd_tokenizer_manager_detect(
@@ -350,18 +354,22 @@ rspamd_tokenize_text(const char *text, gsize len,
 
                if (custom_tok && custom_confidence >= custom_tok->min_confidence) {
                        /* Use custom tokenizer with exception handling */
-                       GArray *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
+                       rspamd_tokenizer_result_t *custom_res = rspamd_custom_tokenizer_tokenize_with_exceptions(
                                custom_tok, text, len, exceptions, pool);
 
                        if (custom_res) {
                                msg_debug_pool("using custom tokenizer %s (confidence: %.2f) for text tokenization",
                                                           custom_tok->name, custom_confidence);
 
+                               /* Copy custom tokenizer results to output kvec */
+                               for (unsigned int i = 0; i < kv_size(*custom_res); i++) {
+                                       kv_push_safe(rspamd_word_t, *res, kv_A(*custom_res, i), custom_tokenizer_error);
+                               }
+
                                /* Calculate hash if needed */
-                               if (hash && custom_res->len > 0) {
-                                       unsigned int i;
-                                       for (i = 0; i < custom_res->len; i++) {
-                                               rspamd_stat_token_t *t = &g_array_index(custom_res, rspamd_stat_token_t, i);
+                               if (hash && kv_size(*res) > 0) {
+                                       for (unsigned int i = 0; i < kv_size(*res); i++) {
+                                               rspamd_word_t *t = &kv_A(*res, i);
                                                if (t->original.len >= sizeof(uint64_t)) {
                                                        uint64_t tmp;
                                                        memcpy(&tmp, t->original.begin, sizeof(tmp));
@@ -371,14 +379,7 @@ rspamd_tokenize_text(const char *text, gsize len,
                                        *hash = mum_hash_finish(hv);
                                }
 
-                               /* If we had existing words, append to them */
-                               if (cur_words && custom_res != cur_words) {
-                                       g_array_append_vals(cur_words, custom_res->data, custom_res->len);
-                                       g_array_free(custom_res, TRUE);
-                                       return cur_words;
-                               }
-
-                               return custom_res;
+                               return res;
                        }
                        else {
                                msg_warn_pool("custom tokenizer %s failed to tokenize text, falling back to default",
@@ -396,7 +397,7 @@ rspamd_tokenize_text(const char *text, gsize len,
                        }
 
                        if (token.original.len > 0 &&
-                               rspamd_tokenize_check_limit(decay, word_decay, res->len,
+                               rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
                                                                                        &hv, &prob, &token, pos - text, len)) {
                                if (!decay) {
                                        decay = TRUE;
@@ -408,28 +409,28 @@ rspamd_tokenize_text(const char *text, gsize len,
                        }
 
                        if (long_text_mode) {
-                               if ((res->len + 1) % 16 == 0) {
+                               if ((kv_size(*res) + 1) % 16 == 0) {
                                        ev_tstamp now = ev_time();
 
                                        if (now - start > max_exec_time) {
                                                msg_warn_pool_check(
                                                        "too long time has been spent on tokenization:"
-                                                       " %.1f ms, limit is %.1f ms; %d words added so far",
+                                                       " %.1f ms, limit is %.1f ms; %z words added so far",
                                                        (now - start) * 1e3, max_exec_time * 1e3,
-                                                       res->len);
+                                                       kv_size(*res));
 
                                                goto end;
                                        }
                                }
                        }
 
-                       g_array_append_val(res, token);
+                       kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
 
-                       if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+                       if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
                                /* Due to bug in glib ! */
                                msg_err_pool_check(
-                                       "too many words found: %d, stop tokenization to avoid DoS",
-                                       res->len);
+                                       "too many words found: %z, stop tokenization to avoid DoS",
+                                       kv_size(*res));
 
                                goto end;
                        }
@@ -576,7 +577,7 @@ rspamd_tokenize_text(const char *text, gsize len,
                                }
 
                                if (token.original.len > 0 &&
-                                       rspamd_tokenize_check_limit(decay, word_decay, res->len,
+                                       rspamd_tokenize_check_limit(decay, word_decay, kv_size(*res),
                                                                                                &hv, &prob, &token, p, len)) {
                                        if (!decay) {
                                                decay = TRUE;
@@ -589,15 +590,15 @@ rspamd_tokenize_text(const char *text, gsize len,
 
                        if (token.original.len > 0) {
                                /* Additional check for number of words */
-                               if (((gsize) res->len) * sizeof(token) > (0x1ull << 30u)) {
+                               if (kv_size(*res) * sizeof(token) > (0x1ull << 30u)) {
                                        /* Due to bug in glib ! */
-                                       msg_err("too many words found: %d, stop tokenization to avoid DoS",
-                                                       res->len);
+                                       msg_err("too many words found: %z, stop tokenization to avoid DoS",
+                                                       kv_size(*res));
 
                                        goto end;
                                }
 
-                               g_array_append_val(res, token);
+                               kv_push_safe(rspamd_word_t, *res, token, tokenize_error);
                        }
 
                        /* Also check for long text mode */
@@ -605,15 +606,15 @@ rspamd_tokenize_text(const char *text, gsize len,
                                /* Check time each 128 words added */
                                const int words_check_mask = 0x7F;
 
-                               if ((res->len & words_check_mask) == words_check_mask) {
+                               if ((kv_size(*res) & words_check_mask) == words_check_mask) {
                                        ev_tstamp now = ev_time();
 
                                        if (now - start > max_exec_time) {
                                                msg_warn_pool_check(
                                                        "too long time has been spent on tokenization:"
-                                                       " %.1f ms, limit is %.1f ms; %d words added so far",
+                                                       " %.1f ms, limit is %.1f ms; %z words added so far",
                                                        (now - start) * 1e3, max_exec_time * 1e3,
-                                                       res->len);
+                                                       kv_size(*res));
 
                                                goto end;
                                        }
@@ -643,8 +644,14 @@ end:
        }
 
        return res;
+
+tokenize_error:
+custom_tokenizer_error:
+       msg_err_pool("failed to allocate memory for tokenization");
+       return res;
 }
 
+
 #undef SHIFT_EX
 
 static void
@@ -678,32 +685,38 @@ rspamd_add_metawords_from_str(const char *beg, gsize len,
 #endif
        }
 
+       /* Initialize meta_words kvec if not already done */
+       if (!task->meta_words.a) {
+               kv_init(task->meta_words);
+       }
+
        if (valid_utf) {
                utext_openUTF8(&utxt,
                                           beg,
                                           len,
                                           &uc_err);
 
-               task->meta_words = rspamd_tokenize_text(beg, len,
-                                                                                               &utxt, RSPAMD_TOKENIZE_UTF,
-                                                                                               task->cfg, NULL, NULL,
-                                                                                               task->meta_words,
-                                                                                               task->task_pool);
+               rspamd_tokenize_text(beg, len,
+                                                        &utxt, RSPAMD_TOKENIZE_UTF,
+                                                        task->cfg, NULL, NULL,
+                                                        &task->meta_words,
+                                                        task->task_pool);
 
                utext_close(&utxt);
        }
        else {
-               task->meta_words = rspamd_tokenize_text(beg, len,
-                                                                                               NULL, RSPAMD_TOKENIZE_RAW,
-                                                                                               task->cfg, NULL, NULL, task->meta_words,
-                                                                                               task->task_pool);
+               rspamd_tokenize_text(beg, len,
+                                                        NULL, RSPAMD_TOKENIZE_RAW,
+                                                        task->cfg, NULL, NULL,
+                                                        &task->meta_words,
+                                                        task->task_pool);
        }
 }
 
 void rspamd_tokenize_meta_words(struct rspamd_task *task)
 {
        unsigned int i = 0;
-       rspamd_stat_token_t *tok;
+       rspamd_word_t *tok;
 
        if (MESSAGE_FIELD(task, subject)) {
                rspamd_add_metawords_from_str(MESSAGE_FIELD(task, subject),
@@ -720,7 +733,7 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
                }
        }
 
-       if (task->meta_words != NULL) {
+       if (task->meta_words.a) {
                const char *language = NULL;
 
                if (MESSAGE_FIELD(task, text_parts) &&
@@ -733,12 +746,12 @@ void rspamd_tokenize_meta_words(struct rspamd_task *task)
                        }
                }
 
-               rspamd_normalize_words(task->meta_words, task->task_pool);
-               rspamd_stem_words(task->meta_words, task->task_pool, language,
+               rspamd_normalize_words(&task->meta_words, task->task_pool);
+               rspamd_stem_words(&task->meta_words, task->task_pool, language,
                                                  task->lang_det);
 
-               for (i = 0; i < task->meta_words->len; i++) {
-                       tok = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+               for (i = 0; i < kv_size(task->meta_words); i++) {
+                       tok = &kv_A(task->meta_words, i);
                        tok->flags |= RSPAMD_STAT_TOKEN_FLAG_HEADER;
                }
        }
@@ -812,7 +825,7 @@ rspamd_ucs32_to_normalised(rspamd_stat_token_t *tok,
        tok->normalized.begin = dest;
 }
 
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool)
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool)
 {
        UErrorCode uc_err = U_ZERO_ERROR;
        UConverter *utf8_converter;
@@ -911,25 +924,27 @@ void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *po
        }
 }
 
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool)
+
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool)
 {
-       rspamd_stat_token_t *tok;
+       rspamd_word_t *tok;
        unsigned int i;
 
-       for (i = 0; i < words->len; i++) {
-               tok = &g_array_index(words, rspamd_stat_token_t, i);
+       for (i = 0; i < kv_size(*words); i++) {
+               tok = &kv_A(*words, i);
                rspamd_normalize_single_word(tok, pool);
        }
 }
 
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
                                           const char *language,
                                           struct rspamd_lang_detector *lang_detector)
 {
        static GHashTable *stemmers = NULL;
        struct sb_stemmer *stem = NULL;
        unsigned int i;
-       rspamd_stat_token_t *tok;
+       rspamd_word_t *tok;
        char *dest;
        gsize dlen;
 
@@ -962,8 +977,8 @@ void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
                        stem = NULL;
                }
        }
-       for (i = 0; i < words->len; i++) {
-               tok = &g_array_index(words, rspamd_stat_token_t, i);
+       for (i = 0; i < kv_size(*words); i++) {
+               tok = &kv_A(*words, i);
 
                if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
                        if (stem) {
index f3066b5cf1a98e91f78b542d45cccf323085402c..bb0bb54e24f0b28b32e4dd7ccfd094fff3b76c61 100644 (file)
@@ -22,6 +22,7 @@
 #include "fstring.h"
 #include "rspamd.h"
 #include "stat_api.h"
+#include "libserver/word.h"
 
 #include <unicode/utext.h>
 
@@ -43,7 +44,7 @@ struct rspamd_stat_tokenizer {
 
        int (*tokenize_func)(struct rspamd_stat_ctx *ctx,
                                                 struct rspamd_task *task,
-                                                GArray *words,
+                                                rspamd_words_t *words,
                                                 gboolean is_utf,
                                                 const char *prefix,
                                                 GPtrArray *result);
@@ -59,20 +60,20 @@ enum rspamd_tokenize_type {
 int token_node_compare_func(gconstpointer a, gconstpointer b);
 
 
-/* Tokenize text into array of words (rspamd_stat_token_t type) */
-GArray *rspamd_tokenize_text(const char *text, gsize len,
-                                                        const UText *utxt,
-                                                        enum rspamd_tokenize_type how,
-                                                        struct rspamd_config *cfg,
-                                                        GList *exceptions,
-                                                        uint64_t *hash,
-                                                        GArray *cur_words,
-                                                        rspamd_mempool_t *pool);
+/* Tokenize text into kvec of words (rspamd_word_t type) */
+rspamd_words_t *rspamd_tokenize_text(const char *text, gsize len,
+                                                                        const UText *utxt,
+                                                                        enum rspamd_tokenize_type how,
+                                                                        struct rspamd_config *cfg,
+                                                                        GList *exceptions,
+                                                                        uint64_t *hash,
+                                                                        rspamd_words_t *output_kvec,
+                                                                        rspamd_mempool_t *pool);
 
 /* OSB tokenize function */
 int rspamd_tokenizer_osb(struct rspamd_stat_ctx *ctx,
                                                 struct rspamd_task *task,
-                                                GArray *words,
+                                                rspamd_words_t *words,
                                                 gboolean is_utf,
                                                 const char *prefix,
                                                 GPtrArray *result);
@@ -83,11 +84,11 @@ gpointer rspamd_tokenizer_osb_get_config(rspamd_mempool_t *pool,
 
 struct rspamd_lang_detector;
 
-void rspamd_normalize_single_word(rspamd_stat_token_t *tok, rspamd_mempool_t *pool);
+void rspamd_normalize_single_word(rspamd_word_t *tok, rspamd_mempool_t *pool);
 
-void rspamd_normalize_words(GArray *words, rspamd_mempool_t *pool);
-
-void rspamd_stem_words(GArray *words, rspamd_mempool_t *pool,
+/* Word processing functions */
+void rspamd_normalize_words(rspamd_words_t *words, rspamd_mempool_t *pool);
+void rspamd_stem_words(rspamd_words_t *words, rspamd_mempool_t *pool,
                                           const char *language,
                                           struct rspamd_lang_detector *lang_detector);
 
index 5fe110eb8428987445f3fa368b1edb9e0e077e4d..c69c4229270b525d3181fd076b99f88e5397779a 100644 (file)
@@ -18,6 +18,7 @@
 #include "cryptobox.h"
 #include "images.h"
 #include "libstat/stat_api.h"
+#include "libserver/word.h"
 
 #define SHINGLES_WINDOW 3
 #define SHINGLES_KEY_SIZE rspamd_cryptobox_SIPKEYBYTES
@@ -112,7 +113,7 @@ rspamd_shingles_get_keys_cached(const unsigned char key[SHINGLES_KEY_SIZE])
 }
 
 struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
-       rspamd_shingles_from_text(GArray *input,
+       rspamd_shingles_from_text(rspamd_words_t *input,
                                                          const unsigned char key[16],
                                                          rspamd_mempool_t *pool,
                                                          rspamd_shingles_filter filter,
@@ -123,12 +124,16 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
        uint64_t **hashes;
        unsigned char **keys;
        rspamd_fstring_t *row;
-       rspamd_stat_token_t *word;
+       rspamd_word_t *word;
        uint64_t val;
        int i, j, k;
        gsize hlen, ilen = 0, beg = 0, widx = 0;
        enum rspamd_cryptobox_fast_hash_type ht;
 
+       if (!input || !input->a) {
+               return NULL;
+       }
+
        if (pool != NULL) {
                res = rspamd_mempool_alloc(pool, sizeof(*res));
        }
@@ -138,10 +143,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
 
        row = rspamd_fstring_sized_new(256);
 
-       for (i = 0; i < input->len; i++) {
-               word = &g_array_index(input, rspamd_stat_token_t, i);
+       for (i = 0; i < kv_size(*input); i++) {
+               word = &kv_A(*input, i);
 
-               if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+               if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
                        ilen++;
                }
        }
@@ -162,10 +167,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
                                for (j = beg; j < i; j++) {
 
                                        word = NULL;
-                                       while (widx < input->len) {
-                                               word = &g_array_index(input, rspamd_stat_token_t, widx);
+                                       while (widx < kv_size(*input)) {
+                                               word = &kv_A(*input, widx);
 
-                                               if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+                                               if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
                                                        widx++;
                                                }
                                                else {
@@ -237,10 +242,10 @@ struct rspamd_shingle *RSPAMD_OPTIMIZE("unroll-loops")
 
                                        word = NULL;
 
-                                       while (widx < input->len) {
-                                               word = &g_array_index(input, rspamd_stat_token_t, widx);
+                                       while (widx < kv_size(*input)) {
+                                               word = &kv_A(*input, widx);
 
-                                               if ((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0) {
+                                               if ((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0) {
                                                        widx++;
                                                }
                                                else {
index fe6f16cf85636024ce93e18ee3437b86d800b380..1ab2c684298d73109ad8531985d0be45c956b247 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "config.h"
 #include "mem_pool.h"
+#include "libserver/word.h"
 
 #define RSPAMD_SHINGLE_SIZE 32
 
@@ -48,14 +49,14 @@ typedef uint64_t (*rspamd_shingles_filter)(uint64_t *input, gsize count,
 /**
  * Generate shingles from the input of fixed size strings using lemmatizer
  * if needed
- * @param input array of `rspamd_fstring_t`
+ * @param input kvec of `rspamd_word_t`
  * @param key secret key used to generate shingles
  * @param pool pool to allocate shingles array
  * @param filter hashes filtering function
  * @param filterd opaque data for filtering function
  * @return shingles array
  */
-struct rspamd_shingle *rspamd_shingles_from_text(GArray *input,
+struct rspamd_shingle *rspamd_shingles_from_text(rspamd_words_t *input,
                                                                                                 const unsigned char key[16],
                                                                                                 rspamd_mempool_t *pool,
                                                                                                 rspamd_shingles_filter filter,
index 3a0f1a06c403fbf9070e803b7c1df782574b239b..f3622868027f4c74c3abc4a6c313c68e35182e59 100644 (file)
@@ -2401,7 +2401,7 @@ rspamd_lua_try_load_redis(lua_State *L, const ucl_object_t *obj,
        return FALSE;
 }
 
-void rspamd_lua_push_full_word(lua_State *L, rspamd_stat_token_t *w)
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *w)
 {
        int fl_cnt;
 
@@ -2521,6 +2521,54 @@ int rspamd_lua_push_words(lua_State *L, GArray *words,
        return 1;
 }
 
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+                                                          enum rspamd_lua_words_type how)
+{
+       rspamd_word_t *w;
+       unsigned int i, cnt;
+
+       if (!words || !words->a) {
+               lua_createtable(L, 0, 0);
+               return 1;
+       }
+
+       lua_createtable(L, kv_size(*words), 0);
+
+       for (i = 0, cnt = 1; i < kv_size(*words); i++) {
+               w = &kv_A(*words, i);
+
+               switch (how) {
+               case RSPAMD_LUA_WORDS_STEM:
+                       if (w->stemmed.len > 0) {
+                               lua_pushlstring(L, w->stemmed.begin, w->stemmed.len);
+                               lua_rawseti(L, -2, cnt++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_NORM:
+                       if (w->normalized.len > 0) {
+                               lua_pushlstring(L, w->normalized.begin, w->normalized.len);
+                               lua_rawseti(L, -2, cnt++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_RAW:
+                       if (w->original.len > 0) {
+                               lua_pushlstring(L, w->original.begin, w->original.len);
+                               lua_rawseti(L, -2, cnt++);
+                       }
+                       break;
+               case RSPAMD_LUA_WORDS_FULL:
+                       rspamd_lua_push_full_word(L, w);
+                       /* Push to the resulting vector */
+                       lua_rawseti(L, -2, cnt++);
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       return 1;
+}
+
 char *
 rspamd_lua_get_module_name(lua_State *L)
 {
@@ -2658,4 +2706,4 @@ int rspamd_lua_geti(lua_State *L, int pos, int i)
 
        return lua_type(L, -1);
 }
-#endif
\ No newline at end of file
+#endif
index 5819da8cbee8249f1f62a0f43605fd906bbaf7b7..d494f09231135008e1625543e1d749dbb627a6eb 100644 (file)
@@ -539,7 +539,7 @@ enum lua_logger_escape_type {
 * @return
 */
 gsize lua_logger_out(lua_State *L, int pos, char *outbuf, gsize len,
-                                                 enum lua_logger_escape_type esc_type);
+                                        enum lua_logger_escape_type esc_type);
 
 /**
 * Safely checks userdata to match specified class
@@ -632,7 +632,7 @@ struct rspamd_stat_token_s;
 * @param L
 * @param word
 */
-void rspamd_lua_push_full_word(lua_State *L, struct rspamd_stat_token_s *word);
+void rspamd_lua_push_full_word(lua_State *L, rspamd_word_t *word);
 
 enum rspamd_lua_words_type {
        RSPAMD_LUA_WORDS_STEM = 0,
@@ -651,6 +651,9 @@ enum rspamd_lua_words_type {
 int rspamd_lua_push_words(lua_State *L, GArray *words,
                                                  enum rspamd_lua_words_type how);
 
+int rspamd_lua_push_words_kvec(lua_State *L, rspamd_words_t *words,
+                                                          enum rspamd_lua_words_type how);
+
 /**
 * Returns newly allocated name for caller module name
 * @param L
index 07dba9c931f6be8fb74b80bfce35480d1ba39e2e..982b10d901176dba24f33a0a04268f62a4dce93b 100644 (file)
@@ -901,7 +901,7 @@ lua_textpart_get_words_count(lua_State *L)
                return 1;
        }
 
-       if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+       if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
                lua_pushinteger(L, 0);
        }
        else {
@@ -943,7 +943,7 @@ lua_textpart_get_words(lua_State *L)
                return luaL_error(L, "invalid arguments");
        }
 
-       if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+       if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
                lua_createtable(L, 0, 0);
        }
        else {
@@ -957,7 +957,7 @@ lua_textpart_get_words(lua_State *L)
                        }
                }
 
-               return rspamd_lua_push_words(L, part->utf_words, how);
+               return rspamd_lua_push_words_kvec(L, &part->utf_words, how);
        }
 
        return 1;
@@ -976,7 +976,7 @@ lua_textpart_filter_words(lua_State *L)
                return luaL_error(L, "invalid arguments");
        }
 
-       if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+       if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
                lua_createtable(L, 0, 0);
        }
        else {
@@ -998,9 +998,8 @@ lua_textpart_filter_words(lua_State *L)
 
                lua_createtable(L, 8, 0);
 
-               for (i = 0, cnt = 1; i < part->utf_words->len; i++) {
-                       rspamd_stat_token_t *w = &g_array_index(part->utf_words,
-                                                                                                       rspamd_stat_token_t, i);
+               for (i = 0, cnt = 1; i < kv_size(part->utf_words); i++) {
+                       rspamd_word_t *w = &kv_A(part->utf_words, i);
 
                        switch (how) {
                        case RSPAMD_LUA_WORDS_STEM:
@@ -1194,13 +1193,13 @@ struct lua_shingle_filter_cbdata {
        rspamd_mempool_t *pool;
 };
 
-#define STORE_TOKEN(i, t)                                                     \
-       do {                                                                      \
-               if ((i) < part->utf_words->len) {                                     \
-                       word = &g_array_index(part->utf_words, rspamd_stat_token_t, (i)); \
-                       sd->t.begin = word->stemmed.begin;                                \
-                       sd->t.len = word->stemmed.len;                                    \
-               }                                                                     \
+#define STORE_TOKEN(i, t)                       \
+       do {                                        \
+               if ((i) < kv_size(part->utf_words)) {   \
+                       word = &kv_A(part->utf_words, (i)); \
+                       sd->t.begin = word->stemmed.begin;  \
+                       sd->t.len = word->stemmed.len;      \
+               }                                       \
        } while (0)
 
 static uint64_t
@@ -1210,7 +1209,7 @@ lua_shingles_filter(uint64_t *input, gsize count,
        uint64_t minimal = G_MAXUINT64;
        gsize i, min_idx = 0;
        struct lua_shingle_data *sd;
-       rspamd_stat_token_t *word;
+       rspamd_word_t *word;
        struct lua_shingle_filter_cbdata *cbd = (struct lua_shingle_filter_cbdata *) ud;
        struct rspamd_mime_text_part *part;
 
@@ -1248,7 +1247,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
        unsigned int i;
        struct lua_shingle_data *sd;
        rspamd_cryptobox_hash_state_t st;
-       rspamd_stat_token_t *word;
+       rspamd_word_t *word;
        struct lua_shingle_filter_cbdata cbd;
 
 
@@ -1256,7 +1255,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
                return luaL_error(L, "invalid arguments");
        }
 
-       if (IS_TEXT_PART_EMPTY(part) || part->utf_words == NULL) {
+       if (IS_TEXT_PART_EMPTY(part) || !part->utf_words.a) {
                lua_pushnil(L);
                lua_pushnil(L);
        }
@@ -1269,8 +1268,8 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
                /* Calculate direct hash */
                rspamd_cryptobox_hash_init(&st, key, rspamd_cryptobox_HASHKEYBYTES);
 
-               for (i = 0; i < part->utf_words->len; i++) {
-                       word = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+               for (i = 0; i < kv_size(part->utf_words); i++) {
+                       word = &kv_A(part->utf_words, i);
                        rspamd_cryptobox_hash_update(&st,
                                                                                 word->stemmed.begin, word->stemmed.len);
                }
@@ -1283,7 +1282,7 @@ lua_textpart_get_fuzzy_hashes(lua_State *L)
 
                cbd.pool = pool;
                cbd.part = part;
-               sgl = rspamd_shingles_from_text(part->utf_words, key,
+               sgl = rspamd_shingles_from_text(&part->utf_words, key,
                                                                                pool, lua_shingles_filter, &cbd, RSPAMD_SHINGLES_MUMHASH);
 
                if (sgl == NULL) {
index 97f9c496e4ed41a656336bed49c5e6aabdf63b03..0b1473b61c5b9f098f6c0395fdeec604933fc0df 100644 (file)
@@ -6943,7 +6943,7 @@ lua_task_get_meta_words(lua_State *L)
                return luaL_error(L, "invalid arguments");
        }
 
-       if (task->meta_words == NULL) {
+       if (!task->meta_words.a) {
                lua_createtable(L, 0, 0);
        }
        else {
@@ -6967,7 +6967,7 @@ lua_task_get_meta_words(lua_State *L)
                        }
                }
 
-               return rspamd_lua_push_words(L, task->meta_words, how);
+               return rspamd_lua_push_words_kvec(L, &task->meta_words, how);
        }
 
        return 1;
@@ -7039,6 +7039,76 @@ lua_lookup_words_array(lua_State *L,
        return nmatched;
 }
 
+static unsigned int
+lua_lookup_words_kvec(lua_State *L,
+                                         int cbpos,
+                                         struct rspamd_task *task,
+                                         struct rspamd_lua_map *map,
+                                         rspamd_words_t *words)
+{
+       rspamd_word_t *tok;
+       unsigned int i, nmatched = 0;
+       int err_idx;
+       gboolean matched;
+       const char *key;
+       gsize keylen;
+
+       if (!words || !words->a) {
+               return 0;
+       }
+
+       for (i = 0; i < kv_size(*words); i++) {
+               tok = &kv_A(*words, i);
+
+               matched = FALSE;
+
+               if (tok->normalized.len == 0) {
+                       continue;
+               }
+
+               key = tok->normalized.begin;
+               keylen = tok->normalized.len;
+
+               switch (map->type) {
+               case RSPAMD_LUA_MAP_SET:
+               case RSPAMD_LUA_MAP_HASH:
+                       /* We know that tok->normalized is zero terminated in fact */
+                       if (rspamd_match_hash_map(map->data.hash, key, keylen)) {
+                               matched = TRUE;
+                       }
+                       break;
+               case RSPAMD_LUA_MAP_REGEXP:
+               case RSPAMD_LUA_MAP_REGEXP_MULTIPLE:
+                       if (rspamd_match_regexp_map_single(map->data.re_map, key,
+                                                                                          keylen)) {
+                               matched = TRUE;
+                       }
+                       break;
+               default:
+                       g_assert_not_reached();
+                       break;
+               }
+
+               if (matched) {
+                       nmatched++;
+
+                       lua_pushcfunction(L, &rspamd_lua_traceback);
+                       err_idx = lua_gettop(L);
+                       lua_pushvalue(L, cbpos); /* Function */
+                       rspamd_lua_push_full_word(L, tok);
+
+                       if (lua_pcall(L, 1, 0, err_idx) != 0) {
+                               msg_err_task("cannot call callback function for lookup words: %s",
+                                                        lua_tostring(L, -1));
+                       }
+
+                       lua_settop(L, err_idx - 1);
+               }
+       }
+
+       return nmatched;
+}
+
 static int
 lua_task_lookup_words(lua_State *L)
 {
@@ -7062,13 +7132,13 @@ lua_task_lookup_words(lua_State *L)
 
        PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
        {
-               if (tp->utf_words) {
-                       matches += lua_lookup_words_array(L, 3, task, map, tp->utf_words);
+               if (tp->utf_words.a) {
+                       matches += lua_lookup_words_kvec(L, 3, task, map, &tp->utf_words);
                }
        }
 
-       if (task->meta_words) {
-               matches += lua_lookup_words_array(L, 3, task, map, task->meta_words);
+       if (task->meta_words.a) {
+               matches += lua_lookup_words_kvec(L, 3, task, map, &task->meta_words);
        }
 
        lua_pushinteger(L, matches);
index a5c7cb899822b967accf9938048640872a8fecf8..c82748862db7401bf2bd6fbb5d1dab319548ec8e 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright 2024 Vsevolod Stakhov
+ * Copyright 2025 Vsevolod Stakhov
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1696,7 +1696,7 @@ rspamd_can_alias_latin(int ch)
 
 static double
 rspamd_chartable_process_word_utf(struct rspamd_task *task,
-                                                                 rspamd_stat_token_t *w,
+                                                                 rspamd_word_t *w,
                                                                  gboolean is_url,
                                                                  unsigned int *ncap,
                                                                  struct chartable_ctx *chartable_module_ctx,
@@ -1842,7 +1842,7 @@ rspamd_chartable_process_word_utf(struct rspamd_task *task,
 
 static double
 rspamd_chartable_process_word_ascii(struct rspamd_task *task,
-                                                                       rspamd_stat_token_t *w,
+                                                                       rspamd_word_t *w,
                                                                        gboolean is_url,
                                                                        struct chartable_ctx *chartable_module_ctx)
 {
@@ -1931,17 +1931,17 @@ rspamd_chartable_process_part(struct rspamd_task *task,
                                                          struct chartable_ctx *chartable_module_ctx,
                                                          gboolean ignore_diacritics)
 {
-       rspamd_stat_token_t *w;
+       rspamd_word_t *w;
        unsigned int i, ncap = 0;
        double cur_score = 0.0;
 
-       if (part == nullptr || part->utf_words == nullptr ||
-               part->utf_words->len == 0 || part->nwords == 0) {
+       if (part == nullptr || part->utf_words.a == nullptr ||
+               kv_size(part->utf_words) == 0 || part->nwords == 0) {
                return FALSE;
        }
 
-       for (i = 0; i < part->utf_words->len; i++) {
-               w = &g_array_index(part->utf_words, rspamd_stat_token_t, i);
+       for (i = 0; i < kv_size(part->utf_words); i++) {
+               w = &kv_A(part->utf_words, i);
 
                if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
 
@@ -2015,13 +2015,13 @@ chartable_symbol_callback(struct rspamd_task *task,
                ignore_diacritics = TRUE;
        }
 
-       if (task->meta_words != nullptr && task->meta_words->len > 0) {
-               rspamd_stat_token_t *w;
+       if (task->meta_words.a && kv_size(task->meta_words) > 0) {
+               rspamd_word_t *w;
                double cur_score = 0;
-               gsize arlen = task->meta_words->len;
+               gsize arlen = kv_size(task->meta_words);
 
                for (i = 0; i < arlen; i++) {
-                       w = &g_array_index(task->meta_words, rspamd_stat_token_t, i);
+                       w = &kv_A(task->meta_words, i);
                        cur_score += rspamd_chartable_process_word_utf(task, w, FALSE,
                                                                                                                   nullptr, chartable_module_ctx, ignore_diacritics);
                }
index 85ea3b00c4364d7d10f85f6cf8a9c68ca37577cc..7dd5162ac720f00728a50df5b4f865d9f4f9842b 100644 (file)
@@ -1431,10 +1431,10 @@ fuzzy_io_fin(void *ud)
        close(session->fd);
 }
 
-static GArray *
+static rspamd_words_t *
 fuzzy_preprocess_words(struct rspamd_mime_text_part *part, rspamd_mempool_t *pool)
 {
-       return part->utf_words;
+       return &part->utf_words;
 }
 
 static void
@@ -1861,7 +1861,7 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
        unsigned int i;
        rspamd_cryptobox_hash_state_t st;
        rspamd_stat_token_t *word;
-       GArray *words;
+       rspamd_words_t *words;
        struct fuzzy_cmd_io *io;
        unsigned int additional_length;
        unsigned char *additional_data;
@@ -1970,10 +1970,10 @@ fuzzy_cmd_from_text_part(struct rspamd_task *task,
                        rspamd_cryptobox_hash_init(&st, rule->hash_key->str, rule->hash_key->len);
                        words = fuzzy_preprocess_words(part, task->task_pool);
 
-                       for (i = 0; i < words->len; i++) {
-                               word = &g_array_index(words, rspamd_stat_token_t, i);
+                       for (i = 0; i < kv_size(*words); i++) {
+                               word = &kv_A(*words, i);
 
-                               if (!((word->flags & RSPAMD_STAT_TOKEN_FLAG_SKIPPED) || word->stemmed.len == 0)) {
+                               if (!((word->flags & RSPAMD_WORD_FLAG_SKIPPED) || word->stemmed.len == 0)) {
                                        rspamd_cryptobox_hash_update(&st, word->stemmed.begin,
                                                                                                 word->stemmed.len);
                                }
@@ -2684,7 +2684,7 @@ fuzzy_insert_metric_results(struct rspamd_task *task, struct fuzzy_rule *rule,
        if (task->message) {
                PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, tp)
                {
-                       if (!IS_TEXT_PART_EMPTY(tp) && tp->utf_words != NULL && tp->utf_words->len > 0) {
+                       if (!IS_TEXT_PART_EMPTY(tp) && kv_size(tp->utf_words) > 0) {
                                seen_text_part = TRUE;
 
                                if (tp->utf_stripped_text.magic == UTEXT_MAGIC) {
index d1a10de84a40c603d04c6cbf0bbb49cd9ab4844c..5b88f4b2d0986f5eecf38a233ddabbe8ab74d82d 100644 (file)
@@ -17,6 +17,7 @@
 #include "rspamd.h"
 #include "shingles.h"
 #include "ottery.h"
+#include "libserver/word.h"
 #include <math.h>
 
 static const char *
@@ -52,63 +53,76 @@ generate_random_string(char *begin, size_t len)
        }
 }
 
-static GArray *
+static rspamd_words_t *
 generate_fuzzy_words(gsize cnt, gsize max_len)
 {
-       GArray *res;
+       rspamd_words_t *res;
        gsize i, wlen;
-       rspamd_ftok_t w;
+       rspamd_word_t word;
        char *t;
 
-       res = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), cnt);
+       res = g_malloc(sizeof(*res));
+       kv_init(*res);
 
        for (i = 0; i < cnt; i++) {
                wlen = ottery_rand_range(max_len) + 1;
                /* wlen = max_len; */
 
-               w.len = wlen;
                t = g_malloc(wlen);
                generate_random_string(t, wlen);
-               w.begin = t;
-               g_array_append_val(res, w);
+
+               memset(&word, 0, sizeof(word));
+               word.stemmed.begin = t;
+               word.stemmed.len = wlen;
+               word.original.begin = t;
+               word.original.len = wlen;
+               word.flags = 0; /* No flags set, so it won't be skipped */
+
+               kv_push(rspamd_word_t, *res, word);
        }
 
        return res;
 }
 
 static void
-permute_vector(GArray *in, double prob)
+permute_vector(rspamd_words_t *in, double prob)
 {
        gsize i, total = 0;
-       rspamd_ftok_t *w;
+       rspamd_word_t *w;
 
-       for (i = 0; i < in->len; i++) {
+       for (i = 0; i < kv_size(*in); i++) {
                if (ottery_rand_unsigned() <= G_MAXUINT * prob) {
-                       w = &g_array_index(in, rspamd_ftok_t, i);
-                       generate_random_string((char *) w->begin, w->len);
+                       w = &kv_A(*in, i);
+                       generate_random_string((char *) w->stemmed.begin, w->stemmed.len);
+                       /* Also update original since they point to same memory */
+                       w->original.begin = w->stemmed.begin;
+                       w->original.len = w->stemmed.len;
                        total++;
                }
        }
-       msg_debug("generated %z permutations of %ud words", total, in->len);
+       msg_debug("generated %z permutations of %ud words", total, (unsigned int) kv_size(*in));
 }
 
 static void
-free_fuzzy_words(GArray *ar)
+free_fuzzy_words(rspamd_words_t *ar)
 {
        gsize i;
-       rspamd_ftok_t *w;
+       rspamd_word_t *w;
 
-       for (i = 0; i < ar->len; i++) {
-               w = &g_array_index(ar, rspamd_ftok_t, i);
-               g_free((gpointer) w->begin);
+       for (i = 0; i < kv_size(*ar); i++) {
+               w = &kv_A(*ar, i);
+               g_free((gpointer) w->stemmed.begin);
        }
+
+       kv_destroy(*ar);
+       g_free(ar);
 }
 
 static void
 test_case(gsize cnt, gsize max_len, double perm_factor,
                  enum rspamd_shingle_alg alg)
 {
-       GArray *input;
+       rspamd_words_t *input;
        struct rspamd_shingle *sgl, *sgl_permuted;
        double res;
        unsigned char key[16];
@@ -281,51 +295,59 @@ void rspamd_shingles_test_func(void)
        enum rspamd_shingle_alg alg = RSPAMD_SHINGLES_OLD;
        struct rspamd_shingle *sgl;
        unsigned char key[16];
-       GArray *input;
-       rspamd_ftok_t tok;
+       rspamd_words_t input;
+       rspamd_word_t word;
        int i;
 
        memset(key, 0, sizeof(key));
-       input = g_array_sized_new(FALSE, FALSE, sizeof(rspamd_ftok_t), 5);
+       kv_init(input);
 
        for (i = 0; i < 5; i++) {
                char *b = g_alloca(8);
                memset(b, 0, 8);
                memcpy(b + 1, "test", 4);
                b[0] = 'a' + i;
-               tok.begin = b;
-               tok.len = 5 + ((i + 1) % 4);
-               g_array_append_val(input, tok);
+
+               memset(&word, 0, sizeof(word));
+               word.stemmed.begin = b;
+               word.stemmed.len = 5 + ((i + 1) % 4);
+               word.original.begin = b;
+               word.original.len = word.stemmed.len;
+               word.flags = 0; /* No flags set, so it won't be skipped */
+
+               kv_push(rspamd_word_t, input, word);
        }
 
-       sgl = rspamd_shingles_from_text(input, key, NULL,
+       sgl = rspamd_shingles_from_text(&input, key, NULL,
                                                                        rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_OLD);
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
                g_assert(sgl->hashes[i] == expected_old[i]);
        }
        g_free(sgl);
 
-       sgl = rspamd_shingles_from_text(input, key, NULL,
+       sgl = rspamd_shingles_from_text(&input, key, NULL,
                                                                        rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_XXHASH);
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
                g_assert(sgl->hashes[i] == expected_xxhash[i]);
        }
        g_free(sgl);
 
-       sgl = rspamd_shingles_from_text(input, key, NULL,
+       sgl = rspamd_shingles_from_text(&input, key, NULL,
                                                                        rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_MUMHASH);
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
                g_assert(sgl->hashes[i] == expected_mumhash[i]);
        }
        g_free(sgl);
 
-       sgl = rspamd_shingles_from_text(input, key, NULL,
+       sgl = rspamd_shingles_from_text(&input, key, NULL,
                                                                        rspamd_shingles_default_filter, NULL, RSPAMD_SHINGLES_FAST);
        for (i = 0; i < RSPAMD_SHINGLE_SIZE; i++) {
                g_assert(sgl->hashes[i] == expected_fasthash[i]);
        }
        g_free(sgl);
 
+       kv_destroy(input);
+
        for (alg = RSPAMD_SHINGLES_OLD; alg <= RSPAMD_SHINGLES_FAST; alg++) {
                test_case(200, 10, 0.1, alg);
                test_case(500, 20, 0.01, alg);