[Fix] Fix custom tokenizer UAF and improve CJK fuzzy detection

author Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)
diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua

index e953ee146742f4af6ac228360245201334229dcf..be3610626f5d11001c624956f200131d1200b6a8 100644 (file)
--- a/lualib/lua_fuzzy.lua
+++ b/lualib/lua_fuzzy.lua
@@ -292,6 +292,12 @@ exports.process_rule = function(rule)
    return #rules
  end
  
+-- CJK languages use multi-byte characters (3 bytes per char in UTF-8) and
+-- carry more semantic content per token than Latin languages.
+local function is_cjk_language(lang)
+  return lang and (lang == 'ja' or lang == 'zh' or lang == 'ko')
+end
+
  local function check_length(task, part, rule)
    local bytes = part:get_length()
    local length_ok = bytes > 0
@@ -317,7 +323,18 @@ local function check_length(task, part, rule)
        end
  
        if rule.text_multiplier then
-        adjusted_bytes = bytes * rule.text_multiplier
+        local multiplier = rule.text_multiplier
+
+        -- CJK characters are 3 bytes in UTF-8, so the same semantic content
+        -- takes ~3x more bytes than Latin text; boost the multiplier to compensate
+        local lang = part:get_text():get_language()
+        if is_cjk_language(lang) then
+          multiplier = multiplier * 3.0
+          lua_util.debugm(N, task, 'CJK language %s: boosted text_multiplier to %s',
+              lang, multiplier)
+        end
+
+        adjusted_bytes = bytes * multiplier
        end
      end
  
@@ -353,12 +370,25 @@ local function check_text_part(task, part, rule, text)
    if rule.text_shingles then
      -- Check number of words
      local min_words = rule.min_length or 0
-    if min_words < 32 then
-      min_words = 32 -- Minimum for shingles
+    local min_floor = 32
+
+    -- CJK morphemes carry higher semantic density per token, so fewer words
+    -- are needed for meaningful shingle generation (3-word window still works
+    -- well with as few as 12 tokens producing 10 windows)
+    local lang = text:get_language()
+    if is_cjk_language(lang) then
+      min_words = math.floor(min_words / 3)
+      min_floor = 12
+      lua_util.debugm(N, task, 'CJK language %s: adjusted min_words to %s (floor %s)',
+          lang, min_words, min_floor)
+    end
+
+    if min_words < min_floor then
+      min_words = min_floor
      end
      if wcnt < min_words then
        lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
-          rule.min_length, wcnt)
+          min_words, wcnt)
        allow_shingles = false
      else
        lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c

index 8e951364533a75d88917187eab98a00fd01085f8..35c0665b44f10bb29fabbefafd3c5b1e3da10939 100644 (file)
--- a/src/libstat/tokenizers/tokenizer_manager.c
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -376,6 +376,40 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
         return best_tok;
  }
  
+/*
+ * Copy per-token strings (normalized, stemmed, unicode) from custom tokenizer
+ * allocations into the mempool so that cleanup_result can safely free the originals.
+ * Tokens from start_idx to the end of the words kvec are processed.
+ */
+static void
+rspamd_custom_tokens_to_mempool(rspamd_words_t *words,
+                                                               gsize start_idx,
+                                                               rspamd_mempool_t *pool)
+{
+       for (gsize i = start_idx; i < kv_size(*words); i++) {
+               rspamd_word_t *w = &kv_A(*words, i);
+
+               if (w->normalized.begin && w->normalized.len > 0) {
+                       char *copy = rspamd_mempool_alloc(pool, w->normalized.len);
+                       memcpy(copy, w->normalized.begin, w->normalized.len);
+                       w->normalized.begin = copy;
+               }
+
+               if (w->stemmed.begin && w->stemmed.len > 0) {
+                       char *copy = rspamd_mempool_alloc(pool, w->stemmed.len);
+                       memcpy(copy, w->stemmed.begin, w->stemmed.len);
+                       w->stemmed.begin = copy;
+               }
+
+               if (w->unicode.begin && w->unicode.len > 0) {
+                       uint32_t *copy = rspamd_mempool_alloc(pool,
+                                                                                                 w->unicode.len * sizeof(uint32_t));
+                       memcpy(copy, w->unicode.begin, w->unicode.len * sizeof(uint32_t));
+                       w->unicode.begin = copy;
+               }
+       }
+}
+
  /* Helper function to tokenize with a custom tokenizer handling exceptions */
  rspamd_tokenizer_result_t *
  rspamd_custom_tokenizer_tokenize_with_exceptions(
@@ -403,12 +437,17 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
  
                 ret = tokenizer->api->tokenize(text, len, &result);
                 if (ret == 0 && result.a) {
+                       gsize start_idx = kv_size(*words);
+
                         /* Copy tokens from result to output */
                         for (i = 0; i < kv_size(result); i++) {
                                 rspamd_word_t tok = kv_A(result, i);
                                 kv_push(rspamd_word_t, *words, tok);
                         }
  
+                       /* Copy per-token strings to mempool before cleanup frees them */
+                       rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
                         /* Use tokenizer's cleanup function */
                         if (tokenizer->api->cleanup_result) {
                                 tokenizer->api->cleanup_result(&result);
@@ -429,6 +468,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
  
                         ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
                         if (ret == 0 && result.a) {
+                               gsize start_idx = kv_size(*words);
+
                                 /* Copy tokens from result, adjusting positions for segment offset */
                                 for (i = 0; i < kv_size(result); i++) {
                                         rspamd_word_t tok = kv_A(result, i);
@@ -444,6 +485,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
                                         }
                                 }
  
+                               /* Copy per-token strings to mempool before cleanup frees them */
+                               rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
                                 /* Use tokenizer's cleanup function */
                                 if (tokenizer->api->cleanup_result) {
                                         tokenizer->api->cleanup_result(&result);
@@ -477,6 +521,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
  
                 ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
                 if (ret == 0 && result.a) {
+                       gsize start_idx = kv_size(*words);
+
                         /* Copy tokens from result, adjusting positions for segment offset */
                         for (i = 0; i < kv_size(result); i++) {
                                 rspamd_word_t tok = kv_A(result, i);
@@ -489,6 +535,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
                                 }
                         }
  
+                       /* Copy per-token strings to mempool before cleanup frees them */
+                       rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
                         /* Use tokenizer's cleanup function */
                         if (tokenizer->api->cleanup_result) {
                                 tokenizer->api->cleanup_result(&result);
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Fri, 13 Feb 2026 14:09:59 +0000 (14:09 +0000)
lualib/lua_fuzzy.lua		patch \| blob \| blame \| history
src/libstat/tokenizers/tokenizer_manager.c		patch \| blob \| blame \| history