From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Fri, 13 Feb 2026 14:09:59 +0000 (+0000)
Subject: [Fix] Fix custom tokenizer UAF and improve CJK fuzzy detection
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=75f58156f64cb647b4aac49d2f8e6ab87b92d590;p=thirdparty%2Frspamd.git

[Fix] Fix custom tokenizer UAF and improve CJK fuzzy detection

Fix use-after-free in custom tokenizer integration: when tokens were
shallow-copied from a custom tokenizer result, cleanup_result would free
per-token normalized/stemmed/unicode strings while the copies still
referenced them. Now copy these strings into the mempool before cleanup.

Improve fuzzy hash generation for CJK (Japanese/Chinese/Korean) text:
- Lower shingle word count threshold by 3x for CJK languages (64→21,
  floor 32→12) since CJK morphemes carry higher semantic density
- Boost text_multiplier by 3x for CJK to compensate for 3-byte UTF-8
  characters, allowing short CJK emails to pass the min_bytes gate

Previously, short Japanese phishing emails (~50 chars, ~20 words) would
generate zero fuzzy hashes — failing both the word count gate for shingles
and the byte length gate for direct hash.
---

diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua
index e953ee1467..be3610626f 100644
--- a/lualib/lua_fuzzy.lua
+++ b/lualib/lua_fuzzy.lua
@@ -292,6 +292,12 @@ exports.process_rule = function(rule)
   return #rules
 end
 
+-- CJK languages use multi-byte characters (3 bytes per char in UTF-8) and
+-- carry more semantic content per token than Latin languages.
+local function is_cjk_language(lang)
+  return lang and (lang == 'ja' or lang == 'zh' or lang == 'ko')
+end
+
 local function check_length(task, part, rule)
   local bytes = part:get_length()
   local length_ok = bytes > 0
@@ -317,7 +323,18 @@ local function check_length(task, part, rule)
       end
 
       if rule.text_multiplier then
-        adjusted_bytes = bytes * rule.text_multiplier
+        local multiplier = rule.text_multiplier
+
+        -- CJK characters are 3 bytes in UTF-8, so the same semantic content
+        -- takes ~3x more bytes than Latin text; boost the multiplier to compensate
+        local lang = part:get_text():get_language()
+        if is_cjk_language(lang) then
+          multiplier = multiplier * 3.0
+          lua_util.debugm(N, task, 'CJK language %s: boosted text_multiplier to %s',
+              lang, multiplier)
+        end
+
+        adjusted_bytes = bytes * multiplier
       end
     end
 
@@ -353,12 +370,25 @@ local function check_text_part(task, part, rule, text)
   if rule.text_shingles then
     -- Check number of words
     local min_words = rule.min_length or 0
-    if min_words < 32 then
-      min_words = 32 -- Minimum for shingles
+    local min_floor = 32
+
+    -- CJK morphemes carry higher semantic density per token, so fewer words
+    -- are needed for meaningful shingle generation (3-word window still works
+    -- well with as few as 12 tokens producing 10 windows)
+    local lang = text:get_language()
+    if is_cjk_language(lang) then
+      min_words = math.floor(min_words / 3)
+      min_floor = 12
+      lua_util.debugm(N, task, 'CJK language %s: adjusted min_words to %s (floor %s)',
+          lang, min_words, min_floor)
+    end
+
+    if min_words < min_floor then
+      min_words = min_floor
     end
     if wcnt < min_words then
       lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
-          rule.min_length, wcnt)
+          min_words, wcnt)
       allow_shingles = false
     else
       lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c
index 8e95136453..35c0665b44 100644
--- a/src/libstat/tokenizers/tokenizer_manager.c
+++ b/src/libstat/tokenizers/tokenizer_manager.c
@@ -376,6 +376,40 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr,
 	return best_tok;
 }
 
+/*
+ * Copy per-token strings (normalized, stemmed, unicode) from custom tokenizer
+ * allocations into the mempool so that cleanup_result can safely free the originals.
+ * Tokens from start_idx to the end of the words kvec are processed.
+ */
+static void
+rspamd_custom_tokens_to_mempool(rspamd_words_t *words,
+								gsize start_idx,
+								rspamd_mempool_t *pool)
+{
+	for (gsize i = start_idx; i < kv_size(*words); i++) {
+		rspamd_word_t *w = &kv_A(*words, i);
+
+		if (w->normalized.begin && w->normalized.len > 0) {
+			char *copy = rspamd_mempool_alloc(pool, w->normalized.len);
+			memcpy(copy, w->normalized.begin, w->normalized.len);
+			w->normalized.begin = copy;
+		}
+
+		if (w->stemmed.begin && w->stemmed.len > 0) {
+			char *copy = rspamd_mempool_alloc(pool, w->stemmed.len);
+			memcpy(copy, w->stemmed.begin, w->stemmed.len);
+			w->stemmed.begin = copy;
+		}
+
+		if (w->unicode.begin && w->unicode.len > 0) {
+			uint32_t *copy = rspamd_mempool_alloc(pool,
+												  w->unicode.len * sizeof(uint32_t));
+			memcpy(copy, w->unicode.begin, w->unicode.len * sizeof(uint32_t));
+			w->unicode.begin = copy;
+		}
+	}
+}
+
 /* Helper function to tokenize with a custom tokenizer handling exceptions */
 rspamd_tokenizer_result_t *
 rspamd_custom_tokenizer_tokenize_with_exceptions(
@@ -403,12 +437,17 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 
 		ret = tokenizer->api->tokenize(text, len, &result);
 		if (ret == 0 && result.a) {
+			gsize start_idx = kv_size(*words);
+
 			/* Copy tokens from result to output */
 			for (i = 0; i < kv_size(result); i++) {
 				rspamd_word_t tok = kv_A(result, i);
 				kv_push(rspamd_word_t, *words, tok);
 			}
 
+			/* Copy per-token strings to mempool before cleanup frees them */
+			rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
 			/* Use tokenizer's cleanup function */
 			if (tokenizer->api->cleanup_result) {
 				tokenizer->api->cleanup_result(&result);
@@ -429,6 +468,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 
 			ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
 			if (ret == 0 && result.a) {
+				gsize start_idx = kv_size(*words);
+
 				/* Copy tokens from result, adjusting positions for segment offset */
 				for (i = 0; i < kv_size(result); i++) {
 					rspamd_word_t tok = kv_A(result, i);
@@ -444,6 +485,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 					}
 				}
 
+				/* Copy per-token strings to mempool before cleanup frees them */
+				rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
 				/* Use tokenizer's cleanup function */
 				if (tokenizer->api->cleanup_result) {
 					tokenizer->api->cleanup_result(&result);
@@ -477,6 +521,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 
 		ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
 		if (ret == 0 && result.a) {
+			gsize start_idx = kv_size(*words);
+
 			/* Copy tokens from result, adjusting positions for segment offset */
 			for (i = 0; i < kv_size(result); i++) {
 				rspamd_word_t tok = kv_A(result, i);
@@ -489,6 +535,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions(
 				}
 			}
 
+			/* Copy per-token strings to mempool before cleanup frees them */
+			rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
 			/* Use tokenizer's cleanup function */
 			if (tokenizer->api->cleanup_result) {
 				tokenizer->api->cleanup_result(&result);