From: Vsevolod Stakhov Date: Fri, 13 Feb 2026 14:09:59 +0000 (+0000) Subject: [Fix] Fix custom tokenizer UAF and improve CJK fuzzy detection X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=75f58156f64cb647b4aac49d2f8e6ab87b92d590;p=thirdparty%2Frspamd.git [Fix] Fix custom tokenizer UAF and improve CJK fuzzy detection Fix use-after-free in custom tokenizer integration: when tokens were shallow-copied from a custom tokenizer result, cleanup_result would free per-token normalized/stemmed/unicode strings while the copies still referenced them. Now copy these strings into the mempool before cleanup. Improve fuzzy hash generation for CJK (Japanese/Chinese/Korean) text: - Lower shingle word count threshold by 3x for CJK languages (64→21, floor 32→12) since CJK morphemes carry higher semantic density - Boost text_multiplier by 3x for CJK to compensate for 3-byte UTF-8 characters, allowing short CJK emails to pass the min_bytes gate Previously, short Japanese phishing emails (~50 chars, ~20 words) would generate zero fuzzy hashes — failing both the word count gate for shingles and the byte length gate for direct hash. --- diff --git a/lualib/lua_fuzzy.lua b/lualib/lua_fuzzy.lua index e953ee1467..be3610626f 100644 --- a/lualib/lua_fuzzy.lua +++ b/lualib/lua_fuzzy.lua @@ -292,6 +292,12 @@ exports.process_rule = function(rule) return #rules end +-- CJK languages use multi-byte characters (3 bytes per char in UTF-8) and +-- carry more semantic content per token than Latin languages. +local function is_cjk_language(lang) + return lang and (lang == 'ja' or lang == 'zh' or lang == 'ko') +end + local function check_length(task, part, rule) local bytes = part:get_length() local length_ok = bytes > 0 @@ -317,7 +323,18 @@ local function check_length(task, part, rule) end if rule.text_multiplier then - adjusted_bytes = bytes * rule.text_multiplier + local multiplier = rule.text_multiplier + + -- CJK characters are 3 bytes in UTF-8, so the same semantic content + -- takes ~3x more bytes than Latin text; boost the multiplier to compensate + local lang = part:get_text():get_language() + if is_cjk_language(lang) then + multiplier = multiplier * 3.0 + lua_util.debugm(N, task, 'CJK language %s: boosted text_multiplier to %s', + lang, multiplier) + end + + adjusted_bytes = bytes * multiplier end end @@ -353,12 +370,25 @@ local function check_text_part(task, part, rule, text) if rule.text_shingles then -- Check number of words local min_words = rule.min_length or 0 - if min_words < 32 then - min_words = 32 -- Minimum for shingles + local min_floor = 32 + + -- CJK morphemes carry higher semantic density per token, so fewer words + -- are needed for meaningful shingle generation (3-word window still works + -- well with as few as 12 tokens producing 10 windows) + local lang = text:get_language() + if is_cjk_language(lang) then + min_words = math.floor(min_words / 3) + min_floor = 12 + lua_util.debugm(N, task, 'CJK language %s: adjusted min_words to %s (floor %s)', + lang, min_words, min_floor) + end + + if min_words < min_floor then + min_words = min_floor end if wcnt < min_words then lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles', - rule.min_length, wcnt) + min_words, wcnt) allow_shingles = false else lua_util.debugm(N, task, 'allow shingles in text %s, %s words', diff --git a/src/libstat/tokenizers/tokenizer_manager.c b/src/libstat/tokenizers/tokenizer_manager.c index 8e95136453..35c0665b44 100644 --- a/src/libstat/tokenizers/tokenizer_manager.c +++ b/src/libstat/tokenizers/tokenizer_manager.c @@ -376,6 +376,40 @@ rspamd_tokenizer_manager_detect(struct rspamd_tokenizer_manager *mgr, return best_tok; } +/* + * Copy per-token strings (normalized, stemmed, unicode) from custom tokenizer + * allocations into the mempool so that cleanup_result can safely free the originals. + * Tokens from start_idx to the end of the words kvec are processed. + */ +static void +rspamd_custom_tokens_to_mempool(rspamd_words_t *words, + gsize start_idx, + rspamd_mempool_t *pool) +{ + for (gsize i = start_idx; i < kv_size(*words); i++) { + rspamd_word_t *w = &kv_A(*words, i); + + if (w->normalized.begin && w->normalized.len > 0) { + char *copy = rspamd_mempool_alloc(pool, w->normalized.len); + memcpy(copy, w->normalized.begin, w->normalized.len); + w->normalized.begin = copy; + } + + if (w->stemmed.begin && w->stemmed.len > 0) { + char *copy = rspamd_mempool_alloc(pool, w->stemmed.len); + memcpy(copy, w->stemmed.begin, w->stemmed.len); + w->stemmed.begin = copy; + } + + if (w->unicode.begin && w->unicode.len > 0) { + uint32_t *copy = rspamd_mempool_alloc(pool, + w->unicode.len * sizeof(uint32_t)); + memcpy(copy, w->unicode.begin, w->unicode.len * sizeof(uint32_t)); + w->unicode.begin = copy; + } + } +} + /* Helper function to tokenize with a custom tokenizer handling exceptions */ rspamd_tokenizer_result_t * rspamd_custom_tokenizer_tokenize_with_exceptions( @@ -403,12 +437,17 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( ret = tokenizer->api->tokenize(text, len, &result); if (ret == 0 && result.a) { + gsize start_idx = kv_size(*words); + /* Copy tokens from result to output */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); kv_push(rspamd_word_t, *words, tok); } + /* Copy per-token strings to mempool before cleanup frees them */ + rspamd_custom_tokens_to_mempool(words, start_idx, pool); + /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result); @@ -429,6 +468,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( ret = tokenizer->api->tokenize(text + pos, segment_len, &result); if (ret == 0 && result.a) { + gsize start_idx = kv_size(*words); + /* Copy tokens from result, adjusting positions for segment offset */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); @@ -444,6 +485,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( } } + /* Copy per-token strings to mempool before cleanup frees them */ + rspamd_custom_tokens_to_mempool(words, start_idx, pool); + /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result); @@ -477,6 +521,8 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( ret = tokenizer->api->tokenize(text + pos, len - pos, &result); if (ret == 0 && result.a) { + gsize start_idx = kv_size(*words); + /* Copy tokens from result, adjusting positions for segment offset */ for (i = 0; i < kv_size(result); i++) { rspamd_word_t tok = kv_A(result, i); @@ -489,6 +535,9 @@ rspamd_custom_tokenizer_tokenize_with_exceptions( } } + /* Copy per-token strings to mempool before cleanup frees them */ + rspamd_custom_tokens_to_mempool(words, start_idx, pool); + /* Use tokenizer's cleanup function */ if (tokenizer->api->cleanup_result) { tokenizer->api->cleanup_result(&result);