return #rules
end
+-- CJK languages use multi-byte characters (3 bytes per char in UTF-8) and
+-- carry more semantic content per token than Latin languages.
+local function is_cjk_language(lang)
+ return lang and (lang == 'ja' or lang == 'zh' or lang == 'ko')
+end
+
local function check_length(task, part, rule)
local bytes = part:get_length()
local length_ok = bytes > 0
end
if rule.text_multiplier then
- adjusted_bytes = bytes * rule.text_multiplier
+ local multiplier = rule.text_multiplier
+
+ -- CJK characters are 3 bytes in UTF-8, so the same semantic content
+ -- takes ~3x more bytes than Latin text; boost the multiplier to compensate
+ local lang = part:get_text():get_language()
+ if is_cjk_language(lang) then
+ multiplier = multiplier * 3.0
+ lua_util.debugm(N, task, 'CJK language %s: boosted text_multiplier to %s',
+ lang, multiplier)
+ end
+
+ adjusted_bytes = bytes * multiplier
end
end
if rule.text_shingles then
-- Check number of words
local min_words = rule.min_length or 0
- if min_words < 32 then
- min_words = 32 -- Minimum for shingles
+ local min_floor = 32
+
+ -- CJK morphemes carry higher semantic density per token, so fewer words
+ -- are needed for meaningful shingle generation (3-word window still works
+ -- well with as few as 12 tokens producing 10 windows)
+ local lang = text:get_language()
+ if is_cjk_language(lang) then
+ min_words = math.floor(min_words / 3)
+ min_floor = 12
+ lua_util.debugm(N, task, 'CJK language %s: adjusted min_words to %s (floor %s)',
+ lang, min_words, min_floor)
+ end
+
+ if min_words < min_floor then
+ min_words = min_floor
end
if wcnt < min_words then
lua_util.debugm(N, task, 'text has less than %s words: %s; disable shingles',
- rule.min_length, wcnt)
+ min_words, wcnt)
allow_shingles = false
else
lua_util.debugm(N, task, 'allow shingles in text %s, %s words',
return best_tok;
}
+/*
+ * Copy per-token strings (normalized, stemmed, unicode) from custom tokenizer
+ * allocations into the mempool so that cleanup_result can safely free the originals.
+ * Tokens from start_idx to the end of the words kvec are processed.
+ */
+static void
+rspamd_custom_tokens_to_mempool(rspamd_words_t *words,
+ gsize start_idx,
+ rspamd_mempool_t *pool)
+{
+ for (gsize i = start_idx; i < kv_size(*words); i++) {
+ rspamd_word_t *w = &kv_A(*words, i);
+
+ if (w->normalized.begin && w->normalized.len > 0) {
+ char *copy = rspamd_mempool_alloc(pool, w->normalized.len);
+ memcpy(copy, w->normalized.begin, w->normalized.len);
+ w->normalized.begin = copy;
+ }
+
+ if (w->stemmed.begin && w->stemmed.len > 0) {
+ char *copy = rspamd_mempool_alloc(pool, w->stemmed.len);
+ memcpy(copy, w->stemmed.begin, w->stemmed.len);
+ w->stemmed.begin = copy;
+ }
+
+ if (w->unicode.begin && w->unicode.len > 0) {
+ uint32_t *copy = rspamd_mempool_alloc(pool,
+ w->unicode.len * sizeof(uint32_t));
+ memcpy(copy, w->unicode.begin, w->unicode.len * sizeof(uint32_t));
+ w->unicode.begin = copy;
+ }
+ }
+}
+
/* Helper function to tokenize with a custom tokenizer handling exceptions */
rspamd_tokenizer_result_t *
rspamd_custom_tokenizer_tokenize_with_exceptions(
ret = tokenizer->api->tokenize(text, len, &result);
if (ret == 0 && result.a) {
+ gsize start_idx = kv_size(*words);
+
/* Copy tokens from result to output */
for (i = 0; i < kv_size(result); i++) {
rspamd_word_t tok = kv_A(result, i);
kv_push(rspamd_word_t, *words, tok);
}
+ /* Copy per-token strings to mempool before cleanup frees them */
+ rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
/* Use tokenizer's cleanup function */
if (tokenizer->api->cleanup_result) {
tokenizer->api->cleanup_result(&result);
ret = tokenizer->api->tokenize(text + pos, segment_len, &result);
if (ret == 0 && result.a) {
+ gsize start_idx = kv_size(*words);
+
/* Copy tokens from result, adjusting positions for segment offset */
for (i = 0; i < kv_size(result); i++) {
rspamd_word_t tok = kv_A(result, i);
}
}
+ /* Copy per-token strings to mempool before cleanup frees them */
+ rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
/* Use tokenizer's cleanup function */
if (tokenizer->api->cleanup_result) {
tokenizer->api->cleanup_result(&result);
ret = tokenizer->api->tokenize(text + pos, len - pos, &result);
if (ret == 0 && result.a) {
+ gsize start_idx = kv_size(*words);
+
/* Copy tokens from result, adjusting positions for segment offset */
for (i = 0; i < kv_size(result); i++) {
rspamd_word_t tok = kv_A(result, i);
}
}
+ /* Copy per-token strings to mempool before cleanup frees them */
+ rspamd_custom_tokens_to_mempool(words, start_idx, pool);
+
/* Use tokenizer's cleanup function */
if (tokenizer->api->cleanup_result) {
tokenizer->api->cleanup_result(&result);