From bda797c37ceea3dc6bf6e5b1a0c717dfd7431b35 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 19 Jan 2026 09:29:44 +0000 Subject: [PATCH] [Feature] Add language-based model/URL selection for LLM embeddings Support language-specific embedding models via language_models config: - Shorthand: language_models = { ru = "model-name" } - Full config: language_models = { ru = { model, url, api_key } } Uses get_displayed_text_part() for language detection. Include language in cache key for proper separation. --- lualib/plugins/neural/providers/llm.lua | 55 ++++++++++++++++++++++--- 1 file changed, 49 insertions(+), 6 deletions(-) diff --git a/lualib/plugins/neural/providers/llm.lua b/lualib/plugins/neural/providers/llm.lua index 60dea419f5..9709c231ed 100644 --- a/lualib/plugins/neural/providers/llm.lua +++ b/lualib/plugins/neural/providers/llm.lua @@ -10,6 +10,7 @@ local ucl = require "ucl" local neural_common = require "plugins/neural" local lua_cache = require "lua_cache" local llm_common = require "llm_common" +local lua_mime = require "lua_mime" local N = "neural.llm" @@ -17,7 +18,19 @@ local function select_text(task, opts) return llm_common.build_llm_input(task, opts) end -local function compose_llm_settings(pcfg) +-- Detect primary language from the displayed text part +local function detect_language(task) + local part = lua_mime.get_displayed_text_part(task) + if part then + local lang = part:get_language() + if lang and lang ~= '' then + return lang + end + end + return nil +end + +local function compose_llm_settings(pcfg, language) local gpt_settings = rspamd_config:get_all_opt('gpt') or {} -- Provider identity is pcfg.type=='llm'; backend type is specified via one of these keys local llm_type = pcfg.llm_type or pcfg.api or pcfg.backend or gpt_settings.type or 'openai' @@ -32,6 +45,31 @@ local function compose_llm_settings(pcfg) local url = pcfg.url local api_key = pcfg.api_key or gpt_settings.api_key + -- Language-specific model/URL selection + -- Config format: language_models = { en = { model = "...", url = "..." }, ru = { model = "..." }, ... } + -- Or shorthand: language_models = { en = "model-name", ru = "model-name", ... } + local language_models = pcfg.language_models + if language and language_models then + local lang_cfg = language_models[language] + if lang_cfg then + if type(lang_cfg) == 'string' then + -- Shorthand: just model name + model = lang_cfg + elseif type(lang_cfg) == 'table' then + -- Full config: { model = "...", url = "...", api_key = "..." } + if lang_cfg.model then + model = lang_cfg.model + end + if lang_cfg.url then + url = lang_cfg.url + end + if lang_cfg.api_key then + api_key = lang_cfg.api_key + end + end + end + end + if not url then if llm_type == 'openai' then url = 'https://api.openai.com/v1/embeddings' @@ -85,7 +123,10 @@ end neural_common.register_provider('llm', { collect_async = function(task, ctx, cont) local pcfg = ctx.config or {} - local llm = compose_llm_settings(pcfg) + + -- Detect language from displayed text part for model/URL selection + local language = detect_language(task) + local llm = compose_llm_settings(pcfg, language) if not llm.model then rspamd_logger.debugm(N, task, 'llm provider missing model; skip') @@ -117,8 +158,8 @@ neural_common.register_provider('llm', { end local input_key = normalize_cache_key_input(input_string) - rspamd_logger.debugm(N, task, 'llm embedding request: model=%s url=%s len=%s', tostring(llm.model), tostring(llm.url), - tostring(#input_key)) + rspamd_logger.debugm(N, task, 'llm embedding request: model=%s url=%s lang=%s len=%s', + tostring(llm.model), tostring(llm.url), tostring(language or 'unknown'), tostring(#input_key)) local body if llm.type == 'openai' then @@ -141,7 +182,8 @@ neural_common.register_provider('llm', { }, N) -- Use raw key and allow cache module to hash/shorten it per context - local key = string.format('%s:%s:%s', llm.type, llm.model or 'model', input_key) + -- Include language in cache key for proper separation + local key = string.format('%s:%s:%s:%s', llm.type, llm.model or 'model', language or 'unk', input_key) local function finish_with_vec(vec) if type(vec) == 'table' and #vec > 0 then @@ -152,8 +194,9 @@ neural_common.register_provider('llm', { weight = ctx.weight or 1.0, model = llm.model, provider = llm.type, + language = language, } - rspamd_logger.debugm(N, task, 'llm embedding result: dim=%s', #vec) + rspamd_logger.debugm(N, task, 'llm embedding result: dim=%s lang=%s', #vec, language or 'unknown') cont(vec, meta) else rspamd_logger.debugm(N, task, 'llm embedding result: empty') -- 2.47.3