From: Vsevolod Stakhov Date: Mon, 24 Nov 2025 14:18:30 +0000 (+0000) Subject: [Fix] Detect and convert UTF-16 text in PDF content X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=72277d3f3924d745c3fc62b4304a0681ac8694d1;p=thirdparty%2Frspamd.git [Fix] Detect and convert UTF-16 text in PDF content --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index d49b2e699f..5ad54bb1d0 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -334,6 +334,44 @@ local function gen_text_grammar() local C = lpeg.C local gen = generic_grammar_elts() + local function sanitize_pdf_text(s) + if not s or #s < 4 then return s end + + local nulls_odd = 0 + local nulls_even = 0 + local len = #s + + for i = 1, len do + local b = string.byte(s, i) + if b == 0 then + if i % 2 == 1 then + nulls_odd = nulls_odd + 1 + else + nulls_even = nulls_even + 1 + end + end + end + + local ratio_odd = nulls_odd / math.ceil(len / 2) + local ratio_even = nulls_even / math.floor(len / 2) + local charset + + if ratio_odd > 0.8 and ratio_even < 0.2 then + charset = 'UTF-16BE' + elseif ratio_even > 0.8 and ratio_odd < 0.2 then + charset = 'UTF-16LE' + end + + if charset and rspamd_util.to_utf8 then + local conv = rspamd_util.to_utf8(s, charset) + if conv then + return conv + end + end + + return s + end + local function text_op_handler(...) local args = { ... } local op = args[#args] @@ -355,6 +393,8 @@ local function gen_text_grammar() res = table.concat(tres) end + res = sanitize_pdf_text(res) + if op == "'" or op == '"' then return '\n' .. res end diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index b6f1e74903..c3d0cb9b08 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -44,6 +44,7 @@ #include "unicode/uspoof.h" #include "unicode/uscript.h" +#include #include "rspamd_simdutf.h" /*** @@ -275,6 +276,15 @@ LUA_FUNCTION_DEF(util, normalize_utf8); */ LUA_FUNCTION_DEF(util, transliterate); +/*** + * @function util.to_utf8(str, charset) + * Converts a string from a specific charset to UTF-8 + * @param {string/text} str input string + * @param {string} charset input charset (e.g. "utf-16le", "utf-16be") + * @return {text} utf8 string or nil on error + */ +LUA_FUNCTION_DEF(util, to_utf8); + /*** * @function util.strequal_caseless(str1, str2) * Compares two strings regardless of their case using ascii comparison. @@ -732,6 +742,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF(util, lower_utf8), LUA_INTERFACE_DEF(util, normalize_utf8), LUA_INTERFACE_DEF(util, transliterate), + LUA_INTERFACE_DEF(util, to_utf8), LUA_INTERFACE_DEF(util, strequal_caseless), LUA_INTERFACE_DEF(util, strequal_caseless_utf8), LUA_INTERFACE_DEF(util, get_ticks), @@ -1709,6 +1720,48 @@ lua_util_transliterate(lua_State *L) return 1; } +static int +lua_util_to_utf8(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t; + const char *charset; + char *dest; + int32_t dest_len, dest_cap; + UErrorCode err = U_ZERO_ERROR; + + t = lua_check_text_or_string(L, 1); + charset = luaL_checkstring(L, 2); + + if (!t || !charset) { + return luaL_error(L, "invalid arguments"); + } + + dest_cap = t->len * 1.5 + 16; + dest = g_malloc(dest_cap); + + dest_len = ucnv_convert("UTF-8", charset, dest, dest_cap, t->start, t->len, &err); + + if (err == U_BUFFER_OVERFLOW_ERROR) { + g_free(dest); + err = U_ZERO_ERROR; + dest_cap = dest_len + 1; + dest = g_malloc(dest_cap); + dest_len = ucnv_convert("UTF-8", charset, dest, dest_cap, t->start, t->len, &err); + } + + if (U_FAILURE(err)) { + g_free(dest); + lua_pushnil(L); + return 1; + } + + struct rspamd_lua_text *out = lua_new_text(L, dest, dest_len, FALSE); + out->flags |= RSPAMD_TEXT_FLAG_OWN; + + return 1; +} + static int lua_util_strequal_caseless(lua_State *L) {