local C = lpeg.C
local gen = generic_grammar_elts()
+ local function sanitize_pdf_text(s)
+ if not s or #s < 4 then return s end
+
+ local nulls_odd = 0
+ local nulls_even = 0
+ local len = #s
+
+ for i = 1, len do
+ local b = string.byte(s, i)
+ if b == 0 then
+ if i % 2 == 1 then
+ nulls_odd = nulls_odd + 1
+ else
+ nulls_even = nulls_even + 1
+ end
+ end
+ end
+
+ local ratio_odd = nulls_odd / math.ceil(len / 2)
+ local ratio_even = nulls_even / math.floor(len / 2)
+ local charset
+
+ if ratio_odd > 0.8 and ratio_even < 0.2 then
+ charset = 'UTF-16BE'
+ elseif ratio_even > 0.8 and ratio_odd < 0.2 then
+ charset = 'UTF-16LE'
+ end
+
+ if charset and rspamd_util.to_utf8 then
+ local conv = rspamd_util.to_utf8(s, charset)
+ if conv then
+ return conv
+ end
+ end
+
+ return s
+ end
+
local function text_op_handler(...)
local args = { ... }
local op = args[#args]
res = table.concat(tres)
end
+ res = sanitize_pdf_text(res)
+
if op == "'" or op == '"' then
return '\n' .. res
end
#include "unicode/uspoof.h"
#include "unicode/uscript.h"
+#include <unicode/ucnv.h>
#include "rspamd_simdutf.h"
/***
*/
LUA_FUNCTION_DEF(util, transliterate);
+/***
+ * @function util.to_utf8(str, charset)
+ * Converts a string from a specific charset to UTF-8
+ * @param {string/text} str input string
+ * @param {string} charset input charset (e.g. "utf-16le", "utf-16be")
+ * @return {text} utf8 string or nil on error
+ */
+LUA_FUNCTION_DEF(util, to_utf8);
+
/***
* @function util.strequal_caseless(str1, str2)
* Compares two strings regardless of their case using ascii comparison.
LUA_INTERFACE_DEF(util, lower_utf8),
LUA_INTERFACE_DEF(util, normalize_utf8),
LUA_INTERFACE_DEF(util, transliterate),
+ LUA_INTERFACE_DEF(util, to_utf8),
LUA_INTERFACE_DEF(util, strequal_caseless),
LUA_INTERFACE_DEF(util, strequal_caseless_utf8),
LUA_INTERFACE_DEF(util, get_ticks),
return 1;
}
+static int
+lua_util_to_utf8(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const char *charset;
+ char *dest;
+ int32_t dest_len, dest_cap;
+ UErrorCode err = U_ZERO_ERROR;
+
+ t = lua_check_text_or_string(L, 1);
+ charset = luaL_checkstring(L, 2);
+
+ if (!t || !charset) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ dest_cap = t->len * 1.5 + 16;
+ dest = g_malloc(dest_cap);
+
+ dest_len = ucnv_convert("UTF-8", charset, dest, dest_cap, t->start, t->len, &err);
+
+ if (err == U_BUFFER_OVERFLOW_ERROR) {
+ g_free(dest);
+ err = U_ZERO_ERROR;
+ dest_cap = dest_len + 1;
+ dest = g_malloc(dest_cap);
+ dest_len = ucnv_convert("UTF-8", charset, dest, dest_cap, t->start, t->len, &err);
+ }
+
+ if (U_FAILURE(err)) {
+ g_free(dest);
+ lua_pushnil(L);
+ return 1;
+ }
+
+ struct rspamd_lua_text *out = lua_new_text(L, dest, dest_len, FALSE);
+ out->flags |= RSPAMD_TEXT_FLAG_OWN;
+
+ return 1;
+}
+
static int
lua_util_strequal_caseless(lua_State *L)
{