From 60aad59f3bd9377a263e4411c5598c17ede338bb Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 24 Nov 2025 14:47:08 +0000 Subject: [PATCH] [Fix] Discard PDF text containing unprintable control characters --- lualib/lua_content/pdf.lua | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index bd2482031d..de72952d58 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -400,6 +400,21 @@ local function gen_text_grammar() if charset and rspamd_util.to_utf8 then local conv = rspamd_util.to_utf8(s, charset) if conv then + -- Check for control characters to avoid garbage + local garbage_limit = 0 + local clen = #conv + for i = 1, clen do + local b = string.byte(conv, i) + if b < 32 and b ~= 9 and b ~= 10 and b ~= 13 then + garbage_limit = garbage_limit + 1 + end + end + + if garbage_limit > 0 then + -- Treat as garbage + return '' + end + return conv end end -- 2.47.3