]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Discard PDF text containing unprintable control characters
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 24 Nov 2025 14:47:08 +0000 (14:47 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 24 Nov 2025 14:47:08 +0000 (14:47 +0000)
lualib/lua_content/pdf.lua

index bd2482031d69c17aa98fa308e2a9f99ae6c60f7a..de72952d58da8ad6c539dbde4dc8646863dfa3e9 100644 (file)
@@ -400,6 +400,21 @@ local function gen_text_grammar()
     if charset and rspamd_util.to_utf8 then
        local conv = rspamd_util.to_utf8(s, charset)
        if conv then
+          -- Check for control characters to avoid garbage
+          local garbage_limit = 0
+          local clen = #conv
+          for i = 1, clen do
+            local b = string.byte(conv, i)
+            if b < 32 and b ~= 9 and b ~= 10 and b ~= 13 then
+              garbage_limit = garbage_limit + 1
+            end
+          end
+
+          if garbage_limit > 0 then
+             -- Treat as garbage
+             return ''
+          end
+
           return conv
        end
     end