]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Fix non-deterministic PDF text extraction
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 23 Nov 2025 22:11:02 +0000 (22:11 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 23 Nov 2025 22:11:02 +0000 (22:11 +0000)
- Ensure pdf_text_trie is created with deterministic array input
- Enable regex flags for PDF text pattern matching
- Aggregate PDF text parts and inject as single virtual part
- Add support for virtual parts in lua_task C API

lualib/lua_content/pdf.lua
src/libmime/message.h
src/lua/lua_task.c

index ead4706f4747052f80eba048797a761080f90da0..b356407ec47a91be2bee5e117875a45c1f24ecfd 100644 (file)
@@ -63,19 +63,6 @@ local pdf_patterns = {
   }
 }
 
-local pdf_text_patterns = {
-  start = {
-    patterns = {
-      [[\sBT\s]]
-    }
-  },
-  stop = {
-    patterns = {
-      [[\sET\b]]
-    }
-  }
-}
-
 local pdf_cmap_patterns = {
   start = {
     patterns = {
@@ -97,7 +84,6 @@ local pdf_cmap_patterns = {
 --  t[3] - value in patterns table
 --  t[4] - local pattern index
 local pdf_indexes = {}
-local pdf_text_indexes = {}
 local pdf_cmap_indexes = {}
 
 local pdf_trie
@@ -118,7 +104,7 @@ local config = {
   max_pdf_objects = 10000, -- Maximum number of objects to be considered
   max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
   max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
-  pdf_process_timeout = 1.0, -- Timeout in seconds for processing
+  pdf_process_timeout = 10.0, -- Timeout in seconds for processing
 }
 
 -- Used to process patterns found in PDF
@@ -161,7 +147,10 @@ local function compile_tries()
     pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
   end
   if not pdf_text_trie then
-    pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
+    pdf_text_trie = rspamd_trie.create({
+      [[\sBT\s]],
+      [[\sET\b]]
+    }, default_compile_flags)
   end
   if not pdf_cmap_trie then
     pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
@@ -1151,6 +1140,8 @@ local function postprocess_pdf_objects(task, input, pdf)
       if now >= pdf.end_timestamp then
         pdf.timeout_processing = now - pdf.start_timestamp
 
+        io.stderr:write(string.format("DEBUG: Timeout! Start: %f, End: %f, Now: %f\n", pdf.start_timestamp, pdf.end_timestamp, now))
+
         lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
             '%s elements processed',
             pdf.timeout_processing, i)
@@ -1254,8 +1245,8 @@ local function search_text(task, pdf)
             end
 
             bl.data = tobj.uncompressed:span(bl.start, bl.len)
-            --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
-            --    tobj.major, tobj.minor, bl.data)
+            lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
+                tobj.major, tobj.minor, bl.data)
 
             if bl.len < config.max_processing_size then
               local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
@@ -1308,11 +1299,31 @@ local function search_text(task, pdf)
         end
         local res = table.concat(text, '')
         obj.text = rspamd_text.fromstring(res)
+
         lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
             obj.major, obj.minor, obj.text)
       end
     end
   end
+  -- Aggregate and inject once
+  if task.inject_part then
+    local all_text = {}
+    for _, obj in ipairs(pdf.objects) do
+      if obj.text then
+        table.insert(all_text, tostring(obj.text))
+      end
+    end
+
+    if #all_text > 0 then
+      local final_text = table.concat(all_text, "\n")
+      -- Only inject if it contains non-whitespace characters
+      if final_text:match("%S") then
+        task:inject_part('text', final_text)
+      else
+        lua_util.debugm(N, task, 'skipping injection of empty/whitespace-only text')
+      end
+    end
+  end
 end
 
 -- This function searches objects for `/URI` key and parses it's content
@@ -1350,6 +1361,7 @@ local function search_urls(task, pdf, mpart)
 end
 
 local function process_pdf(input, mpart, task)
+  -- io.stderr:write("DEBUG: process_pdf called, input len: " .. tostring(#input) .. "\n")
 
   if not config.enabled then
     -- Skip processing
@@ -1359,6 +1371,7 @@ local function process_pdf(input, mpart, task)
   local matches = pdf_trie:match(input)
 
   if matches then
+    -- io.stderr:write("DEBUG: PDF matches found\n")
     local start_ts = rspamd_util.get_ticks()
     -- Temp object used to share data between pdf extraction methods
     local pdf_object = {
index dc9987d01fea9325a46f37eec04b02d4b8a32faf..10167615124eda94c35af1f833a1231313439619 100644 (file)
@@ -37,6 +37,7 @@ enum rspamd_mime_part_flags {
        RSPAMD_MIME_PART_BAD_CTE = (1u << 4u),
        RSPAMD_MIME_PART_MISSING_CTE = (1u << 5u),
        RSPAMD_MIME_PART_NO_TEXT_EXTRACTION = (1u << 6u),
+       RSPAMD_MIME_PART_COMPUTED = (1u << 7u),
 };
 
 enum rspamd_mime_part_type {
index d242bbe76d3da9a2c0fc71faa1b244c9adba5f54..a695177e8c3c01c1047e5e26467222edb120965d 100644 (file)
@@ -318,6 +318,14 @@ LUA_FUNCTION_DEF(task, get_rawbody);
  * @return {table rspamd_url} list of all email addresses found
  */
 LUA_FUNCTION_DEF(task, get_emails);
+/***
+ * @method task:inject_part(type, content)
+ * Injects a virtual mime part into the task structure
+ * @param {string} type part type (currently only "text" is supported)
+ * @param {string} content part content
+ * @return {boolean} true if part was injected
+ */
+LUA_FUNCTION_DEF(task, inject_part);
 /***
  * @method task:get_text_parts()
  * Get all text (and HTML) parts found in a message
@@ -1331,6 +1339,7 @@ static const struct luaL_reg tasklib_m[] = {
        LUA_INTERFACE_DEF(task, get_rawbody),
        LUA_INTERFACE_DEF(task, get_emails),
        LUA_INTERFACE_DEF(task, get_text_parts),
+       LUA_INTERFACE_DEF(task, inject_part),
        LUA_INTERFACE_DEF(task, get_parts),
        LUA_INTERFACE_DEF(task, get_request_header),
        LUA_INTERFACE_DEF(task, set_request_header),
@@ -2768,6 +2777,66 @@ lua_task_has_urls(lua_State *L)
        return 2;
 }
 
+static int
+lua_task_inject_part(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_task *task = lua_check_task(L, 1);
+       const char *type = luaL_checkstring(L, 2);
+       gsize content_len;
+       const char *content = luaL_checklstring(L, 3, &content_len);
+       struct rspamd_mime_part *part;
+       struct rspamd_mime_text_part *txt_part;
+
+       if (task && task->message) {
+               if (g_ascii_strcasecmp(type, "text") == 0) {
+                       part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part));
+                       part->part_type = RSPAMD_MIME_PART_TEXT;
+                       part->flags |= RSPAMD_MIME_PART_COMPUTED;
+
+                       /* Basic headers setup */
+                       part->ct = rspamd_mempool_alloc0(task->task_pool, sizeof(*part->ct));
+
+                       part->ct->type.begin = "text";
+                       part->ct->type.len = 4;
+                       part->ct->subtype.begin = "plain";
+                       part->ct->subtype.len = 5;
+                       part->ct->flags = RSPAMD_CONTENT_TYPE_TEXT;
+                       part->ct->charset.begin = "utf-8";
+                       part->ct->charset.len = 5;
+
+                       /* Content setup */
+                       part->parsed_data.begin = rspamd_mempool_strdup(task->task_pool, content);
+                       part->parsed_data.len = content_len;
+                       part->raw_data = part->parsed_data;
+
+                       /* Text part specific setup */
+                       txt_part = rspamd_mempool_alloc0(task->task_pool, sizeof(*txt_part));
+                       txt_part->mime_part = part;
+                       txt_part->raw.begin = part->parsed_data.begin;
+                       txt_part->raw.len = content_len;
+                       txt_part->parsed = txt_part->raw;
+                       txt_part->utf_content = txt_part->raw;
+                       txt_part->real_charset = "utf-8";
+
+                       /* Add to message */
+                       part->specific.txt = txt_part;
+                       g_ptr_array_add(task->message->parts, part);
+                       g_ptr_array_add(task->message->text_parts, txt_part);
+
+                       lua_pushboolean(L, true);
+               }
+               else {
+                       lua_pushboolean(L, false);
+               }
+       }
+       else {
+               lua_pushboolean(L, false);
+       }
+
+       return 1;
+}
+
 struct rspamd_url_query_to_inject_cbd {
        struct rspamd_task *task;
        struct rspamd_url *url;
@@ -2980,22 +3049,36 @@ lua_task_get_text_parts(lua_State *L)
        unsigned int i;
        struct rspamd_task *task = lua_check_task(L, 1);
        struct rspamd_mime_text_part *part, **ppart;
+       gboolean include_virtual = FALSE;
+
+       if (lua_gettop(L) >= 2) {
+               include_virtual = lua_toboolean(L, 2);
+       }
 
        if (task != NULL) {
 
                if (task->message) {
-                       if (!lua_task_get_cached(L, task, "text_parts")) {
-                               lua_createtable(L, MESSAGE_FIELD(task, text_parts)->len, 0);
+                       if (!include_virtual && lua_task_get_cached(L, task, "text_parts")) {
+                               return 1;
+                       }
 
-                               PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
-                               {
-                                       ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
-                                       *ppart = part;
-                                       rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
-                                       /* Make it array */
-                                       lua_rawseti(L, -2, i + 1);
+                       lua_newtable(L);
+                       int idx = 1;
+
+                       PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
+                       {
+                               if (!include_virtual && (part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
+                                       continue;
                                }
 
+                               ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+                               *ppart = part;
+                               rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+                               /* Make it array */
+                               lua_rawseti(L, -2, idx++);
+                       }
+
+                       if (!include_virtual) {
                                lua_task_set_cached(L, task, "text_parts", -1);
                        }
                }