From: Vsevolod Stakhov Date: Sun, 23 Nov 2025 22:11:02 +0000 (+0000) Subject: Fix non-deterministic PDF text extraction X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0299d9b6e0bcdf901bc20611e2bbb831c664ce3f;p=thirdparty%2Frspamd.git Fix non-deterministic PDF text extraction - Ensure pdf_text_trie is created with deterministic array input - Enable regex flags for PDF text pattern matching - Aggregate PDF text parts and inject as single virtual part - Add support for virtual parts in lua_task C API --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index ead4706f47..b356407ec4 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -63,19 +63,6 @@ local pdf_patterns = { } } -local pdf_text_patterns = { - start = { - patterns = { - [[\sBT\s]] - } - }, - stop = { - patterns = { - [[\sET\b]] - } - } -} - local pdf_cmap_patterns = { start = { patterns = { @@ -97,7 +84,6 @@ local pdf_cmap_patterns = { -- t[3] - value in patterns table -- t[4] - local pattern index local pdf_indexes = {} -local pdf_text_indexes = {} local pdf_cmap_indexes = {} local pdf_trie @@ -118,7 +104,7 @@ local config = { max_pdf_objects = 10000, -- Maximum number of objects to be considered max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse) max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer - pdf_process_timeout = 1.0, -- Timeout in seconds for processing + pdf_process_timeout = 10.0, -- Timeout in seconds for processing } -- Used to process patterns found in PDF @@ -161,7 +147,10 @@ local function compile_tries() pdf_trie = compile_pats(pdf_patterns, pdf_indexes) end if not pdf_text_trie then - pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes) + pdf_text_trie = rspamd_trie.create({ + [[\sBT\s]], + [[\sET\b]] + }, default_compile_flags) end if not pdf_cmap_trie then pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes) @@ -1151,6 +1140,8 @@ local function postprocess_pdf_objects(task, input, pdf) if now >= pdf.end_timestamp then pdf.timeout_processing = now - pdf.start_timestamp + io.stderr:write(string.format("DEBUG: Timeout! Start: %f, End: %f, Now: %f\n", pdf.start_timestamp, pdf.end_timestamp, now)) + lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' .. '%s elements processed', pdf.timeout_processing, i) @@ -1254,8 +1245,8 @@ local function search_text(task, pdf) end bl.data = tobj.uncompressed:span(bl.start, bl.len) - --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s', - -- tobj.major, tobj.minor, bl.data) + lua_util.debugm(N, task, 'extracted text from object %s:%s: %s', + tobj.major, tobj.minor, bl.data) if bl.len < config.max_processing_size then local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar, @@ -1308,11 +1299,31 @@ local function search_text(task, pdf) end local res = table.concat(text, '') obj.text = rspamd_text.fromstring(res) + lua_util.debugm(N, task, 'object %s:%s is parsed to: %s', obj.major, obj.minor, obj.text) end end end + -- Aggregate and inject once + if task.inject_part then + local all_text = {} + for _, obj in ipairs(pdf.objects) do + if obj.text then + table.insert(all_text, tostring(obj.text)) + end + end + + if #all_text > 0 then + local final_text = table.concat(all_text, "\n") + -- Only inject if it contains non-whitespace characters + if final_text:match("%S") then + task:inject_part('text', final_text) + else + lua_util.debugm(N, task, 'skipping injection of empty/whitespace-only text') + end + end + end end -- This function searches objects for `/URI` key and parses it's content @@ -1350,6 +1361,7 @@ local function search_urls(task, pdf, mpart) end local function process_pdf(input, mpart, task) + -- io.stderr:write("DEBUG: process_pdf called, input len: " .. tostring(#input) .. "\n") if not config.enabled then -- Skip processing @@ -1359,6 +1371,7 @@ local function process_pdf(input, mpart, task) local matches = pdf_trie:match(input) if matches then + -- io.stderr:write("DEBUG: PDF matches found\n") local start_ts = rspamd_util.get_ticks() -- Temp object used to share data between pdf extraction methods local pdf_object = { diff --git a/src/libmime/message.h b/src/libmime/message.h index dc9987d01f..1016761512 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -37,6 +37,7 @@ enum rspamd_mime_part_flags { RSPAMD_MIME_PART_BAD_CTE = (1u << 4u), RSPAMD_MIME_PART_MISSING_CTE = (1u << 5u), RSPAMD_MIME_PART_NO_TEXT_EXTRACTION = (1u << 6u), + RSPAMD_MIME_PART_COMPUTED = (1u << 7u), }; enum rspamd_mime_part_type { diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index d242bbe76d..a695177e8c 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -318,6 +318,14 @@ LUA_FUNCTION_DEF(task, get_rawbody); * @return {table rspamd_url} list of all email addresses found */ LUA_FUNCTION_DEF(task, get_emails); +/*** + * @method task:inject_part(type, content) + * Injects a virtual mime part into the task structure + * @param {string} type part type (currently only "text" is supported) + * @param {string} content part content + * @return {boolean} true if part was injected + */ +LUA_FUNCTION_DEF(task, inject_part); /*** * @method task:get_text_parts() * Get all text (and HTML) parts found in a message @@ -1331,6 +1339,7 @@ static const struct luaL_reg tasklib_m[] = { LUA_INTERFACE_DEF(task, get_rawbody), LUA_INTERFACE_DEF(task, get_emails), LUA_INTERFACE_DEF(task, get_text_parts), + LUA_INTERFACE_DEF(task, inject_part), LUA_INTERFACE_DEF(task, get_parts), LUA_INTERFACE_DEF(task, get_request_header), LUA_INTERFACE_DEF(task, set_request_header), @@ -2768,6 +2777,66 @@ lua_task_has_urls(lua_State *L) return 2; } +static int +lua_task_inject_part(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_task *task = lua_check_task(L, 1); + const char *type = luaL_checkstring(L, 2); + gsize content_len; + const char *content = luaL_checklstring(L, 3, &content_len); + struct rspamd_mime_part *part; + struct rspamd_mime_text_part *txt_part; + + if (task && task->message) { + if (g_ascii_strcasecmp(type, "text") == 0) { + part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part)); + part->part_type = RSPAMD_MIME_PART_TEXT; + part->flags |= RSPAMD_MIME_PART_COMPUTED; + + /* Basic headers setup */ + part->ct = rspamd_mempool_alloc0(task->task_pool, sizeof(*part->ct)); + + part->ct->type.begin = "text"; + part->ct->type.len = 4; + part->ct->subtype.begin = "plain"; + part->ct->subtype.len = 5; + part->ct->flags = RSPAMD_CONTENT_TYPE_TEXT; + part->ct->charset.begin = "utf-8"; + part->ct->charset.len = 5; + + /* Content setup */ + part->parsed_data.begin = rspamd_mempool_strdup(task->task_pool, content); + part->parsed_data.len = content_len; + part->raw_data = part->parsed_data; + + /* Text part specific setup */ + txt_part = rspamd_mempool_alloc0(task->task_pool, sizeof(*txt_part)); + txt_part->mime_part = part; + txt_part->raw.begin = part->parsed_data.begin; + txt_part->raw.len = content_len; + txt_part->parsed = txt_part->raw; + txt_part->utf_content = txt_part->raw; + txt_part->real_charset = "utf-8"; + + /* Add to message */ + part->specific.txt = txt_part; + g_ptr_array_add(task->message->parts, part); + g_ptr_array_add(task->message->text_parts, txt_part); + + lua_pushboolean(L, true); + } + else { + lua_pushboolean(L, false); + } + } + else { + lua_pushboolean(L, false); + } + + return 1; +} + struct rspamd_url_query_to_inject_cbd { struct rspamd_task *task; struct rspamd_url *url; @@ -2980,22 +3049,36 @@ lua_task_get_text_parts(lua_State *L) unsigned int i; struct rspamd_task *task = lua_check_task(L, 1); struct rspamd_mime_text_part *part, **ppart; + gboolean include_virtual = FALSE; + + if (lua_gettop(L) >= 2) { + include_virtual = lua_toboolean(L, 2); + } if (task != NULL) { if (task->message) { - if (!lua_task_get_cached(L, task, "text_parts")) { - lua_createtable(L, MESSAGE_FIELD(task, text_parts)->len, 0); + if (!include_virtual && lua_task_get_cached(L, task, "text_parts")) { + return 1; + } - PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) - { - ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *)); - *ppart = part; - rspamd_lua_setclass(L, rspamd_textpart_classname, -1); - /* Make it array */ - lua_rawseti(L, -2, i + 1); + lua_newtable(L); + int idx = 1; + + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part) + { + if (!include_virtual && (part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) { + continue; } + ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *)); + *ppart = part; + rspamd_lua_setclass(L, rspamd_textpart_classname, -1); + /* Make it array */ + lua_rawseti(L, -2, idx++); + } + + if (!include_virtual) { lua_task_set_cached(L, task, "text_parts", -1); } }