}
}
-local pdf_text_patterns = {
- start = {
- patterns = {
- [[\sBT\s]]
- }
- },
- stop = {
- patterns = {
- [[\sET\b]]
- }
- }
-}
-
local pdf_cmap_patterns = {
start = {
patterns = {
-- t[3] - value in patterns table
-- t[4] - local pattern index
local pdf_indexes = {}
-local pdf_text_indexes = {}
local pdf_cmap_indexes = {}
local pdf_trie
max_pdf_objects = 10000, -- Maximum number of objects to be considered
max_pdf_trailer = 10 * 1024 * 1024, -- Maximum trailer size (to avoid abuse)
max_pdf_trailer_lines = 100, -- Maximum number of lines in pdf trailer
- pdf_process_timeout = 1.0, -- Timeout in seconds for processing
+ pdf_process_timeout = 10.0, -- Timeout in seconds for processing
}
-- Used to process patterns found in PDF
pdf_trie = compile_pats(pdf_patterns, pdf_indexes)
end
if not pdf_text_trie then
- pdf_text_trie = compile_pats(pdf_text_patterns, pdf_text_indexes)
+ pdf_text_trie = rspamd_trie.create({
+ [[\sBT\s]],
+ [[\sET\b]]
+ }, default_compile_flags)
end
if not pdf_cmap_trie then
pdf_cmap_trie = compile_pats(pdf_cmap_patterns, pdf_cmap_indexes)
if now >= pdf.end_timestamp then
pdf.timeout_processing = now - pdf.start_timestamp
+ io.stderr:write(string.format("DEBUG: Timeout! Start: %f, End: %f, Now: %f\n", pdf.start_timestamp, pdf.end_timestamp, now))
+
lua_util.debugm(N, task, 'pdf: timeout processing grammars after spending %s seconds, ' ..
'%s elements processed',
pdf.timeout_processing, i)
end
bl.data = tobj.uncompressed:span(bl.start, bl.len)
- --lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
- -- tobj.major, tobj.minor, bl.data)
+ lua_util.debugm(N, task, 'extracted text from object %s:%s: %s',
+ tobj.major, tobj.minor, bl.data)
if bl.len < config.max_processing_size then
local ret, obj_or_err = pcall(pdf_text_grammar.match, pdf_text_grammar,
end
local res = table.concat(text, '')
obj.text = rspamd_text.fromstring(res)
+
lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
obj.major, obj.minor, obj.text)
end
end
end
+ -- Aggregate and inject once
+ if task.inject_part then
+ local all_text = {}
+ for _, obj in ipairs(pdf.objects) do
+ if obj.text then
+ table.insert(all_text, tostring(obj.text))
+ end
+ end
+
+ if #all_text > 0 then
+ local final_text = table.concat(all_text, "\n")
+ -- Only inject if it contains non-whitespace characters
+ if final_text:match("%S") then
+ task:inject_part('text', final_text)
+ else
+ lua_util.debugm(N, task, 'skipping injection of empty/whitespace-only text')
+ end
+ end
+ end
end
-- This function searches objects for `/URI` key and parses it's content
end
local function process_pdf(input, mpart, task)
+ -- io.stderr:write("DEBUG: process_pdf called, input len: " .. tostring(#input) .. "\n")
if not config.enabled then
-- Skip processing
local matches = pdf_trie:match(input)
if matches then
+ -- io.stderr:write("DEBUG: PDF matches found\n")
local start_ts = rspamd_util.get_ticks()
-- Temp object used to share data between pdf extraction methods
local pdf_object = {
* @return {table rspamd_url} list of all email addresses found
*/
LUA_FUNCTION_DEF(task, get_emails);
+/***
+ * @method task:inject_part(type, content)
+ * Injects a virtual mime part into the task structure
+ * @param {string} type part type (currently only "text" is supported)
+ * @param {string} content part content
+ * @return {boolean} true if part was injected
+ */
+LUA_FUNCTION_DEF(task, inject_part);
/***
* @method task:get_text_parts()
* Get all text (and HTML) parts found in a message
LUA_INTERFACE_DEF(task, get_rawbody),
LUA_INTERFACE_DEF(task, get_emails),
LUA_INTERFACE_DEF(task, get_text_parts),
+ LUA_INTERFACE_DEF(task, inject_part),
LUA_INTERFACE_DEF(task, get_parts),
LUA_INTERFACE_DEF(task, get_request_header),
LUA_INTERFACE_DEF(task, set_request_header),
return 2;
}
+static int
+lua_task_inject_part(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_task *task = lua_check_task(L, 1);
+ const char *type = luaL_checkstring(L, 2);
+ gsize content_len;
+ const char *content = luaL_checklstring(L, 3, &content_len);
+ struct rspamd_mime_part *part;
+ struct rspamd_mime_text_part *txt_part;
+
+ if (task && task->message) {
+ if (g_ascii_strcasecmp(type, "text") == 0) {
+ part = rspamd_mempool_alloc0(task->task_pool, sizeof(*part));
+ part->part_type = RSPAMD_MIME_PART_TEXT;
+ part->flags |= RSPAMD_MIME_PART_COMPUTED;
+
+ /* Basic headers setup */
+ part->ct = rspamd_mempool_alloc0(task->task_pool, sizeof(*part->ct));
+
+ part->ct->type.begin = "text";
+ part->ct->type.len = 4;
+ part->ct->subtype.begin = "plain";
+ part->ct->subtype.len = 5;
+ part->ct->flags = RSPAMD_CONTENT_TYPE_TEXT;
+ part->ct->charset.begin = "utf-8";
+ part->ct->charset.len = 5;
+
+ /* Content setup */
+ part->parsed_data.begin = rspamd_mempool_strdup(task->task_pool, content);
+ part->parsed_data.len = content_len;
+ part->raw_data = part->parsed_data;
+
+ /* Text part specific setup */
+ txt_part = rspamd_mempool_alloc0(task->task_pool, sizeof(*txt_part));
+ txt_part->mime_part = part;
+ txt_part->raw.begin = part->parsed_data.begin;
+ txt_part->raw.len = content_len;
+ txt_part->parsed = txt_part->raw;
+ txt_part->utf_content = txt_part->raw;
+ txt_part->real_charset = "utf-8";
+
+ /* Add to message */
+ part->specific.txt = txt_part;
+ g_ptr_array_add(task->message->parts, part);
+ g_ptr_array_add(task->message->text_parts, txt_part);
+
+ lua_pushboolean(L, true);
+ }
+ else {
+ lua_pushboolean(L, false);
+ }
+ }
+ else {
+ lua_pushboolean(L, false);
+ }
+
+ return 1;
+}
+
struct rspamd_url_query_to_inject_cbd {
struct rspamd_task *task;
struct rspamd_url *url;
unsigned int i;
struct rspamd_task *task = lua_check_task(L, 1);
struct rspamd_mime_text_part *part, **ppart;
+ gboolean include_virtual = FALSE;
+
+ if (lua_gettop(L) >= 2) {
+ include_virtual = lua_toboolean(L, 2);
+ }
if (task != NULL) {
if (task->message) {
- if (!lua_task_get_cached(L, task, "text_parts")) {
- lua_createtable(L, MESSAGE_FIELD(task, text_parts)->len, 0);
+ if (!include_virtual && lua_task_get_cached(L, task, "text_parts")) {
+ return 1;
+ }
- PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
- {
- ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
- *ppart = part;
- rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
- /* Make it array */
- lua_rawseti(L, -2, i + 1);
+ lua_newtable(L);
+ int idx = 1;
+
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, text_parts), i, part)
+ {
+ if (!include_virtual && (part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
+ continue;
}
+ ppart = lua_newuserdata(L, sizeof(struct rspamd_mime_text_part *));
+ *ppart = part;
+ rspamd_lua_setclass(L, rspamd_textpart_classname, -1);
+ /* Make it array */
+ lua_rawseti(L, -2, idx++);
+ }
+
+ if (!include_virtual) {
lua_task_set_cached(L, task, "text_parts", -1);
}
}