From: Vsevolod Stakhov Date: Sun, 23 Nov 2025 14:20:48 +0000 (+0000) Subject: [Feature] Implement basic PDF text extraction X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=9d575b165a026a39f8aecc8a361c1dfdb82107ef;p=thirdparty%2Frspamd.git [Feature] Implement basic PDF text extraction - Enable text_extraction in default config - Implement extract_text_data to collect text from Page objects - Improve PDF grammar to handle text operators and spacing (TJ, Tj, ', ") - Add logic for newline insertion based on Td/TD/Tm operators - Add heuristic for space insertion based on negative kerning in TJ arrays - Support common ligatures for StandardEncoding and MacRomanEncoding - Support FlateDecode and ASCIIHexDecode filters - Update rspamadm mime to support raw PDF extraction (-r flag) and better content type detection --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index f6d5c0bc0d..ead4706f47 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -109,7 +109,7 @@ local exports = {} local config = { max_extraction_size = 512 * 1024, max_processing_size = 32 * 1024, - text_extraction = false, -- NYI feature + text_extraction = true, url_extraction = true, enabled = true, js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts @@ -181,14 +181,36 @@ local function generic_grammar_elts() -- Helper functions local function pdf_hexstring_unescape(s) + local res if #s % 2 == 0 then -- Sane hex string - return lua_util.unhex(s) + res = lua_util.unhex(s) + else + -- WTF hex string + -- Append '0' to it and unescape... + res = lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0')) + end + + if res then + -- Ligature fix for StandardEncoding (common in simple PDFs) + -- 0xAB (171) -> ff + -- 0xAC (172) -> ffi + -- 0xAD (173) -> ffl + -- 0xAE (174) -> fi + -- 0xAF (175) -> fl + -- MacRomanEncoding + -- 0xDE (222) -> fi + -- 0xDF (223) -> fl + res = res:gsub('\171', 'ff') + res = res:gsub('\172', 'ffi') + res = res:gsub('\173', 'ffl') + res = res:gsub('\174', 'fi') + res = res:gsub('\175', 'fl') + res = res:gsub('\222', 'fi') + res = res:gsub('\223', 'fl') end - -- WTF hex string - -- Append '0' to it and unescape... - return lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0')) + return res end local function pdf_string_unescape(s) @@ -210,6 +232,23 @@ local function generic_grammar_elts() end s = s:gsub('\\%d%d?%d?', ue_octal) + -- Ligature fix for StandardEncoding (common in simple PDFs) + -- 0xAB (171) -> ff + -- 0xAC (172) -> ffi + -- 0xAD (173) -> ffl + -- 0xAE (174) -> fi + -- 0xAF (175) -> fl + -- MacRomanEncoding + -- 0xDE (222) -> fi + -- 0xDF (223) -> fl + s = s:gsub('\171', 'ff') + s = s:gsub('\172', 'ffi') + s = s:gsub('\173', 'ffl') + s = s:gsub('\174', 'fi') + s = s:gsub('\175', 'fl') + s = s:gsub('\222', 'fi') + s = s:gsub('\223', 'fl') + return s end @@ -306,6 +345,58 @@ local function gen_text_grammar() local C = lpeg.C local gen = generic_grammar_elts() + local function text_op_handler(...) + local args = {...} + local op = args[#args] + local t = args[#args - 1] + + local res = t + if type(t) == 'table' then + local tres = {} + for _, chunk in ipairs(t) do + if type(chunk) == 'string' then + table.insert(tres, chunk) + elseif type(chunk) == 'number' then + -- Heuristic: if number is < -200, append space + if chunk < -200 then + table.insert(tres, ' ') + end + end + end + res = table.concat(tres) + end + + if op == "'" or op == '"' then + return '\n' .. res + end + + return res + end + + local function nary_op_handler(...) + local args = {...} + local op = args[#args] + -- local t = args[#args - 1] -- The table of numbers + + if op == 'Tm' then + return '\n' + end + + return '' + end + + local function ternary_op_handler(...) + local args = {...} + local op = args[#args] + local a2 = args[#args - 2] -- Second to last argument (ty) + + if (op == 'Td' or op == 'TD') and type(a2) == 'number' and a2 ~= 0 then + return '\n' + end + + return '' + end + local empty = "" local unary_ops = C("T*") / "\n" + C(gen_graphics_unary()) / empty @@ -313,8 +404,8 @@ local function gen_text_grammar() gen_graphics_binary() local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary() local nary_op = P("Tm") + gen_graphics_nary() - local text_binary_op = P("Tj") + P("TJ") + P("'") - local text_quote_op = P('"') + local text_binary_op = C(P("Tj") + P("TJ") + P("'")) + local text_quote_op = C(P('"')) local font_op = P("Tf") return lpeg.P { @@ -324,8 +415,8 @@ local function gen_text_grammar() V("FONT") + gen.comment) * gen.ws ^ 0, UNARY = unary_ops, BINARY = V("ARG") / empty * gen.ws ^ 1 * binary_ops, - TERNARY = V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * ternary_ops, - NARY = (gen.number / 0 * gen.ws ^ 1) ^ 1 * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op, + TERNARY = (V("ARG") * gen.ws ^ 1 * V("ARG") * gen.ws ^ 1 * ternary_ops) / ternary_op_handler, + NARY = lpeg.Ct((V("ARG") * gen.ws ^ 1) ^ 1) * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op / nary_op_handler, ARG = V("ARRAY") + V("DICT") + V("ATOM"), ATOM = (gen.comment + gen.boolean + gen.ref + gen.number + V("STRING") + gen.id), @@ -333,13 +424,13 @@ local function gen_text_grammar() KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ARG") * gen.ws ^ 0), ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ARG") ^ 0) * gen.ws ^ 0 * "]", STRING = lpeg.P { gen.str + gen.hexstr }, - TEXT = (V("TEXT_ARG") * gen.ws ^ 1 * text_binary_op) + - (V("ARG") / 0 * gen.ws ^ 1 * V("ARG") / 0 * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 1 * text_quote_op), - FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / 0) * gen.ws ^ 1 * font_op), + TEXT = ((V("TEXT_ARG") * gen.ws ^ 0 * text_binary_op) / text_op_handler) + + ((V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 0 * text_quote_op) / text_op_handler), + FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / empty) * gen.ws ^ 1 * font_op) / empty, FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id), TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"), - TEXT_ARRAY = "[" * - lpeg.Ct(((gen.ws ^ 0 * (gen.ws ^ 0 * (gen.number / 0) ^ 0 * gen.ws ^ 0 * (gen.str + gen.hexstr))) ^ 1)) * gen.ws ^ 0 * "]", + TEXT_ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct((V("TEXT_ARRAY_ELT") * gen.ws ^ 0) ^ 0) * "]", + TEXT_ARRAY_ELT = gen.number + gen.str + gen.hexstr, } end @@ -351,7 +442,20 @@ pdf_outer_grammar = gen_outer_grammar() pdf_text_grammar = gen_text_grammar() local function extract_text_data(specific) - return nil -- NYI + local res = {} + if specific.objects then + for _, obj in ipairs(specific.objects) do + if obj.text then + if type(obj.text) == 'userdata' then + res[#res + 1] = tostring(obj.text) + else + res[#res + 1] = obj.text + end + end + end + end + + return res end -- Generates index for major/minor pair @@ -379,8 +483,16 @@ end -- Apply PDF stream filter local function apply_pdf_filter(input, filt) - if filt == 'FlateDecode' then + if filt == 'FlateDecode' or filt == 'Fl' then return rspamd_util.inflate(input, config.max_extraction_size) + elseif filt == 'ASCIIHexDecode' or filt == 'AHx' then + -- Strip > at the end if present (should be stripped by parser but safety check) + -- Also strip whitespaces + local to_decode = input:gsub('%s', '') + if to_decode:sub(-1) == '>' then + to_decode = to_decode:sub(1, -2) + end + return lua_util.unhex(to_decode) end return nil @@ -392,8 +504,13 @@ local function maybe_apply_filter(dict, data, pdf, task) if dict.Filter then local filt = dict.Filter + local filts = {} + if type(filt) == 'string' then - filt = { filt } + filts = { filt } + elseif type(filt) == 'table' then + -- Array of filters + filts = filt end if dict.DecodeParms then @@ -401,16 +518,25 @@ local function maybe_apply_filter(dict, data, pdf, task) if type(decode_params) == 'table' then if decode_params.Predictor then - return nil, 'predictor exists' + -- We can handle Predictor 1 (No prediction) or maybe others in future + local predictor = tonumber(decode_params.Predictor) or 1 + if predictor > 1 then + -- For now, we just log debug and fail, or maybe try to continue if it's simple PNG prediction + -- But without implementation, better to return nil to avoid garbage + return nil, 'predictor exists: ' .. tostring(predictor) + end end end end - for _, f in ipairs(filt) do - uncompressed = apply_pdf_filter(uncompressed, f) + for _, f in ipairs(filts) do + local next_uncompressed = apply_pdf_filter(uncompressed, f) - if not uncompressed then - break + if next_uncompressed then + uncompressed = next_uncompressed + else + -- If chain fails, we stop + return nil, 'filter failed: ' .. tostring(f) end end end @@ -425,27 +551,35 @@ local function maybe_extract_object_stream(obj, pdf, task) return nil end local dict = obj.dict - if dict.Length and type(obj.stream) == 'table' then - local len = math.min(obj.stream.len, - tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0) - if len > 0 then - local real_stream = obj.stream.data:span(1, len) - - local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task) - - if uncompressed then - obj.uncompressed = uncompressed - lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)', - obj.major, obj.minor, len, uncompressed:len()) - return obj.uncompressed - else - lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s', - obj.major, obj.minor, len, dict.Filter, filter_err) - end + + -- Determine length: try /Length first, fallback to parsed length + local len = obj.stream.len + local decl_len = maybe_dereference_object(dict.Length, pdf, task) + + if decl_len then + local nlen = tonumber(decl_len) + if nlen then + len = math.min(len, nlen) + end + end + + if len > 0 then + local real_stream = obj.stream.data:span(1, len) + + local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task) + + if uncompressed then + obj.uncompressed = uncompressed + lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)', + obj.major, obj.minor, len, #uncompressed) + return obj.uncompressed else - lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s', - obj.major, obj.minor, len) + lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s', + obj.major, obj.minor, len, dict.Filter, filter_err) end + else + lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s', + obj.major, obj.minor, len) end end @@ -1109,6 +1243,9 @@ local function search_text(task, pdf) end end + table.sort(starts) + table.sort(ends) + offsets_to_blocks(starts, ends, text_blocks) for _, bl in ipairs(text_blocks) do if bl.len > 2 then @@ -1125,7 +1262,15 @@ local function search_text(task, pdf) bl.data) if ret then - text[#text + 1] = obj_or_err + if #obj_or_err == 0 then + lua_util.debugm(N, task, 'empty text match from block: %s', bl.data) + end + for _, chunk in ipairs(obj_or_err) do + text[#text + 1] = chunk + end + -- Add newline after each block to separate lines + text[#text + 1] = '\n' + -- text[#text + 1] = obj_or_err lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s', obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor) else @@ -1140,7 +1285,29 @@ local function search_text(task, pdf) -- Join all text data together if #text > 0 then - obj.text = rspamd_text.fromtable(text) + for i, chunk in ipairs(text) do + if type(chunk) == 'userdata' then + text[i] = tostring(chunk) + elseif type(chunk) == 'table' then + -- Nested table? + local function flatten(t) + local res = {} + for _, v in ipairs(t) do + if type(v) == 'userdata' then + res[#res + 1] = tostring(v) + elseif type(v) == 'table' then + res[#res + 1] = flatten(v) + else + res[#res + 1] = v + end + end + return table.concat(res, '') + end + text[i] = flatten(chunk) + end + end + local res = table.concat(text, '') + obj.text = rspamd_text.fromstring(res) lua_util.debugm(N, task, 'object %s:%s is parsed to: %s', obj.major, obj.minor, obj.text) end @@ -1241,6 +1408,7 @@ local function process_pdf(input, mpart, task) -- Postprocess objects postprocess_pdf_objects(task, input, pdf_object) + pdf_output.objects = pdf_object.objects if config.text_extraction then search_text(task, pdf_object, pdf_output) end diff --git a/lualib/rspamadm/mime.lua b/lualib/rspamadm/mime.lua index 6412a7f38e..9ad7238dd3 100644 --- a/lualib/rspamadm/mime.lua +++ b/lualib/rspamadm/mime.lua @@ -61,6 +61,8 @@ extract:argument "file" extract:flag "-t --text" :description "Extracts plain text data from a message" +extract:flag "-r --raw" + :description "Load as raw file" extract:flag "-H --html" :description "Extracts htm data from a message" extract:option "-o --output" @@ -292,7 +294,7 @@ local function cleanup_tokenizers() end end -local function load_task(_, fname) +local function load_task(opts, fname) if not fname then fname = '-' end @@ -301,12 +303,53 @@ local function load_task(_, fname) task:set_session(rspamadm_session) task:set_resolver(rspamadm_dns_resolver) - local res = task:load_from_file(fname) + if opts.raw then + local f + if fname == '-' then + f = io.stdin + else + f = io.open(fname, "rb") + end - if not res then - parser:error(string.format('cannot read message from %s: %s', fname, - task)) - return nil + if not f then + parser:error("cannot open file " .. fname) + end + + local content = f:read("*a") + if fname ~= '-' then + f:close() + end + + local lua_magic = require "lua_magic" + local dummy_part = { + get_content = function() + return content + end, + get_filename = function() + return fname + end, + } + local _, type_data = lua_magic.detect(dummy_part, rspamd_config) + local ct = "application/octet-stream" + if type_data and type_data.type then + ct = type_data.type + end + + if fname:match('%.pdf$') and (not ct or ct == 'application/octet-stream' or ct == 'binary') then + ct = 'application/pdf' + end + + -- Construct message + local msg = string.format("Content-Type: %s\r\nContent-Transfer-Encoding: 8bit\r\n\r\n", ct) + task:load_from_string(msg .. content) + else + local res = task:load_from_file(fname) + + if not res then + parser:error(string.format('cannot read message from %s: %s', fname, + task)) + return nil + end end if not task:process_message() then @@ -542,6 +585,26 @@ local function extract_handler(opts) tostring(hc:get_invisible()))) end end + else + -- Not a text part, check for PDF + local _, msubtype = mime_part:get_type() + if msubtype == 'pdf' and opts.text then + local lua_content_pdf = require "lua_content.pdf" + -- Get raw content of the part + local content = mime_part:get_content() + if content then + local res = lua_content_pdf.process(content, mime_part, task) + if res and res.extract_text then + local text_data = res.extract_text(res) + if text_data and #text_data > 0 then + maybe_print_mime_part_info(mime_part, out_elts[fname]) + for _, txt in ipairs(text_data) do + table.insert(out_elts[fname], tostring(txt)) + end + end + end + end + end end if not part then