local config = {
max_extraction_size = 512 * 1024,
max_processing_size = 32 * 1024,
- text_extraction = false, -- NYI feature
+ text_extraction = true,
url_extraction = true,
enabled = true,
js_fuzzy = true, -- Generate fuzzy hashes from PDF javascripts
-- Helper functions
local function pdf_hexstring_unescape(s)
+ local res
if #s % 2 == 0 then
-- Sane hex string
- return lua_util.unhex(s)
+ res = lua_util.unhex(s)
+ else
+ -- WTF hex string
+ -- Append '0' to it and unescape...
+ res = lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0'))
+ end
+
+ if res then
+ -- Ligature fix for StandardEncoding (common in simple PDFs)
+ -- 0xAB (171) -> ff
+ -- 0xAC (172) -> ffi
+ -- 0xAD (173) -> ffl
+ -- 0xAE (174) -> fi
+ -- 0xAF (175) -> fl
+ -- MacRomanEncoding
+ -- 0xDE (222) -> fi
+ -- 0xDF (223) -> fl
+ res = res:gsub('\171', 'ff')
+ res = res:gsub('\172', 'ffi')
+ res = res:gsub('\173', 'ffl')
+ res = res:gsub('\174', 'fi')
+ res = res:gsub('\175', 'fl')
+ res = res:gsub('\222', 'fi')
+ res = res:gsub('\223', 'fl')
end
- -- WTF hex string
- -- Append '0' to it and unescape...
- return lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0'))
+ return res
end
local function pdf_string_unescape(s)
end
s = s:gsub('\\%d%d?%d?', ue_octal)
+ -- Ligature fix for StandardEncoding (common in simple PDFs)
+ -- 0xAB (171) -> ff
+ -- 0xAC (172) -> ffi
+ -- 0xAD (173) -> ffl
+ -- 0xAE (174) -> fi
+ -- 0xAF (175) -> fl
+ -- MacRomanEncoding
+ -- 0xDE (222) -> fi
+ -- 0xDF (223) -> fl
+ s = s:gsub('\171', 'ff')
+ s = s:gsub('\172', 'ffi')
+ s = s:gsub('\173', 'ffl')
+ s = s:gsub('\174', 'fi')
+ s = s:gsub('\175', 'fl')
+ s = s:gsub('\222', 'fi')
+ s = s:gsub('\223', 'fl')
+
return s
end
local C = lpeg.C
local gen = generic_grammar_elts()
+ local function text_op_handler(...)
+ local args = {...}
+ local op = args[#args]
+ local t = args[#args - 1]
+
+ local res = t
+ if type(t) == 'table' then
+ local tres = {}
+ for _, chunk in ipairs(t) do
+ if type(chunk) == 'string' then
+ table.insert(tres, chunk)
+ elseif type(chunk) == 'number' then
+ -- Heuristic: if number is < -200, append space
+ if chunk < -200 then
+ table.insert(tres, ' ')
+ end
+ end
+ end
+ res = table.concat(tres)
+ end
+
+ if op == "'" or op == '"' then
+ return '\n' .. res
+ end
+
+ return res
+ end
+
+ local function nary_op_handler(...)
+ local args = {...}
+ local op = args[#args]
+ -- local t = args[#args - 1] -- The table of numbers
+
+ if op == 'Tm' then
+ return '\n'
+ end
+
+ return ''
+ end
+
+ local function ternary_op_handler(...)
+ local args = {...}
+ local op = args[#args]
+ local a2 = args[#args - 2] -- Second to last argument (ty)
+
+ if (op == 'Td' or op == 'TD') and type(a2) == 'number' and a2 ~= 0 then
+ return '\n'
+ end
+
+ return ''
+ end
+
local empty = ""
local unary_ops = C("T*") / "\n" +
C(gen_graphics_unary()) / empty
gen_graphics_binary()
local ternary_ops = P("TD") + P("Td") + gen_graphics_ternary()
local nary_op = P("Tm") + gen_graphics_nary()
- local text_binary_op = P("Tj") + P("TJ") + P("'")
- local text_quote_op = P('"')
+ local text_binary_op = C(P("Tj") + P("TJ") + P("'"))
+ local text_quote_op = C(P('"'))
local font_op = P("Tf")
return lpeg.P {
V("FONT") + gen.comment) * gen.ws ^ 0,
UNARY = unary_ops,
BINARY = V("ARG") / empty * gen.ws ^ 1 * binary_ops,
- TERNARY = V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * ternary_ops,
- NARY = (gen.number / 0 * gen.ws ^ 1) ^ 1 * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op,
+ TERNARY = (V("ARG") * gen.ws ^ 1 * V("ARG") * gen.ws ^ 1 * ternary_ops) / ternary_op_handler,
+ NARY = lpeg.Ct((V("ARG") * gen.ws ^ 1) ^ 1) * (gen.id / empty * gen.ws ^ 0) ^ -1 * nary_op / nary_op_handler,
ARG = V("ARRAY") + V("DICT") + V("ATOM"),
ATOM = (gen.comment + gen.boolean + gen.ref +
gen.number + V("STRING") + gen.id),
KV_PAIR = lpeg.Cg(gen.id * gen.ws ^ 0 * V("ARG") * gen.ws ^ 0),
ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct(V("ARG") ^ 0) * gen.ws ^ 0 * "]",
STRING = lpeg.P { gen.str + gen.hexstr },
- TEXT = (V("TEXT_ARG") * gen.ws ^ 1 * text_binary_op) +
- (V("ARG") / 0 * gen.ws ^ 1 * V("ARG") / 0 * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 1 * text_quote_op),
- FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / 0) * gen.ws ^ 1 * font_op),
+ TEXT = ((V("TEXT_ARG") * gen.ws ^ 0 * text_binary_op) / text_op_handler) +
+ ((V("ARG") / empty * gen.ws ^ 1 * V("ARG") / empty * gen.ws ^ 1 * V("TEXT_ARG") * gen.ws ^ 0 * text_quote_op) / text_op_handler),
+ FONT = (V("FONT_ARG") * gen.ws ^ 1 * (gen.number / empty) * gen.ws ^ 1 * font_op) / empty,
FONT_ARG = lpeg.Ct(lpeg.Cc("%font%") * gen.id),
TEXT_ARG = lpeg.Ct(V("STRING")) + V("TEXT_ARRAY"),
- TEXT_ARRAY = "[" *
- lpeg.Ct(((gen.ws ^ 0 * (gen.ws ^ 0 * (gen.number / 0) ^ 0 * gen.ws ^ 0 * (gen.str + gen.hexstr))) ^ 1)) * gen.ws ^ 0 * "]",
+ TEXT_ARRAY = "[" * gen.ws ^ 0 * lpeg.Ct((V("TEXT_ARRAY_ELT") * gen.ws ^ 0) ^ 0) * "]",
+ TEXT_ARRAY_ELT = gen.number + gen.str + gen.hexstr,
}
end
pdf_text_grammar = gen_text_grammar()
local function extract_text_data(specific)
- return nil -- NYI
+ local res = {}
+ if specific.objects then
+ for _, obj in ipairs(specific.objects) do
+ if obj.text then
+ if type(obj.text) == 'userdata' then
+ res[#res + 1] = tostring(obj.text)
+ else
+ res[#res + 1] = obj.text
+ end
+ end
+ end
+ end
+
+ return res
end
-- Generates index for major/minor pair
-- Apply PDF stream filter
local function apply_pdf_filter(input, filt)
- if filt == 'FlateDecode' then
+ if filt == 'FlateDecode' or filt == 'Fl' then
return rspamd_util.inflate(input, config.max_extraction_size)
+ elseif filt == 'ASCIIHexDecode' or filt == 'AHx' then
+ -- Strip > at the end if present (should be stripped by parser but safety check)
+ -- Also strip whitespaces
+ local to_decode = input:gsub('%s', '')
+ if to_decode:sub(-1) == '>' then
+ to_decode = to_decode:sub(1, -2)
+ end
+ return lua_util.unhex(to_decode)
end
return nil
if dict.Filter then
local filt = dict.Filter
+ local filts = {}
+
if type(filt) == 'string' then
- filt = { filt }
+ filts = { filt }
+ elseif type(filt) == 'table' then
+ -- Array of filters
+ filts = filt
end
if dict.DecodeParms then
if type(decode_params) == 'table' then
if decode_params.Predictor then
- return nil, 'predictor exists'
+ -- We can handle Predictor 1 (No prediction) or maybe others in future
+ local predictor = tonumber(decode_params.Predictor) or 1
+ if predictor > 1 then
+ -- For now, we just log debug and fail, or maybe try to continue if it's simple PNG prediction
+ -- But without implementation, better to return nil to avoid garbage
+ return nil, 'predictor exists: ' .. tostring(predictor)
+ end
end
end
end
- for _, f in ipairs(filt) do
- uncompressed = apply_pdf_filter(uncompressed, f)
+ for _, f in ipairs(filts) do
+ local next_uncompressed = apply_pdf_filter(uncompressed, f)
- if not uncompressed then
- break
+ if next_uncompressed then
+ uncompressed = next_uncompressed
+ else
+ -- If chain fails, we stop
+ return nil, 'filter failed: ' .. tostring(f)
end
end
end
return nil
end
local dict = obj.dict
- if dict.Length and type(obj.stream) == 'table' then
- local len = math.min(obj.stream.len,
- tonumber(maybe_dereference_object(dict.Length, pdf, task)) or 0)
- if len > 0 then
- local real_stream = obj.stream.data:span(1, len)
-
- local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
-
- if uncompressed then
- obj.uncompressed = uncompressed
- lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
- obj.major, obj.minor, len, uncompressed:len())
- return obj.uncompressed
- else
- lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
- obj.major, obj.minor, len, dict.Filter, filter_err)
- end
+
+ -- Determine length: try /Length first, fallback to parsed length
+ local len = obj.stream.len
+ local decl_len = maybe_dereference_object(dict.Length, pdf, task)
+
+ if decl_len then
+ local nlen = tonumber(decl_len)
+ if nlen then
+ len = math.min(len, nlen)
+ end
+ end
+
+ if len > 0 then
+ local real_stream = obj.stream.data:span(1, len)
+
+ local uncompressed, filter_err = maybe_apply_filter(dict, real_stream, pdf, task)
+
+ if uncompressed then
+ obj.uncompressed = uncompressed
+ lua_util.debugm(N, task, 'extracted object %s:%s: (%s -> %s)',
+ obj.major, obj.minor, len, #uncompressed)
+ return obj.uncompressed
else
- lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s',
- obj.major, obj.minor, len)
+ lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s; filter = %s: %s',
+ obj.major, obj.minor, len, dict.Filter, filter_err)
end
+ else
+ lua_util.debugm(N, task, 'cannot extract object %s:%s; len = %s',
+ obj.major, obj.minor, len)
end
end
end
end
+ table.sort(starts)
+ table.sort(ends)
+
offsets_to_blocks(starts, ends, text_blocks)
for _, bl in ipairs(text_blocks) do
if bl.len > 2 then
bl.data)
if ret then
- text[#text + 1] = obj_or_err
+ if #obj_or_err == 0 then
+ lua_util.debugm(N, task, 'empty text match from block: %s', bl.data)
+ end
+ for _, chunk in ipairs(obj_or_err) do
+ text[#text + 1] = chunk
+ end
+ -- Add newline after each block to separate lines
+ text[#text + 1] = '\n'
+ -- text[#text + 1] = obj_or_err
lua_util.debugm(N, task, 'attached %s from content object %s:%s to %s:%s',
obj_or_err, tobj.major, tobj.minor, obj.major, obj.minor)
else
-- Join all text data together
if #text > 0 then
- obj.text = rspamd_text.fromtable(text)
+ for i, chunk in ipairs(text) do
+ if type(chunk) == 'userdata' then
+ text[i] = tostring(chunk)
+ elseif type(chunk) == 'table' then
+ -- Nested table?
+ local function flatten(t)
+ local res = {}
+ for _, v in ipairs(t) do
+ if type(v) == 'userdata' then
+ res[#res + 1] = tostring(v)
+ elseif type(v) == 'table' then
+ res[#res + 1] = flatten(v)
+ else
+ res[#res + 1] = v
+ end
+ end
+ return table.concat(res, '')
+ end
+ text[i] = flatten(chunk)
+ end
+ end
+ local res = table.concat(text, '')
+ obj.text = rspamd_text.fromstring(res)
lua_util.debugm(N, task, 'object %s:%s is parsed to: %s',
obj.major, obj.minor, obj.text)
end
-- Postprocess objects
postprocess_pdf_objects(task, input, pdf_object)
+ pdf_output.objects = pdf_object.objects
if config.text_extraction then
search_text(task, pdf_object, pdf_output)
end