From: Vsevolod Stakhov Date: Fri, 6 Feb 2026 17:42:55 +0000 (+0000) Subject: [Fix] lua_content: Move PDF ligature substitutions from string unescape to text handler X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eb1acde80cbbe1750835a3d07b133435f9bf30ed;p=thirdparty%2Frspamd.git [Fix] lua_content: Move PDF ligature substitutions from string unescape to text handler StandardEncoding/MacRomanEncoding ligature substitutions (e.g. byte 0xAD -> 'ffl') were applied to all PDF strings including /URI annotation values. This corrupted soft hyphens (U+00AD) in URLs, preventing the URL parser from detecting zero-width space obfuscation and setting the ZW_SPACES flag. Move ligature substitutions to text_op_handler where they belong, so they only apply to rendered text content (Tj/TJ operators), not to dictionary string values. --- diff --git a/lualib/lua_content/pdf.lua b/lualib/lua_content/pdf.lua index 4c37b06385..def4889207 100644 --- a/lualib/lua_content/pdf.lua +++ b/lualib/lua_content/pdf.lua @@ -160,6 +160,21 @@ local function compile_tries() end end +-- StandardEncoding/MacRomanEncoding ligature substitutions +-- Applied only to rendered text, NOT to dictionary strings (URI values etc.) +-- to avoid corrupting soft hyphens (U+00AD = byte 0xAD = \173) in URLs +local function apply_ligature_substitutions(s) + if not s then return s end + s = s:gsub('\171', 'ff') + s = s:gsub('\172', 'ffi') + s = s:gsub('\173', 'ffl') + s = s:gsub('\174', 'fi') + s = s:gsub('\175', 'fl') + s = s:gsub('\222', 'fi') + s = s:gsub('\223', 'fl') + return s +end + -- Returns a table with generic grammar elements for PDF local function generic_grammar_elts() local P = lpeg.P @@ -183,17 +198,6 @@ local function generic_grammar_elts() res = lua_util.unhex(s:sub(1, #s - 1)) .. lua_util.unhex((s:sub(#s) .. '0')) end - if res then - -- StandardEncoding/MacRomanEncoding ligature substitutions - res = res:gsub('\171', 'ff') - res = res:gsub('\172', 'ffi') - res = res:gsub('\173', 'ffl') - res = res:gsub('\174', 'fi') - res = res:gsub('\175', 'fl') - res = res:gsub('\222', 'fi') - res = res:gsub('\223', 'fl') - end - return res end @@ -216,15 +220,6 @@ local function generic_grammar_elts() end s = s:gsub('\\%d%d?%d?', ue_octal) - -- StandardEncoding/MacRomanEncoding ligature substitutions - s = s:gsub('\171', 'ff') - s = s:gsub('\172', 'ffi') - s = s:gsub('\173', 'ffl') - s = s:gsub('\174', 'fi') - s = s:gsub('\175', 'fl') - s = s:gsub('\222', 'fi') - s = s:gsub('\223', 'fl') - return s end @@ -491,6 +486,11 @@ local function gen_text_grammar() res = table.concat(tres) end + -- Apply ligature substitutions for rendered text only (not dictionary strings like /URI) + if type(res) == 'string' then + res = apply_ligature_substitutions(res) + end + res = sanitize_pdf_text(res) -- Apply text quality filtering to reject garbage chunks