From: Vsevolod Stakhov Date: Wed, 4 Feb 2026 14:50:35 +0000 (+0000) Subject: [Fix] lua_magic: avoid misdetecting HTML with embedded SVG as SVG X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=170c4c5d6e4e26c9118f443a8ce4464f8d765805;p=thirdparty%2Frspamd.git [Fix] lua_magic: avoid misdetecting HTML with embedded SVG as SVG Add svg_format_heuristic that checks for HTML markers (, , , , ) before the tag position. If HTML markers are present, skip SVG detection and let the text heuristic properly classify the content as HTML. Add functional test with HTML containing embedded SVG (should detect as HTML) and standalone SVG (should still detect as SVG). --- diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index 8258ff2493..8b5b9c52da 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -614,4 +614,34 @@ exports.pe_part_heuristic = function(input, log_obj, pos, part) return 'exe', 30 end +-- SVG heuristic: check if this is actually HTML with embedded SVG +exports.svg_format_heuristic = function(input, log_obj, pos, part) + if not input then + return + end + + -- Only check content before the tag position + local check_len = math.min(pos, 4096) + if check_len < 5 then + -- is at the very beginning, likely a real SVG + return 'svg', 40 + end + + local head = tostring(input:span(1, check_len)):lower() + + -- Check for HTML markers that would appear before in an HTML document + -- If we find these, it's HTML with embedded SVG, not a standalone SVG + if head:find(']') or + head:find(']') or + head:find(']') or + head:find(']') then + lua_util.debugm(N, log_obj, 'svg pattern found at %s but HTML markers present, skipping svg detection', + pos) + return nil + end + + return 'svg', 40 +end + return exports diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua index df524f177c..2bbf4e258d 100644 --- a/lualib/lua_magic/patterns.lua +++ b/lualib/lua_magic/patterns.lua @@ -447,12 +447,15 @@ local patterns = { matches = { { -- Case-insensitive in the first chunk + -- Use heuristic to avoid misdetecting HTML with embedded SVG string = [[(?i)