From: Vsevolod Stakhov Date: Sun, 23 Nov 2025 11:38:18 +0000 (+0000) Subject: [Feature] Add rspamd_util.decode_html_entities and improve obfuscated URL detection X-Git-Tag: 3.14.1~7^2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=refs%2Fpull%2F5761%2Fhead;p=thirdparty%2Frspamd.git [Feature] Add rspamd_util.decode_html_entities and improve obfuscated URL detection - Add Lua binding for HTML entity decoding (rspamd_util.decode_html_entities) wrapping rspamd_html_decode_entitles_inplace C function - Switch obfuscated URL detection from regexp module to rspamd_trie for Hyperscan-accelerated multi-pattern matching - Fix URL flag passing (use url.create with flags table instead of add_flag) - Fix inject_url usage (doesn't return value) - Add functional tests for obfuscated URL detection --- diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index b6f1e74903..ad5cd1ce99 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -20,6 +20,7 @@ #include "libmime/content_type.h" #include "libmime/mime_headers.h" #include "libutil/hash.h" +#include "libserver/html/html.h" #include "lua_parsers.h" @@ -95,6 +96,14 @@ LUA_FUNCTION_DEF(util, encode_qp); */ LUA_FUNCTION_DEF(util, decode_qp); +/*** + * @function util.decode_html_entities(input) + * Decodes HTML entities in text (numeric &#XX; &#xXX; and named & etc) + * @param {text or string} input input data + * @return {rspamd_text} decoded data chunk + */ +LUA_FUNCTION_DEF(util, decode_html_entities); + /*** * @function util.decode_base64(input) * Decodes data from base64 ignoring whitespace characters @@ -713,6 +722,7 @@ static const struct luaL_reg utillib_f[] = { LUA_INTERFACE_DEF(util, encode_base64), LUA_INTERFACE_DEF(util, encode_qp), LUA_INTERFACE_DEF(util, decode_qp), + LUA_INTERFACE_DEF(util, decode_html_entities), LUA_INTERFACE_DEF(util, decode_base64), LUA_INTERFACE_DEF(util, encode_base32), LUA_INTERFACE_DEF(util, decode_base32), @@ -1197,6 +1207,44 @@ lua_util_decode_qp(lua_State *L) return 1; } +static int +lua_util_decode_html_entities(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t, *out; + const char *s = NULL; + gsize inlen = 0; + unsigned int outlen; + + if (lua_type(L, 1) == LUA_TSTRING) { + s = luaL_checklstring(L, 1, &inlen); + } + else if (lua_type(L, 1) == LUA_TUSERDATA) { + t = lua_check_text(L, 1); + + if (t != NULL) { + s = t->start; + inlen = t->len; + } + } + + if (s == NULL || inlen == 0) { + lua_pushnil(L); + } + else { + out = lua_newuserdata(L, sizeof(*out)); + rspamd_lua_setclass(L, rspamd_text_classname, -1); + out->start = g_malloc(inlen + 1); + out->flags = RSPAMD_TEXT_FLAG_OWN; + memcpy((char *) out->start, s, inlen); + ((char *) out->start)[inlen] = '\0'; + outlen = rspamd_html_decode_entitles_inplace((char *) out->start, inlen); + out->len = outlen; + } + + return 1; +} + static int lua_util_decode_base64(lua_State *L) { diff --git a/src/plugins/lua/url_suspect.lua b/src/plugins/lua/url_suspect.lua index b36c93a679..ec65c82425 100644 --- a/src/plugins/lua/url_suspect.lua +++ b/src/plugins/lua/url_suspect.lua @@ -54,7 +54,9 @@ local symbols = { multiple_at = "URL_MULTIPLE_AT_SIGNS", backslash = "URL_BACKSLASH_PATH", excessive_dots = "URL_EXCESSIVE_DOTS", - very_long = "URL_VERY_LONG" + very_long = "URL_VERY_LONG", + -- Obfuscated text symbol + obfuscated_text = "URL_OBFUSCATED_TEXT" } -- Default settings (work without any maps) @@ -171,8 +173,11 @@ local function normalize_obfuscated_text(text, max_len) -- 1. Remove zero-width characters (U+200B, U+200C, U+200D, BOM, soft hyphen) text = text:gsub("[\226\128\139\226\128\140\226\128\141\239\187\191\194\173]", "") - -- 2. HTML entity decode - text = rspamd_util.decode_html(text) + -- 2. HTML entity decode (using C binding for comprehensive entity support) + local decoded = rspamd_util.decode_html_entities(text) + if decoded then + text = tostring(decoded) + end -- 3. Normalize spaced protocol: h t t p s : / / -> https:// text = text:gsub("[hH]%s+[tT]%s+[tT]%s+[pP]%s*[sS]?%s*:%s*/%s*/", "https://") @@ -721,212 +726,180 @@ if settings.enabled then flags = 'empty,nice' }) - -- Register all symbol names as virtual - for _, symbol_name in pairs(symbols) do - rspamd_config:register_symbol({ - name = symbol_name, - type = 'virtual', - parent = id, - group = 'url' - }) + -- Register all symbol names as virtual (except obfuscated_text which is handled separately) + for key, symbol_name in pairs(symbols) do + if key ~= 'obfuscated_text' then + rspamd_config:register_symbol({ + name = symbol_name, + type = 'virtual', + parent = id, + group = 'url' + }) + end end end -- Obfuscated URL detection in message text --- Uses Hyperscan for fast pre-filtering, then normalizes and extracts URLs +-- Uses rspamd_trie (Hyperscan when available) for fast multi-pattern matching if settings.enabled and settings.checks.obfuscated_text and settings.checks.obfuscated_text.enabled then local obf_cfg = settings.checks.obfuscated_text - - -- Counters for DoS protection (per task) - local obf_state = {} - - -- Helper: try to extract and inject URL from matched text - local function process_obfuscated_match(task, txt, start_pos, end_pos, obf_type) - -- Get or initialize state for this task - local task_id = tostring(task) - if not obf_state[task_id] then - obf_state[task_id] = { - match_count = 0, - extracted_count = 0 - } - end - local state = obf_state[task_id] - - -- Check limits - state.match_count = state.match_count + 1 - if state.match_count > obf_cfg.max_matches_per_message then - lua_util.debugm(N, task, 'Reached max matches limit (%d), skipping further checks', - obf_cfg.max_matches_per_message) - return false - end - - if state.extracted_count >= obf_cfg.max_extracted_urls then - lua_util.debugm(N, task, 'Reached max extracted URLs limit (%d)', - obf_cfg.max_extracted_urls) - return false - end - - -- Extract context window - local window = extract_context_window(txt, start_pos, end_pos, obf_cfg) - if #window < obf_cfg.min_match_length then - return false - end - - lua_util.debugm(N, task, 'Processing %s match at %d-%d, window: %s', - obf_type, start_pos, end_pos, window:sub(1, 100)) - - -- Normalize - local normalized = normalize_obfuscated_text(window, obf_cfg.max_normalize_length) - if not normalized or #normalized < obf_cfg.min_match_length then - lua_util.debugm(N, task, 'Normalized text too short or empty') - return false - end - - lua_util.debugm(N, task, 'Normalized text: %s', normalized:sub(1, 100)) - - -- Extract URL - local extracted_url, url_type = extract_url_from_normalized(normalized) - if not extracted_url then - lua_util.debugm(N, task, 'Could not extract URL from normalized text') - return false - end - - lua_util.debugm(N, task, 'Extracted URL: %s (type: %s)', extracted_url, url_type) - - -- Create URL object - local url_obj = rspamd_url.create(task:get_mempool(), extracted_url) - if not url_obj then - lua_util.debugm(N, task, 'Failed to create URL object for: %s', extracted_url) - return false - end - - -- Set obscured flag - url_obj:add_flag('obscured') - - -- Inject URL into task - local success = task:inject_url(url_obj) - if success then - state.extracted_count = state.extracted_count + 1 - - -- Insert result symbol with details - local original_snippet = window:sub(1, 50):gsub("%s+", " ") - task:insert_result(settings.symbols.obfuscated_text, 1.0, { - string.format("type=%s", obf_type), - string.format("url=%s", extracted_url:sub(1, 50)), - string.format("orig=%s", original_snippet) - }) - - lua_util.debugm(N, task, 'Successfully injected obfuscated URL: %s (obfuscation: %s)', - extracted_url, obf_type) - return true - else - lua_util.debugm(N, task, 'Failed to inject URL: %s', extracted_url) - return false - end - end + local rspamd_trie = require "rspamd_trie" -- Helper: check if pattern is enabled local function is_pattern_enabled(pattern_name) - -- If map is configured, check it if maps.obfuscated_patterns then return maps.obfuscated_patterns:get_key(pattern_name) end - -- Otherwise use built-in config return obf_cfg.patterns_enabled[pattern_name] end - -- Build regex patterns - local patterns = {} - local re_conditions = {} + -- Build pattern list with metadata + local pattern_list = {} -- array of pattern strings for trie + local pattern_meta = {} -- metadata for each pattern (indexed by pattern position) if is_pattern_enabled('spaced_protocol') then - -- Match spaced protocol: h t t p s : / / - local spaced_proto_re = [[/[hH]\s+[tT]\s+[tT]\s+[pP]\s*[sS]?\s*[:\/]/L{sa_body}]] - patterns.spaced_proto = spaced_proto_re - re_conditions[spaced_proto_re] = function(task, txt, s, e) - local len = e - s - if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then - return false - end - return process_obfuscated_match(task, txt, s + 1, e, 'spaced_protocol') - end + table.insert(pattern_list, [=[[hH]\s+[tT]\s+[tT]\s+[pP]\s*[sS]?\s*[:\/]]=]) + pattern_meta[#pattern_list] = { name = 'spaced_protocol' } end if is_pattern_enabled('hxxp') then - -- Match hxxp:// or hXXp:// - local hxxp_re = [[/[hH][xX][xX][pP][sS]?:\/\//L{sa_body}]] - patterns.hxxp = hxxp_re - re_conditions[hxxp_re] = function(task, txt, s, e) - local len = e - s - if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then - return false - end - return process_obfuscated_match(task, txt, s + 1, e, 'hxxp') - end + table.insert(pattern_list, [=[[hH][xX][xX][pP][sS]?:\/\/]=]) + pattern_meta[#pattern_list] = { name = 'hxxp' } end if is_pattern_enabled('bracket_dots') then - -- Match dots in brackets: [.] (.) {.} - local bracket_dots_re = [[/[\[\(\{]\s*\.\s*[\]\)\}]/L{sa_body}]] - patterns.bracket_dots = bracket_dots_re - re_conditions[bracket_dots_re] = function(task, txt, s, e) - local len = e - s - if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then - return false - end - return process_obfuscated_match(task, txt, s + 1, e, 'bracket_dots') - end + table.insert(pattern_list, [=[[\[\(\{]\s*\.\s*[\]\)\}]]=]) + pattern_meta[#pattern_list] = { name = 'bracket_dots' } end if is_pattern_enabled('word_dots') then - -- Match word "dot" between word characters - local word_dot_re = [[/\w+\s+[dD][oO][tT]\s+\w+/L{sa_body}]] - patterns.word_dot = word_dot_re - re_conditions[word_dot_re] = function(task, txt, s, e) - local len = e - s - if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then - return false - end - return process_obfuscated_match(task, txt, s + 1, e, 'word_dot') - end + table.insert(pattern_list, [=[\w+\s+[dD][oO][tT]\s+\w+]=]) + pattern_meta[#pattern_list] = { name = 'word_dot' } end if is_pattern_enabled('html_entities') then - -- Match HTML entities that might be dots or slashes - local html_entity_re = [[/&#\d{2,3};[^&]{0,20}&#\d{2,3};/L{sa_body}]] - patterns.html_entity = html_entity_re - re_conditions[html_entity_re] = function(task, txt, s, e) - local len = e - s - if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then - return false - end - return process_obfuscated_match(task, txt, s + 1, e, 'html_entity') - end - end - - -- Build combined regex expression - local re_parts = {} - for _, pattern_re in pairs(patterns) do - table.insert(re_parts, string.format("(%s)", pattern_re)) + table.insert(pattern_list, [=[&#\d{2,3};[^&]{0,20}&#\d{2,3};]=]) + pattern_meta[#pattern_list] = { name = 'html_entity' } end - if #re_parts == 0 then + if #pattern_list == 0 then rspamd_logger.infox(rspamd_config, 'No obfuscated text patterns enabled, skipping registration') else - local combined_re = table.concat(re_parts, " + ") - - -- Register using config.regexp (like bitcoin.lua) - config.regexp[settings.symbols.obfuscated_text] = { - description = 'Obfuscated URL found in message text', - re = string.format('%s > 0', combined_re), - expression_flags = { 'noopt' }, - re_conditions = re_conditions, - score = 5.0, - one_shot = true, - group = 'url' - } + -- Create trie with regex support + -- flags: re (regex mode) + icase (case insensitive) + local trie_flags = rspamd_trie.flags.re + rspamd_trie.flags.icase + local obf_trie = rspamd_trie.create(pattern_list, trie_flags) + + if not obf_trie then + rspamd_logger.errx(rspamd_config, 'Failed to create obfuscated URL trie') + else + local has_hs = rspamd_trie.has_hyperscan() + rspamd_logger.infox(rspamd_config, 'Created obfuscated URL trie with %d patterns (hyperscan: %s)', + #pattern_list, has_hs) + + -- Prefilter callback for obfuscated URL detection + local function obfuscated_text_prefilter(task) + local text_parts = task:get_text_parts() + if not text_parts or #text_parts == 0 then + return false + end + + -- DoS protection counters + local match_count = 0 + local extracted_count = 0 + + -- Process a match + local function process_match(txt, start_pos, end_pos, pattern_idx) + match_count = match_count + 1 + if match_count > obf_cfg.max_matches_per_message then + return 1 -- stop matching + end + + if extracted_count >= obf_cfg.max_extracted_urls then + return 1 -- stop matching + end + + local meta = pattern_meta[pattern_idx] + local obf_type = meta and meta.name or 'unknown' + + -- Extract context window + local window = extract_context_window(txt, start_pos, end_pos, obf_cfg) + if #window < obf_cfg.min_match_length then + return 0 -- continue matching + end + + lua_util.debugm(N, task, 'Processing %s match at %d-%d', obf_type, start_pos, end_pos) + + -- Normalize and extract URL + local normalized = normalize_obfuscated_text(window, obf_cfg.max_normalize_length) + if not normalized or #normalized < obf_cfg.min_match_length then + return 0 + end + + local extracted_url = extract_url_from_normalized(normalized) + if not extracted_url then + return 0 + end + + lua_util.debugm(N, task, 'Extracted URL: %s (type: %s)', extracted_url, obf_type) + + -- Create and inject URL with obscured flag + local url_obj = rspamd_url.create(task:get_mempool(), extracted_url, {'obscured'}) + if not url_obj then + return 0 + end + + task:inject_url(url_obj) + extracted_count = extracted_count + 1 + + local snippet = window:sub(1, 50):gsub("%s+", " ") + task:insert_result(symbols.obfuscated_text, 1.0, { + string.format("type=%s", obf_type), + string.format("url=%s", extracted_url:sub(1, 50)), + string.format("orig=%s", snippet) + }) + lua_util.debugm(N, task, 'Injected obfuscated URL: %s', extracted_url) + + return 0 -- continue matching + end + + -- Search each text part using trie + for _, part in ipairs(text_parts) do + local content = part:get_content() + if content and #content > 0 then + local txt = tostring(content) + + -- Use trie:match with callback and report_start=true for positions + obf_trie:match(txt, function(pattern_idx, match_pos) + local start_pos, end_pos + if type(match_pos) == 'table' then + start_pos, end_pos = match_pos[1], match_pos[2] + else + -- Only end position provided + end_pos = match_pos + start_pos = math.max(1, end_pos - obf_cfg.max_match_length) + end + + return process_match(txt, start_pos, end_pos, pattern_idx) + end, true) -- report_start = true + end + end - rspamd_logger.infox(rspamd_config, 'Registered obfuscated URL detection with %d patterns', - #re_parts) + return false + end + + -- Register as prefilter for early URL injection + local prefilter_id = rspamd_config:register_symbol({ + name = symbols.obfuscated_text, + type = 'prefilter', + callback = obfuscated_text_prefilter, + group = 'url', + score = 5.0, + description = 'Obfuscated URL found in message text' + }) + + rspamd_logger.infox(rspamd_config, 'Registered obfuscated URL prefilter (id=%s, hyperscan=%s)', + prefilter_id, has_hs) + end end end diff --git a/test/functional/cases/001_merged/400_url_suspect.robot b/test/functional/cases/001_merged/400_url_suspect.robot index b4a06ea92a..d1d8399532 100644 --- a/test/functional/cases/001_merged/400_url_suspect.robot +++ b/test/functional/cases/001_merged/400_url_suspect.robot @@ -50,3 +50,27 @@ URL Suspect - Normal URL Do Not Expect Symbol URL_USER_PASSWORD Do Not Expect Symbol URL_NUMERIC_IP Do Not Expect Symbol URL_SUSPICIOUS_TLD + +URL Suspect - Obfuscated hxxp + # Test hxxp:// obfuscation detection + Scan File ${RSPAMD_TESTDIR}/messages/url_obfuscated_hxxp.eml + ... Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]} + Expect Symbol URL_OBFUSCATED_TEXT + +URL Suspect - Obfuscated Bracket Dots + # Test bracket dots obfuscation detection: example[.]com + Scan File ${RSPAMD_TESTDIR}/messages/url_obfuscated_bracket_dots.eml + ... Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]} + Expect Symbol URL_OBFUSCATED_TEXT + +URL Suspect - Obfuscated Word Dot + # Test word dot obfuscation detection: example dot com + Scan File ${RSPAMD_TESTDIR}/messages/url_obfuscated_word_dot.eml + ... Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]} + Expect Symbol URL_OBFUSCATED_TEXT + +URL Suspect - Obfuscated Spaced Protocol + # Test spaced protocol obfuscation: h t t p s : / / + Scan File ${RSPAMD_TESTDIR}/messages/url_obfuscated_spaced.eml + ... Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]} + Expect Symbol URL_OBFUSCATED_TEXT diff --git a/test/functional/messages/url_obfuscated_bracket_dots.eml b/test/functional/messages/url_obfuscated_bracket_dots.eml new file mode 100644 index 0000000000..d4388e55a6 --- /dev/null +++ b/test/functional/messages/url_obfuscated_bracket_dots.eml @@ -0,0 +1,6 @@ +From: sender@example.com +To: victim@example.com +Subject: Test bracket dots obfuscation +Content-Type: text/plain; charset=utf-8 + +Visit our site at example[.]com/login diff --git a/test/functional/messages/url_obfuscated_hxxp.eml b/test/functional/messages/url_obfuscated_hxxp.eml new file mode 100644 index 0000000000..3f1b029c1a --- /dev/null +++ b/test/functional/messages/url_obfuscated_hxxp.eml @@ -0,0 +1,6 @@ +From: sender@example.com +To: victim@example.com +Subject: Test hxxp obfuscation +Content-Type: text/plain; charset=utf-8 + +Check this link: hxxp://malicious-site.com/phish diff --git a/test/functional/messages/url_obfuscated_spaced.eml b/test/functional/messages/url_obfuscated_spaced.eml new file mode 100644 index 0000000000..dd7365f6dc --- /dev/null +++ b/test/functional/messages/url_obfuscated_spaced.eml @@ -0,0 +1,6 @@ +From: sender@example.com +To: victim@example.com +Subject: Test spaced protocol obfuscation +Content-Type: text/plain; charset=utf-8 + +Visit h t t p s : / / evil-site.com/page for details diff --git a/test/functional/messages/url_obfuscated_word_dot.eml b/test/functional/messages/url_obfuscated_word_dot.eml new file mode 100644 index 0000000000..dc0e7dcba4 --- /dev/null +++ b/test/functional/messages/url_obfuscated_word_dot.eml @@ -0,0 +1,6 @@ +From: sender@example.com +To: victim@example.com +Subject: Test word dot obfuscation +Content-Type: text/plain; charset=utf-8 + +Contact us at secure-login dot net for support