[Feature] Add rspamd_util.decode_html_entities and improve obfuscated URL detection

author Vsevolod Stakhov <vsevolod@rspamd.com>

Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c

index b6f1e7490351d882ee2e403ce527bdccbb60a392..ad5cd1ce99dd0d0798dc7544d4e54d055c4b4ae1 100644 (file)
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -20,6 +20,7 @@
  #include "libmime/content_type.h"
  #include "libmime/mime_headers.h"
  #include "libutil/hash.h"
+#include "libserver/html/html.h"
  
  #include "lua_parsers.h"
  
@@ -95,6 +96,14 @@ LUA_FUNCTION_DEF(util, encode_qp);
   */
  LUA_FUNCTION_DEF(util, decode_qp);
  
+/***
+ * @function util.decode_html_entities(input)
+ * Decodes HTML entities in text (numeric &#XX; &#xXX; and named &amp; etc)
+ * @param {text or string} input input data
+ * @return {rspamd_text} decoded data chunk
+ */
+LUA_FUNCTION_DEF(util, decode_html_entities);
+
  /***
   * @function util.decode_base64(input)
   * Decodes data from base64 ignoring whitespace characters
@@ -713,6 +722,7 @@ static const struct luaL_reg utillib_f[] = {
         LUA_INTERFACE_DEF(util, encode_base64),
         LUA_INTERFACE_DEF(util, encode_qp),
         LUA_INTERFACE_DEF(util, decode_qp),
+       LUA_INTERFACE_DEF(util, decode_html_entities),
         LUA_INTERFACE_DEF(util, decode_base64),
         LUA_INTERFACE_DEF(util, encode_base32),
         LUA_INTERFACE_DEF(util, decode_base32),
@@ -1197,6 +1207,44 @@ lua_util_decode_qp(lua_State *L)
         return 1;
  }
  
+static int
+lua_util_decode_html_entities(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_lua_text *t, *out;
+       const char *s = NULL;
+       gsize inlen = 0;
+       unsigned int outlen;
+
+       if (lua_type(L, 1) == LUA_TSTRING) {
+               s = luaL_checklstring(L, 1, &inlen);
+       }
+       else if (lua_type(L, 1) == LUA_TUSERDATA) {
+               t = lua_check_text(L, 1);
+
+               if (t != NULL) {
+                       s = t->start;
+                       inlen = t->len;
+               }
+       }
+
+       if (s == NULL || inlen == 0) {
+               lua_pushnil(L);
+       }
+       else {
+               out = lua_newuserdata(L, sizeof(*out));
+               rspamd_lua_setclass(L, rspamd_text_classname, -1);
+               out->start = g_malloc(inlen + 1);
+               out->flags = RSPAMD_TEXT_FLAG_OWN;
+               memcpy((char *) out->start, s, inlen);
+               ((char *) out->start)[inlen] = '\0';
+               outlen = rspamd_html_decode_entitles_inplace((char *) out->start, inlen);
+               out->len = outlen;
+       }
+
+       return 1;
+}
+
  static int
  lua_util_decode_base64(lua_State *L)
  {
diff --git a/src/plugins/lua/url_suspect.lua b/src/plugins/lua/url_suspect.lua

index b36c93a679f3ae9e0beb326cc085d90fe6a75689..ec65c824250af5a01292dab74d0e43a49036f4c7 100644 (file)
--- a/src/plugins/lua/url_suspect.lua
+++ b/src/plugins/lua/url_suspect.lua
@@ -54,7 +54,9 @@ local symbols = {
    multiple_at = "URL_MULTIPLE_AT_SIGNS",
    backslash = "URL_BACKSLASH_PATH",
    excessive_dots = "URL_EXCESSIVE_DOTS",
-  very_long = "URL_VERY_LONG"
+  very_long = "URL_VERY_LONG",
+  -- Obfuscated text symbol
+  obfuscated_text = "URL_OBFUSCATED_TEXT"
  }
  
  -- Default settings (work without any maps)
@@ -171,8 +173,11 @@ local function normalize_obfuscated_text(text, max_len)
    -- 1. Remove zero-width characters (U+200B, U+200C, U+200D, BOM, soft hyphen)
    text = text:gsub("[\226\128\139\226\128\140\226\128\141\239\187\191\194\173]", "")
  
-  -- 2. HTML entity decode
-  text = rspamd_util.decode_html(text)
+  -- 2. HTML entity decode (using C binding for comprehensive entity support)
+  local decoded = rspamd_util.decode_html_entities(text)
+  if decoded then
+    text = tostring(decoded)
+  end
  
    -- 3. Normalize spaced protocol: h t t p s : / / -> https://
    text = text:gsub("[hH]%s+[tT]%s+[tT]%s+[pP]%s*[sS]?%s*:%s*/%s*/", "https://")
@@ -721,212 +726,180 @@ if settings.enabled then
      flags = 'empty,nice'
    })
  
-  -- Register all symbol names as virtual
-  for _, symbol_name in pairs(symbols) do
-    rspamd_config:register_symbol({
-      name = symbol_name,
-      type = 'virtual',
-      parent = id,
-      group = 'url'
-    })
+  -- Register all symbol names as virtual (except obfuscated_text which is handled separately)
+  for key, symbol_name in pairs(symbols) do
+    if key ~= 'obfuscated_text' then
+      rspamd_config:register_symbol({
+        name = symbol_name,
+        type = 'virtual',
+        parent = id,
+        group = 'url'
+      })
+    end
    end
  end
  
  -- Obfuscated URL detection in message text
--- Uses Hyperscan for fast pre-filtering, then normalizes and extracts URLs
+-- Uses rspamd_trie (Hyperscan when available) for fast multi-pattern matching
  if settings.enabled and settings.checks.obfuscated_text and settings.checks.obfuscated_text.enabled then
    local obf_cfg = settings.checks.obfuscated_text
-
-  -- Counters for DoS protection (per task)
-  local obf_state = {}
-
-  -- Helper: try to extract and inject URL from matched text
-  local function process_obfuscated_match(task, txt, start_pos, end_pos, obf_type)
-    -- Get or initialize state for this task
-    local task_id = tostring(task)
-    if not obf_state[task_id] then
-      obf_state[task_id] = {
-        match_count = 0,
-        extracted_count = 0
-      }
-    end
-    local state = obf_state[task_id]
-
-    -- Check limits
-    state.match_count = state.match_count + 1
-    if state.match_count > obf_cfg.max_matches_per_message then
-      lua_util.debugm(N, task, 'Reached max matches limit (%d), skipping further checks',
-          obf_cfg.max_matches_per_message)
-      return false
-    end
-
-    if state.extracted_count >= obf_cfg.max_extracted_urls then
-      lua_util.debugm(N, task, 'Reached max extracted URLs limit (%d)',
-          obf_cfg.max_extracted_urls)
-      return false
-    end
-
-    -- Extract context window
-    local window = extract_context_window(txt, start_pos, end_pos, obf_cfg)
-    if #window < obf_cfg.min_match_length then
-      return false
-    end
-
-    lua_util.debugm(N, task, 'Processing %s match at %d-%d, window: %s',
-        obf_type, start_pos, end_pos, window:sub(1, 100))
-
-    -- Normalize
-    local normalized = normalize_obfuscated_text(window, obf_cfg.max_normalize_length)
-    if not normalized or #normalized < obf_cfg.min_match_length then
-      lua_util.debugm(N, task, 'Normalized text too short or empty')
-      return false
-    end
-
-    lua_util.debugm(N, task, 'Normalized text: %s', normalized:sub(1, 100))
-
-    -- Extract URL
-    local extracted_url, url_type = extract_url_from_normalized(normalized)
-    if not extracted_url then
-      lua_util.debugm(N, task, 'Could not extract URL from normalized text')
-      return false
-    end
-
-    lua_util.debugm(N, task, 'Extracted URL: %s (type: %s)', extracted_url, url_type)
-
-    -- Create URL object
-    local url_obj = rspamd_url.create(task:get_mempool(), extracted_url)
-    if not url_obj then
-      lua_util.debugm(N, task, 'Failed to create URL object for: %s', extracted_url)
-      return false
-    end
-
-    -- Set obscured flag
-    url_obj:add_flag('obscured')
-
-    -- Inject URL into task
-    local success = task:inject_url(url_obj)
-    if success then
-      state.extracted_count = state.extracted_count + 1
-
-      -- Insert result symbol with details
-      local original_snippet = window:sub(1, 50):gsub("%s+", " ")
-      task:insert_result(settings.symbols.obfuscated_text, 1.0, {
-        string.format("type=%s", obf_type),
-        string.format("url=%s", extracted_url:sub(1, 50)),
-        string.format("orig=%s", original_snippet)
-      })
-
-      lua_util.debugm(N, task, 'Successfully injected obfuscated URL: %s (obfuscation: %s)',
-          extracted_url, obf_type)
-      return true
-    else
-      lua_util.debugm(N, task, 'Failed to inject URL: %s', extracted_url)
-      return false
-    end
-  end
+  local rspamd_trie = require "rspamd_trie"
  
    -- Helper: check if pattern is enabled
    local function is_pattern_enabled(pattern_name)
-    -- If map is configured, check it
      if maps.obfuscated_patterns then
        return maps.obfuscated_patterns:get_key(pattern_name)
      end
-    -- Otherwise use built-in config
      return obf_cfg.patterns_enabled[pattern_name]
    end
  
-  -- Build regex patterns
-  local patterns = {}
-  local re_conditions = {}
+  -- Build pattern list with metadata
+  local pattern_list = {}   -- array of pattern strings for trie
+  local pattern_meta = {}   -- metadata for each pattern (indexed by pattern position)
  
    if is_pattern_enabled('spaced_protocol') then
-    -- Match spaced protocol: h t t p s : / /
-    local spaced_proto_re = [[/[hH]\s+[tT]\s+[tT]\s+[pP]\s*[sS]?\s*[:\/]/L{sa_body}]]
-    patterns.spaced_proto = spaced_proto_re
-    re_conditions[spaced_proto_re] = function(task, txt, s, e)
-      local len = e - s
-      if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then
-        return false
-      end
-      return process_obfuscated_match(task, txt, s + 1, e, 'spaced_protocol')
-    end
+    table.insert(pattern_list, [=[[hH]\s+[tT]\s+[tT]\s+[pP]\s*[sS]?\s*[:\/]]=])
+    pattern_meta[#pattern_list] = { name = 'spaced_protocol' }
    end
  
    if is_pattern_enabled('hxxp') then
-    -- Match hxxp:// or hXXp://
-    local hxxp_re = [[/[hH][xX][xX][pP][sS]?:\/\//L{sa_body}]]
-    patterns.hxxp = hxxp_re
-    re_conditions[hxxp_re] = function(task, txt, s, e)
-      local len = e - s
-      if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then
-        return false
-      end
-      return process_obfuscated_match(task, txt, s + 1, e, 'hxxp')
-    end
+    table.insert(pattern_list, [=[[hH][xX][xX][pP][sS]?:\/\/]=])
+    pattern_meta[#pattern_list] = { name = 'hxxp' }
    end
  
    if is_pattern_enabled('bracket_dots') then
-    -- Match dots in brackets: [.] (.) {.}
-    local bracket_dots_re = [[/[\[\(\{]\s*\.\s*[\]\)\}]/L{sa_body}]]
-    patterns.bracket_dots = bracket_dots_re
-    re_conditions[bracket_dots_re] = function(task, txt, s, e)
-      local len = e - s
-      if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then
-        return false
-      end
-      return process_obfuscated_match(task, txt, s + 1, e, 'bracket_dots')
-    end
+    table.insert(pattern_list, [=[[\[\(\{]\s*\.\s*[\]\)\}]]=])
+    pattern_meta[#pattern_list] = { name = 'bracket_dots' }
    end
  
    if is_pattern_enabled('word_dots') then
-    -- Match word "dot" between word characters
-    local word_dot_re = [[/\w+\s+[dD][oO][tT]\s+\w+/L{sa_body}]]
-    patterns.word_dot = word_dot_re
-    re_conditions[word_dot_re] = function(task, txt, s, e)
-      local len = e - s
-      if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then
-        return false
-      end
-      return process_obfuscated_match(task, txt, s + 1, e, 'word_dot')
-    end
+    table.insert(pattern_list, [=[\w+\s+[dD][oO][tT]\s+\w+]=])
+    pattern_meta[#pattern_list] = { name = 'word_dot' }
    end
  
    if is_pattern_enabled('html_entities') then
-    -- Match HTML entities that might be dots or slashes
-    local html_entity_re = [[/&#\d{2,3};[^&]{0,20}&#\d{2,3};/L{sa_body}]]
-    patterns.html_entity = html_entity_re
-    re_conditions[html_entity_re] = function(task, txt, s, e)
-      local len = e - s
-      if len < obf_cfg.min_match_length or len > obf_cfg.max_match_length then
-        return false
-      end
-      return process_obfuscated_match(task, txt, s + 1, e, 'html_entity')
-    end
-  end
-
-  -- Build combined regex expression
-  local re_parts = {}
-  for _, pattern_re in pairs(patterns) do
-    table.insert(re_parts, string.format("(%s)", pattern_re))
+    table.insert(pattern_list, [=[&#\d{2,3};[^&]{0,20}&#\d{2,3};]=])
+    pattern_meta[#pattern_list] = { name = 'html_entity' }
    end
  
-  if #re_parts == 0 then
+  if #pattern_list == 0 then
      rspamd_logger.infox(rspamd_config, 'No obfuscated text patterns enabled, skipping registration')
    else
-    local combined_re = table.concat(re_parts, " + ")
-
-    -- Register using config.regexp (like bitcoin.lua)
-    config.regexp[settings.symbols.obfuscated_text] = {
-      description = 'Obfuscated URL found in message text',
-      re = string.format('%s > 0', combined_re),
-      expression_flags = { 'noopt' },
-      re_conditions = re_conditions,
-      score = 5.0,
-      one_shot = true,
-      group = 'url'
-    }
+    -- Create trie with regex support
+    -- flags: re (regex mode) + icase (case insensitive)
+    local trie_flags = rspamd_trie.flags.re + rspamd_trie.flags.icase
+    local obf_trie = rspamd_trie.create(pattern_list, trie_flags)
+
+    if not obf_trie then
+      rspamd_logger.errx(rspamd_config, 'Failed to create obfuscated URL trie')
+    else
+      local has_hs = rspamd_trie.has_hyperscan()
+      rspamd_logger.infox(rspamd_config, 'Created obfuscated URL trie with %d patterns (hyperscan: %s)',
+          #pattern_list, has_hs)
+
+      -- Prefilter callback for obfuscated URL detection
+      local function obfuscated_text_prefilter(task)
+        local text_parts = task:get_text_parts()
+        if not text_parts or #text_parts == 0 then
+          return false
+        end
+
+        -- DoS protection counters
+        local match_count = 0
+        local extracted_count = 0
+
+        -- Process a match
+        local function process_match(txt, start_pos, end_pos, pattern_idx)
+          match_count = match_count + 1
+          if match_count > obf_cfg.max_matches_per_message then
+            return 1  -- stop matching
+          end
+
+          if extracted_count >= obf_cfg.max_extracted_urls then
+            return 1  -- stop matching
+          end
+
+          local meta = pattern_meta[pattern_idx]
+          local obf_type = meta and meta.name or 'unknown'
+
+          -- Extract context window
+          local window = extract_context_window(txt, start_pos, end_pos, obf_cfg)
+          if #window < obf_cfg.min_match_length then
+            return 0  -- continue matching
+          end
+
+          lua_util.debugm(N, task, 'Processing %s match at %d-%d', obf_type, start_pos, end_pos)
+
+          -- Normalize and extract URL
+          local normalized = normalize_obfuscated_text(window, obf_cfg.max_normalize_length)
+          if not normalized or #normalized < obf_cfg.min_match_length then
+            return 0
+          end
+
+          local extracted_url = extract_url_from_normalized(normalized)
+          if not extracted_url then
+            return 0
+          end
+
+          lua_util.debugm(N, task, 'Extracted URL: %s (type: %s)', extracted_url, obf_type)
+
+          -- Create and inject URL with obscured flag
+          local url_obj = rspamd_url.create(task:get_mempool(), extracted_url, {'obscured'})
+          if not url_obj then
+            return 0
+          end
+
+          task:inject_url(url_obj)
+          extracted_count = extracted_count + 1
+
+          local snippet = window:sub(1, 50):gsub("%s+", " ")
+          task:insert_result(symbols.obfuscated_text, 1.0, {
+            string.format("type=%s", obf_type),
+            string.format("url=%s", extracted_url:sub(1, 50)),
+            string.format("orig=%s", snippet)
+          })
+          lua_util.debugm(N, task, 'Injected obfuscated URL: %s', extracted_url)
+
+          return 0  -- continue matching
+        end
+
+        -- Search each text part using trie
+        for _, part in ipairs(text_parts) do
+          local content = part:get_content()
+          if content and #content > 0 then
+            local txt = tostring(content)
+
+            -- Use trie:match with callback and report_start=true for positions
+            obf_trie:match(txt, function(pattern_idx, match_pos)
+              local start_pos, end_pos
+              if type(match_pos) == 'table' then
+                start_pos, end_pos = match_pos[1], match_pos[2]
+              else
+                -- Only end position provided
+                end_pos = match_pos
+                start_pos = math.max(1, end_pos - obf_cfg.max_match_length)
+              end
+
+              return process_match(txt, start_pos, end_pos, pattern_idx)
+            end, true)  -- report_start = true
+          end
+        end
  
-    rspamd_logger.infox(rspamd_config, 'Registered obfuscated URL detection with %d patterns',
-        #re_parts)
+        return false
+      end
+
+      -- Register as prefilter for early URL injection
+      local prefilter_id = rspamd_config:register_symbol({
+        name = symbols.obfuscated_text,
+        type = 'prefilter',
+        callback = obfuscated_text_prefilter,
+        group = 'url',
+        score = 5.0,
+        description = 'Obfuscated URL found in message text'
+      })
+
+      rspamd_logger.infox(rspamd_config, 'Registered obfuscated URL prefilter (id=%s, hyperscan=%s)',
+          prefilter_id, has_hs)
+    end
    end
  end
diff --git a/test/functional/cases/001_merged/400_url_suspect.robot b/test/functional/cases/001_merged/400_url_suspect.robot

index b4a06ea92ac49b426964ba83f2cba78495556850..d1d8399532deab35820ca47112c440e36a28c589 100644 (file)
--- a/test/functional/cases/001_merged/400_url_suspect.robot
+++ b/test/functional/cases/001_merged/400_url_suspect.robot
@@ -50,3 +50,27 @@ URL Suspect - Normal URL
    Do Not Expect Symbol  URL_USER_PASSWORD
    Do Not Expect Symbol  URL_NUMERIC_IP
    Do Not Expect Symbol  URL_SUSPICIOUS_TLD
+
+URL Suspect - Obfuscated hxxp
+  # Test hxxp:// obfuscation detection
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_obfuscated_hxxp.eml
+  ...  Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]}
+  Expect Symbol  URL_OBFUSCATED_TEXT
+
+URL Suspect - Obfuscated Bracket Dots
+  # Test bracket dots obfuscation detection: example[.]com
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_obfuscated_bracket_dots.eml
+  ...  Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]}
+  Expect Symbol  URL_OBFUSCATED_TEXT
+
+URL Suspect - Obfuscated Word Dot
+  # Test word dot obfuscation detection: example dot com
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_obfuscated_word_dot.eml
+  ...  Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]}
+  Expect Symbol  URL_OBFUSCATED_TEXT
+
+URL Suspect - Obfuscated Spaced Protocol
+  # Test spaced protocol obfuscation: h t t p s : / /
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_obfuscated_spaced.eml
+  ...  Settings={symbols_enabled = [URL_OBFUSCATED_TEXT]}
+  Expect Symbol  URL_OBFUSCATED_TEXT
diff --git a/test/functional/messages/url_obfuscated_bracket_dots.eml b/test/functional/messages/url_obfuscated_bracket_dots.eml

new file mode 100644 (file)

index 0000000..d4388e5
--- /dev/null
+++ b/test/functional/messages/url_obfuscated_bracket_dots.eml
@@ -0,0 +1,6 @@
+From: sender@example.com
+To: victim@example.com
+Subject: Test bracket dots obfuscation
+Content-Type: text/plain; charset=utf-8
+
+Visit our site at example[.]com/login
diff --git a/test/functional/messages/url_obfuscated_hxxp.eml b/test/functional/messages/url_obfuscated_hxxp.eml

new file mode 100644 (file)

index 0000000..3f1b029
--- /dev/null
+++ b/test/functional/messages/url_obfuscated_hxxp.eml
@@ -0,0 +1,6 @@
+From: sender@example.com
+To: victim@example.com
+Subject: Test hxxp obfuscation
+Content-Type: text/plain; charset=utf-8
+
+Check this link: hxxp://malicious-site.com/phish
diff --git a/test/functional/messages/url_obfuscated_spaced.eml b/test/functional/messages/url_obfuscated_spaced.eml

new file mode 100644 (file)

index 0000000..dd7365f
--- /dev/null
+++ b/test/functional/messages/url_obfuscated_spaced.eml
@@ -0,0 +1,6 @@
+From: sender@example.com
+To: victim@example.com
+Subject: Test spaced protocol obfuscation
+Content-Type: text/plain; charset=utf-8
+
+Visit h t t p s : / / evil-site.com/page for details
diff --git a/test/functional/messages/url_obfuscated_word_dot.eml b/test/functional/messages/url_obfuscated_word_dot.eml

new file mode 100644 (file)

index 0000000..dc0e7dc
--- /dev/null
+++ b/test/functional/messages/url_obfuscated_word_dot.eml
@@ -0,0 +1,6 @@
+From: sender@example.com
+To: victim@example.com
+Subject: Test word dot obfuscation
+Content-Type: text/plain; charset=utf-8
+
+Contact us at secure-login dot net for support
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Sun, 23 Nov 2025 11:38:18 +0000 (11:38 +0000)
src/lua/lua_util.c		patch \| blob \| blame \| history
src/plugins/lua/url_suspect.lua		patch \| blob \| blame \| history
test/functional/cases/001_merged/400_url_suspect.robot		patch \| blob \| blame \| history
test/functional/messages/url_obfuscated_bracket_dots.eml	[new file with mode: 0644]	patch \| blob
test/functional/messages/url_obfuscated_hxxp.eml	[new file with mode: 0644]	patch \| blob
test/functional/messages/url_obfuscated_spaced.eml	[new file with mode: 0644]	patch \| blob
test/functional/messages/url_obfuscated_word_dot.eml	[new file with mode: 0644]	patch \| blob