]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add URL deep processing architecture
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 12:27:21 +0000 (12:27 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 14 Nov 2025 12:27:21 +0000 (12:27 +0000)
This commit implements a two-level URL processing system that addresses
issue #5731 and provides flexible URL analysis with multiple specific symbols.

Core changes:
* Modified src/libserver/url.c to handle oversized user fields (fixes #5731)
* Added lualib/lua_url_filter.lua - Fast library filter during parsing
* Added src/plugins/lua/url_suspect.lua - Deep inspection plugin
* Added conf/modules.d/url_suspect.conf - Plugin configuration
* Added conf/scores.d/url_suspect_group.conf - Symbol scores

Key features:
* No new C flags - uses existing URL flags (has_user, numeric, obscured, etc.)
* Works without maps - built-in logic for common cases
* 15+ specific symbols instead of generic R_SUSPICIOUS_URL
* Backward compatible - keeps R_SUSPICIOUS_URL working
* User extensible - custom filters and checks supported

Optional features:
* Example map files for advanced customization (disabled by default)
* Whitelist, pattern matching, TLD lists

Issue: #5731

conf/maps.d/url_suspect/README.md [new file with mode: 0644]
conf/maps.d/url_suspect/suspicious_ports.map.example [new file with mode: 0644]
conf/maps.d/url_suspect/suspicious_tlds.map.example [new file with mode: 0644]
conf/maps.d/url_suspect/user_patterns.map.example [new file with mode: 0644]
conf/maps.d/url_suspect/whitelist_domains.map.example [new file with mode: 0644]
conf/modules.d/url_suspect.conf [new file with mode: 0644]
conf/scores.d/url_suspect_group.conf [new file with mode: 0644]
local.d/url_filter.lua [new file with mode: 0644]
lualib/lua_url_filter.lua [new file with mode: 0644]
src/libserver/url.c
src/plugins/lua/url_suspect.lua [new file with mode: 0644]

diff --git a/conf/maps.d/url_suspect/README.md b/conf/maps.d/url_suspect/README.md
new file mode 100644 (file)
index 0000000..4968add
--- /dev/null
@@ -0,0 +1,173 @@
+# URL Suspect Optional Maps
+
+This directory contains **optional** map files for the URL Suspect plugin.
+
+**Important**: These maps are **disabled by default**. The plugin works perfectly without them using built-in logic.
+
+## When to Use Maps
+
+Use maps only if you need to:
+- Whitelist specific domains to skip checks
+- Add custom user field patterns beyond built-in checks
+- Blacklist specific user names
+- Define additional suspicious TLDs beyond the built-in list
+- Mark specific IP ranges as suspicious
+- Define unusual ports as suspicious
+
+For most users, the built-in logic is sufficient.
+
+## Available Maps
+
+### 1. whitelist_domains.map
+**Purpose**: Skip all URL suspect checks for trusted domains
+
+**Format**: One domain per line
+```
+google.com
+microsoft.com
+github.com
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  use_whitelist = true;
+  whitelist_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/whitelist_domains.map";
+}
+```
+
+### 2. user_patterns.map
+**Purpose**: Regex patterns for suspicious user fields
+
+**Format**: Regex pattern (one per line)
+```
+^admin$
+^root$
+^test$
+^[0-9]{10,}$
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  checks {
+    user_password {
+      use_pattern_map = true;
+      pattern_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/user_patterns.map";
+    }
+  }
+}
+```
+
+### 3. user_blacklist.map
+**Purpose**: Exact user names to penalize
+
+**Format**: Exact match (one per line)
+```
+admin
+root
+administrator
+webmaster
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  checks {
+    user_password {
+      use_blacklist = true;
+      blacklist_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/user_blacklist.map";
+    }
+  }
+}
+```
+
+### 4. suspicious_tlds.map
+**Purpose**: Additional TLDs beyond built-in list (.tk, .ml, .ga, .cf, .gq)
+
+**Format**: TLD with leading dot (one per line)
+```
+.xyz
+.top
+.work
+.date
+.loan
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  checks {
+    tld {
+      use_tld_map = true;
+      tld_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/suspicious_tlds.map";
+    }
+  }
+}
+```
+
+### 5. suspicious_ip_ranges.map
+**Purpose**: IP ranges to mark as suspicious (beyond built-in private IP detection)
+
+**Format**: CIDR notation (one per line)
+```
+203.0.113.0/24
+198.51.100.0/24
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  checks {
+    numeric_ip {
+      use_range_map = true;
+      range_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/suspicious_ip_ranges.map";
+    }
+  }
+}
+```
+
+### 6. suspicious_ports.map
+**Purpose**: Unusual ports that indicate suspicious URLs
+
+**Format**: Port number (one per line)
+```
+8080
+8443
+3128
+1080
+```
+
+**Enable in** `local.d/url_suspect.conf`:
+```lua
+url_suspect {
+  checks {
+    structure {
+      use_port_map = true;
+      port_map = "$LOCAL_CONFDIR/local.d/maps.d/url_suspect/suspicious_ports.map";
+    }
+  }
+}
+```
+
+## Map File Locations
+
+You can place map files in:
+1. `$LOCAL_CONFDIR/local.d/maps.d/url_suspect/` (recommended)
+2. `$LOCAL_CONFDIR/local.d/` (also works)
+3. Any absolute path
+4. Remote URL (e.g., `https://example.com/map.txt`)
+
+## Example Files
+
+See `.example` files in this directory for templates you can copy and modify.
+
+## Performance Note
+
+Maps are loaded once at startup and cached in memory. They don't add significant overhead even when enabled.
+
+## Support
+
+For questions or issues:
+- Documentation: https://rspamd.com/doc/modules/url_suspect.html
+- GitHub: https://github.com/rspamd/rspamd/issues
diff --git a/conf/maps.d/url_suspect/suspicious_ports.map.example b/conf/maps.d/url_suspect/suspicious_ports.map.example
new file mode 100644 (file)
index 0000000..f7f1240
--- /dev/null
@@ -0,0 +1,18 @@
+# Suspicious Ports
+# Unusual ports that may indicate malicious activity
+# Format: Port number (one per line)
+
+# Common proxy ports
+8080
+8443
+3128
+1080
+
+# Unusual web ports
+8888
+9999
+4444
+8000
+
+# Add your suspicious ports below:
+# 12345
diff --git a/conf/maps.d/url_suspect/suspicious_tlds.map.example b/conf/maps.d/url_suspect/suspicious_tlds.map.example
new file mode 100644 (file)
index 0000000..658996f
--- /dev/null
@@ -0,0 +1,19 @@
+# Suspicious TLDs
+# These are in addition to the built-in list: .tk, .ml, .ga, .cf, .gq
+# Format: TLD with leading dot (one per line)
+
+# Frequently abused TLDs
+.xyz
+.top
+.work
+.date
+.loan
+.win
+.download
+.stream
+.click
+.link
+.racing
+
+# Add your suspicious TLDs below:
+# .suspicious
diff --git a/conf/maps.d/url_suspect/user_patterns.map.example b/conf/maps.d/url_suspect/user_patterns.map.example
new file mode 100644 (file)
index 0000000..4e07b3f
--- /dev/null
@@ -0,0 +1,20 @@
+# Suspicious User Field Patterns (Regex)
+# Format: Regex pattern (one per line)
+
+# Common suspicious usernames
+^admin$
+^root$
+^test$
+^user$
+^administrator$
+^webmaster$
+^postmaster$
+
+# Very long numeric usernames (10+ digits)
+^[0-9]{10,}$
+
+# Very long usernames in general
+^.{128,}$
+
+# Add your patterns below:
+# ^mypattern$
diff --git a/conf/maps.d/url_suspect/whitelist_domains.map.example b/conf/maps.d/url_suspect/whitelist_domains.map.example
new file mode 100644 (file)
index 0000000..a81bb5c
--- /dev/null
@@ -0,0 +1,19 @@
+# Whitelist Domains for URL Suspect Plugin
+# URLs from these domains will skip all URL suspect checks
+# Format: One domain per line
+
+# Major tech companies
+google.com
+microsoft.com
+apple.com
+amazon.com
+
+# Development platforms
+github.com
+gitlab.com
+bitbucket.org
+stackoverflow.com
+
+# Add your trusted domains below:
+# example.com
+# internal-cdn.mycompany.com
diff --git a/conf/modules.d/url_suspect.conf b/conf/modules.d/url_suspect.conf
new file mode 100644 (file)
index 0000000..fd198e0
--- /dev/null
@@ -0,0 +1,163 @@
+# URL Suspect Plugin Configuration
+# Module documentation: https://rspamd.com/doc/modules/url_suspect.html
+
+url_suspect {
+  # Enable the plugin
+  enabled = true;
+
+  # Which URL flags trigger inspection (existing flags, no new flags needed)
+  # Available: has_user, numeric, obscured, zw_spaces, no_tld, unnormalised
+  process_flags = ["has_user", "numeric", "obscured", "zw_spaces", "no_tld"];
+
+  # Check configuration
+  checks {
+    # User/password field analysis
+    user_password {
+      enabled = true;
+
+      # Length thresholds for scoring
+      length_thresholds {
+        suspicious = 64;     # Score if user field > 64 chars
+        long = 128;          # Higher score if > 128
+        very_long = 256;     # Even higher if > 256
+      }
+
+      # OPTIONAL: Advanced pattern matching (disabled by default)
+      # Enable only if you need custom user field patterns
+      use_pattern_map = false;
+      # pattern_map = "$LOCAL_CONFDIR/local.d/url_suspect_user_patterns.map";
+
+      # OPTIONAL: User blacklist (disabled by default)
+      use_blacklist = false;
+      # blacklist_map = "$LOCAL_CONFDIR/local.d/url_suspect_user_blacklist.map";
+    }
+
+    # Numeric IP address analysis
+    numeric_ip {
+      enabled = true;
+
+      # Scoring for different scenarios
+      base_score = 1.5;          # Basic numeric IP
+      with_user_score = 4.0;     # Numeric IP + user field
+
+      # Private IP ranges (10.x, 192.168.x, etc.)
+      allow_private_ranges = true;
+      private_score = 0.5;       # Lower score for private IPs
+
+      # OPTIONAL: Suspicious IP ranges map (disabled by default)
+      use_range_map = false;
+      # range_map = "$LOCAL_CONFDIR/local.d/url_suspect_ip_ranges.map";
+    }
+
+    # TLD (Top Level Domain) analysis
+    tld {
+      enabled = true;
+
+      # Built-in suspicious TLDs (no map needed)
+      builtin_suspicious = [".tk", ".ml", ".ga", ".cf", ".gq"];
+      builtin_score = 3.0;
+
+      # Missing TLD score
+      missing_tld_score = 2.0;
+
+      # OPTIONAL: Custom TLD map (disabled by default)
+      # Add this if you have additional TLDs to check
+      use_tld_map = false;
+      # tld_map = "$LOCAL_CONFDIR/local.d/url_suspect_tlds.map";
+    }
+
+    # Unicode and encoding analysis
+    unicode {
+      enabled = true;
+
+      # All checks use built-in logic (no maps needed)
+      check_validity = true;      # Invalid UTF-8 sequences
+      check_homographs = true;    # Mixed script homograph attacks
+      check_rtl_override = true;  # RTL Unicode override tricks
+      check_zero_width = true;    # Zero-width space characters
+    }
+
+    # URL structure analysis
+    structure {
+      enabled = true;
+
+      # Multiple @ signs
+      check_multiple_at = true;
+      max_at_signs = 2;
+
+      # Backslashes in URL
+      check_backslash = true;
+
+      # Excessive dots in hostname
+      check_excessive_dots = true;
+      max_host_dots = 6;
+
+      # URL length
+      check_length = true;
+      max_url_length = 2048;
+
+      # OPTIONAL: Suspicious ports map (disabled by default)
+      use_port_map = false;
+      # port_map = "$LOCAL_CONFDIR/local.d/url_suspect_ports.map";
+    }
+  }
+
+  # Symbol names (can be customized)
+  symbols {
+    # User/password symbols
+    user_password = "URL_USER_PASSWORD";
+    user_long = "URL_USER_LONG";
+    user_very_long = "URL_USER_VERY_LONG";
+
+    # Numeric IP symbols
+    numeric_ip = "URL_NUMERIC_IP";
+    numeric_ip_user = "URL_NUMERIC_IP_USER";
+    numeric_private = "URL_NUMERIC_PRIVATE_IP";
+
+    # TLD symbols
+    no_tld = "URL_NO_TLD";
+    suspicious_tld = "URL_SUSPICIOUS_TLD";
+
+    # Unicode symbols
+    bad_unicode = "URL_BAD_UNICODE";
+    homograph = "URL_HOMOGRAPH_ATTACK";
+    rtl_override = "URL_RTL_OVERRIDE";
+    zero_width = "URL_ZERO_WIDTH_SPACES";
+
+    # Structure symbols
+    multiple_at = "URL_MULTIPLE_AT_SIGNS";
+    backslash = "URL_BACKSLASH_PATH";
+    excessive_dots = "URL_EXCESSIVE_DOTS";
+    very_long = "URL_VERY_LONG";
+  }
+
+  # ADVANCED: Global whitelist (disabled by default)
+  # Use only if you need to skip checks for specific domains
+  use_whitelist = false;
+  # whitelist_map = "$LOCAL_CONFDIR/local.d/url_suspect_whitelist.map";
+
+  # ADVANCED: Custom checks (disabled by default)
+  # Example:
+  # custom_checks {
+  #   my_check = <<EOD
+  #     return function(task, url, settings)
+  #       local host = url:get_host()
+  #       if host and host:match("suspicious") then
+  #         return {
+  #           symbol = "MY_SUSPICIOUS_URL",
+  #           score = 5.0,
+  #           options = {host}
+  #         }
+  #       end
+  #     end
+  # EOD;
+  # }
+
+  # Backward compatibility with R_SUSPICIOUS_URL
+  # When enabled, R_SUSPICIOUS_URL symbol is inserted if any URL_* symbols fire
+  compat_mode = true;
+
+  .include(try=true,priority=5) "${DBDIR}/dynamic/url_suspect.conf"
+  .include(try=true,priority=1,duplicate=merge) "$LOCAL_CONFDIR/local.d/url_suspect.conf"
+  .include(try=true,priority=10) "$LOCAL_CONFDIR/override.d/url_suspect.conf"
+}
diff --git a/conf/scores.d/url_suspect_group.conf b/conf/scores.d/url_suspect_group.conf
new file mode 100644 (file)
index 0000000..b0591c0
--- /dev/null
@@ -0,0 +1,101 @@
+# URL Suspect Plugin Scores
+# These scores are applied when suspicious URLs are detected
+
+symbols = {
+  # User/password in URL
+  "URL_USER_PASSWORD" {
+    weight = 2.0;
+    description = "URL contains user field";
+    one_shot = false;
+  }
+  "URL_USER_LONG" {
+    weight = 3.0;
+    description = "URL user field is long (>128 chars)";
+    one_shot = false;
+  }
+  "URL_USER_VERY_LONG" {
+    weight = 5.0;
+    description = "URL user field is very long (>256 chars)";
+    one_shot = false;
+  }
+
+  # Numeric IP in URL
+  "URL_NUMERIC_IP" {
+    weight = 1.5;
+    description = "URL uses numeric IP address";
+    one_shot = false;
+  }
+  "URL_NUMERIC_IP_USER" {
+    weight = 4.0;
+    description = "URL uses numeric IP with user field";
+    one_shot = false;
+  }
+  "URL_NUMERIC_PRIVATE_IP" {
+    weight = 0.5;
+    description = "URL uses private IP range";
+    one_shot = false;
+  }
+
+  # TLD issues
+  "URL_NO_TLD" {
+    weight = 2.0;
+    description = "URL has no TLD";
+    one_shot = false;
+  }
+  "URL_SUSPICIOUS_TLD" {
+    weight = 3.0;
+    description = "URL uses suspicious TLD";
+    one_shot = false;
+  }
+
+  # Unicode and encoding issues
+  "URL_BAD_UNICODE" {
+    weight = 3.0;
+    description = "URL contains invalid Unicode";
+    one_shot = false;
+  }
+  "URL_HOMOGRAPH_ATTACK" {
+    weight = 5.0;
+    description = "URL uses homograph attack (mixed scripts)";
+    one_shot = false;
+  }
+  "URL_RTL_OVERRIDE" {
+    weight = 6.0;
+    description = "URL uses RTL override character";
+    one_shot = false;
+  }
+  "URL_ZERO_WIDTH_SPACES" {
+    weight = 7.0;
+    description = "URL contains zero-width spaces";
+    one_shot = false;
+  }
+
+  # URL structure issues
+  "URL_MULTIPLE_AT_SIGNS" {
+    weight = 3.0;
+    description = "URL has multiple @ signs";
+    one_shot = false;
+  }
+  "URL_BACKSLASH_PATH" {
+    weight = 2.0;
+    description = "URL uses backslashes";
+    one_shot = false;
+  }
+  "URL_EXCESSIVE_DOTS" {
+    weight = 2.0;
+    description = "URL has excessive dots in hostname";
+    one_shot = false;
+  }
+  "URL_VERY_LONG" {
+    weight = 1.5;
+    description = "URL is very long";
+    one_shot = false;
+  }
+
+  # Legacy symbol (backward compatibility)
+  "R_SUSPICIOUS_URL" {
+    weight = 5.0;
+    description = "Suspicious URL detected (legacy symbol)";
+    one_shot = true;
+  }
+}
diff --git a/local.d/url_filter.lua b/local.d/url_filter.lua
new file mode 100644 (file)
index 0000000..2145927
--- /dev/null
@@ -0,0 +1,51 @@
+--[[
+URL Filter Configuration
+This is a configuration template for the URL filter library.
+
+The URL filter runs during parsing (before URL objects are created).
+It provides fast validation to reject obvious garbage URLs.
+
+Most users don't need to configure this - the defaults work well.
+]]--
+
+-- Enable/disable the filter
+-- enabled = true;
+
+-- Built-in filter configuration
+-- builtin_filters = {
+--   # Reject URLs with extremely long user fields
+--   oversized_user = {
+--     enabled = true;
+--     max_length = 512;  # Absolute limit for user field length
+--   };
+--
+--   # Reject URLs with invalid UTF-8
+--   basic_unicode = {
+--     enabled = true;
+--     reject_invalid_utf8 = true;
+--   };
+--
+--   # Reject obvious garbage patterns
+--   garbage_pattern = {
+--     enabled = true;
+--     max_at_signs = 20;  # URLs with >20 @ signs are garbage
+--   };
+-- };
+
+-- ADVANCED: Custom filters
+-- You can add your own filters that run during URL parsing.
+-- Filter function signature: function(url_text, url_obj, flags)
+-- Return: "accept", "suspicious", or "reject"
+--
+-- Example:
+-- custom_filters = {
+--   my_domain_filter = function(url_text, url_obj, flags)
+--     if url_obj then
+--       local host = url_obj:get_host()
+--       if host == "blocked-domain.com" then
+--         return "reject"  -- Don't create URL object
+--       end
+--     end
+--     return "accept"
+--   end;
+-- };
diff --git a/lualib/lua_url_filter.lua b/lualib/lua_url_filter.lua
new file mode 100644 (file)
index 0000000..adb7fac
--- /dev/null
@@ -0,0 +1,180 @@
+--[[
+Copyright (c) 2025, Vsevolod Stakhov <vsevolod@rspamd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+--[[[
+-- @module lua_url_filter
+-- This module provides fast URL filtering during parsing phase.
+-- Called from C code to decide whether to create URL object or reject text.
+--]]
+
+local exports = {}
+
+-- Filter result constants
+exports.ACCEPT = 0
+exports.SUSPICIOUS = 1
+exports.REJECT = 2
+
+-- Default settings (work without configuration)
+local settings = {
+  enabled = true,
+  builtin_filters = {
+    oversized_user = {
+      enabled = true,
+      max_length = 512  -- Absolute limit for user field
+    },
+    basic_unicode = {
+      enabled = true,
+      reject_invalid_utf8 = true
+    },
+    garbage_pattern = {
+      enabled = true,
+      max_at_signs = 20  -- Obvious garbage threshold
+    }
+  },
+  custom_filters = {}
+}
+
+-- Built-in filter: Check for extremely long user fields
+local function filter_oversized_user(url_text, url_obj, flags, cfg)
+  if not url_obj then
+    return exports.ACCEPT
+  end
+
+  local user = url_obj:get_user()
+  if not user then
+    return exports.ACCEPT
+  end
+
+  local user_len = #user
+  if user_len > cfg.max_length then
+    -- This is obviously garbage, reject
+    return exports.REJECT
+  end
+
+  return exports.ACCEPT
+end
+
+-- Built-in filter: Check for invalid UTF-8
+local function filter_basic_unicode(url_text, url_obj, flags, cfg)
+  if not cfg.reject_invalid_utf8 then
+    return exports.ACCEPT
+  end
+
+  local ok, rspamd_util = pcall(require, "rspamd_util")
+  if ok and rspamd_util.is_valid_utf8 then
+    if not rspamd_util.is_valid_utf8(url_text) then
+      -- Invalid UTF-8, reject
+      return exports.REJECT
+    end
+  end
+
+  return exports.ACCEPT
+end
+
+-- Built-in filter: Check for obvious garbage patterns
+local function filter_garbage_pattern(url_text, url_obj, flags, cfg)
+  -- Count @ signs
+  local _, at_count = url_text:gsub("@", "")
+  if at_count > cfg.max_at_signs then
+    -- Way too many @ signs, this is garbage
+    return exports.REJECT
+  end
+
+  return exports.ACCEPT
+end
+
+-- Main entry point (called from C)
+function exports.filter_url(url_text, url_obj, flags)
+  if not settings.enabled then
+    return exports.ACCEPT
+  end
+
+  local result = exports.ACCEPT
+
+  -- Run built-in filters
+  if settings.builtin_filters.oversized_user and
+     settings.builtin_filters.oversized_user.enabled then
+    local r = filter_oversized_user(url_text, url_obj, flags,
+                                     settings.builtin_filters.oversized_user)
+    if r == exports.REJECT then
+      return r
+    end
+  end
+
+  if settings.builtin_filters.basic_unicode and
+     settings.builtin_filters.basic_unicode.enabled then
+    local r = filter_basic_unicode(url_text, url_obj, flags,
+                                   settings.builtin_filters.basic_unicode)
+    if r == exports.REJECT then
+      return r
+    end
+  end
+
+  if settings.builtin_filters.garbage_pattern and
+     settings.builtin_filters.garbage_pattern.enabled then
+    local r = filter_garbage_pattern(url_text, url_obj, flags,
+                                     settings.builtin_filters.garbage_pattern)
+    if r == exports.REJECT then
+      return r
+    end
+  end
+
+  -- Run custom filters (if any)
+  for name, filter_func in pairs(settings.custom_filters) do
+    local ok, r = pcall(filter_func, url_text, url_obj, flags)
+    if not ok then
+      -- Log error but don't fail
+      local rspamd_logger = require "rspamd_logger"
+      rspamd_logger.errx("Error in custom URL filter %s: %s", name, r)
+    else
+      if r == "reject" then
+        return exports.REJECT
+      elseif r == "suspicious" then
+        result = exports.SUSPICIOUS
+      end
+    end
+  end
+
+  return result
+end
+
+-- Initialize from configuration
+function exports.init(cfg)
+  local lua_util = require "lua_util"
+  local opts = cfg:get_all_opt('url_filter')
+  if opts then
+    settings = lua_util.override_defaults(settings, opts)
+  end
+
+  local rspamd_logger = require "rspamd_logger"
+  rspamd_logger.infox(cfg, "URL filter initialized (enabled=%s)", settings.enabled)
+end
+
+-- Allow runtime registration of custom filters
+function exports.register_custom_filter(name, func)
+  if type(func) ~= 'function' then
+    local rspamd_logger = require "rspamd_logger"
+    rspamd_logger.errx("Cannot register custom filter %s: not a function", name)
+    return false
+  end
+
+  settings.custom_filters[name] = func
+  local rspamd_logger = require "rspamd_logger"
+  rspamd_logger.infox("Registered custom URL filter: %s", name)
+  return true
+end
+
+return exports
index 7027fc02d769e65a899794040ca41800c139fe0e..6add598a43703ed86a36c9f706b96602297d3aa6 100644 (file)
@@ -1199,7 +1199,10 @@ rspamd_web_parse(struct http_parser_url *u, const char *str, gsize len,
                                goto out;
                        }
                        else if (p - c > max_email_user) {
-                               goto out;
+                               /* Allow oversized user fields but mark them - fixes #5731 */
+                               /* Don't fail completely, just mark with flag and continue */
+                               *flags |= RSPAMD_URL_FLAG_HAS_USER;
+                               /* Continue parsing - the Lua plugin will handle scoring */
                        }
 
                        p++;
diff --git a/src/plugins/lua/url_suspect.lua b/src/plugins/lua/url_suspect.lua
new file mode 100644 (file)
index 0000000..0b5c82b
--- /dev/null
@@ -0,0 +1,602 @@
+--[[
+Copyright (c) 2025, Vsevolod Stakhov <vsevolod@rspamd.com>
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+]]--
+
+--[[[
+-- @module url_suspect
+-- This module performs deep introspection of suspicious URLs.
+-- Works with existing URL flags, no new flags needed.
+-- Provides multiple specific symbols for different URL issues.
+--]]
+
+if confighelp then
+  return
+end
+
+local N = "url_suspect"
+local rspamd_logger = require "rspamd_logger"
+local lua_util = require "lua_util"
+local rspamd_url = require "rspamd_url"
+local rspamd_util = require "rspamd_util"
+local bit = require "bit"
+
+-- Default settings (work without any maps)
+local settings = {
+  enabled = true,
+  process_flags = { 'has_user', 'numeric', 'obscured', 'zw_spaces', 'no_tld' },
+  checks = {
+    user_password = {
+      enabled = true,
+      length_thresholds = {
+        suspicious = 64,
+        long = 128,
+        very_long = 256
+      },
+      use_pattern_map = false,
+      use_blacklist = false
+    },
+    numeric_ip = {
+      enabled = true,
+      base_score = 1.5,
+      with_user_score = 4.0,
+      allow_private_ranges = true,
+      private_score = 0.5,
+      use_range_map = false
+    },
+    tld = {
+      enabled = true,
+      builtin_suspicious = { ".tk", ".ml", ".ga", ".cf", ".gq" },
+      builtin_score = 3.0,
+      missing_tld_score = 2.0,
+      use_tld_map = false
+    },
+    unicode = {
+      enabled = true,
+      check_validity = true,
+      check_homographs = true,
+      check_rtl_override = true,
+      check_zero_width = true
+    },
+    structure = {
+      enabled = true,
+      check_multiple_at = true,
+      max_at_signs = 2,
+      check_backslash = true,
+      check_excessive_dots = true,
+      max_host_dots = 6,
+      check_length = true,
+      max_url_length = 2048,
+      use_port_map = false
+    }
+  },
+  symbols = {
+    -- User/password symbols
+    user_password = "URL_USER_PASSWORD",
+    user_long = "URL_USER_LONG",
+    user_very_long = "URL_USER_VERY_LONG",
+    -- Numeric IP symbols
+    numeric_ip = "URL_NUMERIC_IP",
+    numeric_ip_user = "URL_NUMERIC_IP_USER",
+    numeric_private = "URL_NUMERIC_PRIVATE_IP",
+    -- TLD symbols
+    no_tld = "URL_NO_TLD",
+    suspicious_tld = "URL_SUSPICIOUS_TLD",
+    -- Unicode symbols
+    bad_unicode = "URL_BAD_UNICODE",
+    homograph = "URL_HOMOGRAPH_ATTACK",
+    rtl_override = "URL_RTL_OVERRIDE",
+    zero_width = "URL_ZERO_WIDTH_SPACES",
+    -- Structure symbols
+    multiple_at = "URL_MULTIPLE_AT_SIGNS",
+    backslash = "URL_BACKSLASH_PATH",
+    excessive_dots = "URL_EXCESSIVE_DOTS",
+    very_long = "URL_VERY_LONG"
+  },
+  use_whitelist = false,
+  custom_checks = {},
+  compat_mode = true
+}
+
+-- Optional maps (only loaded if enabled)
+local maps = {
+  whitelist = nil,
+  user_patterns = nil,
+  user_blacklist = nil,
+  suspicious_ips = nil,
+  suspicious_tlds = nil,
+  suspicious_ports = nil
+}
+
+-- Check implementations
+local checks = {}
+
+-- Check: User/password in URL
+function checks.user_password_analysis(task, url, cfg)
+  local findings = {}
+  local url_flags_tab = rspamd_url.flags
+  local flags = url:get_flags_num()
+
+  -- Check if user field present
+  if bit.band(flags, url_flags_tab.has_user) == 0 then
+    return findings
+  end
+
+  local user = url:get_user()
+  if not user then
+    return findings
+  end
+
+  local user_len = #user
+  local host = url:get_host()
+
+  lua_util.debugm(N, task, "Checking user field length: %d chars", user_len)
+
+  -- Length-based scoring (built-in, no map needed)
+  if user_len > cfg.length_thresholds.very_long then
+    table.insert(findings, {
+      symbol = settings.symbols.user_very_long,
+      score = 5.0,
+      options = { string.format("%d", user_len) }
+    })
+  elseif user_len > cfg.length_thresholds.long then
+    table.insert(findings, {
+      symbol = settings.symbols.user_long,
+      score = 3.0,
+      options = { string.format("%d", user_len) }
+    })
+  elseif user_len > cfg.length_thresholds.suspicious then
+    table.insert(findings, {
+      symbol = settings.symbols.user_password,
+      score = 2.0,
+      options = { host or "unknown" }
+    })
+  else
+    -- Normal length user
+    table.insert(findings, {
+      symbol = settings.symbols.user_password,
+      score = 2.0,
+      options = { host or "unknown" }
+    })
+  end
+
+  -- Optional: check pattern map if enabled
+  if cfg.use_pattern_map and maps.user_patterns then
+    local match = maps.user_patterns:get_key(user)
+    if match then
+      lua_util.debugm(N, task, "User field matches suspicious pattern")
+      -- Could add additional symbol or increase score
+    end
+  end
+
+  -- Optional: check blacklist if enabled
+  if cfg.use_blacklist and maps.user_blacklist then
+    if maps.user_blacklist:get_key(user) then
+      lua_util.debugm(N, task, "User field is blacklisted")
+      -- Could add additional symbol or increase score
+    end
+  end
+
+  return findings
+end
+
+-- Check: Numeric IP as hostname
+function checks.numeric_ip_analysis(task, url, cfg)
+  local findings = {}
+  local url_flags_tab = rspamd_url.flags
+  local flags = url:get_flags_num()
+
+  if bit.band(flags, url_flags_tab.numeric) == 0 then
+    return findings
+  end
+
+  local host = url:get_host()
+  if not host then
+    return findings
+  end
+
+  lua_util.debugm(N, task, "Checking numeric IP: %s", host)
+
+  -- Check if private IP
+  local is_private = host:match("^10%.") or
+      host:match("^192%.168%.") or
+      host:match("^172%.1[6-9]%.") or
+      host:match("^172%.2[0-9]%.") or
+      host:match("^172%.3[0-1]%.")
+
+  if is_private and cfg.allow_private_ranges then
+    table.insert(findings, {
+      symbol = settings.symbols.numeric_private,
+      score = cfg.private_score,
+      options = { host }
+    })
+  else
+    -- Check if user present (more suspicious)
+    if bit.band(flags, url_flags_tab.has_user) ~= 0 then
+      table.insert(findings, {
+        symbol = settings.symbols.numeric_ip_user,
+        score = cfg.with_user_score,
+        options = { host }
+      })
+    else
+      table.insert(findings, {
+        symbol = settings.symbols.numeric_ip,
+        score = cfg.base_score,
+        options = { host }
+      })
+    end
+  end
+
+  -- Optional: check IP range map if enabled
+  if cfg.use_range_map and maps.suspicious_ips then
+    if maps.suspicious_ips:get_key(host) then
+      lua_util.debugm(N, task, "IP is in suspicious range")
+      -- Could add additional penalty
+    end
+  end
+
+  return findings
+end
+
+-- Check: TLD validation
+function checks.tld_analysis(task, url, cfg)
+  local findings = {}
+  local url_flags_tab = rspamd_url.flags
+  local flags = url:get_flags_num()
+  local host = url:get_host()
+
+  if not host then
+    return findings
+  end
+
+  -- Check for missing TLD
+  if bit.band(flags, url_flags_tab.no_tld) ~= 0 then
+    -- Skip if it's a numeric IP (handled separately)
+    if bit.band(flags, url_flags_tab.numeric) == 0 then
+      lua_util.debugm(N, task, "URL has no TLD: %s", host)
+      table.insert(findings, {
+        symbol = settings.symbols.no_tld,
+        score = cfg.missing_tld_score,
+        options = { host }
+      })
+    end
+    return findings
+  end
+
+  local tld = url:get_tld()
+  if not tld then
+    return findings
+  end
+
+  -- Check built-in suspicious TLDs (no map needed)
+  for _, suspicious_tld in ipairs(cfg.builtin_suspicious) do
+    if tld == suspicious_tld or tld:sub(-#suspicious_tld) == suspicious_tld then
+      lua_util.debugm(N, task, "URL uses suspicious TLD: %s", tld)
+      table.insert(findings, {
+        symbol = settings.symbols.suspicious_tld,
+        score = cfg.builtin_score,
+        options = { tld }
+      })
+      break
+    end
+  end
+
+  -- Optional: check TLD map if enabled
+  if cfg.use_tld_map and maps.suspicious_tlds then
+    if maps.suspicious_tlds:get_key(tld) then
+      lua_util.debugm(N, task, "URL TLD in suspicious map: %s", tld)
+      -- Already handled by built-in check, or could add extra penalty
+    end
+  end
+
+  return findings
+end
+
+-- Check: Unicode anomalies
+function checks.unicode_analysis(task, url, cfg)
+  local findings = {}
+  local url_flags_tab = rspamd_url.flags
+  local flags = url:get_flags_num()
+
+  local url_text = url:get_text()
+  local host = url:get_host()
+
+  -- Check validity
+  if cfg.check_validity and not rspamd_util.is_valid_utf8(url_text) then
+    lua_util.debugm(N, task, "URL has invalid UTF-8")
+    table.insert(findings, {
+      symbol = settings.symbols.bad_unicode,
+      score = 3.0,
+      options = { host or "unknown" }
+    })
+  end
+
+  -- Check zero-width spaces (existing flag)
+  if cfg.check_zero_width and bit.band(flags, url_flags_tab.zw_spaces) ~= 0 then
+    lua_util.debugm(N, task, "URL contains zero-width spaces")
+    table.insert(findings, {
+      symbol = settings.symbols.zero_width,
+      score = 7.0,
+      options = { host or "unknown" }
+    })
+  end
+
+  -- Check homographs
+  if cfg.check_homographs and host then
+    if rspamd_util.is_utf_spoofed(host) then
+      lua_util.debugm(N, task, "URL uses homograph attack: %s", host)
+      table.insert(findings, {
+        symbol = settings.symbols.homograph,
+        score = 5.0,
+        options = { host }
+      })
+    end
+  end
+
+  -- Check RTL override (U+202E)
+  if cfg.check_rtl_override and url_text:find("\226\128\174") then
+    lua_util.debugm(N, task, "URL contains RTL override")
+    table.insert(findings, {
+      symbol = settings.symbols.rtl_override,
+      score = 6.0,
+      options = { host or "unknown" }
+    })
+  end
+
+  return findings
+end
+
+-- Check: URL structure anomalies
+function checks.structure_analysis(task, url, cfg)
+  local findings = {}
+  local url_text = url:get_text()
+  local host = url:get_host()
+
+  -- Check multiple @ signs
+  if cfg.check_multiple_at then
+    local _, at_count = url_text:gsub("@", "")
+    if at_count > cfg.max_at_signs then
+      lua_util.debugm(N, task, "URL has %d @ signs", at_count)
+      table.insert(findings, {
+        symbol = settings.symbols.multiple_at,
+        score = 3.0,
+        options = { string.format("%d", at_count) }
+      })
+    end
+  end
+
+  -- Check backslashes (existing flag indicates obscured)
+  if cfg.check_backslash then
+    local url_flags_tab = rspamd_url.flags
+    local flags = url:get_flags_num()
+    if bit.band(flags, url_flags_tab.obscured) ~= 0 and url_text:find("\\") then
+      lua_util.debugm(N, task, "URL contains backslashes")
+      table.insert(findings, {
+        symbol = settings.symbols.backslash,
+        score = 2.0,
+        options = { host or "unknown" }
+      })
+    end
+  end
+
+  -- Check excessive dots in hostname
+  if cfg.check_excessive_dots and host then
+    local _, dot_count = host:gsub("%.", "")
+    if dot_count > cfg.max_host_dots then
+      lua_util.debugm(N, task, "URL hostname has %d dots", dot_count)
+      table.insert(findings, {
+        symbol = settings.symbols.excessive_dots,
+        score = 2.0,
+        options = { string.format("%d", dot_count) }
+      })
+    end
+  end
+
+  -- Check URL length
+  if cfg.check_length and #url_text > cfg.max_url_length then
+    lua_util.debugm(N, task, "URL is very long: %d chars", #url_text)
+    table.insert(findings, {
+      symbol = settings.symbols.very_long,
+      score = 1.5,
+      options = { string.format("%d", #url_text) }
+    })
+  end
+
+  return findings
+end
+
+-- Main analysis function
+local function analyze_url(task, url, cfg)
+  local all_findings = {}
+
+  -- Optional: check whitelist first
+  if cfg.use_whitelist and maps.whitelist then
+    local host = url:get_host()
+    if host and maps.whitelist:get_key(host) then
+      lua_util.debugm(N, task, "URL host is whitelisted: %s", host)
+      return all_findings
+    end
+  end
+
+  -- Run all enabled checks (using built-in logic, no maps required)
+  if cfg.checks.user_password and cfg.checks.user_password.enabled then
+    local findings = checks.user_password_analysis(task, url, cfg.checks.user_password)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
+  if cfg.checks.numeric_ip and cfg.checks.numeric_ip.enabled then
+    local findings = checks.numeric_ip_analysis(task, url, cfg.checks.numeric_ip)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
+  if cfg.checks.tld and cfg.checks.tld.enabled then
+    local findings = checks.tld_analysis(task, url, cfg.checks.tld)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
+  if cfg.checks.unicode and cfg.checks.unicode.enabled then
+    local findings = checks.unicode_analysis(task, url, cfg.checks.unicode)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
+  if cfg.checks.structure and cfg.checks.structure.enabled then
+    local findings = checks.structure_analysis(task, url, cfg.checks.structure)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
+  -- Run custom checks (advanced users)
+  for name, check_func in pairs(cfg.custom_checks) do
+    local ok, findings = pcall(check_func, task, url, cfg)
+    if ok and findings then
+      if type(findings) == 'table' and findings.symbol then
+        table.insert(all_findings, findings)
+      end
+    else
+      rspamd_logger.errx(task, "Error in custom check %s: %s", name, findings)
+    end
+  end
+
+  return all_findings
+end
+
+-- Main callback
+local function url_suspect_callback(task)
+  -- Get URLs with suspicious flags (using existing flags)
+  local suspect_urls = task:get_urls_filtered(settings.process_flags)
+
+  if not suspect_urls or #suspect_urls == 0 then
+    return false
+  end
+
+  lua_util.debugm(N, task, "Processing %s URLs with suspicious flags", #suspect_urls)
+
+  local total_findings = 0
+
+  for _, url in ipairs(suspect_urls) do
+    local url_findings = analyze_url(task, url, settings)
+
+    for _, finding in ipairs(url_findings) do
+      task:insert_result(finding.symbol, finding.score, finding.options or {})
+      total_findings = total_findings + 1
+    end
+  end
+
+  -- Backward compatibility: R_SUSPICIOUS_URL
+  if settings.compat_mode and total_findings > 0 then
+    -- Check if we inserted any symbols
+    local has_findings = false
+    for _, symbol_name in pairs(settings.symbols) do
+      if task:has_symbol(symbol_name) then
+        has_findings = true
+        break
+      end
+    end
+
+    if has_findings then
+      task:insert_result('R_SUSPICIOUS_URL', 5.0)
+    end
+  end
+
+  return false
+end
+
+-- Initialize maps (only if enabled)
+local function init_maps(cfg)
+  if cfg.use_whitelist and cfg.whitelist_map then
+    local lua_maps = require "lua_maps"
+    maps.whitelist = lua_maps.map_add_from_ucl(
+        cfg.whitelist_map, 'set', 'url_suspect_whitelist')
+  end
+
+  if cfg.checks.user_password.use_pattern_map and cfg.checks.user_password.pattern_map then
+    local lua_maps = require "lua_maps"
+    maps.user_patterns = lua_maps.map_add_from_ucl(
+        cfg.checks.user_password.pattern_map, 'regexp', 'url_suspect_user_patterns')
+  end
+
+  if cfg.checks.user_password.use_blacklist and cfg.checks.user_password.blacklist_map then
+    local lua_maps = require "lua_maps"
+    maps.user_blacklist = lua_maps.map_add_from_ucl(
+        cfg.checks.user_password.blacklist_map, 'set', 'url_suspect_user_blacklist')
+  end
+
+  if cfg.checks.numeric_ip.use_range_map and cfg.checks.numeric_ip.range_map then
+    local lua_maps = require "lua_maps"
+    maps.suspicious_ips = lua_maps.map_add_from_ucl(
+        cfg.checks.numeric_ip.range_map, 'radix', 'url_suspect_ip_ranges')
+  end
+
+  if cfg.checks.tld.use_tld_map and cfg.checks.tld.tld_map then
+    local lua_maps = require "lua_maps"
+    maps.suspicious_tlds = lua_maps.map_add_from_ucl(
+        cfg.checks.tld.tld_map, 'set', 'url_suspect_tlds')
+  end
+
+  if cfg.checks.structure.use_port_map and cfg.checks.structure.port_map then
+    local lua_maps = require "lua_maps"
+    maps.suspicious_ports = lua_maps.map_add_from_ucl(
+        cfg.checks.structure.port_map, 'set', 'url_suspect_ports')
+  end
+end
+
+-- Plugin registration
+local opts = rspamd_config:get_all_opt(N)
+if opts then
+  settings = lua_util.override_defaults(settings, opts)
+end
+
+if settings.enabled then
+  init_maps(settings)
+
+  local id = rspamd_config:register_symbol({
+    name = 'URL_SUSPECT_CHECK',
+    type = 'callback',
+    callback = url_suspect_callback,
+    priority = 10,
+    group = 'url',
+    flags = 'empty,nice'
+  })
+
+  -- Register all symbol names as virtual
+  for _, symbol_name in pairs(settings.symbols) do
+    rspamd_config:register_symbol({
+      name = symbol_name,
+      type = 'virtual',
+      parent = id,
+      group = 'url'
+    })
+  end
+
+  -- Backward compat symbol
+  if settings.compat_mode then
+    rspamd_config:register_symbol({
+      name = 'R_SUSPICIOUS_URL',
+      type = 'virtual',
+      parent = id,
+      score = 5.0,
+      group = 'url',
+      description = 'Suspicious URL (legacy symbol)'
+    })
+  end
+end