-- @param flags number - URL parsing flags
-- @return number - ACCEPT/SUSPICIOUS/REJECT
function exports.filter_url_string(url_text, flags)
- -- Sanity check: URL length
+ -- Note: this is invoked mid-parse from the C state machine with whatever
+ -- bytes have been seen so far (often just the userinfo span, not a full
+ -- URL). A blanket length REJECT here silently drops legitimate phishing
+ -- patterns like https://legit.com<lots-of-spaces>@evil.com/... where the
+ -- userinfo is intentionally bloated to obscure the real host. The C parser
+ -- already caps total URL length at G_MAXUINT16/2; this threshold is just a
+ -- DoS guard against catastrophic sizes.
local url_len = url_text:len()
- if url_len > 2048 then
- return exports.REJECT -- Overly long URL
+ if url_len > 16384 then
+ return exports.REJECT -- Catastrophic length, abort
end
-- Build control character set: 0x00-0x08, 0x0B-0x1F, 0x7F
goto out;
}
else if (p - c > max_email_user) {
- /* Oversized user field - consult Lua filter (fixes #5731) */
+ /*
+ * Oversized user field is itself an obfuscation signal
+ * (e.g. https://legit.com<lots-of-spaces>@evil.com/...),
+ * so mark obscured regardless of what the Lua filter says.
+ */
+ *flags |= RSPAMD_URL_FLAG_OBSCURED | RSPAMD_URL_FLAG_HAS_USER;
+
+ /* Consult Lua filter (fixes #5731) */
enum rspamd_url_lua_filter_result lua_decision =
rspamd_url_lua_consult(c, p - c, *flags, (lua_State *) lua_state);
/* REJECT: Lua says this is garbage, abort parsing */
goto out;
}
- else if (lua_decision == RSPAMD_URL_LUA_FILTER_SUSPICIOUS) {
- /* SUSPICIOUS: Mark as obscured for plugin analysis */
- *flags |= RSPAMD_URL_FLAG_OBSCURED;
- }
/* ACCEPT or SUSPICIOUS: continue parsing */
- *flags |= RSPAMD_URL_FLAG_HAS_USER;
}
p++;
{ "http://user@host@example.com", 0, SUSPICIOUS, "multiple @ signs" },
{ "http://" .. string.rep("@", 25) .. "example.com", 0, REJECT, ">20 @ signs (reject)" },
- -- Very long URLs
- { "http://example.com/" .. string.rep("a", 2100), 0, REJECT, ">2048 char URL (reject)" },
+ -- Very long URLs: the legacy 2048 threshold dropped legitimate
+ -- userinfo-obfuscation phishing patterns; only catastrophic sizes
+ -- (>16 KiB) should now reject.
+ { "http://example.com/" .. string.rep("a", 2100), 0, ACCEPT, "2100 char URL (accept)" },
+ { "http://example.com/" .. string.rep("a", 17000), 0, REJECT, ">16384 char URL (reject)" },
-- Control characters (should reject)
{ "http://example.com/\x00test", 0, REJECT, "URL with null byte" },