description = "URL is very long";
one_shot = false;
}
+
+ # Query issues
+ "URL_QUERY_MULTIPLE_URLS" {
+ weight = 2.0;
+ description = "URL query embeds more than one URL";
+ one_shot = true;
+ }
}
/* Propagate source/classification flags from the parent (outer) URL */
url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+ /*
+ * Link the query URL to its parent so CTA/button weight can propagate to it
+ * (it owns no tag). Only if unset: a later PHISHED/REDIRECTED decision may
+ * override linked_url, and get_phished/get_redirected only follow it then.
+ */
+ if (cbd->url) {
+ if (url->ext == nullptr) {
+ url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+ }
+ if (url->ext->linked_url == nullptr) {
+ url->ext->linked_url = cbd->url;
+ }
+ }
+
if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)) {
html_part_add_url(cbd->part_urls, url);
}
qcbd.part_urls = part_urls;
qcbd.parent_flags = url->flags;
- rspamd_url_find_multiple(pool,
- rspamd_url_query_unsafe(url), url->querylen,
- RSPAMD_URL_FIND_ALL, NULL,
+ rspamd_url_find_in_query(pool, url, RSPAMD_URL_FIND_ALL,
html_url_query_callback, &qcbd, L);
}
*/
float weight = rspamd_html_url_button_weight(text_part->html, u);
+ /* A query-extracted URL (e.g. a wrapper's real target in ?u=...) owns no
+ * tag, so it has no weight; walk the linked_url chain up to a tagged
+ * ancestor, bounded by the nesting cap, so a leaf several hops deep
+ * still inherits CTA from the original tagged href. */
+ if (weight <= 0.0 && (u->flags & RSPAMD_URL_FLAG_QUERY)) {
+ struct rspamd_url *ancestor = u;
+ for (unsigned int hop = 0; hop < RSPAMD_URL_QUERY_MAX_NESTING; hop++) {
+ if (ancestor->ext == nullptr ||
+ ancestor->ext->linked_url == nullptr ||
+ ancestor->ext->linked_url == ancestor) {
+ break;
+ }
+ ancestor = ancestor->ext->linked_url;
+ weight = rspamd_html_url_button_weight(text_part->html, ancestor);
+ if (weight > 0.0) {
+ break;
+ }
+ }
+ }
+
if (weight > 0.0) {
if (rspamd_heap_size(rspamd_html_heap_storage, heap_ptr) < max_cta) {
rspamd_html_cta_entry entry = {
return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
}
+struct query_target_scan_cbd {
+ unsigned int count; /* embedded URLs at this level */
+ struct rspamd_url *single; /* the sole embedded URL, when count == 1 */
+};
+
+static gboolean
+html_query_target_cb(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ auto *cbd = static_cast<query_target_scan_cbd *>(ud);
+
+ if (!is_transfer_proto(url) || url->tldlen == 0) {
+ return TRUE; /* keep scanning the query for other urls */
+ }
+
+ cbd->count++;
+ cbd->single = url;
+
+ return TRUE;
+}
+
+/*
+ * True if the href is a single-target wrapper/redirector whose (possibly nested)
+ * leaf URL matches the displayed domain - so the host mismatch is not phishing.
+ * Follows one embedded URL per query level (e.g. linkprotect?a=https://disp.tld,
+ * or a wrapper whose target is itself a wrapper); any level holding more than
+ * one embedded URL is ambiguous and never suppresses.
+ */
+static auto
+html_href_query_targets_display(rspamd_mempool_t *pool,
+ struct rspamd_url *href_url,
+ std::string_view disp_tld,
+ lua_State *L) -> bool
+{
+ if (disp_tld.empty()) {
+ return false;
+ }
+
+ struct rspamd_url *leaf = nullptr;
+ struct rspamd_url *cur = href_url;
+ bool fully_consumed = false;
+
+ for (unsigned int depth = 0; depth < RSPAMD_URL_QUERY_MAX_NESTING; depth++) {
+ if (cur->querylen == 0) {
+ fully_consumed = true; /* natural end: no further query */
+ break;
+ }
+
+ query_target_scan_cbd cbd{0, nullptr};
+ rspamd_url_query_foreach_embedded(pool, cur, RSPAMD_URL_FIND_ALL,
+ html_query_target_cb, &cbd, L);
+
+ if (cbd.count == 0) {
+ fully_consumed = true; /* natural end: query carries no embedded url */
+ break;
+ }
+ if (cbd.count != 1) {
+ return false; /* ambiguous: more than one embedded url at this level */
+ }
+
+ leaf = cbd.single; /* descend into the single embedded url */
+ cur = cbd.single;
+ }
+
+ /* Exited via nesting budget exhaustion: the real leaf is deeper than we
+ * are willing to follow, so we cannot decide on a match against an
+ * intermediate hop. Safe default: do not suppress. */
+ if (!fully_consumed || leaf == nullptr) {
+ return false;
+ }
+
+ auto leaf_tld = convert_idna_hostname_maybe(pool, leaf, true);
+ return sv_equals(leaf_tld, disp_tld) ||
+ rspamd_url_is_subdomain(leaf_tld, disp_tld) ||
+ rspamd_url_is_subdomain(disp_tld, leaf_tld);
+}
+
auto html_url_is_phished(rspamd_mempool_t *pool,
struct rspamd_url *href_url,
std::string_view text_data,
if (!sv_equals(disp_tok, href_tok)) {
/* Check if one url is a subdomain for another */
- if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+ if (!rspamd_url_is_subdomain(disp_tok, href_tok) &&
+ !html_href_query_targets_display(pool, href_url, disp_tok, L)) {
href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
msg->host = g_string_new_len(host, pu.field_data[UF_HOST].len);
msg->url = rspamd_fstring_append(msg->url, path, pathlen);
+ if ((pu.field_set & (1 << UF_PATH)) == 0 &&
+ (pu.field_set & (1 << UF_QUERY)) != 0) {
+ /*
+ * URL has a query but no path (e.g. http://host?a=b): the default
+ * path "/" was used above, which is a literal not pointing into the
+ * query, so the query was not included. Append "?<query>" explicitly,
+ * otherwise the request-target would be just "/" and the query string
+ * would be silently dropped.
+ */
+ msg->url = rspamd_fstring_append(msg->url, "?", 1);
+ msg->url = rspamd_fstring_append(msg->url,
+ url + pu.field_data[UF_QUERY].off,
+ pu.field_data[UF_QUERY].len);
+ }
+
REF_INIT_RETAIN(msg, rspamd_http_message_free);
return msg;
if (url->querylen > 0) {
struct rspamd_url_mimepart_cbdata qcbd = *cbd;
qcbd.parent_flags = url->flags;
- rspamd_url_find_multiple(task->task_pool,
- rspamd_url_query_unsafe(url), url->querylen,
- RSPAMD_URL_FIND_ALL, NULL,
+ rspamd_url_find_in_query(task->task_pool, url,
+ RSPAMD_URL_FIND_ALL,
rspamd_url_query_callback, &qcbd,
task->cfg ? task->cfg->lua_state : NULL);
}
}
}
+static void rspamd_url_find_in_query_depth(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state,
+ unsigned int depth);
+
+struct rspamd_url_query_count_cbd {
+ rspamd_mempool_t *pool;
+ enum rspamd_url_find_type how;
+ url_insert_function func;
+ gpointer ud;
+ void *lua_state;
+ unsigned int depth;
+ unsigned int count;
+};
+
+static gboolean
+rspamd_url_query_count_cb(struct rspamd_url *url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ struct rspamd_url_query_count_cbd *cbd =
+ (struct rspamd_url_query_count_cbd *) ud;
+ gboolean ret;
+
+ cbd->count++;
+ ret = cbd->func(url, start_offset, end_offset, cbd->ud);
+
+ /* Recurse into the extracted URL's own query to unwrap nested (properly
+ * escaped) targets, bounded by the nesting limit. Stop if the insert
+ * callback bailed (e.g. max_urls reached). */
+ if (ret && cbd->depth + 1 < RSPAMD_URL_QUERY_MAX_NESTING &&
+ url->querylen > 0) {
+ rspamd_url_find_in_query_depth(cbd->pool, url, cbd->how, cbd->func,
+ cbd->ud, cbd->lua_state, cbd->depth + 1);
+ }
+
+ return ret;
+}
+
+void rspamd_url_query_foreach_embedded(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state)
+{
+ const char *raw, *query, *end, *p, *c;
+
+ if (url->raw == NULL || url->rawlen == 0) {
+ return;
+ }
+
+ /*
+ * Use the raw (percent-encoded) query: an embedded URL's own separators are
+ * %26/%3B there, so splitting on literal '&'/';' bounds it to its parameter
+ * instead of greedily reading into the following parameters.
+ */
+ raw = url->raw;
+ query = memchr(raw, '?', url->rawlen);
+ if (query == NULL) {
+ return;
+ }
+ query++; /* skip '?' */
+ end = raw + url->rawlen;
+
+ {
+ const char *frag = memchr(query, '#', end - query);
+ if (frag != NULL) {
+ end = frag;
+ }
+ }
+
+ /*
+ * Worst case the whole query decodes into one buffer; each parameter writes
+ * into its own non-overlapping slot, so rspamd_url_parse can safely keep
+ * raw pointers into it (the buffer lives for the pool's lifetime). Lazy
+ * alloc keeps queries with no embedded URL paying nothing.
+ */
+ const gsize query_total = (gsize) (end - query);
+ char *scratch = NULL;
+ gsize scratch_off = 0;
+
+ p = query;
+ c = query;
+ while (p <= end) {
+ if (p == end || *p == '&' || *p == ';') {
+ if (p > c) {
+ /* Parameter [c, p): scan the value after the first '=' */
+ const char *eq = memchr(c, '=', p - c);
+ const char *vstart = eq ? eq + 1 : c;
+ gsize vlen = p - vstart;
+
+ if (vlen > 0) {
+ if (scratch == NULL) {
+ scratch = rspamd_mempool_alloc(pool, query_total + 1);
+ }
+ char *slot = scratch + scratch_off;
+ gsize dlen = rspamd_url_decode(slot, vstart, vlen);
+ slot[dlen] = '\0';
+ rspamd_url_find_multiple(pool, slot, dlen, how, NULL,
+ func, ud, lua_state);
+ scratch_off += dlen + 1;
+ }
+ }
+ c = p + 1;
+ }
+ p++;
+ }
+}
+
+static void rspamd_url_find_in_query_depth(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state,
+ unsigned int depth)
+{
+ struct rspamd_url_query_count_cbd cbd = {pool, how, func, ud, lua_state,
+ depth, 0};
+
+ rspamd_url_query_foreach_embedded(pool, url, how,
+ rspamd_url_query_count_cb, &cbd, lua_state);
+
+ /* Record how many URLs are embedded in this URL's query. */
+ if (cbd.count > 0) {
+ if (url->ext == NULL) {
+ url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
+ }
+ url->ext->query_embedded_urls =
+ cbd.count > UINT16_MAX ? UINT16_MAX : (uint16_t) cbd.count;
+ }
+}
+
+void rspamd_url_find_in_query(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state)
+{
+ rspamd_url_find_in_query_depth(pool, url, how, func, ud, lua_state, 0);
+}
+
void rspamd_url_find_single(rspamd_mempool_t *pool,
const char *in,
gsize inlen,
}
+struct rspamd_url_subject_query_cbd {
+ struct rspamd_task *task;
+ uint32_t parent_flags;
+};
+
+static gboolean
+rspamd_url_subject_query_callback(struct rspamd_url *query_url, gsize start_offset,
+ gsize end_offset, gpointer ud)
+{
+ struct rspamd_url_subject_query_cbd *cbd = ud;
+ struct rspamd_task *task = cbd->task;
+
+ if (query_url->hostlen == 0) {
+ return TRUE;
+ }
+
+ query_url->flags |= RSPAMD_URL_FLAG_QUERY;
+ /* Propagate source/classification flags from the parent URL */
+ query_url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
+ if (query_url->protocol == PROTOCOL_MAILTO && query_url->userlen == 0) {
+ return TRUE;
+ }
+
+ rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), query_url, false);
+
+ return TRUE;
+}
+
gboolean
rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
gsize end_offset, gpointer ud)
{
struct rspamd_task *task = ud;
- char *url_str = NULL;
- struct rspamd_url *query_url;
- int rc;
- gboolean prefix_added;
/* It is just a displayed URL, we should not check it for certain things */
url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT;
/* We also search the query for additional url inside */
if (url->querylen > 0) {
- if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen,
- &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
-
- query_url = rspamd_mempool_alloc0(task->task_pool,
- sizeof(struct rspamd_url));
- rc = rspamd_url_parse(query_url,
- url_str,
- strlen(url_str),
- task->task_pool,
- RSPAMD_URL_PARSE_TEXT,
- task->cfg ? task->cfg->lua_state : NULL);
-
- if (rc == URI_ERRNO_OK &&
- query_url->hostlen > 0) {
- msg_debug_task("found url %s in query of url"
- " %*s",
- url_str, url->querylen, rspamd_url_query_unsafe(url));
-
- query_url->flags |= RSPAMD_URL_FLAG_QUERY;
- /* Propagate source/classification flags from the parent URL */
- query_url->flags |= (url->flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
-
- if (prefix_added) {
- query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
- }
-
- if (query_url->protocol == PROTOCOL_MAILTO) {
- if (query_url->userlen == 0) {
- return TRUE;
- }
- }
-
- rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls),
- query_url, false);
- }
- }
+ struct rspamd_url_subject_query_cbd qcbd = {
+ .task = task,
+ .parent_flags = url->flags,
+ };
+ rspamd_url_find_in_query(task->task_pool, url,
+ RSPAMD_URL_FIND_ALL,
+ rspamd_url_subject_query_callback, &qcbd,
+ task->cfg ? task->cfg->lua_state : NULL);
}
return TRUE;
struct rspamd_url *linked_url;
uint16_t port;
+ /* Number of URLs found embedded in this URL's query string */
+ uint16_t query_embedded_urls;
};
#define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
url_insert_function func,
gpointer ud);
+/**
+ * How deep to follow URLs nested inside the query of an already query-extracted
+ * URL (a properly escaped wrapper carries one target per encoding layer).
+ */
+#define RSPAMD_URL_QUERY_MAX_NESTING 5
+
+/**
+ * Find URLs embedded in the query parameters of `url`. Unlike
+ * rspamd_url_find_multiple, this walks the url's raw (still percent-encoded)
+ * query, splits it on the parameter separators '&' and ';', and decodes each
+ * value before scanning it. This bounds an embedded URL to its own parameter
+ * (its internal separators are %26/%3B at that point) instead of greedily
+ * swallowing the following parameters. URLs nested inside an extracted URL's
+ * own query are followed up to RSPAMD_URL_QUERY_MAX_NESTING levels.
+ * @param pool
+ * @param url url whose query is scanned
+ * @param how
+ * @param func callback invoked for each url found
+ * @param ud
+ * @param lua_state Lua state for consultation (may be NULL)
+ */
+void rspamd_url_find_in_query(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state);
+
+/**
+ * Invoke `func` for every URL embedded in `url`'s query, scanning a single
+ * level only (no nesting). Uses the same raw, per-parameter, decode-each-value
+ * bounding as rspamd_url_find_in_query.
+ * @param pool
+ * @param url url whose query is scanned
+ * @param how
+ * @param func callback invoked for each url found
+ * @param ud
+ * @param lua_state Lua state for consultation (may be NULL)
+ */
+void rspamd_url_query_foreach_embedded(rspamd_mempool_t *pool,
+ struct rspamd_url *url,
+ enum rspamd_url_find_type how,
+ url_insert_function func,
+ gpointer ud,
+ void *lua_state);
+
/**
* Generic callback to insert URLs into rspamd_task
* @param url
inject_url_query(struct rspamd_task *task, struct rspamd_url *url,
GPtrArray *part_urls)
{
- if (url->querylen > 0) {
+ /*
+ * REDIRECTED means url_redirector already resolved this hop to a real target
+ * (a chain hop), so skip query-param guessing to avoid duplicating it;
+ * terminal hops (200/dead-end) are never REDIRECTED, so still scanned.
+ */
+ if (!(url->flags & RSPAMD_URL_FLAG_REDIRECTED) && url->querylen > 0) {
struct rspamd_url_query_to_inject_cbd cbd;
cbd.task = task;
cbd.mpart_urls = part_urls;
cbd.parent_flags = url->flags;
- rspamd_url_find_multiple(task->task_pool,
- rspamd_url_query_unsafe(url), url->querylen,
- RSPAMD_URL_FIND_ALL, NULL,
+ rspamd_url_find_in_query(task->task_pool, url,
+ RSPAMD_URL_FIND_ALL,
inject_url_query_callback, &cbd,
task->cfg ? task->cfg->lua_state : NULL);
}
if (task && task->message && url && url->url) {
rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false);
- if (mpart && mpart->urls) {
- inject_url_query(task, url->url, mpart->urls);
- }
+ /*
+ * Scan the injected URL's query string for embedded URLs (e.g. a
+ * redirector/wrapper hop that carries its real target in ?u=...). This
+ * must run even without an associated mime part, otherwise URLs
+ * injected by url_redirector and similar consumers miss the query
+ * extraction that MIME-parsed URLs receive.
+ */
+ inject_url_query(task, url->url, (mpart && mpart->urls) ? mpart->urls : NULL);
}
else {
return luaL_error(L, "invalid arguments");
LUA_FUNCTION_DEF(url, get_phished);
LUA_FUNCTION_DEF(url, set_redirected);
LUA_FUNCTION_DEF(url, get_count);
+LUA_FUNCTION_DEF(url, get_query_embedded_urls);
LUA_FUNCTION_DEF(url, get_visible);
LUA_FUNCTION_DEF(url, get_hash);
LUA_FUNCTION_DEF(url, create);
LUA_INTERFACE_DEF(url, get_visible),
LUA_INTERFACE_DEF(url, get_count),
+ LUA_INTERFACE_DEF(url, get_query_embedded_urls),
LUA_INTERFACE_DEF(url, get_hash),
LUA_INTERFACE_DEF(url, get_flags),
LUA_INTERFACE_DEF(url, get_flags_num),
return 1;
}
+/***
+ * @method url:get_query_embedded_urls()
+ * Get the number of URLs found embedded in this URL's query string
+ * @return {number} count of query-embedded URLs (0 if none)
+ */
+static int
+lua_url_get_query_embedded_urls(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+ if (url != NULL && url->url != NULL) {
+ lua_pushinteger(L, url->url->ext ? url->url->ext->query_embedded_urls : 0);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
/***
* @method url:get_visible()
* Get visible part of the url with html tags stripped
return table.concat(hosts, '->')
end
--- Compute the per-URL Redis cache key. Hashing the URL string keeps keys
--- fixed-length and free of URL-unsafe characters; using tostring() (rather
--- than :get_raw()) keeps the hash stable across the write-then-read cycle
--- when chain values are roundtripped through rspamd_url.create.
+-- Mixed into the hashed cache key; bump on incompatible value-format changes
+-- so old-format entries hash elsewhere and get re-resolved, not misread.
+-- v1: value is the raw (percent-encoded) URL, not the decoded text.
+local cache_format_version = 'v1:'
+
+-- Per-URL Redis cache key: hash the URL (fixed-length, URL-safe). tostring()
+-- (not get_raw) keeps the hash stable across the write-then-read round-trip.
local function cache_key_for_url(url_str)
- return settings.key_prefix .. hash.create(url_str):base32():sub(1, 32)
+ return settings.key_prefix ..
+ hash.create(cache_format_version .. url_str):base32():sub(1, 32)
end
-- Whether an intermediate hop should be saved (in cache and task URL set)
local function write_link(prev_url, next_url, marker)
local link_key = cache_key_for_url(tostring(prev_url))
- local next_str = encode_url_for_redirect(next_url:get_text())
+ -- Cache the raw (percent-encoded) form: keeps query boundaries intact so
+ -- cached hops re-parse identically to live ones, and it is already URL-safe.
+ local next_str = next_url:get_raw()
local cache_value
if marker then
cache_value = string.format('^%s:%s', marker, next_str)
-- Mirror the cache walk's cycle guard: a redirector loop A->B->A->B
-- (e.g. login redirector flapping between two hosts) would otherwise
-- chew through nested_limit and bloat the chain with alternating
- -- entries.
+ -- entries. tostring() (not get_raw): the cycle guard, cache key and
+ -- GET-map match need a stable identity that collapses encoding variants;
local url_str = tostring(url)
if seen[url_str] then
lua_util.debugm(N, task, 'cycle in http walk at %s', url_str)
method = 'get'
end
+ -- Request the raw (percent-encoded) URL: the decoded form would let a wrapper
+ -- mis-split its ?u=https%3A%2F%2F... target at the now-literal '&' and truncate.
+ local request_url = url:get_raw()
+
local http_params = {
- url = url_str,
+ url = request_url,
task = task,
method = method,
max_size = settings.max_size,
backslash = "URL_BACKSLASH_PATH",
excessive_dots = "URL_EXCESSIVE_DOTS",
very_long = "URL_VERY_LONG",
+ -- Query symbols
+ query_multiple_urls = "URL_QUERY_MULTIPLE_URLS",
-- Obfuscated text symbol
obfuscated_text = "URL_OBFUSCATED_TEXT"
}
check_length = true,
max_url_length = 2048
},
+ query_urls = {
+ enabled = true,
+ -- More than this many URLs embedded in a single URL's query is anomalous
+ max_query_urls = 1
+ },
obfuscated_text = {
enabled = true,
-- DoS protection limits
backslash = "URL_BACKSLASH_PATH",
excessive_dots = "URL_EXCESSIVE_DOTS",
very_long = "URL_VERY_LONG",
+ -- Query symbols
+ query_multiple_urls = "URL_QUERY_MULTIPLE_URLS",
-- Obfuscated text symbol
obfuscated_text = "URL_OBFUSCATED_TEXT"
},
return findings
end
+-- Check: multiple URLs embedded in the query string
+function checks.query_urls_analysis(task, url, cfg)
+ local findings = {}
+
+ -- One URL embedded in a query is common (redirectors/trackers); more than
+ -- one in a single URL's query is anomalous.
+ local n = url:get_query_embedded_urls()
+
+ if n and n > cfg.max_query_urls then
+ lua_util.debugm(N, task, "URL query embeds %d URLs", n)
+ table.insert(findings, {
+ symbol = symbols.query_multiple_urls,
+ options = { string.format("%d", n) }
+ })
+ end
+
+ return findings
+end
+
-- Main analysis function
local function analyze_url(task, url, cfg)
local all_findings = {}
end
end
+ if cfg.checks.query_urls and cfg.checks.query_urls.enabled then
+ local findings = checks.query_urls_analysis(task, url, cfg.checks.query_urls)
+ for _, f in ipairs(findings) do
+ table.insert(all_findings, f)
+ end
+ end
+
-- Run custom checks (advanced users)
for name, check_func in pairs(cfg.custom_checks) do
local ok, findings = pcall(check_func, task, url, cfg)
-- TLD and structure checks don't have corresponding URL flags, so need all URLs
local need_all_urls = (
(settings.checks.tld and settings.checks.tld.enabled) or
+ (settings.checks.query_urls and settings.checks.query_urls.enabled) or
(settings.checks.structure and settings.checks.structure.enabled and
(settings.checks.structure.check_multiple_at or
settings.checks.structure.check_excessive_dots or
${MESSAGE1} ${RSPAMD_TESTDIR}/messages/phishing1.eml
${MESSAGE2} ${RSPAMD_TESTDIR}/messages/phishing2.eml
${MESSAGE3} ${RSPAMD_TESTDIR}/messages/phishing3.eml
+${MESSAGE_QSAFE} ${RSPAMD_TESTDIR}/messages/phishing_query_safe.eml
+${MESSAGE_QMULTI} ${RSPAMD_TESTDIR}/messages/phishing_query_multi.eml
+${MESSAGE_QNESTED} ${RSPAMD_TESTDIR}/messages/phishing_query_nested.eml
+${MESSAGE_QNESTFIRE} ${RSPAMD_TESTDIR}/messages/phishing_query_nested_fire.eml
+${MESSAGE_QOVERCAP} ${RSPAMD_TESTDIR}/messages/phishing_query_overcap.eml
${SETTINGS_PHISHING} {symbols_enabled = [PHISHING,STRICT_PHISHING,STRICTER_PHISHING]}
*** Test Cases ***
Scan File ${MESSAGE3}
... Settings=${SETTINGS_PHISHING}
Expect Symbol STRICTER_PHISHING
+
+TEST PHISHING NO FP WHEN HREF QUERY DEST EQUALS DISPLAY TEXT
+ # href host differs from the displayed domain, but the href's query embeds a
+ # destination URL equal to the displayed domain (a wrapper/redirector that
+ # points back at the shown domain) -- not phishing.
+ Scan File ${MESSAGE_QSAFE}
+ ... Settings=${SETTINGS_PHISHING}
+ Do Not Expect Symbol PHISHING
+
+TEST PHISHING FIRES WHEN HREF QUERY HAS MULTIPLE URLS
+ # The href query embeds two URLs (one matching the displayed domain, one not).
+ # Multiple embedded URLs are ambiguous, so the single-target wrapper exception
+ # does not apply and phishing must still fire on the host mismatch.
+ Scan File ${MESSAGE_QMULTI}
+ ... Settings=${SETTINGS_PHISHING}
+ Expect Symbol PHISHING
+
+TEST PHISHING NO FP WHEN NESTED QUERY LEAF EQUALS DISPLAY TEXT
+ # The href wraps a wrapper: its query holds one URL (mid) whose query in turn
+ # holds one URL (the displayed domain). Following the single-target chain to
+ # the leaf, it equals the displayed domain -- not phishing.
+ Scan File ${MESSAGE_QNESTED}
+ ... Settings=${SETTINGS_PHISHING}
+ Do Not Expect Symbol PHISHING
+
+TEST PHISHING FIRES WHEN NESTED QUERY LEAF DIFFERS FROM DISPLAY TEXT
+ # The href wraps a wrapper whose own host equals the displayed domain, but its
+ # query wraps a further URL (the leaf) pointing elsewhere. The single-target
+ # chain must be followed to the leaf: an intermediate match does not suppress,
+ # so phishing fires on the leaf vs display mismatch.
+ Scan File ${MESSAGE_QNESTFIRE}
+ ... Settings=${SETTINGS_PHISHING}
+ Expect Symbol PHISHING
+
+TEST PHISHING FIRES WHEN NESTED CHAIN EXCEEDS NESTING CAP
+ # The href wraps a single-target chain deeper than RSPAMD_URL_QUERY_MAX_NESTING.
+ # The deepest URL we are willing to follow happens to match the displayed
+ # domain, but the real leaf is one more level down and points elsewhere.
+ # Since the walk exits via budget exhaustion rather than reaching a natural
+ # end, the intermediate match must not suppress phishing.
+ Scan File ${MESSAGE_QOVERCAP}
+ ... Settings=${SETTINGS_PHISHING}
+ Expect Symbol PHISHING
... Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_MULTIPLE_AT_SIGNS]}
Expect Symbol URL_MULTIPLE_AT_SIGNS
+URL Suspect - Multiple Query URLs
+ # A single URL whose query embeds more than one URL is anomalous
+ Scan File ${RSPAMD_TESTDIR}/messages/url_suspect_multi_query.eml
+ ... Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_QUERY_MULTIPLE_URLS]}
+ Expect Symbol With Exact Options URL_QUERY_MULTIPLE_URLS 2
+
+URL Suspect - Single Query URL
+ # One embedded URL in a query is common (redirectors/trackers) and must not fire
+ Scan File ${RSPAMD_TESTDIR}/messages/url_suspect_single_query.eml
+ ... Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_QUERY_MULTIPLE_URLS]}
+ Do Not Expect Symbol URL_QUERY_MULTIPLE_URLS
+
URL Suspect - Normal URL
# Test that normal URLs don't trigger symbols
Scan File ${RSPAMD_TESTDIR}/messages/url_suspect_normal.eml
--- /dev/null
+*** Settings ***
+Suite Setup Urlredirector Setup
+Suite Teardown Urlredirector Teardown
+Library Process
+Library ${RSPAMD_TESTDIR}/lib/rspamd.py
+Resource ${RSPAMD_TESTDIR}/lib/rspamd.robot
+Test Tags notparallel
+Variables ${RSPAMD_TESTDIR}/lib/vars.py
+
+*** Variables ***
+${CONFIG} ${RSPAMD_TESTDIR}/configs/url_redirector_phishing.conf
+${MESSAGE} ${RSPAMD_TESTDIR}/messages/redir_phishing_safe.eml
+${REDIS_SCOPE} Suite
+${RSPAMD_SCOPE} Suite
+${RSPAMD_URL_TLD} ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat
+${SETTINGS} {symbols_enabled=[URL_REDIRECTOR_CHECK, PHISHING]}
+
+*** Test Cases ***
+PHISHING NO FP WHEN DISPLAY TEXT EQUALS REDIRECT DEST
+ # t.co (a known redirector, faked to the dummy HTTP server) resolves to the
+ # same domain shown in the anchor text, so the resolved redirect destination
+ # equals the displayed URL -- phishing must not fire.
+ Scan File ${MESSAGE} Settings=${SETTINGS}
+ Expect Symbol URL_REDIRECTOR
+ Do Not Expect Symbol PHISHING
+
+*** Keywords ***
+Urlredirector Setup
+ Run Dummy Http
+ Rspamd Redis Setup
+
+Urlredirector Teardown
+ Rspamd Redis Teardown
+ Dummy Http Teardown
+ Terminate All Processes kill=True
--- /dev/null
+*** Settings ***
+Suite Setup Urlredirector Setup
+Suite Teardown Urlredirector Teardown
+Library Process
+Library ${RSPAMD_TESTDIR}/lib/rspamd.py
+Resource ${RSPAMD_TESTDIR}/lib/rspamd.robot
+Test Tags notparallel
+Variables ${RSPAMD_TESTDIR}/lib/vars.py
+
+*** Variables ***
+${CONFIG} ${RSPAMD_TESTDIR}/configs/url_redirector.conf
+${MESSAGE} ${RSPAMD_TESTDIR}/messages/redir_query_target.eml
+${REDIS_SCOPE} Suite
+${RSPAMD_SCOPE} Suite
+${RSPAMD_URL_TLD} ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat
+${SETTINGS} {symbols_enabled=[URL_REDIRECTOR_CHECK]}
+
+*** Test Cases ***
+RESOLVE ENCODED QUERY TARGET THROUGH PATH-LESS WRAPPER
+ # /combo_entry redirects to a PATH-LESS wrapper that carries the real target
+ # percent-encoded (with its own &-separated params) in ?u=. Resolving the
+ # full destination requires the request to a path-less URL to keep its query
+ # and to send it percent-encoded; otherwise the wrapper sees a dropped or
+ # &-truncated u and the target is lost.
+ Scan File ${MESSAGE} Flags=ext_urls Settings=${SETTINGS}
+ Expect Extended URL http://dest.com/?a=1&b=2
+ # The followed wrapper also carried &other=...; a hop already resolved to a
+ # real redirect target must not have its query re-extracted, so the extra
+ # URL must not surface.
+ Do Not Expect Extended URL http://other.com/
+
+*** Keywords ***
+Urlredirector Setup
+ Run Dummy Http
+ Rspamd Redis Setup
+
+Urlredirector Teardown
+ Rspamd Redis Teardown
+ Dummy Http Teardown
+ Terminate All Processes kill=True
t.co
bit.ly
127.0.0.1
+redir.example.net
type = a;
replies = ["127.0.0.2"];
},
+ # Redirector test (168_url_redirector_phishing): fake A so the dummy
+ # HTTP server on 127.0.0.1 is reached without real DNS.
+ {
+ name = "redir.example.net";
+ type = a;
+ replies = ["127.0.0.1"];
+ },
# TODO: add IPv6 tests
];
}
--- /dev/null
+.include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf"
+
+redis {
+ servers = "{= env.REDIS_ADDR =}:{= env.REDIS_PORT =}";
+}
+
+url_redirector {
+ redirector_hosts_map = "{= env.TESTDIR =}/configs/maps/redir.map";
+ redirector_symbol = "URL_REDIRECTOR";
+}
+
+phishing {
+ symbol = "PHISHING";
+}
END
Should Be True ${found_url} msg="Expected URL was not found: ${url}"
+Do Not Expect Extended URL
+ [Arguments] ${url}
+ ${found_url} = Set Variable ${FALSE}
+ ${url_list} = Convert To List ${SCAN_RESULT}[urls]
+ FOR ${item} IN @{url_list}
+ ${d} = Convert To Dictionary ${item}
+ ${found_url} = Evaluate "${d}[url]" == "${url}"
+ Exit For Loop If ${found_url} == ${TRUE}
+ END
+ Should Not Be True ${found_url} msg="URL should not be present: ${url}"
+
Expect Symbol With Exact Options
[Arguments] ${symbol} @{options}
Expect Symbol ${symbol}
local url_text = 'http://example.com?redir=https://another.com'
local url_to_inject = url.create(task:get_mempool(), url_text)
task:inject_url(url_to_inject)
- if #(task:get_urls()) == 2 then
+ -- 3 urls: 1 from the scanned ics.eml (SUMMARY: http://test.com), 1 injected
+ -- (example.com), and 1 extracted from the injected URL's query by the
+ -- bounded query scan inject_url runs (another.com from ?redir=...).
+ if #(task:get_urls()) == 3 then
return true
end
return false
--- /dev/null
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F&v=http%3A%2F%2Fevil.org%2F">http://www.cnn.com</a>
--- /dev/null
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fmid.com%2F%3Fv%3Dhttp%253A%252F%252Fwww.cnn.com%252F">http://www.cnn.com</a>
--- /dev/null
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F%3Fv%3Dhttp%253A%252F%252Fevil.org%252F">http://www.cnn.com</a>
--- /dev/null
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fa.io%2F%3Fv%3Dhttp%253A%252F%252Fb.io%252F%253Fw%253Dhttp%25253A%25252F%25252Fc.io%25252F%25253Fx%25253Dhttp%2525253A%2525252F%2525252Fd.io%2525252F%2525253Fy%2525253Dhttp%252525253A%252525252F%252525252Fwww.cnn.com%252525252F%252525253Fz%252525253Dhttp%25252525253A%25252525252F%25252525252Fev.io%25252525252F">http://www.cnn.com</a>
--- /dev/null
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F">http://www.cnn.com</a>
--- /dev/null
+Content-type: text/html
+
+<a href="http://redir.example.net:18080/redir_to_example">http://www.example.com/</a>
--- /dev/null
+Content-type: text/plain
+
+click http://127.0.0.1:18080/combo_entry
--- /dev/null
+Content-Type: text/html
+
+<html><body><a href="http://wrap.com/r?u=https://a.com/&v=https://b.com/">click</a></body></html>
--- /dev/null
+Content-Type: text/html
+
+<html><body><a href="http://wrap.com/r?u=https://a.com/">click</a></body></html>
elif path == "/mailto_redirect":
# Send an HTTP redirect to a mailto: URL (non-HTTP scheme)
self.redirect("mailto:user@example.net")
+ elif path == "/redir_to_example":
+ # Redirect to a domain matching the displayed text, so the phishing
+ # module sees a 1:1 redirect target and must not fire (no FP).
+ self.redirect("http://www.example.com/")
elif path == "/chain_intermediate_1":
# Intermediate hop to chain_to_tel2
self.redirect(f"{self.request.protocol}://{self.request.host}/chain_intermediate_2")
elif path == "/slow":
# Slow redirect
self.redirect(f"{self.request.protocol}://{self.request.host}/hello")
+ elif path == "/combo_entry":
+ # Redirect to a PATH-LESS wrapper that carries a percent-encoded
+ # target (with its own &-separated params) plus an extra distinct
+ # URL. Resolving the full target needs: keeping the query on a
+ # path-less request (http path-less fix) and sending it
+ # percent-encoded (redirector raw-request fix); and the extra URL
+ # must NOT surface, because the followed wrapper's query is not
+ # re-extracted (REDIRECTED-guard fix).
+ target = "http%3A%2F%2Fdest.com%2F%3Fa%3D1%26b%3D2"
+ other = "http%3A%2F%2Fother.com%2F"
+ self.redirect(f"{self.request.protocol}://{self.request.host}?u={target}&other={other}")
+ elif path == "/" and self.get_query_argument("u", default=None):
+ # Path-less wrapper: redirect to the URL carried (percent-encoded)
+ # in the u query parameter.
+ self.redirect(self.get_query_argument("u"))
else:
self.send_response(200)
self.set_header("Content-Type", "text/plain")
def make_app():
return tornado.web.Application([
- (r"(/[^/]+)", MainHandler),
+ (r"(/.*)", MainHandler),
])
async def main():
task:destroy()
end)
+ test("Query-embedded URL extraction is bounded by its parameter", function()
+ local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fdest.com%2F&b=x&c=y">link</a>
+</body></html>
+]]
+ local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+ assert_true(res, "failed to load message")
+
+ task:process_message()
+
+ local found
+ for _, u in ipairs(task:get_urls() or {}) do
+ if u:get_host() == "dest.com" then
+ found = u:get_text()
+ end
+ end
+
+ assert_not_nil(found, "embedded query URL should be extracted")
+ assert_equal("http://dest.com/", found,
+ "embedded URL must stop at the parameter boundary, not swallow &b=x&c=y")
+
+ task:destroy()
+ end)
+
+ test("Query-embedded URL inherits CTA from its parent href", function()
+ local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fdest.com%2F">Click here to continue</a>
+</body></html>
+]]
+ local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+ assert_true(res, "failed to load message")
+
+ task:process_message()
+
+ local found_cta = false
+ for _, part in ipairs(task:get_text_parts() or {}) do
+ if part:is_html() then
+ for _, u in ipairs(part:get_cta_urls({ original = true }) or {}) do
+ if u:get_host() == "dest.com" then
+ found_cta = true
+ end
+ end
+ end
+ end
+
+ assert_true(found_cta,
+ "query-extracted destination should inherit CTA from its parent href")
+
+ task:destroy()
+ end)
+
+ test("Nested query-embedded URLs are followed to the leaf", function()
+ -- href wraps mid, whose (escaped) query wraps deep
+ local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fmid.com%2F%3Fv%3Dhttp%253A%252F%252Fdeep.com%252F">link</a>
+</body></html>
+]]
+ local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+ assert_true(res, "failed to load message")
+
+ task:process_message()
+
+ local hosts = {}
+ for _, u in ipairs(task:get_urls() or {}) do
+ hosts[u:get_host()] = true
+ end
+
+ assert_true(hosts["mid.com"], "first-level embedded URL should be extracted")
+ assert_true(hosts["deep.com"], "nested embedded URL should be extracted")
+
+ task:destroy()
+ end)
+
+ test("Nested query-embedded URLs stop at RSPAMD_URL_QUERY_MAX_NESTING", function()
+ -- wrap?u=l1?v=l2?w=l3?x=l4?y=l5?z=l6, each level escaped one layer deeper.
+ -- With the nesting cap at 5, l1..l5 are extracted but l6 (a 6th level) is not.
+ local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fl1.com%2F%3Fu%3Dhttp%253A%252F%252Fl2.com%252F%253Fv%253Dhttp%25253A%25252F%25252Fl3.com%25252F%25253Fw%25253Dhttp%2525253A%2525252F%2525252Fl4.com%2525252F%2525253Fx%2525253Dhttp%252525253A%252525252F%252525252Fl5.com%252525252F%252525253Fy%252525253Dhttp%25252525253A%25252525252F%25252525252Fl6.com%25252525252F">link</a>
+</body></html>
+]]
+ local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+ assert_true(res, "failed to load message")
+
+ task:process_message()
+
+ local hosts = {}
+ for _, u in ipairs(task:get_urls() or {}) do
+ hosts[u:get_host()] = true
+ end
+
+ assert_true(hosts["l1.com"], "level-1 embedded URL should be extracted")
+ assert_true(hosts["l2.com"], "level-2 embedded URL should be extracted")
+ assert_true(hosts["l3.com"], "level-3 embedded URL should be extracted")
+ assert_true(hosts["l4.com"], "level-4 embedded URL should be extracted")
+ assert_true(hosts["l5.com"], "level-5 embedded URL should be extracted")
+ assert_nil(hosts["l6.com"], "level-6 URL is past the nesting cap and must not be extracted")
+
+ task:destroy()
+ end)
+
end)