]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
Merge pull request #6066 from dragoangel/fix/properly-handle-redirects
authorDmytro Alieksieiev <1865999+dragoangel@users.noreply.github.com>
Fri, 29 May 2026 10:30:13 +0000 (12:30 +0200)
committerGitHub <noreply@github.com>
Fri, 29 May 2026 10:30:13 +0000 (11:30 +0100)
[Fix] Handle query-embedded URL targets in wrappers and redirectors

31 files changed:
conf/scores.d/url_suspect_group.conf
src/libserver/html/html.cxx
src/libserver/html/html_cta.cxx
src/libserver/html/html_url.cxx
src/libserver/http/http_message.c
src/libserver/url.c
src/libserver/url.h
src/lua/lua_task.c
src/lua/lua_url.c
src/plugins/lua/url_redirector.lua
src/plugins/lua/url_suspect.lua
test/functional/cases/001_merged/114_phishing.robot
test/functional/cases/001_merged/400_url_suspect.robot
test/functional/cases/168_url_redirector_phishing.robot [new file with mode: 0644]
test/functional/cases/169_url_redirector_query_target.robot [new file with mode: 0644]
test/functional/configs/maps/redir.map
test/functional/configs/plugins.conf
test/functional/configs/url_redirector_phishing.conf [new file with mode: 0644]
test/functional/lib/rspamd.robot
test/functional/lua/inject_url.lua
test/functional/messages/phishing_query_multi.eml [new file with mode: 0644]
test/functional/messages/phishing_query_nested.eml [new file with mode: 0644]
test/functional/messages/phishing_query_nested_fire.eml [new file with mode: 0644]
test/functional/messages/phishing_query_overcap.eml [new file with mode: 0644]
test/functional/messages/phishing_query_safe.eml [new file with mode: 0644]
test/functional/messages/redir_phishing_safe.eml [new file with mode: 0644]
test/functional/messages/redir_query_target.eml [new file with mode: 0644]
test/functional/messages/url_suspect_multi_query.eml [new file with mode: 0644]
test/functional/messages/url_suspect_single_query.eml [new file with mode: 0644]
test/functional/util/dummy_http.py
test/lua/unit/get_html_urls.lua

index 1de8d256459ee71ee23e5cbffd4b0bf7d64e6b2f..56d57192cf3c42fc7dbcbd87f272a7d87ae24c8c 100644 (file)
@@ -110,4 +110,11 @@ symbols = {
     description = "URL is very long";
     one_shot = false;
   }
+
+  # Query issues
+  "URL_QUERY_MULTIPLE_URLS" {
+    weight = 2.0;
+    description = "URL query embeds more than one URL";
+    one_shot = true;
+  }
 }
index 62aeef3bb6b5dc82bafab260fc4761457d120ed5..9589f64c5aff9e046c6e974232161bd4b708d0ae 100644 (file)
@@ -1434,6 +1434,20 @@ html_url_query_callback(struct rspamd_url *url, gsize start_offset,
        /* Propagate source/classification flags from the parent (outer) URL */
        url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
 
+       /*
+        * Link the query URL to its parent so CTA/button weight can propagate to it
+        * (it owns no tag). Only if unset: a later PHISHED/REDIRECTED decision may
+        * override linked_url, and get_phished/get_redirected only follow it then.
+        */
+       if (cbd->url) {
+               if (url->ext == nullptr) {
+                       url->ext = rspamd_mempool_alloc0_type(pool, rspamd_url_ext);
+               }
+               if (url->ext->linked_url == nullptr) {
+                       url->ext->linked_url = cbd->url;
+               }
+       }
+
        if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)) {
                html_part_add_url(cbd->part_urls, url);
        }
@@ -1456,9 +1470,7 @@ html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                qcbd.part_urls = part_urls;
                qcbd.parent_flags = url->flags;
 
-               rspamd_url_find_multiple(pool,
-                                                                rspamd_url_query_unsafe(url), url->querylen,
-                                                                RSPAMD_URL_FIND_ALL, NULL,
+               rspamd_url_find_in_query(pool, url, RSPAMD_URL_FIND_ALL,
                                                                 html_url_query_callback, &qcbd, L);
        }
 
index a25103bea426468a1ac3ade2ef8c4c33d5f01a5f..e47015797ce51fc3a385dd2ce835631131ab59bd 100644 (file)
@@ -540,6 +540,26 @@ void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
          */
                float weight = rspamd_html_url_button_weight(text_part->html, u);
 
+               /* A query-extracted URL (e.g. a wrapper's real target in ?u=...) owns no
+         * tag, so it has no weight; walk the linked_url chain up to a tagged
+         * ancestor, bounded by the nesting cap, so a leaf several hops deep
+         * still inherits CTA from the original tagged href. */
+               if (weight <= 0.0 && (u->flags & RSPAMD_URL_FLAG_QUERY)) {
+                       struct rspamd_url *ancestor = u;
+                       for (unsigned int hop = 0; hop < RSPAMD_URL_QUERY_MAX_NESTING; hop++) {
+                               if (ancestor->ext == nullptr ||
+                                       ancestor->ext->linked_url == nullptr ||
+                                       ancestor->ext->linked_url == ancestor) {
+                                       break;
+                               }
+                               ancestor = ancestor->ext->linked_url;
+                               weight = rspamd_html_url_button_weight(text_part->html, ancestor);
+                               if (weight > 0.0) {
+                                       break;
+                               }
+                       }
+               }
+
                if (weight > 0.0) {
                        if (rspamd_heap_size(rspamd_html_heap_storage, heap_ptr) < max_cta) {
                                rspamd_html_cta_entry entry = {
index 22db7f66f69748d93de8a4e6b7b2945cc958f717..2cdf03ee33b954e61641b321cd68da203eab76e3 100644 (file)
@@ -131,6 +131,83 @@ is_transfer_proto(struct rspamd_url *u) -> bool
        return (u->protocol & (PROTOCOL_HTTP | PROTOCOL_HTTPS | PROTOCOL_FTP)) != 0;
 }
 
+struct query_target_scan_cbd {
+       unsigned int count;        /* embedded URLs at this level */
+       struct rspamd_url *single; /* the sole embedded URL, when count == 1 */
+};
+
+static gboolean
+html_query_target_cb(struct rspamd_url *url, gsize start_offset,
+                                         gsize end_offset, gpointer ud)
+{
+       auto *cbd = static_cast<query_target_scan_cbd *>(ud);
+
+       if (!is_transfer_proto(url) || url->tldlen == 0) {
+               return TRUE; /* keep scanning the query for other urls */
+       }
+
+       cbd->count++;
+       cbd->single = url;
+
+       return TRUE;
+}
+
+/*
+ * True if the href is a single-target wrapper/redirector whose (possibly nested)
+ * leaf URL matches the displayed domain - so the host mismatch is not phishing.
+ * Follows one embedded URL per query level (e.g. linkprotect?a=https://disp.tld,
+ * or a wrapper whose target is itself a wrapper); any level holding more than
+ * one embedded URL is ambiguous and never suppresses.
+ */
+static auto
+html_href_query_targets_display(rspamd_mempool_t *pool,
+                                                               struct rspamd_url *href_url,
+                                                               std::string_view disp_tld,
+                                                               lua_State *L) -> bool
+{
+       if (disp_tld.empty()) {
+               return false;
+       }
+
+       struct rspamd_url *leaf = nullptr;
+       struct rspamd_url *cur = href_url;
+       bool fully_consumed = false;
+
+       for (unsigned int depth = 0; depth < RSPAMD_URL_QUERY_MAX_NESTING; depth++) {
+               if (cur->querylen == 0) {
+                       fully_consumed = true; /* natural end: no further query */
+                       break;
+               }
+
+               query_target_scan_cbd cbd{0, nullptr};
+               rspamd_url_query_foreach_embedded(pool, cur, RSPAMD_URL_FIND_ALL,
+                                                                                 html_query_target_cb, &cbd, L);
+
+               if (cbd.count == 0) {
+                       fully_consumed = true; /* natural end: query carries no embedded url */
+                       break;
+               }
+               if (cbd.count != 1) {
+                       return false; /* ambiguous: more than one embedded url at this level */
+               }
+
+               leaf = cbd.single; /* descend into the single embedded url */
+               cur = cbd.single;
+       }
+
+       /* Exited via nesting budget exhaustion: the real leaf is deeper than we
+        * are willing to follow, so we cannot decide on a match against an
+        * intermediate hop. Safe default: do not suppress. */
+       if (!fully_consumed || leaf == nullptr) {
+               return false;
+       }
+
+       auto leaf_tld = convert_idna_hostname_maybe(pool, leaf, true);
+       return sv_equals(leaf_tld, disp_tld) ||
+                  rspamd_url_is_subdomain(leaf_tld, disp_tld) ||
+                  rspamd_url_is_subdomain(disp_tld, leaf_tld);
+}
+
 auto html_url_is_phished(rspamd_mempool_t *pool,
                                                 struct rspamd_url *href_url,
                                                 std::string_view text_data,
@@ -182,7 +259,8 @@ auto html_url_is_phished(rspamd_mempool_t *pool,
                                        if (!sv_equals(disp_tok, href_tok)) {
                                                /* Check if one url is a subdomain for another */
 
-                                               if (!rspamd_url_is_subdomain(disp_tok, href_tok)) {
+                                               if (!rspamd_url_is_subdomain(disp_tok, href_tok) &&
+                                                       !html_href_query_targets_display(pool, href_url, disp_tok, L)) {
                                                        href_url->flags |= RSPAMD_URL_FLAG_PHISHED;
                                                        text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
 
index f1aac919af6bfcf8c892130e3c57f8aa4bf319a2..b27b64caaea95feb3dd175698b1ae35451337a50 100644 (file)
@@ -122,6 +122,21 @@ rspamd_http_message_from_url(const char *url)
        msg->host = g_string_new_len(host, pu.field_data[UF_HOST].len);
        msg->url = rspamd_fstring_append(msg->url, path, pathlen);
 
+       if ((pu.field_set & (1 << UF_PATH)) == 0 &&
+               (pu.field_set & (1 << UF_QUERY)) != 0) {
+               /*
+                * URL has a query but no path (e.g. http://host?a=b): the default
+                * path "/" was used above, which is a literal not pointing into the
+                * query, so the query was not included. Append "?<query>" explicitly,
+                * otherwise the request-target would be just "/" and the query string
+                * would be silently dropped.
+                */
+               msg->url = rspamd_fstring_append(msg->url, "?", 1);
+               msg->url = rspamd_fstring_append(msg->url,
+                                                                                url + pu.field_data[UF_QUERY].off,
+                                                                                pu.field_data[UF_QUERY].len);
+       }
+
        REF_INIT_RETAIN(msg, rspamd_http_message_free);
 
        return msg;
index 5ea429395c9afaad03e94dd0f3ae3f4fbadb9f15..dfa3c6879870e5a6129eb7109b0e5b9d1e5d8f9e 100644 (file)
@@ -3771,9 +3771,8 @@ rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
        if (url->querylen > 0) {
                struct rspamd_url_mimepart_cbdata qcbd = *cbd;
                qcbd.parent_flags = url->flags;
-               rspamd_url_find_multiple(task->task_pool,
-                                                                rspamd_url_query_unsafe(url), url->querylen,
-                                                                RSPAMD_URL_FIND_ALL, NULL,
+               rspamd_url_find_in_query(task->task_pool, url,
+                                                                RSPAMD_URL_FIND_ALL,
                                                                 rspamd_url_query_callback, &qcbd,
                                                                 task->cfg ? task->cfg->lua_state : NULL);
        }
@@ -3856,6 +3855,152 @@ void rspamd_url_find_multiple(rspamd_mempool_t *pool,
        }
 }
 
+static void rspamd_url_find_in_query_depth(rspamd_mempool_t *pool,
+                                                                                  struct rspamd_url *url,
+                                                                                  enum rspamd_url_find_type how,
+                                                                                  url_insert_function func,
+                                                                                  gpointer ud,
+                                                                                  void *lua_state,
+                                                                                  unsigned int depth);
+
+struct rspamd_url_query_count_cbd {
+       rspamd_mempool_t *pool;
+       enum rspamd_url_find_type how;
+       url_insert_function func;
+       gpointer ud;
+       void *lua_state;
+       unsigned int depth;
+       unsigned int count;
+};
+
+static gboolean
+rspamd_url_query_count_cb(struct rspamd_url *url, gsize start_offset,
+                                                 gsize end_offset, gpointer ud)
+{
+       struct rspamd_url_query_count_cbd *cbd =
+               (struct rspamd_url_query_count_cbd *) ud;
+       gboolean ret;
+
+       cbd->count++;
+       ret = cbd->func(url, start_offset, end_offset, cbd->ud);
+
+       /* Recurse into the extracted URL's own query to unwrap nested (properly
+        * escaped) targets, bounded by the nesting limit. Stop if the insert
+        * callback bailed (e.g. max_urls reached). */
+       if (ret && cbd->depth + 1 < RSPAMD_URL_QUERY_MAX_NESTING &&
+               url->querylen > 0) {
+               rspamd_url_find_in_query_depth(cbd->pool, url, cbd->how, cbd->func,
+                                                                          cbd->ud, cbd->lua_state, cbd->depth + 1);
+       }
+
+       return ret;
+}
+
+void rspamd_url_query_foreach_embedded(rspamd_mempool_t *pool,
+                                                                          struct rspamd_url *url,
+                                                                          enum rspamd_url_find_type how,
+                                                                          url_insert_function func,
+                                                                          gpointer ud,
+                                                                          void *lua_state)
+{
+       const char *raw, *query, *end, *p, *c;
+
+       if (url->raw == NULL || url->rawlen == 0) {
+               return;
+       }
+
+       /*
+        * Use the raw (percent-encoded) query: an embedded URL's own separators are
+        * %26/%3B there, so splitting on literal '&'/';' bounds it to its parameter
+        * instead of greedily reading into the following parameters.
+        */
+       raw = url->raw;
+       query = memchr(raw, '?', url->rawlen);
+       if (query == NULL) {
+               return;
+       }
+       query++; /* skip '?' */
+       end = raw + url->rawlen;
+
+       {
+               const char *frag = memchr(query, '#', end - query);
+               if (frag != NULL) {
+                       end = frag;
+               }
+       }
+
+       /*
+        * Worst case the whole query decodes into one buffer; each parameter writes
+        * into its own non-overlapping slot, so rspamd_url_parse can safely keep
+        * raw pointers into it (the buffer lives for the pool's lifetime). Lazy
+        * alloc keeps queries with no embedded URL paying nothing.
+        */
+       const gsize query_total = (gsize) (end - query);
+       char *scratch = NULL;
+       gsize scratch_off = 0;
+
+       p = query;
+       c = query;
+       while (p <= end) {
+               if (p == end || *p == '&' || *p == ';') {
+                       if (p > c) {
+                               /* Parameter [c, p): scan the value after the first '=' */
+                               const char *eq = memchr(c, '=', p - c);
+                               const char *vstart = eq ? eq + 1 : c;
+                               gsize vlen = p - vstart;
+
+                               if (vlen > 0) {
+                                       if (scratch == NULL) {
+                                               scratch = rspamd_mempool_alloc(pool, query_total + 1);
+                                       }
+                                       char *slot = scratch + scratch_off;
+                                       gsize dlen = rspamd_url_decode(slot, vstart, vlen);
+                                       slot[dlen] = '\0';
+                                       rspamd_url_find_multiple(pool, slot, dlen, how, NULL,
+                                                                                        func, ud, lua_state);
+                                       scratch_off += dlen + 1;
+                               }
+                       }
+                       c = p + 1;
+               }
+               p++;
+       }
+}
+
+static void rspamd_url_find_in_query_depth(rspamd_mempool_t *pool,
+                                                                                  struct rspamd_url *url,
+                                                                                  enum rspamd_url_find_type how,
+                                                                                  url_insert_function func,
+                                                                                  gpointer ud,
+                                                                                  void *lua_state,
+                                                                                  unsigned int depth)
+{
+       struct rspamd_url_query_count_cbd cbd = {pool, how, func, ud, lua_state,
+                                                                                        depth, 0};
+
+       rspamd_url_query_foreach_embedded(pool, url, how,
+                                                                         rspamd_url_query_count_cb, &cbd, lua_state);
+
+       /* Record how many URLs are embedded in this URL's query. */
+       if (cbd.count > 0) {
+               if (url->ext == NULL) {
+                       url->ext = rspamd_mempool_alloc0_type(pool, struct rspamd_url_ext);
+               }
+               url->ext->query_embedded_urls =
+                       cbd.count > UINT16_MAX ? UINT16_MAX : (uint16_t) cbd.count;
+       }
+}
+
+void rspamd_url_find_in_query(rspamd_mempool_t *pool,
+                                                         struct rspamd_url *url,
+                                                         enum rspamd_url_find_type how,
+                                                         url_insert_function func,
+                                                         gpointer ud,
+                                                         void *lua_state)
+{
+       rspamd_url_find_in_query_depth(pool, url, how, func, ud, lua_state, 0);
+}
+
 void rspamd_url_find_single(rspamd_mempool_t *pool,
                                                        const char *in,
                                                        gsize inlen,
@@ -3912,15 +4057,40 @@ void rspamd_url_find_single(rspamd_mempool_t *pool,
 }
 
 
+struct rspamd_url_subject_query_cbd {
+       struct rspamd_task *task;
+       uint32_t parent_flags;
+};
+
+static gboolean
+rspamd_url_subject_query_callback(struct rspamd_url *query_url, gsize start_offset,
+                                                                 gsize end_offset, gpointer ud)
+{
+       struct rspamd_url_subject_query_cbd *cbd = ud;
+       struct rspamd_task *task = cbd->task;
+
+       if (query_url->hostlen == 0) {
+               return TRUE;
+       }
+
+       query_url->flags |= RSPAMD_URL_FLAG_QUERY;
+       /* Propagate source/classification flags from the parent URL */
+       query_url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
+       if (query_url->protocol == PROTOCOL_MAILTO && query_url->userlen == 0) {
+               return TRUE;
+       }
+
+       rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), query_url, false);
+
+       return TRUE;
+}
+
 gboolean
 rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
                                                                 gsize end_offset, gpointer ud)
 {
        struct rspamd_task *task = ud;
-       char *url_str = NULL;
-       struct rspamd_url *query_url;
-       int rc;
-       gboolean prefix_added;
 
        /* It is just a displayed URL, we should not check it for certain things */
        url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED | RSPAMD_URL_FLAG_SUBJECT;
@@ -3935,42 +4105,14 @@ rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
 
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
-               if (rspamd_url_find(task->task_pool, rspamd_url_query_unsafe(url), url->querylen,
-                                                       &url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
-
-                       query_url = rspamd_mempool_alloc0(task->task_pool,
-                                                                                         sizeof(struct rspamd_url));
-                       rc = rspamd_url_parse(query_url,
-                                                                 url_str,
-                                                                 strlen(url_str),
-                                                                 task->task_pool,
-                                                                 RSPAMD_URL_PARSE_TEXT,
-                                                                 task->cfg ? task->cfg->lua_state : NULL);
-
-                       if (rc == URI_ERRNO_OK &&
-                               query_url->hostlen > 0) {
-                               msg_debug_task("found url %s in query of url"
-                                                          " %*s",
-                                                          url_str, url->querylen, rspamd_url_query_unsafe(url));
-
-                               query_url->flags |= RSPAMD_URL_FLAG_QUERY;
-                               /* Propagate source/classification flags from the parent URL */
-                               query_url->flags |= (url->flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
-
-                               if (prefix_added) {
-                                       query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
-                               }
-
-                               if (query_url->protocol == PROTOCOL_MAILTO) {
-                                       if (query_url->userlen == 0) {
-                                               return TRUE;
-                                       }
-                               }
-
-                               rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls),
-                                                                                          query_url, false);
-                       }
-               }
+               struct rspamd_url_subject_query_cbd qcbd = {
+                       .task = task,
+                       .parent_flags = url->flags,
+               };
+               rspamd_url_find_in_query(task->task_pool, url,
+                                                                RSPAMD_URL_FIND_ALL,
+                                                                rspamd_url_subject_query_callback, &qcbd,
+                                                                task->cfg ? task->cfg->lua_state : NULL);
        }
 
        return TRUE;
index d5ed9134efb7d69f16df96a811226f0252d8e879..91b7a75b3757d49515a8a23d22347a4e28af958a 100644 (file)
@@ -123,6 +123,8 @@ struct rspamd_url_ext {
        struct rspamd_url *linked_url;
 
        uint16_t port;
+       /* Number of URLs found embedded in this URL's query string */
+       uint16_t query_embedded_urls;
 };
 
 #define rspamd_url_user(u) ((u)->userlen > 0 ? (u)->string + (u)->usershift : NULL)
@@ -272,6 +274,52 @@ void rspamd_url_find_single(rspamd_mempool_t *pool,
                                                        url_insert_function func,
                                                        gpointer ud);
 
+/**
+ * How deep to follow URLs nested inside the query of an already query-extracted
+ * URL (a properly escaped wrapper carries one target per encoding layer).
+ */
+#define RSPAMD_URL_QUERY_MAX_NESTING 5
+
+/**
+ * Find URLs embedded in the query parameters of `url`. Unlike
+ * rspamd_url_find_multiple, this walks the url's raw (still percent-encoded)
+ * query, splits it on the parameter separators '&' and ';', and decodes each
+ * value before scanning it. This bounds an embedded URL to its own parameter
+ * (its internal separators are %26/%3B at that point) instead of greedily
+ * swallowing the following parameters. URLs nested inside an extracted URL's
+ * own query are followed up to RSPAMD_URL_QUERY_MAX_NESTING levels.
+ * @param pool
+ * @param url url whose query is scanned
+ * @param how
+ * @param func callback invoked for each url found
+ * @param ud
+ * @param lua_state Lua state for consultation (may be NULL)
+ */
+void rspamd_url_find_in_query(rspamd_mempool_t *pool,
+                                                         struct rspamd_url *url,
+                                                         enum rspamd_url_find_type how,
+                                                         url_insert_function func,
+                                                         gpointer ud,
+                                                         void *lua_state);
+
+/**
+ * Invoke `func` for every URL embedded in `url`'s query, scanning a single
+ * level only (no nesting). Uses the same raw, per-parameter, decode-each-value
+ * bounding as rspamd_url_find_in_query.
+ * @param pool
+ * @param url url whose query is scanned
+ * @param how
+ * @param func callback invoked for each url found
+ * @param ud
+ * @param lua_state Lua state for consultation (may be NULL)
+ */
+void rspamd_url_query_foreach_embedded(rspamd_mempool_t *pool,
+                                                                          struct rspamd_url *url,
+                                                                          enum rspamd_url_find_type how,
+                                                                          url_insert_function func,
+                                                                          gpointer ud,
+                                                                          void *lua_state);
+
 /**
  * Generic callback to insert URLs into rspamd_task
  * @param url
index a29be75fd4d0ee818a3f70672e2fb506205fbaab..7d70ff168f52cf4b149c2b808d169e409df4ba14 100644 (file)
@@ -3021,7 +3021,12 @@ static void
 inject_url_query(struct rspamd_task *task, struct rspamd_url *url,
                                 GPtrArray *part_urls)
 {
-       if (url->querylen > 0) {
+       /*
+        * REDIRECTED means url_redirector already resolved this hop to a real target
+        * (a chain hop), so skip query-param guessing to avoid duplicating it;
+        * terminal hops (200/dead-end) are never REDIRECTED, so still scanned.
+        */
+       if (!(url->flags & RSPAMD_URL_FLAG_REDIRECTED) && url->querylen > 0) {
                struct rspamd_url_query_to_inject_cbd cbd;
 
                cbd.task = task;
@@ -3029,9 +3034,8 @@ inject_url_query(struct rspamd_task *task, struct rspamd_url *url,
                cbd.mpart_urls = part_urls;
                cbd.parent_flags = url->flags;
 
-               rspamd_url_find_multiple(task->task_pool,
-                                                                rspamd_url_query_unsafe(url), url->querylen,
-                                                                RSPAMD_URL_FIND_ALL, NULL,
+               rspamd_url_find_in_query(task->task_pool, url,
+                                                                RSPAMD_URL_FIND_ALL,
                                                                 inject_url_query_callback, &cbd,
                                                                 task->cfg ? task->cfg->lua_state : NULL);
        }
@@ -3057,9 +3061,14 @@ lua_task_inject_url(lua_State *L)
        if (task && task->message && url && url->url) {
                rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false);
 
-               if (mpart && mpart->urls) {
-                       inject_url_query(task, url->url, mpart->urls);
-               }
+               /*
+                * Scan the injected URL's query string for embedded URLs (e.g. a
+                * redirector/wrapper hop that carries its real target in ?u=...). This
+                * must run even without an associated mime part, otherwise URLs
+                * injected by url_redirector and similar consumers miss the query
+                * extraction that MIME-parsed URLs receive.
+                */
+               inject_url_query(task, url->url, (mpart && mpart->urls) ? mpart->urls : NULL);
        }
        else {
                return luaL_error(L, "invalid arguments");
index b3299ead36fcf6275b2a0339f4555411ae3ed2eb..788338c92545e4d3cbbc4e072e40a3d6fb5d3464 100644 (file)
@@ -66,6 +66,7 @@ LUA_FUNCTION_DEF(url, is_subject);
 LUA_FUNCTION_DEF(url, get_phished);
 LUA_FUNCTION_DEF(url, set_redirected);
 LUA_FUNCTION_DEF(url, get_count);
+LUA_FUNCTION_DEF(url, get_query_embedded_urls);
 LUA_FUNCTION_DEF(url, get_visible);
 LUA_FUNCTION_DEF(url, get_hash);
 LUA_FUNCTION_DEF(url, create);
@@ -99,6 +100,7 @@ static const struct luaL_reg urllib_m[] = {
 
        LUA_INTERFACE_DEF(url, get_visible),
        LUA_INTERFACE_DEF(url, get_count),
+       LUA_INTERFACE_DEF(url, get_query_embedded_urls),
        LUA_INTERFACE_DEF(url, get_hash),
        LUA_INTERFACE_DEF(url, get_flags),
        LUA_INTERFACE_DEF(url, get_flags_num),
@@ -788,6 +790,27 @@ lua_url_get_count(lua_State *L)
        return 1;
 }
 
+/***
+ * @method url:get_query_embedded_urls()
+ * Get the number of URLs found embedded in this URL's query string
+ * @return {number} count of query-embedded URLs (0 if none)
+ */
+static int
+lua_url_get_query_embedded_urls(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_lua_url *url = lua_check_url(L, 1);
+
+       if (url != NULL && url->url != NULL) {
+               lua_pushinteger(L, url->url->ext ? url->url->ext->query_embedded_urls : 0);
+       }
+       else {
+               lua_pushnil(L);
+       }
+
+       return 1;
+}
+
 /***
 * @method url:get_visible()
 * Get visible part of the url with html tags stripped
index 0377b226d17445cca4148fd1b31110265497715f..262ad59f8b8c528b63f4c866e70c31116cb19d1b 100644 (file)
@@ -233,12 +233,16 @@ local function chain_hosts_string(chain)
   return table.concat(hosts, '->')
 end
 
--- Compute the per-URL Redis cache key. Hashing the URL string keeps keys
--- fixed-length and free of URL-unsafe characters; using tostring() (rather
--- than :get_raw()) keeps the hash stable across the write-then-read cycle
--- when chain values are roundtripped through rspamd_url.create.
+-- Mixed into the hashed cache key; bump on incompatible value-format changes
+-- so old-format entries hash elsewhere and get re-resolved, not misread.
+-- v1: value is the raw (percent-encoded) URL, not the decoded text.
+local cache_format_version = 'v1:'
+
+-- Per-URL Redis cache key: hash the URL (fixed-length, URL-safe). tostring()
+-- (not get_raw) keeps the hash stable across the write-then-read round-trip.
 local function cache_key_for_url(url_str)
-  return settings.key_prefix .. hash.create(url_str):base32():sub(1, 32)
+  return settings.key_prefix ..
+      hash.create(cache_format_version .. url_str):base32():sub(1, 32)
 end
 
 -- Whether an intermediate hop should be saved (in cache and task URL set)
@@ -347,7 +351,9 @@ local function cache_chain_to_redis(task, chain, terminal_prefix)
 
   local function write_link(prev_url, next_url, marker)
     local link_key = cache_key_for_url(tostring(prev_url))
-    local next_str = encode_url_for_redirect(next_url:get_text())
+    -- Cache the raw (percent-encoded) form: keeps query boundaries intact so
+    -- cached hops re-parse identically to live ones, and it is already URL-safe.
+    local next_str = next_url:get_raw()
     local cache_value
     if marker then
       cache_value = string.format('^%s:%s', marker, next_str)
@@ -565,7 +571,8 @@ http_walk = function(task, orig_url, url, ntries, chain, seen)
   -- Mirror the cache walk's cycle guard: a redirector loop A->B->A->B
   -- (e.g. login redirector flapping between two hosts) would otherwise
   -- chew through nested_limit and bloat the chain with alternating
-  -- entries.
+  -- entries. tostring() (not get_raw): the cycle guard, cache key and
+  -- GET-map match need a stable identity that collapses encoding variants;
   local url_str = tostring(url)
   if seen[url_str] then
     lua_util.debugm(N, task, 'cycle in http walk at %s', url_str)
@@ -719,8 +726,12 @@ http_walk = function(task, orig_url, url, ntries, chain, seen)
     method = 'get'
   end
 
+  -- Request the raw (percent-encoded) URL: the decoded form would let a wrapper
+  -- mis-split its ?u=https%3A%2F%2F... target at the now-literal '&' and truncate.
+  local request_url = url:get_raw()
+
   local http_params = {
-    url = url_str,
+    url = request_url,
     task = task,
     method = method,
     max_size = settings.max_size,
index 9fc9e5adc859b3984cf071111c10fe1cdfca3578..ecad86ceb4762091d9f0fa3cf32658d45e0315c6 100644 (file)
@@ -55,6 +55,8 @@ local symbols = {
   backslash = "URL_BACKSLASH_PATH",
   excessive_dots = "URL_EXCESSIVE_DOTS",
   very_long = "URL_VERY_LONG",
+  -- Query symbols
+  query_multiple_urls = "URL_QUERY_MULTIPLE_URLS",
   -- Obfuscated text symbol
   obfuscated_text = "URL_OBFUSCATED_TEXT"
 }
@@ -99,6 +101,11 @@ local settings = {
       check_length = true,
       max_url_length = 2048
     },
+    query_urls = {
+      enabled = true,
+      -- More than this many URLs embedded in a single URL's query is anomalous
+      max_query_urls = 1
+    },
     obfuscated_text = {
       enabled = true,
       -- DoS protection limits
@@ -142,6 +149,8 @@ local settings = {
     backslash = "URL_BACKSLASH_PATH",
     excessive_dots = "URL_EXCESSIVE_DOTS",
     very_long = "URL_VERY_LONG",
+    -- Query symbols
+    query_multiple_urls = "URL_QUERY_MULTIPLE_URLS",
     -- Obfuscated text symbol
     obfuscated_text = "URL_OBFUSCATED_TEXT"
   },
@@ -559,6 +568,25 @@ function checks.structure_analysis(task, url, cfg)
   return findings
 end
 
+-- Check: multiple URLs embedded in the query string
+function checks.query_urls_analysis(task, url, cfg)
+  local findings = {}
+
+  -- One URL embedded in a query is common (redirectors/trackers); more than
+  -- one in a single URL's query is anomalous.
+  local n = url:get_query_embedded_urls()
+
+  if n and n > cfg.max_query_urls then
+    lua_util.debugm(N, task, "URL query embeds %d URLs", n)
+    table.insert(findings, {
+      symbol = symbols.query_multiple_urls,
+      options = { string.format("%d", n) }
+    })
+  end
+
+  return findings
+end
+
 -- Main analysis function
 local function analyze_url(task, url, cfg)
   local all_findings = {}
@@ -608,6 +636,13 @@ local function analyze_url(task, url, cfg)
     end
   end
 
+  if cfg.checks.query_urls and cfg.checks.query_urls.enabled then
+    local findings = checks.query_urls_analysis(task, url, cfg.checks.query_urls)
+    for _, f in ipairs(findings) do
+      table.insert(all_findings, f)
+    end
+  end
+
   -- Run custom checks (advanced users)
   for name, check_func in pairs(cfg.custom_checks) do
     local ok, findings = pcall(check_func, task, url, cfg)
@@ -631,6 +666,7 @@ local function url_suspect_callback(task)
   -- TLD and structure checks don't have corresponding URL flags, so need all URLs
   local need_all_urls = (
       (settings.checks.tld and settings.checks.tld.enabled) or
+          (settings.checks.query_urls and settings.checks.query_urls.enabled) or
           (settings.checks.structure and settings.checks.structure.enabled and
               (settings.checks.structure.check_multiple_at or
                   settings.checks.structure.check_excessive_dots or
index bc7f3981ef7e9cc73b4e8adc46195e0d4392f2aa..c553cea627862d131e990fbac12b3c1a8c247ef4 100644 (file)
@@ -7,6 +7,11 @@ Variables       ${RSPAMD_TESTDIR}/lib/vars.py
 ${MESSAGE1}           ${RSPAMD_TESTDIR}/messages/phishing1.eml
 ${MESSAGE2}           ${RSPAMD_TESTDIR}/messages/phishing2.eml
 ${MESSAGE3}           ${RSPAMD_TESTDIR}/messages/phishing3.eml
+${MESSAGE_QSAFE}      ${RSPAMD_TESTDIR}/messages/phishing_query_safe.eml
+${MESSAGE_QMULTI}     ${RSPAMD_TESTDIR}/messages/phishing_query_multi.eml
+${MESSAGE_QNESTED}    ${RSPAMD_TESTDIR}/messages/phishing_query_nested.eml
+${MESSAGE_QNESTFIRE}  ${RSPAMD_TESTDIR}/messages/phishing_query_nested_fire.eml
+${MESSAGE_QOVERCAP}   ${RSPAMD_TESTDIR}/messages/phishing_query_overcap.eml
 ${SETTINGS_PHISHING}  {symbols_enabled = [PHISHING,STRICT_PHISHING,STRICTER_PHISHING]}
 
 *** Test Cases ***
@@ -24,3 +29,46 @@ TEST PHISHING STRICT TWO
   Scan File  ${MESSAGE3}
   ...  Settings=${SETTINGS_PHISHING}
   Expect Symbol  STRICTER_PHISHING
+
+TEST PHISHING NO FP WHEN HREF QUERY DEST EQUALS DISPLAY TEXT
+  # href host differs from the displayed domain, but the href's query embeds a
+  # destination URL equal to the displayed domain (a wrapper/redirector that
+  # points back at the shown domain) -- not phishing.
+  Scan File  ${MESSAGE_QSAFE}
+  ...  Settings=${SETTINGS_PHISHING}
+  Do Not Expect Symbol  PHISHING
+
+TEST PHISHING FIRES WHEN HREF QUERY HAS MULTIPLE URLS
+  # The href query embeds two URLs (one matching the displayed domain, one not).
+  # Multiple embedded URLs are ambiguous, so the single-target wrapper exception
+  # does not apply and phishing must still fire on the host mismatch.
+  Scan File  ${MESSAGE_QMULTI}
+  ...  Settings=${SETTINGS_PHISHING}
+  Expect Symbol  PHISHING
+
+TEST PHISHING NO FP WHEN NESTED QUERY LEAF EQUALS DISPLAY TEXT
+  # The href wraps a wrapper: its query holds one URL (mid) whose query in turn
+  # holds one URL (the displayed domain). Following the single-target chain to
+  # the leaf, it equals the displayed domain -- not phishing.
+  Scan File  ${MESSAGE_QNESTED}
+  ...  Settings=${SETTINGS_PHISHING}
+  Do Not Expect Symbol  PHISHING
+
+TEST PHISHING FIRES WHEN NESTED QUERY LEAF DIFFERS FROM DISPLAY TEXT
+  # The href wraps a wrapper whose own host equals the displayed domain, but its
+  # query wraps a further URL (the leaf) pointing elsewhere. The single-target
+  # chain must be followed to the leaf: an intermediate match does not suppress,
+  # so phishing fires on the leaf vs display mismatch.
+  Scan File  ${MESSAGE_QNESTFIRE}
+  ...  Settings=${SETTINGS_PHISHING}
+  Expect Symbol  PHISHING
+
+TEST PHISHING FIRES WHEN NESTED CHAIN EXCEEDS NESTING CAP
+  # The href wraps a single-target chain deeper than RSPAMD_URL_QUERY_MAX_NESTING.
+  # The deepest URL we are willing to follow happens to match the displayed
+  # domain, but the real leaf is one more level down and points elsewhere.
+  # Since the walk exits via budget exhaustion rather than reaching a natural
+  # end, the intermediate match must not suppress phishing.
+  Scan File  ${MESSAGE_QOVERCAP}
+  ...  Settings=${SETTINGS_PHISHING}
+  Expect Symbol  PHISHING
index d1d8399532deab35820ca47112c440e36a28c589..52c7efae0574ba30fe417246feb05fde23908792 100644 (file)
@@ -43,6 +43,18 @@ URL Suspect - Multiple At Signs
   ...  Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_MULTIPLE_AT_SIGNS]}
   Expect Symbol  URL_MULTIPLE_AT_SIGNS
 
+URL Suspect - Multiple Query URLs
+  # A single URL whose query embeds more than one URL is anomalous
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_suspect_multi_query.eml
+  ...  Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_QUERY_MULTIPLE_URLS]}
+  Expect Symbol With Exact Options  URL_QUERY_MULTIPLE_URLS  2
+
+URL Suspect - Single Query URL
+  # One embedded URL in a query is common (redirectors/trackers) and must not fire
+  Scan File  ${RSPAMD_TESTDIR}/messages/url_suspect_single_query.eml
+  ...  Settings={symbols_enabled = [URL_SUSPECT_CHECK, URL_QUERY_MULTIPLE_URLS]}
+  Do Not Expect Symbol  URL_QUERY_MULTIPLE_URLS
+
 URL Suspect - Normal URL
   # Test that normal URLs don't trigger symbols
   Scan File  ${RSPAMD_TESTDIR}/messages/url_suspect_normal.eml
diff --git a/test/functional/cases/168_url_redirector_phishing.robot b/test/functional/cases/168_url_redirector_phishing.robot
new file mode 100644 (file)
index 0000000..de77baf
--- /dev/null
@@ -0,0 +1,35 @@
+*** Settings ***
+Suite Setup     Urlredirector Setup
+Suite Teardown  Urlredirector Teardown
+Library         Process
+Library         ${RSPAMD_TESTDIR}/lib/rspamd.py
+Resource        ${RSPAMD_TESTDIR}/lib/rspamd.robot
+Test Tags       notparallel
+Variables       ${RSPAMD_TESTDIR}/lib/vars.py
+
+*** Variables ***
+${CONFIG}          ${RSPAMD_TESTDIR}/configs/url_redirector_phishing.conf
+${MESSAGE}         ${RSPAMD_TESTDIR}/messages/redir_phishing_safe.eml
+${REDIS_SCOPE}     Suite
+${RSPAMD_SCOPE}    Suite
+${RSPAMD_URL_TLD}  ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat
+${SETTINGS}        {symbols_enabled=[URL_REDIRECTOR_CHECK, PHISHING]}
+
+*** Test Cases ***
+PHISHING NO FP WHEN DISPLAY TEXT EQUALS REDIRECT DEST
+  # t.co (a known redirector, faked to the dummy HTTP server) resolves to the
+  # same domain shown in the anchor text, so the resolved redirect destination
+  # equals the displayed URL -- phishing must not fire.
+  Scan File  ${MESSAGE}  Settings=${SETTINGS}
+  Expect Symbol  URL_REDIRECTOR
+  Do Not Expect Symbol  PHISHING
+
+*** Keywords ***
+Urlredirector Setup
+  Run Dummy Http
+  Rspamd Redis Setup
+
+Urlredirector Teardown
+  Rspamd Redis Teardown
+  Dummy Http Teardown
+  Terminate All Processes    kill=True
diff --git a/test/functional/cases/169_url_redirector_query_target.robot b/test/functional/cases/169_url_redirector_query_target.robot
new file mode 100644 (file)
index 0000000..cc02631
--- /dev/null
@@ -0,0 +1,40 @@
+*** Settings ***
+Suite Setup     Urlredirector Setup
+Suite Teardown  Urlredirector Teardown
+Library         Process
+Library         ${RSPAMD_TESTDIR}/lib/rspamd.py
+Resource        ${RSPAMD_TESTDIR}/lib/rspamd.robot
+Test Tags       notparallel
+Variables       ${RSPAMD_TESTDIR}/lib/vars.py
+
+*** Variables ***
+${CONFIG}          ${RSPAMD_TESTDIR}/configs/url_redirector.conf
+${MESSAGE}         ${RSPAMD_TESTDIR}/messages/redir_query_target.eml
+${REDIS_SCOPE}     Suite
+${RSPAMD_SCOPE}    Suite
+${RSPAMD_URL_TLD}  ${RSPAMD_TESTDIR}/../lua/unit/test_tld.dat
+${SETTINGS}        {symbols_enabled=[URL_REDIRECTOR_CHECK]}
+
+*** Test Cases ***
+RESOLVE ENCODED QUERY TARGET THROUGH PATH-LESS WRAPPER
+  # /combo_entry redirects to a PATH-LESS wrapper that carries the real target
+  # percent-encoded (with its own &-separated params) in ?u=. Resolving the
+  # full destination requires the request to a path-less URL to keep its query
+  # and to send it percent-encoded; otherwise the wrapper sees a dropped or
+  # &-truncated u and the target is lost.
+  Scan File  ${MESSAGE}  Flags=ext_urls  Settings=${SETTINGS}
+  Expect Extended URL  http://dest.com/?a=1&b=2
+  # The followed wrapper also carried &other=...; a hop already resolved to a
+  # real redirect target must not have its query re-extracted, so the extra
+  # URL must not surface.
+  Do Not Expect Extended URL  http://other.com/
+
+*** Keywords ***
+Urlredirector Setup
+  Run Dummy Http
+  Rspamd Redis Setup
+
+Urlredirector Teardown
+  Rspamd Redis Teardown
+  Dummy Http Teardown
+  Terminate All Processes    kill=True
index 5b7eb38bab862b37a4146a276f2f97913b04735c..90035e1b724aa79d9071e9400f3b496c5de7de40 100644 (file)
@@ -1,3 +1,4 @@
 t.co
 bit.ly
 127.0.0.1
+redir.example.net
index 71d82c717aaccb7ffe779d2d97340d2a73297122..940766070e3b9473c0ef494623635683e5445d6e 100644 (file)
@@ -754,6 +754,13 @@ options = {
           type = a;
           replies = ["127.0.0.2"];
         },
+        # Redirector test (168_url_redirector_phishing): fake A so the dummy
+        # HTTP server on 127.0.0.1 is reached without real DNS.
+        {
+          name = "redir.example.net";
+          type = a;
+          replies = ["127.0.0.1"];
+        },
         # TODO: add IPv6 tests
         ];
   }
diff --git a/test/functional/configs/url_redirector_phishing.conf b/test/functional/configs/url_redirector_phishing.conf
new file mode 100644 (file)
index 0000000..5feefe1
--- /dev/null
@@ -0,0 +1,14 @@
+.include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf"
+
+redis {
+  servers = "{= env.REDIS_ADDR =}:{= env.REDIS_PORT =}";
+}
+
+url_redirector {
+  redirector_hosts_map = "{= env.TESTDIR =}/configs/maps/redir.map";
+  redirector_symbol = "URL_REDIRECTOR";
+}
+
+phishing {
+  symbol = "PHISHING";
+}
index c04c8330ed4e3009025c1c2220bd2dc3944e9cb2..25c63e8c8190d407f24d9221c7905157736545b6 100644 (file)
@@ -175,6 +175,17 @@ Expect Extended URL
   END
   Should Be True  ${found_url}  msg="Expected URL was not found: ${url}"
 
+Do Not Expect Extended URL
+  [Arguments]  ${url}
+  ${found_url} =  Set Variable  ${FALSE}
+  ${url_list} =  Convert To List  ${SCAN_RESULT}[urls]
+  FOR  ${item}  IN  @{url_list}
+    ${d} =  Convert To Dictionary  ${item}
+    ${found_url} =  Evaluate  "${d}[url]" == "${url}"
+    Exit For Loop If  ${found_url} == ${TRUE}
+  END
+  Should Not Be True  ${found_url}  msg="URL should not be present: ${url}"
+
 Expect Symbol With Exact Options
   [Arguments]  ${symbol}  @{options}
   Expect Symbol  ${symbol}
index f8764b2134bcef66ace6fd593fc15dfe44266b1e..ab1e40417e324010a4565aceeb1a44d335b33109 100644 (file)
@@ -4,7 +4,10 @@ local function task_inject_cb (task)
     local url_text = 'http://example.com?redir=https://another.com'
     local url_to_inject = url.create(task:get_mempool(), url_text)
     task:inject_url(url_to_inject)
-    if #(task:get_urls()) == 2 then
+    -- 3 urls: 1 from the scanned ics.eml (SUMMARY: http://test.com), 1 injected
+    -- (example.com), and 1 extracted from the injected URL's query by the
+    -- bounded query scan inject_url runs (another.com from ?redir=...).
+    if #(task:get_urls()) == 3 then
         return true
     end
     return false
diff --git a/test/functional/messages/phishing_query_multi.eml b/test/functional/messages/phishing_query_multi.eml
new file mode 100644 (file)
index 0000000..76a1619
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F&v=http%3A%2F%2Fevil.org%2F">http://www.cnn.com</a>
diff --git a/test/functional/messages/phishing_query_nested.eml b/test/functional/messages/phishing_query_nested.eml
new file mode 100644 (file)
index 0000000..b591063
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fmid.com%2F%3Fv%3Dhttp%253A%252F%252Fwww.cnn.com%252F">http://www.cnn.com</a>
diff --git a/test/functional/messages/phishing_query_nested_fire.eml b/test/functional/messages/phishing_query_nested_fire.eml
new file mode 100644 (file)
index 0000000..ef3fc7d
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F%3Fv%3Dhttp%253A%252F%252Fevil.org%252F">http://www.cnn.com</a>
diff --git a/test/functional/messages/phishing_query_overcap.eml b/test/functional/messages/phishing_query_overcap.eml
new file mode 100644 (file)
index 0000000..664f32a
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fa.io%2F%3Fv%3Dhttp%253A%252F%252Fb.io%252F%253Fw%253Dhttp%25253A%25252F%25252Fc.io%25252F%25253Fx%25253Dhttp%2525253A%2525252F%2525252Fd.io%2525252F%2525253Fy%2525253Dhttp%252525253A%252525252F%252525252Fwww.cnn.com%252525252F%252525253Fz%252525253Dhttp%25252525253A%25252525252F%25252525252Fev.io%25252525252F">http://www.cnn.com</a>
diff --git a/test/functional/messages/phishing_query_safe.eml b/test/functional/messages/phishing_query_safe.eml
new file mode 100644 (file)
index 0000000..5c1ac2b
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+lol <a href="http://www.example.net/r?u=http%3A%2F%2Fwww.cnn.com%2F">http://www.cnn.com</a>
diff --git a/test/functional/messages/redir_phishing_safe.eml b/test/functional/messages/redir_phishing_safe.eml
new file mode 100644 (file)
index 0000000..6c254d1
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/html
+
+<a href="http://redir.example.net:18080/redir_to_example">http://www.example.com/</a>
diff --git a/test/functional/messages/redir_query_target.eml b/test/functional/messages/redir_query_target.eml
new file mode 100644 (file)
index 0000000..77a1799
--- /dev/null
@@ -0,0 +1,3 @@
+Content-type: text/plain
+
+click http://127.0.0.1:18080/combo_entry
diff --git a/test/functional/messages/url_suspect_multi_query.eml b/test/functional/messages/url_suspect_multi_query.eml
new file mode 100644 (file)
index 0000000..f283c0d
--- /dev/null
@@ -0,0 +1,3 @@
+Content-Type: text/html
+
+<html><body><a href="http://wrap.com/r?u=https://a.com/&v=https://b.com/">click</a></body></html>
diff --git a/test/functional/messages/url_suspect_single_query.eml b/test/functional/messages/url_suspect_single_query.eml
new file mode 100644 (file)
index 0000000..db89ec9
--- /dev/null
@@ -0,0 +1,3 @@
+Content-Type: text/html
+
+<html><body><a href="http://wrap.com/r?u=https://a.com/">click</a></body></html>
index 5626885ab988999845be12d4344fe3ac0bfe95f8..e2077ae65fdb7dc629d9045ed14875395c22ff33 100755 (executable)
@@ -113,6 +113,10 @@ class MainHandler(tornado.web.RequestHandler):
         elif path == "/mailto_redirect":
             # Send an HTTP redirect to a mailto: URL (non-HTTP scheme)
             self.redirect("mailto:user@example.net")
+        elif path == "/redir_to_example":
+            # Redirect to a domain matching the displayed text, so the phishing
+            # module sees a 1:1 redirect target and must not fire (no FP).
+            self.redirect("http://www.example.com/")
         elif path == "/chain_intermediate_1":
             # Intermediate hop to chain_to_tel2
             self.redirect(f"{self.request.protocol}://{self.request.host}/chain_intermediate_2")
@@ -131,13 +135,28 @@ class MainHandler(tornado.web.RequestHandler):
         elif path == "/slow":
             # Slow redirect
             self.redirect(f"{self.request.protocol}://{self.request.host}/hello")
+        elif path == "/combo_entry":
+            # Redirect to a PATH-LESS wrapper that carries a percent-encoded
+            # target (with its own &-separated params) plus an extra distinct
+            # URL. Resolving the full target needs: keeping the query on a
+            # path-less request (http path-less fix) and sending it
+            # percent-encoded (redirector raw-request fix); and the extra URL
+            # must NOT surface, because the followed wrapper's query is not
+            # re-extracted (REDIRECTED-guard fix).
+            target = "http%3A%2F%2Fdest.com%2F%3Fa%3D1%26b%3D2"
+            other = "http%3A%2F%2Fother.com%2F"
+            self.redirect(f"{self.request.protocol}://{self.request.host}?u={target}&other={other}")
+        elif path == "/" and self.get_query_argument("u", default=None):
+            # Path-less wrapper: redirect to the URL carried (percent-encoded)
+            # in the u query parameter.
+            self.redirect(self.get_query_argument("u"))
         else:
             self.send_response(200)
         self.set_header("Content-Type", "text/plain")
 
 def make_app():
     return tornado.web.Application([
-        (r"(/[^/]+)", MainHandler),
+        (r"(/.*)", MainHandler),
     ])
 
 async def main():
index 862fea4fcb06bc9c6f934fa2a6c9dc303a6333b2..69f747b4284859a63be40a3a4eadb021944231c2 100644 (file)
@@ -335,4 +335,128 @@ Content-Type: text/html
     task:destroy()
   end)
 
+  test("Query-embedded URL extraction is bounded by its parameter", function()
+    local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fdest.com%2F&b=x&c=y">link</a>
+</body></html>
+]]
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+
+    task:process_message()
+
+    local found
+    for _, u in ipairs(task:get_urls() or {}) do
+      if u:get_host() == "dest.com" then
+        found = u:get_text()
+      end
+    end
+
+    assert_not_nil(found, "embedded query URL should be extracted")
+    assert_equal("http://dest.com/", found,
+        "embedded URL must stop at the parameter boundary, not swallow &b=x&c=y")
+
+    task:destroy()
+  end)
+
+  test("Query-embedded URL inherits CTA from its parent href", function()
+    local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fdest.com%2F">Click here to continue</a>
+</body></html>
+]]
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+
+    task:process_message()
+
+    local found_cta = false
+    for _, part in ipairs(task:get_text_parts() or {}) do
+      if part:is_html() then
+        for _, u in ipairs(part:get_cta_urls({ original = true }) or {}) do
+          if u:get_host() == "dest.com" then
+            found_cta = true
+          end
+        end
+      end
+    end
+
+    assert_true(found_cta,
+        "query-extracted destination should inherit CTA from its parent href")
+
+    task:destroy()
+  end)
+
+  test("Nested query-embedded URLs are followed to the leaf", function()
+    -- href wraps mid, whose (escaped) query wraps deep
+    local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fmid.com%2F%3Fv%3Dhttp%253A%252F%252Fdeep.com%252F">link</a>
+</body></html>
+]]
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+
+    task:process_message()
+
+    local hosts = {}
+    for _, u in ipairs(task:get_urls() or {}) do
+      hosts[u:get_host()] = true
+    end
+
+    assert_true(hosts["mid.com"], "first-level embedded URL should be extracted")
+    assert_true(hosts["deep.com"], "nested embedded URL should be extracted")
+
+    task:destroy()
+  end)
+
+  test("Nested query-embedded URLs stop at RSPAMD_URL_QUERY_MAX_NESTING", function()
+    -- wrap?u=l1?v=l2?w=l3?x=l4?y=l5?z=l6, each level escaped one layer deeper.
+    -- With the nesting cap at 5, l1..l5 are extracted but l6 (a 6th level) is not.
+    local msg = [[
+From: test@example.com
+To: nobody@example.com
+Subject: test
+Content-Type: text/html
+
+<html><body>
+<a href="http://wrap.com/r?u=http%3A%2F%2Fl1.com%2F%3Fu%3Dhttp%253A%252F%252Fl2.com%252F%253Fv%253Dhttp%25253A%25252F%25252Fl3.com%25252F%25253Fw%25253Dhttp%2525253A%2525252F%2525252Fl4.com%2525252F%2525253Fx%2525253Dhttp%252525253A%252525252F%252525252Fl5.com%252525252F%252525253Fy%252525253Dhttp%25252525253A%25252525252F%25252525252Fl6.com%25252525252F">link</a>
+</body></html>
+]]
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+
+    task:process_message()
+
+    local hosts = {}
+    for _, u in ipairs(task:get_urls() or {}) do
+      hosts[u:get_host()] = true
+    end
+
+    assert_true(hosts["l1.com"], "level-1 embedded URL should be extracted")
+    assert_true(hosts["l2.com"], "level-2 embedded URL should be extracted")
+    assert_true(hosts["l3.com"], "level-3 embedded URL should be extracted")
+    assert_true(hosts["l4.com"], "level-4 embedded URL should be extracted")
+    assert_true(hosts["l5.com"], "level-5 embedded URL should be extracted")
+    assert_nil(hosts["l6.com"], "level-6 URL is past the nesting cap and must not be extracted")
+
+    task:destroy()
+  end)
+
 end)