]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Propagate source/classification URL flags to query-extracted URLs
authorVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 24 Feb 2026 21:42:03 +0000 (21:42 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Tue, 24 Feb 2026 21:42:03 +0000 (21:42 +0000)
When a URL is found inside the query string of another URL (e.g.
http://redir.com/?q=http://target.com), the inner URL now inherits
source/classification flags (FROM_TEXT, CONTENT, SUBJECT, INVISIBLE)
from the outer URL via RSPAMD_URL_FLAG_PROPAGATE_MASK.

Previously, inner URLs only received the QUERY flag, losing all context
about where the parent URL was found. This caused inconsistencies in
plugins that filter URLs by source flags (e.g. RBL content URL filtering).

Also fixes two bugs in the subject path (rspamd_url_task_subject_callback):
- hostlen check used outer URL instead of inner query URL
- QUERY flag was not set on URLs extracted from subject URL queries

src/libserver/html/html.cxx
src/libserver/url.c
src/libserver/url.h

index 3604df09e0dcf4a7a84929b3cdf3379953bf6740..a67d35bf72398d832fadc3442599df3145be8cc5 100644 (file)
@@ -1397,6 +1397,7 @@ struct rspamd_html_url_query_cbd {
        khash_t(rspamd_url_hash) * url_set;
        struct rspamd_url *url;
        GPtrArray *part_urls;
+       uint32_t parent_flags; /* Flags from outer URL to propagate */
 };
 
 static gboolean
@@ -1422,6 +1423,9 @@ html_url_query_callback(struct rspamd_url *url, gsize start_offset,
 
        url->flags |= RSPAMD_URL_FLAG_QUERY;
 
+       /* Propagate source/classification flags from the parent (outer) URL */
+       url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
        if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
                g_ptr_array_add(cbd->part_urls, url);
        }
@@ -1442,6 +1446,7 @@ html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                qcbd.url_set = url_set;
                qcbd.url = url;
                qcbd.part_urls = part_urls;
+               qcbd.parent_flags = url->flags;
 
                rspamd_url_find_multiple(pool,
                                                                 rspamd_url_query_unsafe(url), url->querylen,
index ef1b55db2089ed718ce1b98660a975a0e1705df0..5034bd5e56111c4cdeaf08e247b89cddc01314c0 100644 (file)
@@ -3621,6 +3621,7 @@ struct rspamd_url_mimepart_cbdata {
        gsize url_len;
        uint16_t *cur_url_order; /* Global ordering */
        uint16_t cur_part_order; /* Per part ordering */
+       uint32_t parent_flags;   /* Flags from outer URL to propagate to query URLs */
 };
 
 static gboolean
@@ -3651,10 +3652,8 @@ rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
 
        url->flags |= RSPAMD_URL_FLAG_QUERY;
 
-       /* For computed parts (e.g., PDF extracted text), also mark as content URL */
-       if (cbd->part && (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
-               url->flags |= RSPAMD_URL_FLAG_CONTENT;
-       }
+       /* Propagate source/classification flags from the parent (outer) URL */
+       url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
 
        if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
                if (cbd->part && cbd->part->mime_part->urls) {
@@ -3744,10 +3743,12 @@ rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
 
        /* We also search the query for additional url inside */
        if (url->querylen > 0) {
+               struct rspamd_url_mimepart_cbdata qcbd = *cbd;
+               qcbd.parent_flags = url->flags;
                rspamd_url_find_multiple(task->task_pool,
                                                                 rspamd_url_query_unsafe(url), url->querylen,
                                                                 RSPAMD_URL_FIND_ALL, NULL,
-                                                                rspamd_url_query_callback, cbd,
+                                                                rspamd_url_query_callback, &qcbd,
                                                                 task->cfg ? task->cfg->lua_state : NULL);
        }
 
@@ -3921,11 +3922,15 @@ rspamd_url_task_subject_callback(struct rspamd_url *url, gsize start_offset,
                                                                  task->cfg ? task->cfg->lua_state : NULL);
 
                        if (rc == URI_ERRNO_OK &&
-                               url->hostlen > 0) {
+                               query_url->hostlen > 0) {
                                msg_debug_task("found url %s in query of url"
                                                           " %*s",
                                                           url_str, url->querylen, rspamd_url_query_unsafe(url));
 
+                               query_url->flags |= RSPAMD_URL_FLAG_QUERY;
+                               /* Propagate source/classification flags from the parent URL */
+                               query_url->flags |= (url->flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
                                if (prefix_added) {
                                        query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
                                }
index 7c248316a3a8d674c3f99acfd7bed5a885ce521d..d5ed9134efb7d69f16df96a811226f0252d8e879 100644 (file)
@@ -63,6 +63,16 @@ enum rspamd_url_flags {
 };
 #define RSPAMD_URL_MAX_FLAG_SHIFT (26u)
 
+/*
+ * Flags that should propagate from an outer (parent) URL to an inner URL
+ * extracted from its query string. These are source/classification flags
+ * that describe where the URL was found, not structural properties of the
+ * URL itself.
+ */
+#define RSPAMD_URL_FLAG_PROPAGATE_MASK                     \
+       (RSPAMD_URL_FLAG_FROM_TEXT | RSPAMD_URL_FLAG_CONTENT | \
+        RSPAMD_URL_FLAG_SUBJECT | RSPAMD_URL_FLAG_INVISIBLE)
+
 struct rspamd_url_tag {
        const char *data;
        struct rspamd_url_tag *prev, *next;