]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Fix] Preserve duplicate URLs across MIME parts
authorVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 6 Mar 2026 08:44:21 +0000 (08:44 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Fri, 6 Mar 2026 08:46:32 +0000 (08:46 +0000)
Do not suppress URLs from mime_part:get_urls() when the same URL was already seen in another MIME part. This restores per-part URL visibility for multipart/alternative messages and keeps text/plain URLs available even when text/html contains the same links.

src/libserver/html/html.cxx
src/libserver/url.c
src/lua/lua_task.c
test/lua/unit/task.lua

index a67d35bf72398d832fadc3442599df3145be8cc5..62aeef3bb6b5dc82bafab260fc4761457d120ed5 100644 (file)
@@ -1400,6 +1400,14 @@ struct rspamd_html_url_query_cbd {
        uint32_t parent_flags; /* Flags from outer URL to propagate */
 };
 
+static inline auto
+html_part_add_url(GPtrArray *part_urls, struct rspamd_url *url) -> void
+{
+       if (part_urls) {
+               g_ptr_array_add(part_urls, url);
+       }
+}
+
 static gboolean
 html_url_query_callback(struct rspamd_url *url, gsize start_offset,
                                                gsize end_offset, gpointer ud)
@@ -1426,8 +1434,8 @@ html_url_query_callback(struct rspamd_url *url, gsize start_offset,
        /* Propagate source/classification flags from the parent (outer) URL */
        url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
 
-       if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
-               g_ptr_array_add(cbd->part_urls, url);
+       if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)) {
+               html_part_add_url(cbd->part_urls, url);
        }
 
        return TRUE;
@@ -1454,9 +1462,7 @@ html_process_query_url(rspamd_mempool_t *pool, struct rspamd_url *url,
                                                                 html_url_query_callback, &qcbd, L);
        }
 
-       if (part_urls) {
-               g_ptr_array_add(part_urls, url);
-       }
+       html_part_add_url(part_urls, url);
 }
 
 static auto
@@ -1576,9 +1582,8 @@ html_process_img_tag(rspamd_mempool_t *pool,
                                                        existing->flags |= img->url->flags;
                                                        existing->count++;
                                                }
-                                               else if (part_urls) {
-                                                       /* New url */
-                                                       g_ptr_array_add(part_urls, img->url);
+                                               else {
+                                                       html_part_add_url(part_urls, img->url);
                                                }
                                        }
                                }
@@ -2378,9 +2383,7 @@ auto html_process_input(struct rspamd_task *task,
                                                url->count++;
                                        }
                                }
-                               if (part_urls) {
-                                       g_ptr_array_add(part_urls, url);
-                               }
+                               html_part_add_url(part_urls, url);
 
                                /* Minimal link features collection */
                                hc->features.links.total_links++;
index 5034bd5e56111c4cdeaf08e247b89cddc01314c0..4c2871070b342cd4302bcff349735e639a13ba33 100644 (file)
@@ -3624,6 +3624,14 @@ struct rspamd_url_mimepart_cbdata {
        uint32_t parent_flags;   /* Flags from outer URL to propagate to query URLs */
 };
 
+static inline void
+rspamd_mime_part_add_url(struct rspamd_mime_part *part, struct rspamd_url *url)
+{
+       if (part && part->urls) {
+               g_ptr_array_add(part->urls, url);
+       }
+}
+
 static gboolean
 rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
                                                  gsize end_offset, gpointer ud)
@@ -3655,10 +3663,9 @@ rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
        /* Propagate source/classification flags from the parent (outer) URL */
        url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
 
+       rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
        if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
-               if (cbd->part && cbd->part->mime_part->urls) {
-                       g_ptr_array_add(cbd->part->mime_part->urls, url);
-               }
 
                url->part_order = cbd->cur_part_order++;
 
@@ -3727,14 +3734,16 @@ rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
                url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
        }
 
-       if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
-               cbd->part->mime_part->urls) {
+       rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
+       if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
+               rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
                url->part_order = cbd->cur_part_order++;
 
                if (cbd->cur_url_order) {
                        url->order = (*cbd->cur_url_order)++;
                }
-               g_ptr_array_add(cbd->part->mime_part->urls, url);
        }
 
        cbd->part->exceptions = g_list_prepend(
index 239bd896e79db85c6f69a40e9ab0239e74b0d298..441618b545afedb0c9c6f42257824be7a409015c 100644 (file)
@@ -2953,10 +2953,12 @@ inject_url_query_callback(struct rspamd_url *url, gsize start_offset,
 
        url->flags |= RSPAMD_URL_FLAG_QUERY;
 
-       if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) && cbd->mpart_urls) {
+       if (cbd->mpart_urls) {
                g_ptr_array_add(cbd->mpart_urls, url);
        }
 
+       rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false);
+
        return TRUE;
 }
 
@@ -2997,10 +2999,10 @@ lua_task_inject_url(lua_State *L)
                                          rspamd_lua_check_udata_maybe(L, 3, rspamd_mimepart_classname));
        }
        if (task && task->message && url && url->url) {
-               if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false)) {
-                       if (mpart && mpart->urls) {
-                               inject_url_query(task, url->url, mpart->urls);
-                       }
+               rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false);
+
+               if (mpart && mpart->urls) {
+                       inject_url_query(task, url->url, mpart->urls);
                }
        }
        else {
index ec2a1cd4c20ea04002bb3a47895d1436e2f3b499..9a0b98122b2c027d2330f5f2d190ec9f01648653 100644 (file)
@@ -167,4 +167,63 @@ Thank you,
 
     task:destroy()
   end)
+
+  test("Part URLs are not deduplicated across MIME parts", function()
+    local msg = table.concat {
+      hdrs,
+      'Content-Type: multipart/alternative; boundary=XXX\n',
+      '\n',
+      '--XXX\n',
+      'Content-Type: text/plain\n',
+      '\n',
+      'Visit <http://example.com/a> and <http://example.com/b>\n',
+      '\n',
+      '--XXX\n',
+      'Content-Type: text/html\n',
+      '\n',
+      '<html><body>' ..
+        '<a href="http://example.com/a">A</a>' ..
+        '<a href="http://example.com/b">B</a>' ..
+      '</body></html>\n',
+      '\n',
+      '--XXX--\n',
+    }
+    local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+    assert_true(res, "failed to load message")
+    task:process_message()
+
+    local parts = task:get_parts()
+    assert_true(#parts >= 2, "should have at least two MIME parts")
+
+    local function uniq_urls(part)
+      local seen = {}
+
+      return fun.totable(fun.filter(function(v)
+        if seen[v] then
+          return false
+        end
+
+        seen[v] = true
+        return true
+      end, fun.map(function(u)
+        return u:get_host() .. '/' .. u:get_path()
+      end, part:get_urls())))
+    end
+
+    assert_rspamd_table_eq_sorted({
+      actual = uniq_urls(parts[#parts - 1]),
+      expect = {
+        'example.com/a', 'example.com/b'
+      }
+    })
+
+    assert_rspamd_table_eq_sorted({
+      actual = uniq_urls(parts[#parts]),
+      expect = {
+        'example.com/a', 'example.com/b'
+      }
+    })
+
+    task:destroy()
+  end)
 end)