Do not suppress URLs from mime_part:get_urls() when the same URL was already seen in another MIME part. This restores per-part URL visibility for multipart/alternative messages and keeps text/plain URLs available even when text/html contains the same links.
uint32_t parent_flags; /* Flags from outer URL to propagate */
};
+static inline auto
+html_part_add_url(GPtrArray *part_urls, struct rspamd_url *url) -> void
+{
+ if (part_urls) {
+ g_ptr_array_add(part_urls, url);
+ }
+}
+
static gboolean
html_url_query_callback(struct rspamd_url *url, gsize start_offset,
gsize end_offset, gpointer ud)
/* Propagate source/classification flags from the parent (outer) URL */
url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
- if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
- g_ptr_array_add(cbd->part_urls, url);
+ if (rspamd_url_set_add_or_increase(cbd->url_set, url, false)) {
+ html_part_add_url(cbd->part_urls, url);
}
return TRUE;
html_url_query_callback, &qcbd, L);
}
- if (part_urls) {
- g_ptr_array_add(part_urls, url);
- }
+ html_part_add_url(part_urls, url);
}
static auto
existing->flags |= img->url->flags;
existing->count++;
}
- else if (part_urls) {
- /* New url */
- g_ptr_array_add(part_urls, img->url);
+ else {
+ html_part_add_url(part_urls, img->url);
}
}
}
url->count++;
}
}
- if (part_urls) {
- g_ptr_array_add(part_urls, url);
- }
+ html_part_add_url(part_urls, url);
/* Minimal link features collection */
hc->features.links.total_links++;
uint32_t parent_flags; /* Flags from outer URL to propagate to query URLs */
};
+static inline void
+rspamd_mime_part_add_url(struct rspamd_mime_part *part, struct rspamd_url *url)
+{
+ if (part && part->urls) {
+ g_ptr_array_add(part->urls, url);
+ }
+}
+
static gboolean
rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
gsize end_offset, gpointer ud)
/* Propagate source/classification flags from the parent (outer) URL */
url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+ rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
- if (cbd->part && cbd->part->mime_part->urls) {
- g_ptr_array_add(cbd->part->mime_part->urls, url);
- }
url->part_order = cbd->cur_part_order++;
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
}
- if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
- cbd->part->mime_part->urls) {
+ rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
+ if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
+ rspamd_mime_part_add_url(cbd->part ? cbd->part->mime_part : NULL, url);
+
url->part_order = cbd->cur_part_order++;
if (cbd->cur_url_order) {
url->order = (*cbd->cur_url_order)++;
}
- g_ptr_array_add(cbd->part->mime_part->urls, url);
}
cbd->part->exceptions = g_list_prepend(
url->flags |= RSPAMD_URL_FLAG_QUERY;
- if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) && cbd->mpart_urls) {
+ if (cbd->mpart_urls) {
g_ptr_array_add(cbd->mpart_urls, url);
}
+ rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false);
+
return TRUE;
}
rspamd_lua_check_udata_maybe(L, 3, rspamd_mimepart_classname));
}
if (task && task->message && url && url->url) {
- if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false)) {
- if (mpart && mpart->urls) {
- inject_url_query(task, url->url, mpart->urls);
- }
+ rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url->url, false);
+
+ if (mpart && mpart->urls) {
+ inject_url_query(task, url->url, mpart->urls);
}
}
else {
task:destroy()
end)
+
+ test("Part URLs are not deduplicated across MIME parts", function()
+ local msg = table.concat {
+ hdrs,
+ 'Content-Type: multipart/alternative; boundary=XXX\n',
+ '\n',
+ '--XXX\n',
+ 'Content-Type: text/plain\n',
+ '\n',
+ 'Visit <http://example.com/a> and <http://example.com/b>\n',
+ '\n',
+ '--XXX\n',
+ 'Content-Type: text/html\n',
+ '\n',
+ '<html><body>' ..
+ '<a href="http://example.com/a">A</a>' ..
+ '<a href="http://example.com/b">B</a>' ..
+ '</body></html>\n',
+ '\n',
+ '--XXX--\n',
+ }
+ local res, task = rspamd_task.load_from_string(msg, rspamd_config)
+ assert_true(res, "failed to load message")
+ task:process_message()
+
+ local parts = task:get_parts()
+ assert_true(#parts >= 2, "should have at least two MIME parts")
+
+ local function uniq_urls(part)
+ local seen = {}
+
+ return fun.totable(fun.filter(function(v)
+ if seen[v] then
+ return false
+ end
+
+ seen[v] = true
+ return true
+ end, fun.map(function(u)
+ return u:get_host() .. '/' .. u:get_path()
+ end, part:get_urls())))
+ end
+
+ assert_rspamd_table_eq_sorted({
+ actual = uniq_urls(parts[#parts - 1]),
+ expect = {
+ 'example.com/a', 'example.com/b'
+ }
+ })
+
+ assert_rspamd_table_eq_sorted({
+ actual = uniq_urls(parts[#parts]),
+ expect = {
+ 'example.com/a', 'example.com/b'
+ }
+ })
+
+ task:destroy()
+ end)
end)