khash_t(rspamd_url_hash) * url_set;
struct rspamd_url *url;
GPtrArray *part_urls;
+ uint32_t parent_flags; /* Flags from outer URL to propagate */
};
static gboolean
url->flags |= RSPAMD_URL_FLAG_QUERY;
+ /* Propagate source/classification flags from the parent (outer) URL */
+ url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
if (rspamd_url_set_add_or_increase(cbd->url_set, url, false) && cbd->part_urls) {
g_ptr_array_add(cbd->part_urls, url);
}
qcbd.url_set = url_set;
qcbd.url = url;
qcbd.part_urls = part_urls;
+ qcbd.parent_flags = url->flags;
rspamd_url_find_multiple(pool,
rspamd_url_query_unsafe(url), url->querylen,
gsize url_len;
uint16_t *cur_url_order; /* Global ordering */
uint16_t cur_part_order; /* Per part ordering */
+ uint32_t parent_flags; /* Flags from outer URL to propagate to query URLs */
};
static gboolean
url->flags |= RSPAMD_URL_FLAG_QUERY;
- /* For computed parts (e.g., PDF extracted text), also mark as content URL */
- if (cbd->part && (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
- url->flags |= RSPAMD_URL_FLAG_CONTENT;
- }
+ /* Propagate source/classification flags from the parent (outer) URL */
+ url->flags |= (cbd->parent_flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
if (cbd->part && cbd->part->mime_part->urls) {
/* We also search the query for additional url inside */
if (url->querylen > 0) {
+ struct rspamd_url_mimepart_cbdata qcbd = *cbd;
+ qcbd.parent_flags = url->flags;
rspamd_url_find_multiple(task->task_pool,
rspamd_url_query_unsafe(url), url->querylen,
RSPAMD_URL_FIND_ALL, NULL,
- rspamd_url_query_callback, cbd,
+ rspamd_url_query_callback, &qcbd,
task->cfg ? task->cfg->lua_state : NULL);
}
task->cfg ? task->cfg->lua_state : NULL);
if (rc == URI_ERRNO_OK &&
- url->hostlen > 0) {
+ query_url->hostlen > 0) {
msg_debug_task("found url %s in query of url"
" %*s",
url_str, url->querylen, rspamd_url_query_unsafe(url));
+ query_url->flags |= RSPAMD_URL_FLAG_QUERY;
+ /* Propagate source/classification flags from the parent URL */
+ query_url->flags |= (url->flags & RSPAMD_URL_FLAG_PROPAGATE_MASK);
+
if (prefix_added) {
query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
}
};
#define RSPAMD_URL_MAX_FLAG_SHIFT (26u)
+/*
+ * Flags that should propagate from an outer (parent) URL to an inner URL
+ * extracted from its query string. These are source/classification flags
+ * that describe where the URL was found, not structural properties of the
+ * URL itself.
+ */
+#define RSPAMD_URL_FLAG_PROPAGATE_MASK \
+ (RSPAMD_URL_FLAG_FROM_TEXT | RSPAMD_URL_FLAG_CONTENT | \
+ RSPAMD_URL_FLAG_SUBJECT | RSPAMD_URL_FLAG_INVISIBLE)
+
struct rspamd_url_tag {
const char *data;
struct rspamd_url_tag *prev, *next;