max_urls = 10240;
max_recipients = 1024;
+# Include content URLs (extracted from PDF, etc.) in URL API calls by default
+# These are URLs found in document content that may be clickable links
+# Set to false to exclude them from task:get_urls and similar methods
+# include_content_urls = true;
+
dns {
timeout = 1s;
sockets = 16;
- - prefix <string> cache prefix (default = nil)
- - ignore_redirected <bool> (default = false)
- - need_images <bool> (default = false)
-- - need_content <bool> (default = false)
+- - need_content <bool> (default = nil, uses global include_content_urls config which defaults to true)
-- }
-- Apply heuristic in extracting of urls from task, this function
-- tries its best to extract specific number of urls from a task based on
esld_limit = 9999,
need_emails = false,
need_images = false,
- need_content = false,
+ need_content = nil, -- nil means use global include_content_urls config (default: true)
filter = nil,
prefix = nil,
ignore_ip = false,
if params.flags then
cache_key_suffix = table.concat(params.flags) .. (params.flags_mode or '')
else
+ -- Use tostring directly to distinguish nil (config default) from false (explicit exclude)
cache_key_suffix = string.format('%s%s%s',
tostring(params.need_emails or false),
tostring(params.need_images or false),
- tostring(params.need_content or false))
+ tostring(params.need_content)) -- nil = config default, false = explicit exclude
end
cache_key = string.format('sp_urls_%d%s', params.limit, cache_key_suffix)
end
gboolean enable_css_parser; /**< Enable css parsing in HTML */
gboolean enable_mime_utf; /**< Enable utf8 mime parsing */
gboolean enable_url_rewrite; /**< Enable HTML URL rewriting */
+ gboolean include_content_urls; /**< Include content URLs (from PDF etc) in API calls */
gboolean composites_inverted_index; /**< Use inverted index for composite lookup */
gboolean composites_stats_always; /**< Always collect composite stats (not sampled) */
G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
0,
"Enable HTML URL rewriting");
+ rspamd_rcl_add_default_handler(sub,
+ "include_content_urls",
+ rspamd_rcl_parse_struct_boolean,
+ G_STRUCT_OFFSET(struct rspamd_config, include_content_urls),
+ 0,
+ "Include URLs extracted from content (PDF, etc.) in URL API calls (default: true)");
rspamd_rcl_add_default_handler(sub,
"composites_inverted_index",
rspamd_rcl_parse_struct_boolean,
cfg->enable_css_parser = true;
cfg->enable_mime_utf = false;
cfg->enable_url_rewrite = false;
+ cfg->include_content_urls = true; /* Include URLs from PDF/content by default */
cfg->url_rewrite_lua_func = nullptr;
cfg->composites_inverted_index = true; /* Enable inverted index by default */
cfg->composites_stats_always = false; /* Use probabilistic sampling by default */
url->flags |= RSPAMD_URL_FLAG_QUERY;
+ /* For computed parts (e.g., PDF extracted text), also mark as content URL */
+ if (cbd->part && (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
+ url->flags |= RSPAMD_URL_FLAG_CONTENT;
+ }
if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
if (cbd->part && cbd->part->mime_part->urls) {
}
}
- url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+ /*
+ * For computed/virtual parts (e.g., text extracted from PDF), use CONTENT flag
+ * instead of FROM_TEXT. These URLs may be clickable links in the original document
+ * rather than plain text URLs.
+ */
+ if (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED) {
+ url->flags |= RSPAMD_URL_FLAG_CONTENT;
+ }
+ else {
+ url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+ }
if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
cbd->part->mime_part->urls) {
/***
* @method task:get_urls([need_emails|list_protos][, need_images])
* Get all URLs found in a message. Telephone urls and emails are not included unless explicitly asked in `list_protos`
+ * Content URLs (extracted from PDF and other content types) are included by default unless
+ * `include_content_urls` global option is set to false.
* @param {boolean} need_emails if `true` then return also email urls, this can be a comma separated string of protocols desired or a table (e.g. `mailto` or `telephone`)
* @param {boolean} need_images return urls from images (<img src=...>) as well
* @return {table rspamd_url} list of all urls found
return 1;
}
- /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */
+ /*
+ * By default, include content URLs if configured (default: true).
+ * Always exclude image URLs unless explicitly requested.
+ */
+ unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+ if (task->cfg && !task->cfg->include_content_urls) {
+ default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+ }
+
if (!lua_url_cbdata_fill(L, 2, &cb, default_protocols_mask,
- ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+ default_flags,
max_urls)) {
return luaL_error(L, "invalid arguments");
}
max_urls = task->cfg->max_lua_urls;
}
+ /*
+ * By default, include content URLs if configured (default: true).
+ * Always exclude image URLs unless explicitly requested.
+ */
+ unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+ if (task->cfg && !task->cfg->include_content_urls) {
+ default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+ }
+
if (!lua_url_cbdata_fill(L, 2, &cb, PROTOCOL_MAILTO,
- ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+ default_flags,
max_urls)) {
return luaL_error(L, "invalid arguments");
}
flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
}
}
- else {
- flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
- }
+ /* If content is nil/not specified, keep the default_flags as-is
+ * (which respects the include_content_urls config option) */
lua_pop(L, 1);
}