[Feature] Include content URLs by default in URL API calls

author Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
diff --git a/conf/options.inc b/conf/options.inc

index fb6c6afd1ff81e5b6fdf353e991c756f741b3a8b..414fcb7d1cbeee3604b3edc06aee498912dee565 100644 (file)
--- a/conf/options.inc
+++ b/conf/options.inc
@@ -30,6 +30,11 @@ max_lua_urls = 1024;
  max_urls = 10240;
  max_recipients = 1024;
  
+# Include content URLs (extracted from PDF, etc.) in URL API calls by default
+# These are URLs found in document content that may be clickable links
+# Set to false to exclude them from task:get_urls and similar methods
+# include_content_urls = true;
+
  dns {
         timeout = 1s;
         sockets = 16;
diff --git a/lualib/lua_util.lua b/lualib/lua_util.lua

index 06571e638da82826f2f3fcda30b886bcadb41593..e936019acd591cd75a6335b883af3c467b799c55 100644 (file)
--- a/lualib/lua_util.lua
+++ b/lualib/lua_util.lua
@@ -1170,7 +1170,7 @@ end
  - - prefix <string> cache prefix (default = nil)
  - - ignore_redirected <bool> (default = false)
  - - need_images <bool> (default = false)
-- - need_content <bool> (default = false)
+- - need_content <bool> (default = nil, uses global include_content_urls config which defaults to true)
  -- }
  -- Apply heuristic in extracting of urls from task, this function
  -- tries its best to extract specific number of urls from a task based on
@@ -1183,7 +1183,7 @@ exports.extract_specific_urls = function(params_or_task, lim, need_emails, filte
      esld_limit = 9999,
      need_emails = false,
      need_images = false,
-    need_content = false,
+    need_content = nil, -- nil means use global include_content_urls config (default: true)
      filter = nil,
      prefix = nil,
      ignore_ip = false,
@@ -1227,10 +1227,11 @@ exports.extract_specific_urls = function(params_or_task, lim, need_emails, filte
        if params.flags then
          cache_key_suffix = table.concat(params.flags) .. (params.flags_mode or '')
        else
+        -- Use tostring directly to distinguish nil (config default) from false (explicit exclude)
          cache_key_suffix = string.format('%s%s%s',
            tostring(params.need_emails or false),
            tostring(params.need_images or false),
-          tostring(params.need_content or false))
+          tostring(params.need_content)) -- nil = config default, false = explicit exclude
        end
        cache_key = string.format('sp_urls_%d%s', params.limit, cache_key_suffix)
      end
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h

index 8794004be79de661ce9ccca9dba51c49e2d38d75..ed0c1f1670c79fffd6eb5eec740bbc2cd53f4813 100644 (file)
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -377,6 +377,7 @@ struct rspamd_config {
         gboolean enable_css_parser;                              /**< Enable css parsing in HTML                                                        */
         gboolean enable_mime_utf;                                /**< Enable utf8 mime parsing                                                  */
         gboolean enable_url_rewrite;                             /**< Enable HTML URL rewriting                                                 */
+       gboolean include_content_urls;                           /**< Include content URLs (from PDF etc) in API calls  */
  
         gboolean composites_inverted_index; /**< Use inverted index for composite lookup                        */
         gboolean composites_stats_always;   /**< Always collect composite stats (not sampled)           */
diff --git a/src/libserver/cfg_rcl.cxx b/src/libserver/cfg_rcl.cxx

index 59f498bb2ac1d2ba68f420ef35e70c31a6c2268e..5045624490dc52ab0109d7608b1576e24d16856a 100644 (file)
--- a/src/libserver/cfg_rcl.cxx
+++ b/src/libserver/cfg_rcl.cxx
@@ -2117,6 +2117,12 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
                                                                            G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
                                                                            0,
                                                                            "Enable HTML URL rewriting");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "include_content_urls",
+                                                                          rspamd_rcl_parse_struct_boolean,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, include_content_urls),
+                                                                          0,
+                                                                          "Include URLs extracted from content (PDF, etc.) in URL API calls (default: true)");
                 rspamd_rcl_add_default_handler(sub,
                                                                            "composites_inverted_index",
                                                                            rspamd_rcl_parse_struct_boolean,
diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx

index c772c1f57883a2753dd88642a1b6a9da9763ef36..fcfab6ae9cc4b855484416d82854eee631aaed46 100644 (file)
--- a/src/libserver/cfg_utils.cxx
+++ b/src/libserver/cfg_utils.cxx
@@ -352,6 +352,7 @@ rspamd_config_new(enum rspamd_config_init_flags flags)
         cfg->enable_css_parser = true;
         cfg->enable_mime_utf = false;
         cfg->enable_url_rewrite = false;
+       cfg->include_content_urls = true; /* Include URLs from PDF/content by default */
         cfg->url_rewrite_lua_func = nullptr;
         cfg->composites_inverted_index = true; /* Enable inverted index by default */
         cfg->composites_stats_always = false;  /* Use probabilistic sampling by default */
diff --git a/src/libserver/url.c b/src/libserver/url.c

index 5e6c6f37c74c6857a7de527cad1dae5c8d5375e5..90d64af9501612db77ac8bb38a0972db60e1da89 100644 (file)
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -3651,6 +3651,10 @@ rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
  
         url->flags |= RSPAMD_URL_FLAG_QUERY;
  
+       /* For computed parts (e.g., PDF extracted text), also mark as content URL */
+       if (cbd->part && (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
+               url->flags |= RSPAMD_URL_FLAG_CONTENT;
+       }
  
         if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
                 if (cbd->part && cbd->part->mime_part->urls) {
@@ -3712,7 +3716,17 @@ rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
                 }
         }
  
-       url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+       /*
+        * For computed/virtual parts (e.g., text extracted from PDF), use CONTENT flag
+        * instead of FROM_TEXT. These URLs may be clickable links in the original document
+        * rather than plain text URLs.
+        */
+       if (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED) {
+               url->flags |= RSPAMD_URL_FLAG_CONTENT;
+       }
+       else {
+               url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+       }
  
         if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
                 cbd->part->mime_part->urls) {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c

index d47575aa35f4b0455c0379a180fe4e996e854e44..0043ba3f29caa18c2952a8510bba15d45460c7f5 100644 (file)
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -248,6 +248,8 @@ LUA_FUNCTION_DEF(task, append_message);
  /***
   * @method task:get_urls([need_emails|list_protos][, need_images])
   * Get all URLs found in a message. Telephone urls and emails are not included unless explicitly asked in `list_protos`
+ * Content URLs (extracted from PDF and other content types) are included by default unless
+ * `include_content_urls` global option is set to false.
   * @param {boolean} need_emails if `true` then return also email urls, this can be a comma separated string of protocols desired or a table (e.g. `mailto` or `telephone`)
   * @param {boolean} need_images return urls from images (<img src=...>) as well
   * @return {table rspamd_url} list of all urls found
@@ -2632,9 +2634,17 @@ lua_task_get_urls(lua_State *L)
                         return 1;
                 }
  
-               /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */
+               /*
+                * By default, include content URLs if configured (default: true).
+                * Always exclude image URLs unless explicitly requested.
+                */
+               unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+               if (task->cfg && !task->cfg->include_content_urls) {
+                       default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+               }
+
                 if (!lua_url_cbdata_fill(L, 2, &cb, default_protocols_mask,
-                                                                ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+                                                                default_flags,
                                                                  max_urls)) {
                         return luaL_error(L, "invalid arguments");
                 }
@@ -3095,8 +3105,17 @@ lua_task_get_emails(lua_State *L)
                                 max_urls = task->cfg->max_lua_urls;
                         }
  
+                       /*
+                        * By default, include content URLs if configured (default: true).
+                        * Always exclude image URLs unless explicitly requested.
+                        */
+                       unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+                       if (task->cfg && !task->cfg->include_content_urls) {
+                               default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+                       }
+
                         if (!lua_url_cbdata_fill(L, 2, &cb, PROTOCOL_MAILTO,
-                                                                        ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+                                                                        default_flags,
                                                                          max_urls)) {
                                 return luaL_error(L, "invalid arguments");
                         }
diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c

index f23a6323734dde6be9a824775edd60d9fef20c44..b123630fafeb318b00d0ae70d222ed1a7b0ae9cb 100644 (file)
--- a/src/lua/lua_url.c
+++ b/src/lua/lua_url.c
@@ -1311,9 +1311,8 @@ lua_url_cbdata_fill(lua_State *L,
                                                 flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
                                         }
                                 }
-                               else {
-                                       flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
-                               }
+                               /* If content is nil/not specified, keep the default_flags as-is
+                                * (which respects the include_content_urls config option) */
                                 lua_pop(L, 1);
                         }
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
conf/options.inc		patch \| blob \| blame \| history
lualib/lua_util.lua		patch \| blob \| blame \| history
src/libserver/cfg_file.h		patch \| blob \| blame \| history
src/libserver/cfg_rcl.cxx		patch \| blob \| blame \| history
src/libserver/cfg_utils.cxx		patch \| blob \| blame \| history
src/libserver/url.c		patch \| blob \| blame \| history
src/lua/lua_task.c		patch \| blob \| blame \| history
src/lua/lua_url.c		patch \| blob \| blame \| history