]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Include content URLs by default in URL API calls
authorVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Wed, 21 Jan 2026 09:57:54 +0000 (09:57 +0000)
- Add `include_content_urls` global option (default: true) to control
  whether URLs extracted from content (PDF, etc.) are included in API calls
- Update task:get_urls(), task:get_emails() to include content URLs by default
- Update lua_util.extract_specific_urls() to use config default when
  need_content is not explicitly specified
- Mark URLs extracted from computed/virtual parts (PDF text) with CONTENT
  flag instead of FROM_TEXT flag, since they may be clickable links
- Add commented documentation in conf/options.inc

Users who want the old behavior can set `include_content_urls = false`
in their options configuration.

conf/options.inc
lualib/lua_util.lua
src/libserver/cfg_file.h
src/libserver/cfg_rcl.cxx
src/libserver/cfg_utils.cxx
src/libserver/url.c
src/lua/lua_task.c
src/lua/lua_url.c

index fb6c6afd1ff81e5b6fdf353e991c756f741b3a8b..414fcb7d1cbeee3604b3edc06aee498912dee565 100644 (file)
@@ -30,6 +30,11 @@ max_lua_urls = 1024;
 max_urls = 10240;
 max_recipients = 1024;
 
+# Include content URLs (extracted from PDF, etc.) in URL API calls by default
+# These are URLs found in document content that may be clickable links
+# Set to false to exclude them from task:get_urls and similar methods
+# include_content_urls = true;
+
 dns {
        timeout = 1s;
        sockets = 16;
index 06571e638da82826f2f3fcda30b886bcadb41593..e936019acd591cd75a6335b883af3c467b799c55 100644 (file)
@@ -1170,7 +1170,7 @@ end
 - - prefix <string> cache prefix (default = nil)
 - - ignore_redirected <bool> (default = false)
 - - need_images <bool> (default = false)
-- - need_content <bool> (default = false)
+- - need_content <bool> (default = nil, uses global include_content_urls config which defaults to true)
 -- }
 -- Apply heuristic in extracting of urls from task, this function
 -- tries its best to extract specific number of urls from a task based on
@@ -1183,7 +1183,7 @@ exports.extract_specific_urls = function(params_or_task, lim, need_emails, filte
     esld_limit = 9999,
     need_emails = false,
     need_images = false,
-    need_content = false,
+    need_content = nil, -- nil means use global include_content_urls config (default: true)
     filter = nil,
     prefix = nil,
     ignore_ip = false,
@@ -1227,10 +1227,11 @@ exports.extract_specific_urls = function(params_or_task, lim, need_emails, filte
       if params.flags then
         cache_key_suffix = table.concat(params.flags) .. (params.flags_mode or '')
       else
+        -- Use tostring directly to distinguish nil (config default) from false (explicit exclude)
         cache_key_suffix = string.format('%s%s%s',
           tostring(params.need_emails or false),
           tostring(params.need_images or false),
-          tostring(params.need_content or false))
+          tostring(params.need_content)) -- nil = config default, false = explicit exclude
       end
       cache_key = string.format('sp_urls_%d%s', params.limit, cache_key_suffix)
     end
index 8794004be79de661ce9ccca9dba51c49e2d38d75..ed0c1f1670c79fffd6eb5eec740bbc2cd53f4813 100644 (file)
@@ -377,6 +377,7 @@ struct rspamd_config {
        gboolean enable_css_parser;                              /**< Enable css parsing in HTML                                                        */
        gboolean enable_mime_utf;                                /**< Enable utf8 mime parsing                                                  */
        gboolean enable_url_rewrite;                             /**< Enable HTML URL rewriting                                                 */
+       gboolean include_content_urls;                           /**< Include content URLs (from PDF etc) in API calls  */
 
        gboolean composites_inverted_index; /**< Use inverted index for composite lookup                        */
        gboolean composites_stats_always;   /**< Always collect composite stats (not sampled)           */
index 59f498bb2ac1d2ba68f420ef35e70c31a6c2268e..5045624490dc52ab0109d7608b1576e24d16856a 100644 (file)
@@ -2117,6 +2117,12 @@ rspamd_rcl_config_init(struct rspamd_config *cfg, GHashTable *skip_sections)
                                                                           G_STRUCT_OFFSET(struct rspamd_config, enable_url_rewrite),
                                                                           0,
                                                                           "Enable HTML URL rewriting");
+               rspamd_rcl_add_default_handler(sub,
+                                                                          "include_content_urls",
+                                                                          rspamd_rcl_parse_struct_boolean,
+                                                                          G_STRUCT_OFFSET(struct rspamd_config, include_content_urls),
+                                                                          0,
+                                                                          "Include URLs extracted from content (PDF, etc.) in URL API calls (default: true)");
                rspamd_rcl_add_default_handler(sub,
                                                                           "composites_inverted_index",
                                                                           rspamd_rcl_parse_struct_boolean,
index c772c1f57883a2753dd88642a1b6a9da9763ef36..fcfab6ae9cc4b855484416d82854eee631aaed46 100644 (file)
@@ -352,6 +352,7 @@ rspamd_config_new(enum rspamd_config_init_flags flags)
        cfg->enable_css_parser = true;
        cfg->enable_mime_utf = false;
        cfg->enable_url_rewrite = false;
+       cfg->include_content_urls = true; /* Include URLs from PDF/content by default */
        cfg->url_rewrite_lua_func = nullptr;
        cfg->composites_inverted_index = true; /* Enable inverted index by default */
        cfg->composites_stats_always = false;  /* Use probabilistic sampling by default */
index 5e6c6f37c74c6857a7de527cad1dae5c8d5375e5..90d64af9501612db77ac8bb38a0972db60e1da89 100644 (file)
@@ -3651,6 +3651,10 @@ rspamd_url_query_callback(struct rspamd_url *url, gsize start_offset,
 
        url->flags |= RSPAMD_URL_FLAG_QUERY;
 
+       /* For computed parts (e.g., PDF extracted text), also mark as content URL */
+       if (cbd->part && (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED)) {
+               url->flags |= RSPAMD_URL_FLAG_CONTENT;
+       }
 
        if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false)) {
                if (cbd->part && cbd->part->mime_part->urls) {
@@ -3712,7 +3716,17 @@ rspamd_url_text_part_callback(struct rspamd_url *url, gsize start_offset,
                }
        }
 
-       url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+       /*
+        * For computed/virtual parts (e.g., text extracted from PDF), use CONTENT flag
+        * instead of FROM_TEXT. These URLs may be clickable links in the original document
+        * rather than plain text URLs.
+        */
+       if (cbd->part->mime_part->flags & RSPAMD_MIME_PART_COMPUTED) {
+               url->flags |= RSPAMD_URL_FLAG_CONTENT;
+       }
+       else {
+               url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
+       }
 
        if (rspamd_url_set_add_or_increase(MESSAGE_FIELD(task, urls), url, false) &&
                cbd->part->mime_part->urls) {
index d47575aa35f4b0455c0379a180fe4e996e854e44..0043ba3f29caa18c2952a8510bba15d45460c7f5 100644 (file)
@@ -248,6 +248,8 @@ LUA_FUNCTION_DEF(task, append_message);
 /***
  * @method task:get_urls([need_emails|list_protos][, need_images])
  * Get all URLs found in a message. Telephone urls and emails are not included unless explicitly asked in `list_protos`
+ * Content URLs (extracted from PDF and other content types) are included by default unless
+ * `include_content_urls` global option is set to false.
  * @param {boolean} need_emails if `true` then return also email urls, this can be a comma separated string of protocols desired or a table (e.g. `mailto` or `telephone`)
  * @param {boolean} need_images return urls from images (<img src=...>) as well
  * @return {table rspamd_url} list of all urls found
@@ -2632,9 +2634,17 @@ lua_task_get_urls(lua_State *L)
                        return 1;
                }
 
-               /* Exclude RSPAMD_URL_FLAG_CONTENT to preserve backward compatibility */
+               /*
+                * By default, include content URLs if configured (default: true).
+                * Always exclude image URLs unless explicitly requested.
+                */
+               unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+               if (task->cfg && !task->cfg->include_content_urls) {
+                       default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+               }
+
                if (!lua_url_cbdata_fill(L, 2, &cb, default_protocols_mask,
-                                                                ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+                                                                default_flags,
                                                                 max_urls)) {
                        return luaL_error(L, "invalid arguments");
                }
@@ -3095,8 +3105,17 @@ lua_task_get_emails(lua_State *L)
                                max_urls = task->cfg->max_lua_urls;
                        }
 
+                       /*
+                        * By default, include content URLs if configured (default: true).
+                        * Always exclude image URLs unless explicitly requested.
+                        */
+                       unsigned int default_flags = ~RSPAMD_URL_FLAG_IMAGE;
+                       if (task->cfg && !task->cfg->include_content_urls) {
+                               default_flags &= ~RSPAMD_URL_FLAG_CONTENT;
+                       }
+
                        if (!lua_url_cbdata_fill(L, 2, &cb, PROTOCOL_MAILTO,
-                                                                        ~(RSPAMD_URL_FLAG_CONTENT | RSPAMD_URL_FLAG_IMAGE),
+                                                                        default_flags,
                                                                         max_urls)) {
                                return luaL_error(L, "invalid arguments");
                        }
index f23a6323734dde6be9a824775edd60d9fef20c44..b123630fafeb318b00d0ae70d222ed1a7b0ae9cb 100644 (file)
@@ -1311,9 +1311,8 @@ lua_url_cbdata_fill(lua_State *L,
                                                flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
                                        }
                                }
-                               else {
-                                       flags_mask &= ~RSPAMD_URL_FLAG_CONTENT;
-                               }
+                               /* If content is nil/not specified, keep the default_flags as-is
+                                * (which respects the include_content_urls config option) */
                                lua_pop(L, 1);
                        }