From: Vsevolod Stakhov <vsevolod@rspamd.com>
Date: Thu, 6 Nov 2025 13:46:50 +0000 (+0000)
Subject: [Rework] Move CTA processing into dedicated module
X-Git-Tag: 3.14.0~12^2~5
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c6d0ee62795e5ad7d63965e99455144b4f1f68f0;p=thirdparty%2Frspamd.git

[Rework] Move CTA processing into dedicated module

Also refactor all absurdic logic it has previously
---

diff --git a/lualib/llm_search_context.lua b/lualib/llm_search_context.lua
index e75199b4fe..a691de523b 100644
--- a/lualib/llm_search_context.lua
+++ b/lualib/llm_search_context.lua
@@ -45,6 +45,7 @@ local rspamd_http = require "rspamd_http"
 local rspamd_logger = require "rspamd_logger"
 local lua_util = require "lua_util"
 local lua_cache = require "lua_cache"
+local lua_mime = require "lua_mime"
 local ucl = require "ucl"
 
 local DEFAULTS = {
@@ -77,11 +78,18 @@ local function extract_domains(task, max_domains, debug_module)
 
   -- First, try to get CTA URLs from HTML (most relevant for spam detection)
   -- Uses button weight and HTML structure analysis from C code
-  local cta_urls = task:get_cta_urls(max_domains * 2) or {}
-  lua_util.debugm(Np, task, "CTA analysis found %d URLs", #cta_urls)
+  local cta_urls = {}
+  local sel_part = lua_mime.get_displayed_text_part(task)
+  if sel_part then
+    cta_urls = sel_part:get_cta_urls()
+  end
+  lua_util.debugm(Np, task,
+      "CTA analysis found %d URLs across", #cta_urls)
 
   for _, url in ipairs(cta_urls) do
-    if #domains >= max_domains then break end
+    if #domains >= max_domains then
+      break
+    end
 
     local host = url:get_host()
     if host and not skip_domains[host:lower()] and not seen[host] then
@@ -94,20 +102,22 @@ local function extract_domains(task, max_domains, debug_module)
   -- If we don't have enough domains from CTA, get more from content URLs
   if #domains < max_domains then
     lua_util.debugm(Np, task, "need more domains (%d/%d), extracting from content URLs",
-      #domains, max_domains)
+        #domains, max_domains)
 
     local urls = lua_util.extract_specific_urls({
       task = task,
       limit = max_domains * 3,
       esld_limit = max_domains,
-      need_content = true,      -- Content URLs (buttons, links in text)
+      need_content = true, -- Content URLs (buttons, links in text)
       need_images = false,
     }) or {}
 
     lua_util.debugm(Np, task, "extracted %d content URLs", #urls)
 
     for _, url in ipairs(urls) do
-      if #domains >= max_domains then break end
+      if #domains >= max_domains then
+        break
+      end
 
       local host = url:get_host()
       if host and not seen[host] and not skip_domains[host:lower()] then
@@ -121,7 +131,7 @@ local function extract_domains(task, max_domains, debug_module)
   -- Still need more? Get from any URLs
   if #domains < max_domains then
     lua_util.debugm(Np, task, "still need more domains (%d/%d), extracting from all URLs",
-      #domains, max_domains)
+        #domains, max_domains)
 
     local urls = lua_util.extract_specific_urls({
       task = task,
@@ -132,7 +142,9 @@ local function extract_domains(task, max_domains, debug_module)
     lua_util.debugm(Np, task, "extracted %d all URLs", #urls)
 
     for _, url in ipairs(urls) do
-      if #domains >= max_domains then break end
+      if #domains >= max_domains then
+        break
+      end
 
       local host = url:get_host()
       if host and not seen[host] and not skip_domains[host:lower()] then
@@ -176,7 +188,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
 
     if code ~= 200 then
       rspamd_logger.infox(task, "search API returned code %s for domain '%s', url: %s, body: %s",
-        code, domain, full_url, body and body:sub(1, 200) or 'nil')
+          code, domain, full_url, body and body:sub(1, 200) or 'nil')
       callback(nil, domain, string.format("HTTP %s", code))
       return
     end
@@ -188,7 +200,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
     local ok, parse_err = parser:parse_string(body)
     if not ok then
       rspamd_logger.errx(task, "%s: failed to parse search API response for %s: %s",
-        Np, domain, parse_err)
+          Np, domain, parse_err)
       callback(nil, domain, parse_err)
       return
     end
@@ -208,7 +220,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
         local metadata = flat_data[1]
 
         lua_util.debugm(Np, task, "parsing domain '%s': flat_data has %d elements, metadata type: %s",
-          domain, #flat_data, type(metadata))
+            domain, #flat_data, type(metadata))
 
         if metadata and metadata.items and type(metadata.items) == 'number' then
           -- metadata.items is a 0-indexed pointer, add 1 for Lua
@@ -217,7 +229,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
 
           if items and type(items) == 'table' then
             lua_util.debugm(Np, task, "found %d item indices for domain '%s', items_idx=%d",
-              #items, domain, items_idx)
+                #items, domain, items_idx)
 
             local count = 0
 
@@ -237,8 +249,8 @@ local function query_search_api(task, domain, opts, callback, debug_module)
                 local title = result_template.title and flat_data[result_template.title + 1]
 
                 lua_util.debugm(Np, task, "result %d template: link_idx=%s, snippet_idx=%s, title_idx=%s",
-                  count + 1, tostring(result_template.link), tostring(result_template.snippet),
-                  tostring(result_template.title))
+                    count + 1, tostring(result_template.link), tostring(result_template.snippet),
+                    tostring(result_template.title))
 
                 if link or title or snippet then
                   table.insert(search_results.results, {
@@ -248,16 +260,16 @@ local function query_search_api(task, domain, opts, callback, debug_module)
                   })
                   count = count + 1
                   lua_util.debugm(Np, task, "extracted result %d: title='%s', snippet_len=%d",
-                    count, title or "nil", snippet and #snippet or 0)
+                      count, title or "nil", snippet and #snippet or 0)
                 end
               else
                 lua_util.debugm(Np, task, "result_template at idx %d is not a table: %s",
-                  result_template_idx, type(result_template))
+                    result_template_idx, type(result_template))
               end
             end
           else
             lua_util.debugm(Np, task, "items is not a table for domain '%s', type: %s",
-              domain, type(items))
+                domain, type(items))
           end
         else
           lua_util.debugm(Np, task, "no valid metadata.items for domain '%s'", domain)
@@ -266,7 +278,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
     end
 
     lua_util.debugm(Np, task, "extracted %d search results for domain '%s'",
-      #search_results.results, domain)
+        #search_results.results, domain)
     callback(search_results, domain, nil)
   end
 
@@ -342,7 +354,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
   end
 
   lua_util.debugm(Np, task, "final domain list (%d domains) for search: %s",
-    #domains, table.concat(domains, ", "))
+      #domains, table.concat(domains, ", "))
 
   -- Create cache context
   local cache_ctx = nil
@@ -378,7 +390,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
       else
         local context_snippet = format_search_results(all_results, opts)
         lua_util.debugm(Np, task, "search context formatted (%s bytes)",
-          context_snippet and #context_snippet or 0)
+            context_snippet and #context_snippet or 0)
         callback(task, true, context_snippet)
       end
     end
@@ -391,29 +403,29 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
     if cache_ctx then
       -- Use lua_cache for caching
       lua_cache.cache_get(task, cache_key, cache_ctx, opts.timeout,
-        function()
-          -- Cache miss - query API
-          query_search_api(task, domain, opts, function(api_results, d, api_err)
-            if api_results then
-              lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
-              domain_complete(d, api_results)
-            else
-              lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
-              domain_complete(d, nil)
+          function()
+            -- Cache miss - query API
+            query_search_api(task, domain, opts, function(api_results, d, api_err)
+              if api_results then
+                lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
+                domain_complete(d, api_results)
+              else
+                lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
+                domain_complete(d, nil)
+              end
+            end, Np)
+          end,
+          function(_, err, data)
+            -- Cache hit or after miss callback
+            if data and type(data) == 'table' then
+              lua_util.debugm(Np, task, "cache hit for domain %s", domain)
+              domain_complete(domain, data)
+              -- If no data and no error, the miss callback was already invoked
+            elseif err then
+              lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
+              domain_complete(domain, nil)
             end
-          end, Np)
-        end,
-        function(_, err, data)
-          -- Cache hit or after miss callback
-          if data and type(data) == 'table' then
-            lua_util.debugm(Np, task, "cache hit for domain %s", domain)
-            domain_complete(domain, data)
-          -- If no data and no error, the miss callback was already invoked
-          elseif err then
-            lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
-            domain_complete(domain, nil)
-          end
-        end)
+          end)
     else
       -- No Redis, query directly
       query_search_api(task, domain, opts, function(api_results, d, api_err)
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 84ea137113..910fe2082e 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -59,6 +59,9 @@ static const char gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
 struct rspamd_multipattern *gtube_matcher = NULL;
 static const uint64_t words_hash_seed = 0xdeadbabe;
 
+/* CTA URL configuration */
+#define MAX_CTA_URLS_PER_PART 25
+
 static void
 free_byte_array_callback(void *pointer)
 {
@@ -127,16 +130,16 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
 				*avg_len_p += total_len;
 			}
 
-		short_len_p = rspamd_mempool_get_variable(task->task_pool,
-												  RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
+			short_len_p = rspamd_mempool_get_variable(task->task_pool,
+													  RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
 
-		if (short_len_p == NULL) {
-			short_len_p = rspamd_mempool_alloc(task->task_pool,
-											   sizeof(double));
-			*short_len_p = short_len;
-			rspamd_mempool_set_variable(task->task_pool,
-										RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
-		}
+			if (short_len_p == NULL) {
+				short_len_p = rspamd_mempool_alloc(task->task_pool,
+												   sizeof(double));
+				*short_len_p = short_len;
+				rspamd_mempool_set_variable(task->task_pool,
+											RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
+			}
 			else {
 				*short_len_p += short_len;
 			}
@@ -795,6 +798,11 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
 	/* Wire aggregated HTML features */
 	text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
 
+	/* Collect top CTA URLs for this HTML part */
+	if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
+		rspamd_html_process_cta_urls(text_part, task, MAX_CTA_URLS_PER_PART);
+	}
+
 	/* Optionally call CTA/affiliation Lua hook with capped candidates */
 	if (task->cfg && task->cfg->lua_state) {
 		lua_State *L = task->cfg->lua_state;
@@ -944,55 +952,6 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
 
 			lua_settop(L, old_top);
 		}
-
-		/* Store top CTA URLs for LLM and other use cases */
-		if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
-			/* Simple approach: just store URLs sorted by button weight */
-			/* Use task-wide array to aggregate across all HTML parts */
-			GPtrArray *cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
-			if (!cta_urls) {
-				cta_urls = g_ptr_array_new();
-				rspamd_mempool_add_destructor(task->task_pool,
-											  (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
-											  cta_urls);
-				rspamd_mempool_set_variable(task->task_pool, "html_cta_urls", cta_urls, NULL);
-			}
-
-			/* Find best URLs by button weight in this HTML part */
-			float best_weights[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
-			struct rspamd_url *best_urls[5] = {NULL, NULL, NULL, NULL, NULL};
-			unsigned int max_cta_per_part = 5;
-
-			for (unsigned int i = 0; i < text_part->mime_part->urls->len; i++) {
-				struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i);
-				if (!u) continue;
-				if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
-				if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
-
-				float weight = rspamd_html_url_button_weight(text_part->html, u);
-
-				/* Insert into best list if weight is high enough */
-				for (unsigned int j = 0; j < max_cta_per_part; j++) {
-					if (weight > best_weights[j]) {
-						/* Shift lower entries down */
-						for (unsigned int k = max_cta_per_part - 1; k > j; k--) {
-							best_weights[k] = best_weights[k - 1];
-							best_urls[k] = best_urls[k - 1];
-						}
-						best_weights[j] = weight;
-						best_urls[j] = u;
-						break;
-					}
-				}
-			}
-
-			/* Add to task-wide array */
-			for (unsigned int i = 0; i < max_cta_per_part; i++) {
-				if (best_urls[i] && best_weights[i] > 0.0) {
-					g_ptr_array_add(cta_urls, best_urls[i]);
-				}
-			}
-		}
 	}
 	rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
 
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 83f36ff192..dc9987d01f 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -16,6 +16,7 @@
 #include "libserver/url.h"
 #include "libutil/ref.h"
 #include "libutil/str_util.h"
+#include "libutil/heap.h"
 #include "libserver/word.h"
 
 #include <unicode/uchar.h>
@@ -126,6 +127,15 @@ struct rspamd_mime_part {
 #define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
 #define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT)
 
+/* CTA (call-to-action) URL heap entry structure */
+struct rspamd_html_cta_entry {
+	unsigned int pri;       /* Priority for heap (weight * scale) */
+	unsigned int idx;       /* Heap index (managed by heap) */
+	struct rspamd_url *url; /* URL pointer */
+	float weight;           /* Original button weight */
+};
+
+RSPAMD_HEAP_DECLARE(rspamd_html_heap_storage, struct rspamd_html_cta_entry);
 
 struct rspamd_mime_text_part {
 	const char *language;
@@ -148,7 +158,9 @@ struct rspamd_mime_text_part {
 	void *html;
 	/* Optional HTML features collected during parsing */
 	struct rspamd_html_features *html_features;
-	GList *exceptions; /**< list of offsets of urls						*/
+	/* CTA (call-to-action) URLs extracted from HTML with weights */
+	rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */
+	GList *exceptions;                    /**< list of offsets of urls						*/
 	struct rspamd_mime_part *mime_part;
 
 	unsigned int flags;
diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt
index 721e09a65c..24deff707a 100644
--- a/src/libserver/CMakeLists.txt
+++ b/src/libserver/CMakeLists.txt
@@ -41,6 +41,7 @@ SET(LIBRSPAMDSERVERSRC
         ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx
+        ${CMAKE_CURRENT_SOURCE_DIR}/html/html_cta.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx
         ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite_c.cxx
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 1e982236d1..e66ba35652 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -29,6 +29,7 @@
 #include "contrib/libucl/khash.h"
 #include "libmime/images.h"
 #include "libutil/cxx/utf8_util.h"
+#include "libserver/html/html_cta.hxx"
 
 #include "html_tag_defs.hxx"
 #include "html_entities.hxx"
@@ -40,6 +41,8 @@
 #include "contrib/fmt/include/fmt/core.h"
 
 #include <functional>
+#include <algorithm>
+#include <string>
 #include <unicode/uversion.h>
 
 namespace rspamd::html {
@@ -834,6 +837,7 @@ static const auto component_extractors = frozen::make_unordered_map<frozen::stri
 		 }},
 	});
 
+
 auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
 {
 	auto it = component_extractors.find(attr_name);
@@ -890,7 +894,7 @@ enum tag_parser_state {
 struct tag_content_parser_state {
 	tag_parser_state cur_state = parse_start;
 	std::string buf;
-	std::string attr_name;// Store current attribute name
+	std::string attr_name;            // Store current attribute name
 	const char *value_start = nullptr;// Track where attribute value starts in input
 	const char *html_start = nullptr; // Base pointer to HTML buffer start
 
@@ -2406,26 +2410,6 @@ auto html_process_input(struct rspamd_task *task,
 					if (cnt > hc->features.links.max_links_single_domain) {
 						hc->features.links.max_links_single_domain = cnt;
 					}
-					/* Heuristic button weight */
-					float w = 0.0f;
-					if (url->ext && url->ext->linked_url && url->ext->linked_url != url) {
-						w += 0.5f; /* display mismatch bonus */
-					}
-					w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order);
-					if (cur_tag->block && cur_tag->block->is_visible()) {
-						if (cur_tag->block->has_display()) {
-							w += 0.1f;
-						}
-						if (cur_tag->block->width > 0 && cur_tag->block->height > 0) {
-							w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f);
-						}
-						if (cur_tag->block->font_size >= 14) {
-							w += 0.1f;
-						}
-					}
-					if (w > 0) {
-						hc->url_button_weights[url] += w;
-					}
 					/* same eTLD+1 as first-party? */
 					if (!hc->first_party_etld1.empty()) {
 						rspamd_ftok_t tld2;
@@ -3180,6 +3164,8 @@ auto html_process_input(struct rspamd_task *task,
 		}
 	}
 
+	html_compute_cta_weights(*hc);
+
 	return hc;
 }
 
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 368a22b08c..f256aae9dc 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -140,6 +140,17 @@ float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u);
  */
 const struct rspamd_html_features *rspamd_html_get_features(void *html_content);
 
+/**
+ * Creates CTA (call-to-action) URLs heap for a text part
+ * Collects top-K URLs by button weight using min-heap (O(n log k))
+ * @param text_part text part to fill cta_urls for
+ * @param task task for mempool allocation
+ * @param max_cta maximum number of CTA URLs to collect
+ */
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+								  struct rspamd_task *task,
+								  unsigned int max_cta);
+
 
 #ifdef __cplusplus
 }
diff --git a/src/libserver/html/html_cta.cxx b/src/libserver/html/html_cta.cxx
new file mode 100644
index 0000000000..8646b8b50a
--- /dev/null
+++ b/src/libserver/html/html_cta.cxx
@@ -0,0 +1,570 @@
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "libserver/html/html_cta.hxx"
+
+#include "util.h"
+#include "message.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html_block.hxx"
+#include "libserver/html/html_tag.hxx"
+#include "libserver/css/css.hxx"
+#include "libserver/url.h"
+#include "libserver/task.h"
+#include "libutil/cxx/util.hxx"
+#include "libutil/heap.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include <glib.h>
+
+static constexpr unsigned int CTA_WEIGHT_SCALE = 1000;
+
+namespace rspamd::html {
+namespace {
+
+using namespace std::string_view_literals;
+
+static auto trim_ascii(std::string_view input) -> std::string_view
+{
+	while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.front()))) {
+		input.remove_prefix(1);
+	}
+
+	while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.back()))) {
+		input.remove_suffix(1);
+	}
+
+	return input;
+}
+
+static auto space_separated_token_match(std::string_view attr,
+										std::string_view token,
+										bool allow_partial) -> bool
+{
+	attr = trim_ascii(attr);
+	if (attr.empty()) {
+		return false;
+	}
+
+	std::size_t pos = 0;
+	while (pos < attr.size()) {
+		while (pos < attr.size() && g_ascii_isspace(static_cast<gchar>(attr[pos]))) {
+			pos++;
+		}
+		if (pos >= attr.size()) {
+			break;
+		}
+
+		auto end = pos;
+		while (end < attr.size() && !g_ascii_isspace(static_cast<gchar>(attr[end]))) {
+			end++;
+		}
+
+		auto chunk = attr.substr(pos, end - pos);
+		if (allow_partial) {
+			if (chunk.find(token) != std::string_view::npos) {
+				return true;
+			}
+		}
+		else {
+			if (chunk == token) {
+				return true;
+			}
+		}
+
+		pos = end + 1;
+	}
+
+	return false;
+}
+
+static auto optional_attr_contains(const std::optional<std::string_view> &attr,
+								   std::string_view token,
+								   bool allow_partial = false) -> bool
+{
+	if (!attr) {
+		return false;
+	}
+
+	return space_separated_token_match(attr.value(), token, allow_partial);
+}
+
+template<typename Range>
+static auto optional_attr_contains_any(const std::optional<std::string_view> &attr,
+									   const Range &tokens,
+									   bool allow_partial = false) -> bool
+{
+	if (!attr) {
+		return false;
+	}
+
+	for (auto token: tokens) {
+		if (space_separated_token_match(attr.value(), token, allow_partial)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static auto to_lower_ascii(std::string_view input) -> std::string
+{
+	std::string out;
+	out.reserve(input.size());
+	for (auto ch: input) {
+		out.push_back(static_cast<char>(g_ascii_tolower(static_cast<guchar>(ch))));
+	}
+	return out;
+}
+
+static auto get_cta_label(const html_tag &tag, const html_content &hc) -> std::string
+{
+	auto content = trim_ascii(tag.get_content(&hc));
+	if (!content.empty()) {
+		return std::string{content};
+	}
+
+	if (auto title = tag.find_component<html_component_title>()) {
+		auto value = trim_ascii(title.value()->value);
+		if (!value.empty()) {
+			return std::string{value};
+		}
+	}
+
+	if (auto aria_label = tag.find_component_by_name("aria-label"sv)) {
+		auto value = trim_ascii(aria_label.value());
+		if (!value.empty()) {
+			return std::string{value};
+		}
+	}
+
+	if (auto alt = tag.find_component<html_component_alt>()) {
+		auto value = trim_ascii(alt.value()->value);
+		if (!value.empty()) {
+			return std::string{value};
+		}
+	}
+
+	return {};
+}
+
+static auto tag_is_effectively_hidden(const html_tag *tag) -> bool
+{
+	for (auto current = tag; current != nullptr; current = current->parent) {
+		if (current->block && !current->block->is_visible()) {
+			return true;
+		}
+		if (current->flags & FL_IGNORE) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static constexpr auto buttonish_class_tokens = rspamd::array_of<std::string_view>(
+	"btn", "button", "cta", "call-to-action", "submit", "primary",
+	"confirm", "action", "purchase", "buy", "signup", "sign-up", "apply");
+
+static constexpr auto negative_context_tokens = rspamd::array_of<std::string_view>(
+	"logo", "footer", "header", "nav", "menu", "social",
+	"tracking", "pixel", "unsubscribe", "legal", "copyright");
+
+static constexpr auto service_rel_tokens = rspamd::array_of<std::string_view>(
+	"alternate", "canonical", "dns-prefetch", "icon", "manifest",
+	"preconnect", "prefetch", "preload", "stylesheet");
+
+static constexpr auto cta_keywords = rspamd::array_of<std::string_view>(
+	"buy", "purchase", "order", "checkout", "pay", "confirm", "verify",
+	"update", "login", "log in", "sign in", "sign up", "signup", "register",
+	"download", "upgrade", "continue", "next", "open", "submit", "apply",
+	"approve", "activate", "subscribe");
+
+static auto is_service_link_tag(const html_tag &tag, const rspamd_url &url) -> bool
+{
+	if (tag.flags & (FL_XML | FL_VIRTUAL | FL_COMMENT | FL_IGNORE | CM_HEAD)) {
+		return true;
+	}
+
+	switch (tag.id) {
+	case Tag_LINK:
+	case Tag_SCRIPT:
+	case Tag_STYLE:
+	case Tag_META:
+	case Tag_BASE:
+	case Tag_IMG:
+		return true;
+	default:
+		break;
+	}
+
+	if (tag.block && !tag.block->is_visible()) {
+		return true;
+	}
+
+	if (url.flags & RSPAMD_URL_FLAG_IMAGE) {
+		return true;
+	}
+
+	if (tag.id == Tag_A) {
+		if (optional_attr_contains_any(tag.find_rel(), service_rel_tokens, false)) {
+			return true;
+		}
+		if (tag.parent && (tag.parent->flags & CM_HEAD)) {
+			return true;
+		}
+	}
+
+	return false;
+}
+
+static auto compute_semantic_base_score(const html_tag &tag, const rspamd_url &url) -> float
+{
+	switch (tag.id) {
+	case Tag_BUTTON:
+		return 0.9f;
+	case Tag_INPUT: {
+		float base = 0.35f;
+		if (auto type_comp = tag.find_component<html_component_type>()) {
+			auto lowered = to_lower_ascii(trim_ascii(type_comp.value()->get_string_value()));
+			if (lowered == "submit" || lowered == "button" || lowered == "send") {
+				base = 0.85f;
+			}
+			else if (lowered == "image") {
+				base = 0.75f;
+			}
+			else if (lowered == "reset") {
+				base = 0.25f;
+			}
+		}
+		return base;
+	}
+	case Tag_FORM:
+		return 0.8f;
+	case Tag_A: {
+		float base = 0.35f;
+		if (optional_attr_contains_any(tag.find_class(), buttonish_class_tokens, true) ||
+			optional_attr_contains_any(tag.find_id(), buttonish_class_tokens, true)) {
+			base = 0.75f;
+		}
+		if (auto role_comp = tag.find_component<html_component_role>()) {
+			auto lowered = to_lower_ascii(trim_ascii(role_comp.value()->value));
+			if (lowered == "button" || lowered == "tab" || lowered == "menuitem") {
+				base = std::max(base, 0.7f);
+			}
+		}
+		if (url.protocol == PROTOCOL_MAILTO) {
+			base = std::min(base, 0.4f);
+		}
+		return base;
+	}
+	case Tag_AREA:
+		return 0.3f;
+	default:
+		if (tag.flags & FL_HREF) {
+			return 0.2f;
+		}
+		break;
+	}
+
+	return 0.0f;
+}
+
+static auto compute_visual_bonus(const html_tag &tag) -> float
+{
+	if (!tag.block || !tag.block->is_visible()) {
+		return 0.0f;
+	}
+
+	float bonus = 0.0f;
+	const auto &block = *tag.block;
+
+	switch (block.display) {
+	case css::css_display_value::DISPLAY_BLOCK:
+		bonus += 0.12f;
+		break;
+	case css::css_display_value::DISPLAY_TABLE_ROW:
+		bonus += 0.05f;
+		break;
+	default:
+		break;
+	}
+
+	if (block.width > 0 && block.height > 0) {
+		const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+		if (area >= 6000) {
+			bonus += 0.2f;
+		}
+		else if (area >= 2000) {
+			bonus += 0.12f;
+		}
+		else if (area >= 400) {
+			bonus += 0.06f;
+		}
+	}
+
+	if (block.font_size >= 16) {
+		bonus += 0.08f;
+	}
+	else if (block.font_size >= 13) {
+		bonus += 0.04f;
+	}
+
+	return bonus;
+}
+
+static auto compute_text_bonus(std::string_view text_lower) -> float
+{
+	if (text_lower.empty()) {
+		return 0.0f;
+	}
+
+	float bonus = 0.0f;
+	for (auto kw: cta_keywords) {
+		if (text_lower.find(kw) != std::string_view::npos) {
+			bonus += 0.18f;
+			break;
+		}
+	}
+
+	if (text_lower.find('!') != std::string_view::npos) {
+		bonus += 0.03f;
+	}
+
+	if (text_lower.size() <= 18 && text_lower.size() >= 3) {
+		bonus += 0.04f;
+	}
+
+	return bonus;
+}
+
+static auto compute_penalty(const html_tag &tag,
+							const rspamd_url &url,
+							std::string_view text_lower,
+							std::string_view text_original) -> float
+{
+	float penalty = 0.0f;
+
+	if (text_lower.empty()) {
+		penalty += 0.35f;
+	}
+	else {
+		unsigned int alpha = 0;
+		unsigned int graph = 0;
+		for (auto ch: text_lower) {
+			if (g_ascii_isspace(static_cast<gchar>(ch))) {
+				continue;
+			}
+			graph++;
+			if (g_ascii_isalpha(static_cast<gchar>(ch))) {
+				alpha++;
+			}
+		}
+		if (graph > 0 && alpha == 0) {
+			penalty += 0.25f;
+		}
+		if (text_original.size() > 80) {
+			penalty += 0.1f;
+		}
+	}
+
+	if (tag.block) {
+		const auto &block = *tag.block;
+		if (block.width > 0 && block.height > 0) {
+			const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+			if (area <= 64) {
+				penalty += 0.25f;
+			}
+			else if (area <= 150) {
+				penalty += 0.15f;
+			}
+		}
+		if (block.font_size > 0 && block.font_size <= 9) {
+			penalty += 0.08f;
+		}
+		if (block.is_transparent()) {
+			penalty += 0.2f;
+		}
+	}
+
+	if (optional_attr_contains_any(tag.find_class(), negative_context_tokens, true) ||
+		optional_attr_contains_any(tag.find_id(), negative_context_tokens, true)) {
+		penalty += 0.2f;
+	}
+
+	if (url.flags & RSPAMD_URL_FLAG_INVISIBLE) {
+		penalty += 0.3f;
+	}
+
+	if (url.protocol == PROTOCOL_MAILTO || url.protocol == PROTOCOL_FTP) {
+		penalty += 0.05f;
+	}
+
+	return penalty;
+}
+
+static auto compute_cta_weight(const html_tag &tag,
+							   const rspamd_url &url,
+							   const html_content &hc) -> float
+{
+	if (is_service_link_tag(tag, url)) {
+		return 0.0f;
+	}
+
+	if (tag_is_effectively_hidden(&tag)) {
+		return 0.0f;
+	}
+
+	float base = compute_semantic_base_score(tag, url);
+	if (base <= 0.0f) {
+		return 0.0f;
+	}
+
+	auto label = get_cta_label(tag, hc);
+	std::string_view label_view = trim_ascii(label);
+	std::string lowered = to_lower_ascii(label_view);
+
+	float visual = compute_visual_bonus(tag);
+	float text_bonus = compute_text_bonus(lowered);
+	float order_bonus = 0.0f;
+	if (url.order == 0) {
+		order_bonus = 0.1f;
+	}
+	else {
+		order_bonus = std::max(0.0f, 0.06f / (1.0f + static_cast<float>(url.order)));
+	}
+	if (url.ext && url.ext->linked_url && url.ext->linked_url != &url) {
+		order_bonus += 0.12f;
+	}
+	float penalty = compute_penalty(tag, url, lowered, label_view);
+
+	float weight = base + visual + text_bonus + order_bonus - penalty;
+	if (weight < 0.0f) {
+		weight = 0.0f;
+	}
+	else if (weight > 1.0f) {
+		weight = 1.0f;
+	}
+
+	return weight;
+}
+
+}// namespace
+
+void html_compute_cta_weights(html_content &hc)
+{
+	hc.url_button_weights.clear();
+
+	for (const auto &tag_ptr: hc.all_tags) {
+		const auto &tag = *tag_ptr;
+		if (!std::holds_alternative<rspamd_url *>(tag.extra)) {
+			continue;
+		}
+
+		auto *url = std::get<rspamd_url *>(tag.extra);
+		if (!url) {
+			continue;
+		}
+
+		float weight = compute_cta_weight(tag, *url, hc);
+		if (weight <= 0.0f) {
+			continue;
+		}
+
+		auto it = hc.url_button_weights.find(url);
+		if (it == hc.url_button_weights.end()) {
+			hc.url_button_weights.emplace(url, weight);
+		}
+		else {
+			it->second = std::max(it->second, weight);
+		}
+	}
+}
+
+}// namespace rspamd::html
+
+extern "C" {
+
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+								  struct rspamd_task *task,
+								  unsigned int max_cta)
+{
+	using namespace rspamd::html;
+
+	if (!text_part || !text_part->html || !text_part->mime_part || !text_part->mime_part->urls) {
+		return;
+	}
+	auto *part_urls = text_part->mime_part->urls;
+	unsigned int i;
+	rspamd_url *u;
+
+	auto *heap_ptr = rspamd_mempool_alloc_type(task->task_pool, rspamd_html_heap_storage_t);
+	rspamd_heap_init(rspamd_html_heap_storage, heap_ptr);
+	text_part->cta_urls = heap_ptr;
+	rspamd_mempool_add_destructor(task->task_pool, [](void *ptr) {
+                auto *h = static_cast<rspamd_html_heap_storage_t *>(ptr);
+                rspamd_heap_destroy(rspamd_html_heap_storage, h); }, heap_ptr);
+	PTR_ARRAY_FOREACH(part_urls, i, u)
+	{
+		if (!u) continue;
+		if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+		if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+		if (u->flags & RSPAMD_URL_FLAG_IMAGE) continue;
+
+		/* Use button_weight to filter CTA URLs vs technical URLs
+         * Technical tags like <link rel>, <script src> have weight=0
+         * Only actual content URLs (buttons, links) have weight > 0
+         */
+		float weight = rspamd_html_url_button_weight(text_part->html, u);
+
+		if (weight > 0.0) {
+			if (rspamd_heap_size(rspamd_html_heap_storage, heap_ptr) < max_cta) {
+				rspamd_html_cta_entry entry = {
+					.pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+					.idx = 0,
+					.url = u,
+					.weight = weight};
+				rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+			}
+			else {
+				auto *min = rspamd_heap_index(rspamd_html_heap_storage, heap_ptr, 0);
+				if (weight > min->weight) {
+					rspamd_heap_pop(rspamd_html_heap_storage, heap_ptr);
+					rspamd_html_cta_entry entry = {
+						.pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+						.idx = 0,
+						.url = u,
+						.weight = weight};
+					rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+				}
+			}
+		}
+	}
+
+	return;
+
+heap_error:
+	rspamd_heap_destroy(rspamd_html_heap_storage, heap_ptr);
+	text_part->cta_urls = nullptr;
+}
+
+}// extern "C"
diff --git a/src/libserver/html/html_cta.hxx b/src/libserver/html/html_cta.hxx
new file mode 100644
index 0000000000..cd02714e55
--- /dev/null
+++ b/src/libserver/html/html_cta.hxx
@@ -0,0 +1,31 @@
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_CTA_HXX
+#define RSPAMD_HTML_CTA_HXX
+
+namespace rspamd::html {
+
+struct html_content;
+
+/**
+ * Recompute CTA weights for all URLs present in the HTML document.
+ */
+void html_compute_cta_weights(html_content &hc);
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_CTA_HXX
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 941703d756..21b3f6bbe7 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -127,6 +127,13 @@ LUA_FUNCTION_DEF(textpart, get_lines_count);
  * @return {table} table of stats
  */
 LUA_FUNCTION_DEF(textpart, get_stats);
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+LUA_FUNCTION_DEF(textpart, get_cta_urls);
 /***
  * @method mime_part:get_words_count()
  * Get words number in the part
@@ -254,6 +261,7 @@ static const struct luaL_reg textpartlib_m[] = {
 	LUA_INTERFACE_DEF(textpart, get_stats),
 	LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes),
 	LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes),
+	LUA_INTERFACE_DEF(textpart, get_cta_urls),
 	{"__tostring", rspamd_lua_class_tostring},
 	{NULL, NULL}};
 
@@ -1420,6 +1428,58 @@ lua_textpart_get_html_fuzzy_hashes(lua_State *L)
 	return 2;
 }
 
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+static int
+lua_textpart_get_cta_urls(lua_State *L)
+{
+	LUA_TRACE_POINT;
+	struct rspamd_mime_text_part *part = lua_check_textpart(L);
+	unsigned int max_urls = 0;
+	unsigned int nret = 0;
+
+	if (part == NULL) {
+		return luaL_error(L, "invalid arguments");
+	}
+
+	/* Get optional max_urls parameter */
+	if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
+		max_urls = lua_tointeger(L, 2);
+	}
+
+	/* Check if this HTML part has CTA URLs */
+	if (!part->cta_urls) {
+		lua_newtable(L);
+		return 1;
+	}
+
+	/* Access heap structure from html.h */
+	rspamd_html_heap_storage_t *heap = part->cta_urls;
+
+	/* Heap is already top-K, but in min-heap order - need to reverse for descending */
+	unsigned int result_size = max_urls > 0 ? MIN(max_urls, heap->n) : heap->n;
+	lua_createtable(L, result_size, 0);
+
+	/* Iterate heap from end to start for descending order */
+	for (int i = (int) heap->n - 1; i >= 0 && nret < result_size; i--) {
+		struct rspamd_html_cta_entry *entry = &heap->a[i];
+		if (entry && entry->url) {
+			struct rspamd_lua_url *lua_url;
+
+			lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+			rspamd_lua_setclass(L, rspamd_url_classname, -1);
+			lua_url->url = entry->url;
+			lua_rawseti(L, -2, ++nret);
+		}
+	}
+
+	return 1;
+}
+
 static int
 lua_textpart_get_mimepart(lua_State *L)
 {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index b7e42530c3..e10c7e089b 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -278,14 +278,6 @@ LUA_FUNCTION_DEF(task, get_urls);
  * @return {table rspamd_url} list of urls matching conditions
  */
 LUA_FUNCTION_DEF(task, get_urls_filtered);
-/***
- * @method task:get_cta_urls([max_urls])
- * Get call-to-action URLs from HTML content, prioritized by button weight
- * These are URLs that users are likely to click (buttons, prominent links, etc.)
- * @param {number} max_urls maximum number of URLs to return (default: all)
- * @return {table rspamd_url} list of CTA urls sorted by importance
- */
-LUA_FUNCTION_DEF(task, get_cta_urls);
 /***
  * @method task:has_urls([need_emails])
  * Returns 'true' if a task has urls listed
@@ -1333,7 +1325,6 @@ static const struct luaL_reg tasklib_m[] = {
 	LUA_INTERFACE_DEF(task, has_urls),
 	LUA_INTERFACE_DEF(task, get_urls),
 	LUA_INTERFACE_DEF(task, get_urls_filtered),
-	LUA_INTERFACE_DEF(task, get_cta_urls),
 	LUA_INTERFACE_DEF(task, inject_url),
 	LUA_INTERFACE_DEF(task, get_content),
 	LUA_INTERFACE_DEF(task, get_filename),
@@ -2733,61 +2724,6 @@ lua_task_get_urls_filtered(lua_State *L)
 	return 1;
 }
 
-static int
-lua_task_get_cta_urls(lua_State *L)
-{
-	LUA_TRACE_POINT;
-	struct rspamd_task *task = lua_check_task(L, 1);
-	GPtrArray *cta_urls;
-	unsigned int max_urls = 0;
-	unsigned int nret = 0;
-
-	if (task == NULL) {
-		return luaL_error(L, "invalid arguments, no task");
-	}
-
-	if (task->message == NULL) {
-		lua_newtable(L);
-		return 1;
-	}
-
-	/* Get optional max_urls parameter */
-	if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
-		max_urls = lua_tointeger(L, 2);
-	}
-
-	/* Retrieve CTA URLs from mempool */
-	cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
-
-	if (cta_urls == NULL || cta_urls->len == 0) {
-		lua_newtable(L);
-		return 1;
-	}
-
-	/* Create result table */
-	unsigned int result_size = max_urls > 0 ? MIN(max_urls, cta_urls->len) : cta_urls->len;
-	lua_createtable(L, result_size, 0);
-
-	/* Add URLs to result */
-	for (unsigned int i = 0; i < cta_urls->len; i++) {
-		struct rspamd_url *u = g_ptr_array_index(cta_urls, i);
-		if (u) {
-			struct rspamd_lua_url *lua_url;
-
-			lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
-			rspamd_lua_setclass(L, rspamd_url_classname, -1);
-			lua_url->url = u;
-			lua_rawseti(L, -2, ++nret);
-
-			if (max_urls > 0 && nret >= max_urls) {
-				break;
-			}
-		}
-	}
-
-	return 1;
-}
-
 static int
 lua_task_has_urls(lua_State *L)
 {
diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx
index 3335f7dd02..e660edc867 100644
--- a/test/rspamd_cxx_unit.cxx
+++ b/test/rspamd_cxx_unit.cxx
@@ -27,6 +27,7 @@
 #include "rspamd_cxx_unit_cryptobox.hxx"
 #include "rspamd_cxx_unit_rfc2047.hxx"
 #include "rspamd_cxx_unit_html_url_rewrite.hxx"
+#include "rspamd_cxx_unit_html_cta.hxx"
 
 static gboolean verbose = false;
 static const GOptionEntry entries[] =
diff --git a/test/rspamd_cxx_unit_html_cta.hxx b/test/rspamd_cxx_unit_html_cta.hxx
new file mode 100644
index 0000000000..36ee8def27
--- /dev/null
+++ b/test/rspamd_cxx_unit_html_cta.hxx
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include "libutil/mem_pool.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html.h"
+#include "libserver/url.h"
+
+#include <string>
+#include <string_view>
+
+using namespace rspamd::html;
+
+namespace {
+
+struct html_fixture {
+	rspamd_mempool_t *pool = nullptr;
+	html_content *hc = nullptr;
+
+	explicit html_fixture(std::string_view html)
+	{
+		pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), nullptr, 0);
+		auto *input = g_byte_array_sized_new(html.size());
+		g_byte_array_append(input, reinterpret_cast<const guint8 *>(html.data()), html.size());
+		auto *parsed = rspamd_html_process_part(pool, input);
+		g_byte_array_free(input, TRUE);
+		hc = html_content::from_ptr(parsed);
+	}
+
+	~html_fixture()
+	{
+		if (pool != nullptr) {
+			rspamd_mempool_delete(pool);
+		}
+	}
+
+	[[nodiscard]] auto weight_for(std::string_view url) const -> float
+	{
+		for (const auto &[u, weight]: hc->url_button_weights) {
+			if (!u || !u->string) {
+				continue;
+			}
+			std::string_view current(u->string, u->urllen);
+			if (current == url) {
+				return weight;
+			}
+		}
+		return 0.0f;
+	}
+};
+
+}// namespace
+
+TEST_SUITE("html_cta_scoring")
+{
+	TEST_CASE("button-like anchors outrank technical resources")
+	{
+		const auto html = R"HTML(
+		<html><body>
+		<a href="https://example.com/cta" class="btn primary">Click now</a>
+		<link rel="stylesheet" href="https://cdn.example.com/style.css" />
+		<a href="mailto:info@example.com">Email us</a>
+		</body></html>
+	)HTML";
+		html_fixture fx{html};
+
+		CHECK(fx.weight_for("https://example.com/cta") > 0.6f);
+		CHECK(fx.weight_for("https://cdn.example.com/style.css") == doctest::Approx(0.0f));
+		CHECK(fx.weight_for("mailto:info@example.com") < 0.3f);
+	}
+
+	TEST_CASE("footer links and hidden buttons are de-emphasised")
+	{
+		const auto html = R"HTML(
+		<html><body>
+		<a href="https://shop.example.com/buy" class="cta-button">BUY NOW!</a>
+		<div class="footer">
+			<a href="https://shop.example.com/privacy" class="footer-link">Privacy policy</a>
+		</div>
+		<div style="display:none">
+			<a href="https://shop.example.com/hidden" class="btn">Hidden CTA</a>
+		</div>
+		</body></html>
+	)HTML";
+		html_fixture fx{html};
+
+		CHECK(fx.weight_for("https://shop.example.com/buy") > 0.6f);
+		CHECK(fx.weight_for("https://shop.example.com/privacy") < 0.2f);
+		CHECK(fx.weight_for("https://shop.example.com/hidden") == doctest::Approx(0.0f));
+	}
+}
+
+#endif