[Rework] Move CTA processing into dedicated module

author Vsevolod Stakhov <vsevolod@rspamd.com>

Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)

committer Vsevolod Stakhov <vsevolod@rspamd.com>

Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)
author Vsevolod Stakhov <vsevolod@rspamd.com>
Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)
committer Vsevolod Stakhov <vsevolod@rspamd.com>
Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)
diff --git a/lualib/llm_search_context.lua b/lualib/llm_search_context.lua

index e75199b4fee79fcef63c28efa649a95163a04419..a691de523b7659f85b92d08bb37c3ce26f423719 100644 (file)
--- a/lualib/llm_search_context.lua
+++ b/lualib/llm_search_context.lua
@@ -45,6 +45,7 @@ local rspamd_http = require "rspamd_http"
  local rspamd_logger = require "rspamd_logger"
  local lua_util = require "lua_util"
  local lua_cache = require "lua_cache"
+local lua_mime = require "lua_mime"
  local ucl = require "ucl"
  
  local DEFAULTS = {
@@ -77,11 +78,18 @@ local function extract_domains(task, max_domains, debug_module)
  
    -- First, try to get CTA URLs from HTML (most relevant for spam detection)
    -- Uses button weight and HTML structure analysis from C code
-  local cta_urls = task:get_cta_urls(max_domains * 2) or {}
-  lua_util.debugm(Np, task, "CTA analysis found %d URLs", #cta_urls)
+  local cta_urls = {}
+  local sel_part = lua_mime.get_displayed_text_part(task)
+  if sel_part then
+    cta_urls = sel_part:get_cta_urls()
+  end
+  lua_util.debugm(Np, task,
+      "CTA analysis found %d URLs across", #cta_urls)
  
    for _, url in ipairs(cta_urls) do
-    if #domains >= max_domains then break end
+    if #domains >= max_domains then
+      break
+    end
  
      local host = url:get_host()
      if host and not skip_domains[host:lower()] and not seen[host] then
@@ -94,20 +102,22 @@ local function extract_domains(task, max_domains, debug_module)
    -- If we don't have enough domains from CTA, get more from content URLs
    if #domains < max_domains then
      lua_util.debugm(Np, task, "need more domains (%d/%d), extracting from content URLs",
-      #domains, max_domains)
+        #domains, max_domains)
  
      local urls = lua_util.extract_specific_urls({
        task = task,
        limit = max_domains * 3,
        esld_limit = max_domains,
-      need_content = true,      -- Content URLs (buttons, links in text)
+      need_content = true, -- Content URLs (buttons, links in text)
        need_images = false,
      }) or {}
  
      lua_util.debugm(Np, task, "extracted %d content URLs", #urls)
  
      for _, url in ipairs(urls) do
-      if #domains >= max_domains then break end
+      if #domains >= max_domains then
+        break
+      end
  
        local host = url:get_host()
        if host and not seen[host] and not skip_domains[host:lower()] then
@@ -121,7 +131,7 @@ local function extract_domains(task, max_domains, debug_module)
    -- Still need more? Get from any URLs
    if #domains < max_domains then
      lua_util.debugm(Np, task, "still need more domains (%d/%d), extracting from all URLs",
-      #domains, max_domains)
+        #domains, max_domains)
  
      local urls = lua_util.extract_specific_urls({
        task = task,
@@ -132,7 +142,9 @@ local function extract_domains(task, max_domains, debug_module)
      lua_util.debugm(Np, task, "extracted %d all URLs", #urls)
  
      for _, url in ipairs(urls) do
-      if #domains >= max_domains then break end
+      if #domains >= max_domains then
+        break
+      end
  
        local host = url:get_host()
        if host and not seen[host] and not skip_domains[host:lower()] then
@@ -176,7 +188,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
  
      if code ~= 200 then
        rspamd_logger.infox(task, "search API returned code %s for domain '%s', url: %s, body: %s",
-        code, domain, full_url, body and body:sub(1, 200) or 'nil')
+          code, domain, full_url, body and body:sub(1, 200) or 'nil')
        callback(nil, domain, string.format("HTTP %s", code))
        return
      end
@@ -188,7 +200,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
      local ok, parse_err = parser:parse_string(body)
      if not ok then
        rspamd_logger.errx(task, "%s: failed to parse search API response for %s: %s",
-        Np, domain, parse_err)
+          Np, domain, parse_err)
        callback(nil, domain, parse_err)
        return
      end
@@ -208,7 +220,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
          local metadata = flat_data[1]
  
          lua_util.debugm(Np, task, "parsing domain '%s': flat_data has %d elements, metadata type: %s",
-          domain, #flat_data, type(metadata))
+            domain, #flat_data, type(metadata))
  
          if metadata and metadata.items and type(metadata.items) == 'number' then
            -- metadata.items is a 0-indexed pointer, add 1 for Lua
@@ -217,7 +229,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
  
            if items and type(items) == 'table' then
              lua_util.debugm(Np, task, "found %d item indices for domain '%s', items_idx=%d",
-              #items, domain, items_idx)
+                #items, domain, items_idx)
  
              local count = 0
  
@@ -237,8 +249,8 @@ local function query_search_api(task, domain, opts, callback, debug_module)
                  local title = result_template.title and flat_data[result_template.title + 1]
  
                  lua_util.debugm(Np, task, "result %d template: link_idx=%s, snippet_idx=%s, title_idx=%s",
-                  count + 1, tostring(result_template.link), tostring(result_template.snippet),
-                  tostring(result_template.title))
+                    count + 1, tostring(result_template.link), tostring(result_template.snippet),
+                    tostring(result_template.title))
  
                  if link or title or snippet then
                    table.insert(search_results.results, {
@@ -248,16 +260,16 @@ local function query_search_api(task, domain, opts, callback, debug_module)
                    })
                    count = count + 1
                    lua_util.debugm(Np, task, "extracted result %d: title='%s', snippet_len=%d",
-                    count, title or "nil", snippet and #snippet or 0)
+                      count, title or "nil", snippet and #snippet or 0)
                  end
                else
                  lua_util.debugm(Np, task, "result_template at idx %d is not a table: %s",
-                  result_template_idx, type(result_template))
+                    result_template_idx, type(result_template))
                end
              end
            else
              lua_util.debugm(Np, task, "items is not a table for domain '%s', type: %s",
-              domain, type(items))
+                domain, type(items))
            end
          else
            lua_util.debugm(Np, task, "no valid metadata.items for domain '%s'", domain)
@@ -266,7 +278,7 @@ local function query_search_api(task, domain, opts, callback, debug_module)
      end
  
      lua_util.debugm(Np, task, "extracted %d search results for domain '%s'",
-      #search_results.results, domain)
+        #search_results.results, domain)
      callback(search_results, domain, nil)
    end
  
@@ -342,7 +354,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
    end
  
    lua_util.debugm(Np, task, "final domain list (%d domains) for search: %s",
-    #domains, table.concat(domains, ", "))
+      #domains, table.concat(domains, ", "))
  
    -- Create cache context
    local cache_ctx = nil
@@ -378,7 +390,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
        else
          local context_snippet = format_search_results(all_results, opts)
          lua_util.debugm(Np, task, "search context formatted (%s bytes)",
-          context_snippet and #context_snippet or 0)
+            context_snippet and #context_snippet or 0)
          callback(task, true, context_snippet)
        end
      end
@@ -391,29 +403,29 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module)
      if cache_ctx then
        -- Use lua_cache for caching
        lua_cache.cache_get(task, cache_key, cache_ctx, opts.timeout,
-        function()
-          -- Cache miss - query API
-          query_search_api(task, domain, opts, function(api_results, d, api_err)
-            if api_results then
-              lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
-              domain_complete(d, api_results)
-            else
-              lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
-              domain_complete(d, nil)
+          function()
+            -- Cache miss - query API
+            query_search_api(task, domain, opts, function(api_results, d, api_err)
+              if api_results then
+                lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
+                domain_complete(d, api_results)
+              else
+                lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
+                domain_complete(d, nil)
+              end
+            end, Np)
+          end,
+          function(_, err, data)
+            -- Cache hit or after miss callback
+            if data and type(data) == 'table' then
+              lua_util.debugm(Np, task, "cache hit for domain %s", domain)
+              domain_complete(domain, data)
+              -- If no data and no error, the miss callback was already invoked
+            elseif err then
+              lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
+              domain_complete(domain, nil)
              end
-          end, Np)
-        end,
-        function(_, err, data)
-          -- Cache hit or after miss callback
-          if data and type(data) == 'table' then
-            lua_util.debugm(Np, task, "cache hit for domain %s", domain)
-            domain_complete(domain, data)
-          -- If no data and no error, the miss callback was already invoked
-          elseif err then
-            lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
-            domain_complete(domain, nil)
-          end
-        end)
+          end)
      else
        -- No Redis, query directly
        query_search_api(task, domain, opts, function(api_results, d, api_err)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 84ea13711371aaea38023248c6b433ccc0fda6a3..910fe2082e472e3c27a24050261e4bf1153d3509 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -59,6 +59,9 @@ static const char gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*"
  struct rspamd_multipattern *gtube_matcher = NULL;
  static const uint64_t words_hash_seed = 0xdeadbabe;
  
+/* CTA URL configuration */
+#define MAX_CTA_URLS_PER_PART 25
+
  static void
  free_byte_array_callback(void *pointer)
  {
@@ -127,16 +130,16 @@ rspamd_mime_part_extract_words(struct rspamd_task *task,
                                 *avg_len_p += total_len;
                         }
  
-               short_len_p = rspamd_mempool_get_variable(task->task_pool,
-                                                                                                 RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
+                       short_len_p = rspamd_mempool_get_variable(task->task_pool,
+                                                                                                         RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
  
-               if (short_len_p == NULL) {
-                       short_len_p = rspamd_mempool_alloc(task->task_pool,
-                                                                                          sizeof(double));
-                       *short_len_p = short_len;
-                       rspamd_mempool_set_variable(task->task_pool,
-                                                                               RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
-               }
+                       if (short_len_p == NULL) {
+                               short_len_p = rspamd_mempool_alloc(task->task_pool,
+                                                                                                  sizeof(double));
+                               *short_len_p = short_len;
+                               rspamd_mempool_set_variable(task->task_pool,
+                                                                                       RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
+                       }
                         else {
                                 *short_len_p += short_len;
                         }
@@ -795,6 +798,11 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
         /* Wire aggregated HTML features */
         text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
  
+       /* Collect top CTA URLs for this HTML part */
+       if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
+               rspamd_html_process_cta_urls(text_part, task, MAX_CTA_URLS_PER_PART);
+       }
+
         /* Optionally call CTA/affiliation Lua hook with capped candidates */
         if (task->cfg && task->cfg->lua_state) {
                 lua_State *L = task->cfg->lua_state;
@@ -944,55 +952,6 @@ rspamd_message_process_html_text_part(struct rspamd_task *task,
  
                         lua_settop(L, old_top);
                 }
-
-               /* Store top CTA URLs for LLM and other use cases */
-               if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
-                       /* Simple approach: just store URLs sorted by button weight */
-                       /* Use task-wide array to aggregate across all HTML parts */
-                       GPtrArray *cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
-                       if (!cta_urls) {
-                               cta_urls = g_ptr_array_new();
-                               rspamd_mempool_add_destructor(task->task_pool,
-                                                                                         (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
-                                                                                         cta_urls);
-                               rspamd_mempool_set_variable(task->task_pool, "html_cta_urls", cta_urls, NULL);
-                       }
-
-                       /* Find best URLs by button weight in this HTML part */
-                       float best_weights[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
-                       struct rspamd_url *best_urls[5] = {NULL, NULL, NULL, NULL, NULL};
-                       unsigned int max_cta_per_part = 5;
-
-                       for (unsigned int i = 0; i < text_part->mime_part->urls->len; i++) {
-                               struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i);
-                               if (!u) continue;
-                               if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
-                               if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
-
-                               float weight = rspamd_html_url_button_weight(text_part->html, u);
-
-                               /* Insert into best list if weight is high enough */
-                               for (unsigned int j = 0; j < max_cta_per_part; j++) {
-                                       if (weight > best_weights[j]) {
-                                               /* Shift lower entries down */
-                                               for (unsigned int k = max_cta_per_part - 1; k > j; k--) {
-                                                       best_weights[k] = best_weights[k - 1];
-                                                       best_urls[k] = best_urls[k - 1];
-                                               }
-                                               best_weights[j] = weight;
-                                               best_urls[j] = u;
-                                               break;
-                                       }
-                               }
-                       }
-
-                       /* Add to task-wide array */
-                       for (unsigned int i = 0; i < max_cta_per_part; i++) {
-                               if (best_urls[i] && best_weights[i] > 0.0) {
-                                       g_ptr_array_add(cta_urls, best_urls[i]);
-                               }
-                       }
-               }
         }
         rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
  
diff --git a/src/libmime/message.h b/src/libmime/message.h

index 83f36ff19203c485c8f4e3a23a211a751cae9b6f..dc9987d01fea9325a46f37eec04b02d4b8a32faf 100644 (file)
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -16,6 +16,7 @@
  #include "libserver/url.h"
  #include "libutil/ref.h"
  #include "libutil/str_util.h"
+#include "libutil/heap.h"
  #include "libserver/word.h"
  
  #include <unicode/uchar.h>
@@ -126,6 +127,15 @@ struct rspamd_mime_part {
  #define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
  #define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT)
  
+/* CTA (call-to-action) URL heap entry structure */
+struct rspamd_html_cta_entry {
+       unsigned int pri;       /* Priority for heap (weight * scale) */
+       unsigned int idx;       /* Heap index (managed by heap) */
+       struct rspamd_url *url; /* URL pointer */
+       float weight;           /* Original button weight */
+};
+
+RSPAMD_HEAP_DECLARE(rspamd_html_heap_storage, struct rspamd_html_cta_entry);
  
  struct rspamd_mime_text_part {
         const char *language;
@@ -148,7 +158,9 @@ struct rspamd_mime_text_part {
         void *html;
         /* Optional HTML features collected during parsing */
         struct rspamd_html_features *html_features;
-       GList *exceptions; /**< list of offsets of urls                                         */
+       /* CTA (call-to-action) URLs extracted from HTML with weights */
+       rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */
+       GList *exceptions;                    /**< list of offsets of urls                                              */
         struct rspamd_mime_part *mime_part;
  
         unsigned int flags;
diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt

index 721e09a65c47bd54630847b95bbe848c5118e993..24deff707aff35b26e02f13f36a1eb67ec124a2f 100644 (file)
--- a/src/libserver/CMakeLists.txt
+++ b/src/libserver/CMakeLists.txt
@@ -41,6 +41,7 @@ SET(LIBRSPAMDSERVERSRC
          ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
          ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx
          ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx
+        ${CMAKE_CURRENT_SOURCE_DIR}/html/html_cta.cxx
          ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx
          ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx
          ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite_c.cxx
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index 1e982236d16a6780a86d36b3b764fac0988ae763..e66ba356522169aae6e51d9ce86ff6bf7d479251 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -29,6 +29,7 @@
  #include "contrib/libucl/khash.h"
  #include "libmime/images.h"
  #include "libutil/cxx/utf8_util.h"
+#include "libserver/html/html_cta.hxx"
  
  #include "html_tag_defs.hxx"
  #include "html_entities.hxx"
@@ -40,6 +41,8 @@
  #include "contrib/fmt/include/fmt/core.h"
  
  #include <functional>
+#include <algorithm>
+#include <string>
  #include <unicode/uversion.h>
  
  namespace rspamd::html {
@@ -834,6 +837,7 @@ static const auto component_extractors = frozen::make_unordered_map<frozen::stri
                  }},
         });
  
+
  auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
  {
         auto it = component_extractors.find(attr_name);
@@ -890,7 +894,7 @@ enum tag_parser_state {
  struct tag_content_parser_state {
         tag_parser_state cur_state = parse_start;
         std::string buf;
-       std::string attr_name;// Store current attribute name
+       std::string attr_name;            // Store current attribute name
         const char *value_start = nullptr;// Track where attribute value starts in input
         const char *html_start = nullptr; // Base pointer to HTML buffer start
  
@@ -2406,26 +2410,6 @@ auto html_process_input(struct rspamd_task *task,
                                         if (cnt > hc->features.links.max_links_single_domain) {
                                                 hc->features.links.max_links_single_domain = cnt;
                                         }
-                                       /* Heuristic button weight */
-                                       float w = 0.0f;
-                                       if (url->ext && url->ext->linked_url && url->ext->linked_url != url) {
-                                               w += 0.5f; /* display mismatch bonus */
-                                       }
-                                       w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order);
-                                       if (cur_tag->block && cur_tag->block->is_visible()) {
-                                               if (cur_tag->block->has_display()) {
-                                                       w += 0.1f;
-                                               }
-                                               if (cur_tag->block->width > 0 && cur_tag->block->height > 0) {
-                                                       w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f);
-                                               }
-                                               if (cur_tag->block->font_size >= 14) {
-                                                       w += 0.1f;
-                                               }
-                                       }
-                                       if (w > 0) {
-                                               hc->url_button_weights[url] += w;
-                                       }
                                         /* same eTLD+1 as first-party? */
                                         if (!hc->first_party_etld1.empty()) {
                                                 rspamd_ftok_t tld2;
@@ -3180,6 +3164,8 @@ auto html_process_input(struct rspamd_task *task,
                 }
         }
  
+       html_compute_cta_weights(*hc);
+
         return hc;
  }
  
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h

index 368a22b08c97c32aa2080677a39998ae7791cbff..f256aae9dc800c2e673fb74a81a528bad9e6eaad 100644 (file)
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -140,6 +140,17 @@ float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u);
   */
  const struct rspamd_html_features *rspamd_html_get_features(void *html_content);
  
+/**
+ * Creates CTA (call-to-action) URLs heap for a text part
+ * Collects top-K URLs by button weight using min-heap (O(n log k))
+ * @param text_part text part to fill cta_urls for
+ * @param task task for mempool allocation
+ * @param max_cta maximum number of CTA URLs to collect
+ */
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+                                                                 struct rspamd_task *task,
+                                                                 unsigned int max_cta);
+
  
  #ifdef __cplusplus
  }
diff --git a/src/libserver/html/html_cta.cxx b/src/libserver/html/html_cta.cxx

new file mode 100644 (file)

index 0000000..8646b8b
--- /dev/null
+++ b/src/libserver/html/html_cta.cxx
@@ -0,0 +1,570 @@
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "libserver/html/html_cta.hxx"
+
+#include "util.h"
+#include "message.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html_block.hxx"
+#include "libserver/html/html_tag.hxx"
+#include "libserver/css/css.hxx"
+#include "libserver/url.h"
+#include "libserver/task.h"
+#include "libutil/cxx/util.hxx"
+#include "libutil/heap.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include <glib.h>
+
+static constexpr unsigned int CTA_WEIGHT_SCALE = 1000;
+
+namespace rspamd::html {
+namespace {
+
+using namespace std::string_view_literals;
+
+static auto trim_ascii(std::string_view input) -> std::string_view
+{
+       while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.front()))) {
+               input.remove_prefix(1);
+       }
+
+       while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.back()))) {
+               input.remove_suffix(1);
+       }
+
+       return input;
+}
+
+static auto space_separated_token_match(std::string_view attr,
+                                                                               std::string_view token,
+                                                                               bool allow_partial) -> bool
+{
+       attr = trim_ascii(attr);
+       if (attr.empty()) {
+               return false;
+       }
+
+       std::size_t pos = 0;
+       while (pos < attr.size()) {
+               while (pos < attr.size() && g_ascii_isspace(static_cast<gchar>(attr[pos]))) {
+                       pos++;
+               }
+               if (pos >= attr.size()) {
+                       break;
+               }
+
+               auto end = pos;
+               while (end < attr.size() && !g_ascii_isspace(static_cast<gchar>(attr[end]))) {
+                       end++;
+               }
+
+               auto chunk = attr.substr(pos, end - pos);
+               if (allow_partial) {
+                       if (chunk.find(token) != std::string_view::npos) {
+                               return true;
+                       }
+               }
+               else {
+                       if (chunk == token) {
+                               return true;
+                       }
+               }
+
+               pos = end + 1;
+       }
+
+       return false;
+}
+
+static auto optional_attr_contains(const std::optional<std::string_view> &attr,
+                                                                  std::string_view token,
+                                                                  bool allow_partial = false) -> bool
+{
+       if (!attr) {
+               return false;
+       }
+
+       return space_separated_token_match(attr.value(), token, allow_partial);
+}
+
+template<typename Range>
+static auto optional_attr_contains_any(const std::optional<std::string_view> &attr,
+                                                                          const Range &tokens,
+                                                                          bool allow_partial = false) -> bool
+{
+       if (!attr) {
+               return false;
+       }
+
+       for (auto token: tokens) {
+               if (space_separated_token_match(attr.value(), token, allow_partial)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static auto to_lower_ascii(std::string_view input) -> std::string
+{
+       std::string out;
+       out.reserve(input.size());
+       for (auto ch: input) {
+               out.push_back(static_cast<char>(g_ascii_tolower(static_cast<guchar>(ch))));
+       }
+       return out;
+}
+
+static auto get_cta_label(const html_tag &tag, const html_content &hc) -> std::string
+{
+       auto content = trim_ascii(tag.get_content(&hc));
+       if (!content.empty()) {
+               return std::string{content};
+       }
+
+       if (auto title = tag.find_component<html_component_title>()) {
+               auto value = trim_ascii(title.value()->value);
+               if (!value.empty()) {
+                       return std::string{value};
+               }
+       }
+
+       if (auto aria_label = tag.find_component_by_name("aria-label"sv)) {
+               auto value = trim_ascii(aria_label.value());
+               if (!value.empty()) {
+                       return std::string{value};
+               }
+       }
+
+       if (auto alt = tag.find_component<html_component_alt>()) {
+               auto value = trim_ascii(alt.value()->value);
+               if (!value.empty()) {
+                       return std::string{value};
+               }
+       }
+
+       return {};
+}
+
+static auto tag_is_effectively_hidden(const html_tag *tag) -> bool
+{
+       for (auto current = tag; current != nullptr; current = current->parent) {
+               if (current->block && !current->block->is_visible()) {
+                       return true;
+               }
+               if (current->flags & FL_IGNORE) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static constexpr auto buttonish_class_tokens = rspamd::array_of<std::string_view>(
+       "btn", "button", "cta", "call-to-action", "submit", "primary",
+       "confirm", "action", "purchase", "buy", "signup", "sign-up", "apply");
+
+static constexpr auto negative_context_tokens = rspamd::array_of<std::string_view>(
+       "logo", "footer", "header", "nav", "menu", "social",
+       "tracking", "pixel", "unsubscribe", "legal", "copyright");
+
+static constexpr auto service_rel_tokens = rspamd::array_of<std::string_view>(
+       "alternate", "canonical", "dns-prefetch", "icon", "manifest",
+       "preconnect", "prefetch", "preload", "stylesheet");
+
+static constexpr auto cta_keywords = rspamd::array_of<std::string_view>(
+       "buy", "purchase", "order", "checkout", "pay", "confirm", "verify",
+       "update", "login", "log in", "sign in", "sign up", "signup", "register",
+       "download", "upgrade", "continue", "next", "open", "submit", "apply",
+       "approve", "activate", "subscribe");
+
+static auto is_service_link_tag(const html_tag &tag, const rspamd_url &url) -> bool
+{
+       if (tag.flags & (FL_XML | FL_VIRTUAL | FL_COMMENT | FL_IGNORE | CM_HEAD)) {
+               return true;
+       }
+
+       switch (tag.id) {
+       case Tag_LINK:
+       case Tag_SCRIPT:
+       case Tag_STYLE:
+       case Tag_META:
+       case Tag_BASE:
+       case Tag_IMG:
+               return true;
+       default:
+               break;
+       }
+
+       if (tag.block && !tag.block->is_visible()) {
+               return true;
+       }
+
+       if (url.flags & RSPAMD_URL_FLAG_IMAGE) {
+               return true;
+       }
+
+       if (tag.id == Tag_A) {
+               if (optional_attr_contains_any(tag.find_rel(), service_rel_tokens, false)) {
+                       return true;
+               }
+               if (tag.parent && (tag.parent->flags & CM_HEAD)) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static auto compute_semantic_base_score(const html_tag &tag, const rspamd_url &url) -> float
+{
+       switch (tag.id) {
+       case Tag_BUTTON:
+               return 0.9f;
+       case Tag_INPUT: {
+               float base = 0.35f;
+               if (auto type_comp = tag.find_component<html_component_type>()) {
+                       auto lowered = to_lower_ascii(trim_ascii(type_comp.value()->get_string_value()));
+                       if (lowered == "submit" || lowered == "button" || lowered == "send") {
+                               base = 0.85f;
+                       }
+                       else if (lowered == "image") {
+                               base = 0.75f;
+                       }
+                       else if (lowered == "reset") {
+                               base = 0.25f;
+                       }
+               }
+               return base;
+       }
+       case Tag_FORM:
+               return 0.8f;
+       case Tag_A: {
+               float base = 0.35f;
+               if (optional_attr_contains_any(tag.find_class(), buttonish_class_tokens, true) ||
+                       optional_attr_contains_any(tag.find_id(), buttonish_class_tokens, true)) {
+                       base = 0.75f;
+               }
+               if (auto role_comp = tag.find_component<html_component_role>()) {
+                       auto lowered = to_lower_ascii(trim_ascii(role_comp.value()->value));
+                       if (lowered == "button" || lowered == "tab" || lowered == "menuitem") {
+                               base = std::max(base, 0.7f);
+                       }
+               }
+               if (url.protocol == PROTOCOL_MAILTO) {
+                       base = std::min(base, 0.4f);
+               }
+               return base;
+       }
+       case Tag_AREA:
+               return 0.3f;
+       default:
+               if (tag.flags & FL_HREF) {
+                       return 0.2f;
+               }
+               break;
+       }
+
+       return 0.0f;
+}
+
+static auto compute_visual_bonus(const html_tag &tag) -> float
+{
+       if (!tag.block || !tag.block->is_visible()) {
+               return 0.0f;
+       }
+
+       float bonus = 0.0f;
+       const auto &block = *tag.block;
+
+       switch (block.display) {
+       case css::css_display_value::DISPLAY_BLOCK:
+               bonus += 0.12f;
+               break;
+       case css::css_display_value::DISPLAY_TABLE_ROW:
+               bonus += 0.05f;
+               break;
+       default:
+               break;
+       }
+
+       if (block.width > 0 && block.height > 0) {
+               const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+               if (area >= 6000) {
+                       bonus += 0.2f;
+               }
+               else if (area >= 2000) {
+                       bonus += 0.12f;
+               }
+               else if (area >= 400) {
+                       bonus += 0.06f;
+               }
+       }
+
+       if (block.font_size >= 16) {
+               bonus += 0.08f;
+       }
+       else if (block.font_size >= 13) {
+               bonus += 0.04f;
+       }
+
+       return bonus;
+}
+
+static auto compute_text_bonus(std::string_view text_lower) -> float
+{
+       if (text_lower.empty()) {
+               return 0.0f;
+       }
+
+       float bonus = 0.0f;
+       for (auto kw: cta_keywords) {
+               if (text_lower.find(kw) != std::string_view::npos) {
+                       bonus += 0.18f;
+                       break;
+               }
+       }
+
+       if (text_lower.find('!') != std::string_view::npos) {
+               bonus += 0.03f;
+       }
+
+       if (text_lower.size() <= 18 && text_lower.size() >= 3) {
+               bonus += 0.04f;
+       }
+
+       return bonus;
+}
+
+static auto compute_penalty(const html_tag &tag,
+                                                       const rspamd_url &url,
+                                                       std::string_view text_lower,
+                                                       std::string_view text_original) -> float
+{
+       float penalty = 0.0f;
+
+       if (text_lower.empty()) {
+               penalty += 0.35f;
+       }
+       else {
+               unsigned int alpha = 0;
+               unsigned int graph = 0;
+               for (auto ch: text_lower) {
+                       if (g_ascii_isspace(static_cast<gchar>(ch))) {
+                               continue;
+                       }
+                       graph++;
+                       if (g_ascii_isalpha(static_cast<gchar>(ch))) {
+                               alpha++;
+                       }
+               }
+               if (graph > 0 && alpha == 0) {
+                       penalty += 0.25f;
+               }
+               if (text_original.size() > 80) {
+                       penalty += 0.1f;
+               }
+       }
+
+       if (tag.block) {
+               const auto &block = *tag.block;
+               if (block.width > 0 && block.height > 0) {
+                       const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+                       if (area <= 64) {
+                               penalty += 0.25f;
+                       }
+                       else if (area <= 150) {
+                               penalty += 0.15f;
+                       }
+               }
+               if (block.font_size > 0 && block.font_size <= 9) {
+                       penalty += 0.08f;
+               }
+               if (block.is_transparent()) {
+                       penalty += 0.2f;
+               }
+       }
+
+       if (optional_attr_contains_any(tag.find_class(), negative_context_tokens, true) ||
+               optional_attr_contains_any(tag.find_id(), negative_context_tokens, true)) {
+               penalty += 0.2f;
+       }
+
+       if (url.flags & RSPAMD_URL_FLAG_INVISIBLE) {
+               penalty += 0.3f;
+       }
+
+       if (url.protocol == PROTOCOL_MAILTO || url.protocol == PROTOCOL_FTP) {
+               penalty += 0.05f;
+       }
+
+       return penalty;
+}
+
+static auto compute_cta_weight(const html_tag &tag,
+                                                          const rspamd_url &url,
+                                                          const html_content &hc) -> float
+{
+       if (is_service_link_tag(tag, url)) {
+               return 0.0f;
+       }
+
+       if (tag_is_effectively_hidden(&tag)) {
+               return 0.0f;
+       }
+
+       float base = compute_semantic_base_score(tag, url);
+       if (base <= 0.0f) {
+               return 0.0f;
+       }
+
+       auto label = get_cta_label(tag, hc);
+       std::string_view label_view = trim_ascii(label);
+       std::string lowered = to_lower_ascii(label_view);
+
+       float visual = compute_visual_bonus(tag);
+       float text_bonus = compute_text_bonus(lowered);
+       float order_bonus = 0.0f;
+       if (url.order == 0) {
+               order_bonus = 0.1f;
+       }
+       else {
+               order_bonus = std::max(0.0f, 0.06f / (1.0f + static_cast<float>(url.order)));
+       }
+       if (url.ext && url.ext->linked_url && url.ext->linked_url != &url) {
+               order_bonus += 0.12f;
+       }
+       float penalty = compute_penalty(tag, url, lowered, label_view);
+
+       float weight = base + visual + text_bonus + order_bonus - penalty;
+       if (weight < 0.0f) {
+               weight = 0.0f;
+       }
+       else if (weight > 1.0f) {
+               weight = 1.0f;
+       }
+
+       return weight;
+}
+
+}// namespace
+
+void html_compute_cta_weights(html_content &hc)
+{
+       hc.url_button_weights.clear();
+
+       for (const auto &tag_ptr: hc.all_tags) {
+               const auto &tag = *tag_ptr;
+               if (!std::holds_alternative<rspamd_url *>(tag.extra)) {
+                       continue;
+               }
+
+               auto *url = std::get<rspamd_url *>(tag.extra);
+               if (!url) {
+                       continue;
+               }
+
+               float weight = compute_cta_weight(tag, *url, hc);
+               if (weight <= 0.0f) {
+                       continue;
+               }
+
+               auto it = hc.url_button_weights.find(url);
+               if (it == hc.url_button_weights.end()) {
+                       hc.url_button_weights.emplace(url, weight);
+               }
+               else {
+                       it->second = std::max(it->second, weight);
+               }
+       }
+}
+
+}// namespace rspamd::html
+
+extern "C" {
+
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+                                                                 struct rspamd_task *task,
+                                                                 unsigned int max_cta)
+{
+       using namespace rspamd::html;
+
+       if (!text_part || !text_part->html || !text_part->mime_part || !text_part->mime_part->urls) {
+               return;
+       }
+       auto *part_urls = text_part->mime_part->urls;
+       unsigned int i;
+       rspamd_url *u;
+
+       auto *heap_ptr = rspamd_mempool_alloc_type(task->task_pool, rspamd_html_heap_storage_t);
+       rspamd_heap_init(rspamd_html_heap_storage, heap_ptr);
+       text_part->cta_urls = heap_ptr;
+       rspamd_mempool_add_destructor(task->task_pool, [](void *ptr) {
+                auto *h = static_cast<rspamd_html_heap_storage_t *>(ptr);
+                rspamd_heap_destroy(rspamd_html_heap_storage, h); }, heap_ptr);
+       PTR_ARRAY_FOREACH(part_urls, i, u)
+       {
+               if (!u) continue;
+               if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+               if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+               if (u->flags & RSPAMD_URL_FLAG_IMAGE) continue;
+
+               /* Use button_weight to filter CTA URLs vs technical URLs
+         * Technical tags like <link rel>, <script src> have weight=0
+         * Only actual content URLs (buttons, links) have weight > 0
+         */
+               float weight = rspamd_html_url_button_weight(text_part->html, u);
+
+               if (weight > 0.0) {
+                       if (rspamd_heap_size(rspamd_html_heap_storage, heap_ptr) < max_cta) {
+                               rspamd_html_cta_entry entry = {
+                                       .pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+                                       .idx = 0,
+                                       .url = u,
+                                       .weight = weight};
+                               rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+                       }
+                       else {
+                               auto *min = rspamd_heap_index(rspamd_html_heap_storage, heap_ptr, 0);
+                               if (weight > min->weight) {
+                                       rspamd_heap_pop(rspamd_html_heap_storage, heap_ptr);
+                                       rspamd_html_cta_entry entry = {
+                                               .pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+                                               .idx = 0,
+                                               .url = u,
+                                               .weight = weight};
+                                       rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+                               }
+                       }
+               }
+       }
+
+       return;
+
+heap_error:
+       rspamd_heap_destroy(rspamd_html_heap_storage, heap_ptr);
+       text_part->cta_urls = nullptr;
+}
+
+}// extern "C"
diff --git a/src/libserver/html/html_cta.hxx b/src/libserver/html/html_cta.hxx

new file mode 100644 (file)

index 0000000..cd02714
--- /dev/null
+++ b/src/libserver/html/html_cta.hxx
@@ -0,0 +1,31 @@
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_CTA_HXX
+#define RSPAMD_HTML_CTA_HXX
+
+namespace rspamd::html {
+
+struct html_content;
+
+/**
+ * Recompute CTA weights for all URLs present in the HTML document.
+ */
+void html_compute_cta_weights(html_content &hc);
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_CTA_HXX
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c

index 941703d756667914c4296747ca3353f506f0406c..21b3f6bbe7585dceb92fc6ecb6d3824d411dea46 100644 (file)
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -127,6 +127,13 @@ LUA_FUNCTION_DEF(textpart, get_lines_count);
   * @return {table} table of stats
   */
  LUA_FUNCTION_DEF(textpart, get_stats);
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+LUA_FUNCTION_DEF(textpart, get_cta_urls);
  /***
   * @method mime_part:get_words_count()
   * Get words number in the part
@@ -254,6 +261,7 @@ static const struct luaL_reg textpartlib_m[] = {
         LUA_INTERFACE_DEF(textpart, get_stats),
         LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes),
         LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes),
+       LUA_INTERFACE_DEF(textpart, get_cta_urls),
         {"__tostring", rspamd_lua_class_tostring},
         {NULL, NULL}};
  
@@ -1420,6 +1428,58 @@ lua_textpart_get_html_fuzzy_hashes(lua_State *L)
         return 2;
  }
  
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+static int
+lua_textpart_get_cta_urls(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_mime_text_part *part = lua_check_textpart(L);
+       unsigned int max_urls = 0;
+       unsigned int nret = 0;
+
+       if (part == NULL) {
+               return luaL_error(L, "invalid arguments");
+       }
+
+       /* Get optional max_urls parameter */
+       if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
+               max_urls = lua_tointeger(L, 2);
+       }
+
+       /* Check if this HTML part has CTA URLs */
+       if (!part->cta_urls) {
+               lua_newtable(L);
+               return 1;
+       }
+
+       /* Access heap structure from html.h */
+       rspamd_html_heap_storage_t *heap = part->cta_urls;
+
+       /* Heap is already top-K, but in min-heap order - need to reverse for descending */
+       unsigned int result_size = max_urls > 0 ? MIN(max_urls, heap->n) : heap->n;
+       lua_createtable(L, result_size, 0);
+
+       /* Iterate heap from end to start for descending order */
+       for (int i = (int) heap->n - 1; i >= 0 && nret < result_size; i--) {
+               struct rspamd_html_cta_entry *entry = &heap->a[i];
+               if (entry && entry->url) {
+                       struct rspamd_lua_url *lua_url;
+
+                       lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+                       rspamd_lua_setclass(L, rspamd_url_classname, -1);
+                       lua_url->url = entry->url;
+                       lua_rawseti(L, -2, ++nret);
+               }
+       }
+
+       return 1;
+}
+
  static int
  lua_textpart_get_mimepart(lua_State *L)
  {
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c

index b7e42530c3e1198b21a8cc2524456df2547664a8..e10c7e089ba6c8a2c696f6d8707e8ff658da74e5 100644 (file)
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -278,14 +278,6 @@ LUA_FUNCTION_DEF(task, get_urls);
   * @return {table rspamd_url} list of urls matching conditions
   */
  LUA_FUNCTION_DEF(task, get_urls_filtered);
-/***
- * @method task:get_cta_urls([max_urls])
- * Get call-to-action URLs from HTML content, prioritized by button weight
- * These are URLs that users are likely to click (buttons, prominent links, etc.)
- * @param {number} max_urls maximum number of URLs to return (default: all)
- * @return {table rspamd_url} list of CTA urls sorted by importance
- */
-LUA_FUNCTION_DEF(task, get_cta_urls);
  /***
   * @method task:has_urls([need_emails])
   * Returns 'true' if a task has urls listed
@@ -1333,7 +1325,6 @@ static const struct luaL_reg tasklib_m[] = {
         LUA_INTERFACE_DEF(task, has_urls),
         LUA_INTERFACE_DEF(task, get_urls),
         LUA_INTERFACE_DEF(task, get_urls_filtered),
-       LUA_INTERFACE_DEF(task, get_cta_urls),
         LUA_INTERFACE_DEF(task, inject_url),
         LUA_INTERFACE_DEF(task, get_content),
         LUA_INTERFACE_DEF(task, get_filename),
@@ -2733,61 +2724,6 @@ lua_task_get_urls_filtered(lua_State *L)
         return 1;
  }
  
-static int
-lua_task_get_cta_urls(lua_State *L)
-{
-       LUA_TRACE_POINT;
-       struct rspamd_task *task = lua_check_task(L, 1);
-       GPtrArray *cta_urls;
-       unsigned int max_urls = 0;
-       unsigned int nret = 0;
-
-       if (task == NULL) {
-               return luaL_error(L, "invalid arguments, no task");
-       }
-
-       if (task->message == NULL) {
-               lua_newtable(L);
-               return 1;
-       }
-
-       /* Get optional max_urls parameter */
-       if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
-               max_urls = lua_tointeger(L, 2);
-       }
-
-       /* Retrieve CTA URLs from mempool */
-       cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
-
-       if (cta_urls == NULL || cta_urls->len == 0) {
-               lua_newtable(L);
-               return 1;
-       }
-
-       /* Create result table */
-       unsigned int result_size = max_urls > 0 ? MIN(max_urls, cta_urls->len) : cta_urls->len;
-       lua_createtable(L, result_size, 0);
-
-       /* Add URLs to result */
-       for (unsigned int i = 0; i < cta_urls->len; i++) {
-               struct rspamd_url *u = g_ptr_array_index(cta_urls, i);
-               if (u) {
-                       struct rspamd_lua_url *lua_url;
-
-                       lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
-                       rspamd_lua_setclass(L, rspamd_url_classname, -1);
-                       lua_url->url = u;
-                       lua_rawseti(L, -2, ++nret);
-
-                       if (max_urls > 0 && nret >= max_urls) {
-                               break;
-                       }
-               }
-       }
-
-       return 1;
-}
-
  static int
  lua_task_has_urls(lua_State *L)
  {
diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx

index 3335f7dd029e06dd7d1c821b1b5abc2b5eed58fe..e660edc8674d1f5a415633a0f7157a5a0045c13a 100644 (file)
--- a/test/rspamd_cxx_unit.cxx
+++ b/test/rspamd_cxx_unit.cxx
@@ -27,6 +27,7 @@
  #include "rspamd_cxx_unit_cryptobox.hxx"
  #include "rspamd_cxx_unit_rfc2047.hxx"
  #include "rspamd_cxx_unit_html_url_rewrite.hxx"
+#include "rspamd_cxx_unit_html_cta.hxx"
  
  static gboolean verbose = false;
  static const GOptionEntry entries[] =
diff --git a/test/rspamd_cxx_unit_html_cta.hxx b/test/rspamd_cxx_unit_html_cta.hxx

new file mode 100644 (file)

index 0000000..36ee8de
--- /dev/null
+++ b/test/rspamd_cxx_unit_html_cta.hxx
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include "libutil/mem_pool.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html.h"
+#include "libserver/url.h"
+
+#include <string>
+#include <string_view>
+
+using namespace rspamd::html;
+
+namespace {
+
+struct html_fixture {
+       rspamd_mempool_t *pool = nullptr;
+       html_content *hc = nullptr;
+
+       explicit html_fixture(std::string_view html)
+       {
+               pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), nullptr, 0);
+               auto *input = g_byte_array_sized_new(html.size());
+               g_byte_array_append(input, reinterpret_cast<const guint8 *>(html.data()), html.size());
+               auto *parsed = rspamd_html_process_part(pool, input);
+               g_byte_array_free(input, TRUE);
+               hc = html_content::from_ptr(parsed);
+       }
+
+       ~html_fixture()
+       {
+               if (pool != nullptr) {
+                       rspamd_mempool_delete(pool);
+               }
+       }
+
+       [[nodiscard]] auto weight_for(std::string_view url) const -> float
+       {
+               for (const auto &[u, weight]: hc->url_button_weights) {
+                       if (!u || !u->string) {
+                               continue;
+                       }
+                       std::string_view current(u->string, u->urllen);
+                       if (current == url) {
+                               return weight;
+                       }
+               }
+               return 0.0f;
+       }
+};
+
+}// namespace
+
+TEST_SUITE("html_cta_scoring")
+{
+       TEST_CASE("button-like anchors outrank technical resources")
+       {
+               const auto html = R"HTML(
+               <html><body>
+               <a href="https://example.com/cta" class="btn primary">Click now</a>
+               <link rel="stylesheet" href="https://cdn.example.com/style.css" />
+               <a href="mailto:info@example.com">Email us</a>
+               </body></html>
+       )HTML";
+               html_fixture fx{html};
+
+               CHECK(fx.weight_for("https://example.com/cta") > 0.6f);
+               CHECK(fx.weight_for("https://cdn.example.com/style.css") == doctest::Approx(0.0f));
+               CHECK(fx.weight_for("mailto:info@example.com") < 0.3f);
+       }
+
+       TEST_CASE("footer links and hidden buttons are de-emphasised")
+       {
+               const auto html = R"HTML(
+               <html><body>
+               <a href="https://shop.example.com/buy" class="cta-button">BUY NOW!</a>
+               <div class="footer">
+                       <a href="https://shop.example.com/privacy" class="footer-link">Privacy policy</a>
+               </div>
+               <div style="display:none">
+                       <a href="https://shop.example.com/hidden" class="btn">Hidden CTA</a>
+               </div>
+               </body></html>
+       )HTML";
+               html_fixture fx{html};
+
+               CHECK(fx.weight_for("https://shop.example.com/buy") > 0.6f);
+               CHECK(fx.weight_for("https://shop.example.com/privacy") < 0.2f);
+               CHECK(fx.weight_for("https://shop.example.com/hidden") == doctest::Approx(0.0f));
+       }
+}
+
+#endif
author	Vsevolod Stakhov <vsevolod@rspamd.com>
	Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)
committer	Vsevolod Stakhov <vsevolod@rspamd.com>
	Thu, 6 Nov 2025 13:46:50 +0000 (13:46 +0000)
lualib/llm_search_context.lua		patch \| blob \| blame \| history
src/libmime/message.c		patch \| blob \| blame \| history
src/libmime/message.h		patch \| blob \| blame \| history
src/libserver/CMakeLists.txt		patch \| blob \| blame \| history
src/libserver/html/html.cxx		patch \| blob \| blame \| history
src/libserver/html/html.h		patch \| blob \| blame \| history
src/libserver/html/html_cta.cxx	[new file with mode: 0644]	patch \| blob
src/libserver/html/html_cta.hxx	[new file with mode: 0644]	patch \| blob
src/lua/lua_mimepart.c		patch \| blob \| blame \| history
src/lua/lua_task.c		patch \| blob \| blame \| history
test/rspamd_cxx_unit.cxx		patch \| blob \| blame \| history
test/rspamd_cxx_unit_html_cta.hxx	[new file with mode: 0644]	patch \| blob