local rspamd_logger = require "rspamd_logger"
local lua_util = require "lua_util"
local lua_cache = require "lua_cache"
+local lua_mime = require "lua_mime"
local ucl = require "ucl"
local DEFAULTS = {
-- First, try to get CTA URLs from HTML (most relevant for spam detection)
-- Uses button weight and HTML structure analysis from C code
- local cta_urls = task:get_cta_urls(max_domains * 2) or {}
- lua_util.debugm(Np, task, "CTA analysis found %d URLs", #cta_urls)
+ local cta_urls = {}
+ local sel_part = lua_mime.get_displayed_text_part(task)
+ if sel_part then
+ cta_urls = sel_part:get_cta_urls()
+ end
+ lua_util.debugm(Np, task,
+ "CTA analysis found %d URLs across", #cta_urls)
for _, url in ipairs(cta_urls) do
- if #domains >= max_domains then break end
+ if #domains >= max_domains then
+ break
+ end
local host = url:get_host()
if host and not skip_domains[host:lower()] and not seen[host] then
-- If we don't have enough domains from CTA, get more from content URLs
if #domains < max_domains then
lua_util.debugm(Np, task, "need more domains (%d/%d), extracting from content URLs",
- #domains, max_domains)
+ #domains, max_domains)
local urls = lua_util.extract_specific_urls({
task = task,
limit = max_domains * 3,
esld_limit = max_domains,
- need_content = true, -- Content URLs (buttons, links in text)
+ need_content = true, -- Content URLs (buttons, links in text)
need_images = false,
}) or {}
lua_util.debugm(Np, task, "extracted %d content URLs", #urls)
for _, url in ipairs(urls) do
- if #domains >= max_domains then break end
+ if #domains >= max_domains then
+ break
+ end
local host = url:get_host()
if host and not seen[host] and not skip_domains[host:lower()] then
-- Still need more? Get from any URLs
if #domains < max_domains then
lua_util.debugm(Np, task, "still need more domains (%d/%d), extracting from all URLs",
- #domains, max_domains)
+ #domains, max_domains)
local urls = lua_util.extract_specific_urls({
task = task,
lua_util.debugm(Np, task, "extracted %d all URLs", #urls)
for _, url in ipairs(urls) do
- if #domains >= max_domains then break end
+ if #domains >= max_domains then
+ break
+ end
local host = url:get_host()
if host and not seen[host] and not skip_domains[host:lower()] then
if code ~= 200 then
rspamd_logger.infox(task, "search API returned code %s for domain '%s', url: %s, body: %s",
- code, domain, full_url, body and body:sub(1, 200) or 'nil')
+ code, domain, full_url, body and body:sub(1, 200) or 'nil')
callback(nil, domain, string.format("HTTP %s", code))
return
end
local ok, parse_err = parser:parse_string(body)
if not ok then
rspamd_logger.errx(task, "%s: failed to parse search API response for %s: %s",
- Np, domain, parse_err)
+ Np, domain, parse_err)
callback(nil, domain, parse_err)
return
end
local metadata = flat_data[1]
lua_util.debugm(Np, task, "parsing domain '%s': flat_data has %d elements, metadata type: %s",
- domain, #flat_data, type(metadata))
+ domain, #flat_data, type(metadata))
if metadata and metadata.items and type(metadata.items) == 'number' then
-- metadata.items is a 0-indexed pointer, add 1 for Lua
if items and type(items) == 'table' then
lua_util.debugm(Np, task, "found %d item indices for domain '%s', items_idx=%d",
- #items, domain, items_idx)
+ #items, domain, items_idx)
local count = 0
local title = result_template.title and flat_data[result_template.title + 1]
lua_util.debugm(Np, task, "result %d template: link_idx=%s, snippet_idx=%s, title_idx=%s",
- count + 1, tostring(result_template.link), tostring(result_template.snippet),
- tostring(result_template.title))
+ count + 1, tostring(result_template.link), tostring(result_template.snippet),
+ tostring(result_template.title))
if link or title or snippet then
table.insert(search_results.results, {
})
count = count + 1
lua_util.debugm(Np, task, "extracted result %d: title='%s', snippet_len=%d",
- count, title or "nil", snippet and #snippet or 0)
+ count, title or "nil", snippet and #snippet or 0)
end
else
lua_util.debugm(Np, task, "result_template at idx %d is not a table: %s",
- result_template_idx, type(result_template))
+ result_template_idx, type(result_template))
end
end
else
lua_util.debugm(Np, task, "items is not a table for domain '%s', type: %s",
- domain, type(items))
+ domain, type(items))
end
else
lua_util.debugm(Np, task, "no valid metadata.items for domain '%s'", domain)
end
lua_util.debugm(Np, task, "extracted %d search results for domain '%s'",
- #search_results.results, domain)
+ #search_results.results, domain)
callback(search_results, domain, nil)
end
end
lua_util.debugm(Np, task, "final domain list (%d domains) for search: %s",
- #domains, table.concat(domains, ", "))
+ #domains, table.concat(domains, ", "))
-- Create cache context
local cache_ctx = nil
else
local context_snippet = format_search_results(all_results, opts)
lua_util.debugm(Np, task, "search context formatted (%s bytes)",
- context_snippet and #context_snippet or 0)
+ context_snippet and #context_snippet or 0)
callback(task, true, context_snippet)
end
end
if cache_ctx then
-- Use lua_cache for caching
lua_cache.cache_get(task, cache_key, cache_ctx, opts.timeout,
- function()
- -- Cache miss - query API
- query_search_api(task, domain, opts, function(api_results, d, api_err)
- if api_results then
- lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
- domain_complete(d, api_results)
- else
- lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
- domain_complete(d, nil)
+ function()
+ -- Cache miss - query API
+ query_search_api(task, domain, opts, function(api_results, d, api_err)
+ if api_results then
+ lua_cache.cache_set(task, cache_key, api_results, cache_ctx)
+ domain_complete(d, api_results)
+ else
+ lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err)
+ domain_complete(d, nil)
+ end
+ end, Np)
+ end,
+ function(_, err, data)
+ -- Cache hit or after miss callback
+ if data and type(data) == 'table' then
+ lua_util.debugm(Np, task, "cache hit for domain %s", domain)
+ domain_complete(domain, data)
+ -- If no data and no error, the miss callback was already invoked
+ elseif err then
+ lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
+ domain_complete(domain, nil)
end
- end, Np)
- end,
- function(_, err, data)
- -- Cache hit or after miss callback
- if data and type(data) == 'table' then
- lua_util.debugm(Np, task, "cache hit for domain %s", domain)
- domain_complete(domain, data)
- -- If no data and no error, the miss callback was already invoked
- elseif err then
- lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err)
- domain_complete(domain, nil)
- end
- end)
+ end)
else
-- No Redis, query directly
query_search_api(task, domain, opts, function(api_results, d, api_err)
struct rspamd_multipattern *gtube_matcher = NULL;
static const uint64_t words_hash_seed = 0xdeadbabe;
+/* CTA URL configuration */
+#define MAX_CTA_URLS_PER_PART 25
+
static void
free_byte_array_callback(void *pointer)
{
*avg_len_p += total_len;
}
- short_len_p = rspamd_mempool_get_variable(task->task_pool,
- RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
+ short_len_p = rspamd_mempool_get_variable(task->task_pool,
+ RSPAMD_MEMPOOL_SHORT_WORDS_CNT);
- if (short_len_p == NULL) {
- short_len_p = rspamd_mempool_alloc(task->task_pool,
- sizeof(double));
- *short_len_p = short_len;
- rspamd_mempool_set_variable(task->task_pool,
- RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
- }
+ if (short_len_p == NULL) {
+ short_len_p = rspamd_mempool_alloc(task->task_pool,
+ sizeof(double));
+ *short_len_p = short_len;
+ rspamd_mempool_set_variable(task->task_pool,
+ RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL);
+ }
else {
*short_len_p += short_len;
}
/* Wire aggregated HTML features */
text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html);
+ /* Collect top CTA URLs for this HTML part */
+ if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
+ rspamd_html_process_cta_urls(text_part, task, MAX_CTA_URLS_PER_PART);
+ }
+
/* Optionally call CTA/affiliation Lua hook with capped candidates */
if (task->cfg && task->cfg->lua_state) {
lua_State *L = task->cfg->lua_state;
lua_settop(L, old_top);
}
-
- /* Store top CTA URLs for LLM and other use cases */
- if (text_part->html && text_part->mime_part && text_part->mime_part->urls) {
- /* Simple approach: just store URLs sorted by button weight */
- /* Use task-wide array to aggregate across all HTML parts */
- GPtrArray *cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
- if (!cta_urls) {
- cta_urls = g_ptr_array_new();
- rspamd_mempool_add_destructor(task->task_pool,
- (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard,
- cta_urls);
- rspamd_mempool_set_variable(task->task_pool, "html_cta_urls", cta_urls, NULL);
- }
-
- /* Find best URLs by button weight in this HTML part */
- float best_weights[5] = {0.0, 0.0, 0.0, 0.0, 0.0};
- struct rspamd_url *best_urls[5] = {NULL, NULL, NULL, NULL, NULL};
- unsigned int max_cta_per_part = 5;
-
- for (unsigned int i = 0; i < text_part->mime_part->urls->len; i++) {
- struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i);
- if (!u) continue;
- if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
- if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
-
- float weight = rspamd_html_url_button_weight(text_part->html, u);
-
- /* Insert into best list if weight is high enough */
- for (unsigned int j = 0; j < max_cta_per_part; j++) {
- if (weight > best_weights[j]) {
- /* Shift lower entries down */
- for (unsigned int k = max_cta_per_part - 1; k > j; k--) {
- best_weights[k] = best_weights[k - 1];
- best_urls[k] = best_urls[k - 1];
- }
- best_weights[j] = weight;
- best_urls[j] = u;
- break;
- }
- }
- }
-
- /* Add to task-wide array */
- for (unsigned int i = 0; i < max_cta_per_part; i++) {
- if (best_urls[i] && best_weights[i] > 0.0) {
- g_ptr_array_add(cta_urls, best_urls[i]);
- }
- }
- }
}
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
#include "libserver/url.h"
#include "libutil/ref.h"
#include "libutil/str_util.h"
+#include "libutil/heap.h"
#include "libserver/word.h"
#include <unicode/uchar.h>
#define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML)
#define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT)
+/* CTA (call-to-action) URL heap entry structure */
+struct rspamd_html_cta_entry {
+ unsigned int pri; /* Priority for heap (weight * scale) */
+ unsigned int idx; /* Heap index (managed by heap) */
+ struct rspamd_url *url; /* URL pointer */
+ float weight; /* Original button weight */
+};
+
+RSPAMD_HEAP_DECLARE(rspamd_html_heap_storage, struct rspamd_html_cta_entry);
struct rspamd_mime_text_part {
const char *language;
void *html;
/* Optional HTML features collected during parsing */
struct rspamd_html_features *html_features;
- GList *exceptions; /**< list of offsets of urls */
+ /* CTA (call-to-action) URLs extracted from HTML with weights */
+ rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */
+ GList *exceptions; /**< list of offsets of urls */
struct rspamd_mime_part *mime_part;
unsigned int flags;
${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c
${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx
${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx
+ ${CMAKE_CURRENT_SOURCE_DIR}/html/html_cta.cxx
${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx
${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx
${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite_c.cxx
#include "contrib/libucl/khash.h"
#include "libmime/images.h"
#include "libutil/cxx/utf8_util.h"
+#include "libserver/html/html_cta.hxx"
#include "html_tag_defs.hxx"
#include "html_entities.hxx"
#include "contrib/fmt/include/fmt/core.h"
#include <functional>
+#include <algorithm>
+#include <string>
#include <unicode/uversion.h>
namespace rspamd::html {
}},
});
+
auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
{
auto it = component_extractors.find(attr_name);
struct tag_content_parser_state {
tag_parser_state cur_state = parse_start;
std::string buf;
- std::string attr_name;// Store current attribute name
+ std::string attr_name; // Store current attribute name
const char *value_start = nullptr;// Track where attribute value starts in input
const char *html_start = nullptr; // Base pointer to HTML buffer start
if (cnt > hc->features.links.max_links_single_domain) {
hc->features.links.max_links_single_domain = cnt;
}
- /* Heuristic button weight */
- float w = 0.0f;
- if (url->ext && url->ext->linked_url && url->ext->linked_url != url) {
- w += 0.5f; /* display mismatch bonus */
- }
- w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order);
- if (cur_tag->block && cur_tag->block->is_visible()) {
- if (cur_tag->block->has_display()) {
- w += 0.1f;
- }
- if (cur_tag->block->width > 0 && cur_tag->block->height > 0) {
- w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f);
- }
- if (cur_tag->block->font_size >= 14) {
- w += 0.1f;
- }
- }
- if (w > 0) {
- hc->url_button_weights[url] += w;
- }
/* same eTLD+1 as first-party? */
if (!hc->first_party_etld1.empty()) {
rspamd_ftok_t tld2;
}
}
+ html_compute_cta_weights(*hc);
+
return hc;
}
*/
const struct rspamd_html_features *rspamd_html_get_features(void *html_content);
+/**
+ * Creates CTA (call-to-action) URLs heap for a text part
+ * Collects top-K URLs by button weight using min-heap (O(n log k))
+ * @param text_part text part to fill cta_urls for
+ * @param task task for mempool allocation
+ * @param max_cta maximum number of CTA URLs to collect
+ */
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+ struct rspamd_task *task,
+ unsigned int max_cta);
+
#ifdef __cplusplus
}
--- /dev/null
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "config.h"
+#include "libserver/html/html_cta.hxx"
+
+#include "util.h"
+#include "message.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html_block.hxx"
+#include "libserver/html/html_tag.hxx"
+#include "libserver/css/css.hxx"
+#include "libserver/url.h"
+#include "libserver/task.h"
+#include "libutil/cxx/util.hxx"
+#include "libutil/heap.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
+
+#include <glib.h>
+
+static constexpr unsigned int CTA_WEIGHT_SCALE = 1000;
+
+namespace rspamd::html {
+namespace {
+
+using namespace std::string_view_literals;
+
+static auto trim_ascii(std::string_view input) -> std::string_view
+{
+ while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.front()))) {
+ input.remove_prefix(1);
+ }
+
+ while (!input.empty() && g_ascii_isspace(static_cast<gchar>(input.back()))) {
+ input.remove_suffix(1);
+ }
+
+ return input;
+}
+
+static auto space_separated_token_match(std::string_view attr,
+ std::string_view token,
+ bool allow_partial) -> bool
+{
+ attr = trim_ascii(attr);
+ if (attr.empty()) {
+ return false;
+ }
+
+ std::size_t pos = 0;
+ while (pos < attr.size()) {
+ while (pos < attr.size() && g_ascii_isspace(static_cast<gchar>(attr[pos]))) {
+ pos++;
+ }
+ if (pos >= attr.size()) {
+ break;
+ }
+
+ auto end = pos;
+ while (end < attr.size() && !g_ascii_isspace(static_cast<gchar>(attr[end]))) {
+ end++;
+ }
+
+ auto chunk = attr.substr(pos, end - pos);
+ if (allow_partial) {
+ if (chunk.find(token) != std::string_view::npos) {
+ return true;
+ }
+ }
+ else {
+ if (chunk == token) {
+ return true;
+ }
+ }
+
+ pos = end + 1;
+ }
+
+ return false;
+}
+
+static auto optional_attr_contains(const std::optional<std::string_view> &attr,
+ std::string_view token,
+ bool allow_partial = false) -> bool
+{
+ if (!attr) {
+ return false;
+ }
+
+ return space_separated_token_match(attr.value(), token, allow_partial);
+}
+
+template<typename Range>
+static auto optional_attr_contains_any(const std::optional<std::string_view> &attr,
+ const Range &tokens,
+ bool allow_partial = false) -> bool
+{
+ if (!attr) {
+ return false;
+ }
+
+ for (auto token: tokens) {
+ if (space_separated_token_match(attr.value(), token, allow_partial)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static auto to_lower_ascii(std::string_view input) -> std::string
+{
+ std::string out;
+ out.reserve(input.size());
+ for (auto ch: input) {
+ out.push_back(static_cast<char>(g_ascii_tolower(static_cast<guchar>(ch))));
+ }
+ return out;
+}
+
+static auto get_cta_label(const html_tag &tag, const html_content &hc) -> std::string
+{
+ auto content = trim_ascii(tag.get_content(&hc));
+ if (!content.empty()) {
+ return std::string{content};
+ }
+
+ if (auto title = tag.find_component<html_component_title>()) {
+ auto value = trim_ascii(title.value()->value);
+ if (!value.empty()) {
+ return std::string{value};
+ }
+ }
+
+ if (auto aria_label = tag.find_component_by_name("aria-label"sv)) {
+ auto value = trim_ascii(aria_label.value());
+ if (!value.empty()) {
+ return std::string{value};
+ }
+ }
+
+ if (auto alt = tag.find_component<html_component_alt>()) {
+ auto value = trim_ascii(alt.value()->value);
+ if (!value.empty()) {
+ return std::string{value};
+ }
+ }
+
+ return {};
+}
+
+static auto tag_is_effectively_hidden(const html_tag *tag) -> bool
+{
+ for (auto current = tag; current != nullptr; current = current->parent) {
+ if (current->block && !current->block->is_visible()) {
+ return true;
+ }
+ if (current->flags & FL_IGNORE) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static constexpr auto buttonish_class_tokens = rspamd::array_of<std::string_view>(
+ "btn", "button", "cta", "call-to-action", "submit", "primary",
+ "confirm", "action", "purchase", "buy", "signup", "sign-up", "apply");
+
+static constexpr auto negative_context_tokens = rspamd::array_of<std::string_view>(
+ "logo", "footer", "header", "nav", "menu", "social",
+ "tracking", "pixel", "unsubscribe", "legal", "copyright");
+
+static constexpr auto service_rel_tokens = rspamd::array_of<std::string_view>(
+ "alternate", "canonical", "dns-prefetch", "icon", "manifest",
+ "preconnect", "prefetch", "preload", "stylesheet");
+
+static constexpr auto cta_keywords = rspamd::array_of<std::string_view>(
+ "buy", "purchase", "order", "checkout", "pay", "confirm", "verify",
+ "update", "login", "log in", "sign in", "sign up", "signup", "register",
+ "download", "upgrade", "continue", "next", "open", "submit", "apply",
+ "approve", "activate", "subscribe");
+
+static auto is_service_link_tag(const html_tag &tag, const rspamd_url &url) -> bool
+{
+ if (tag.flags & (FL_XML | FL_VIRTUAL | FL_COMMENT | FL_IGNORE | CM_HEAD)) {
+ return true;
+ }
+
+ switch (tag.id) {
+ case Tag_LINK:
+ case Tag_SCRIPT:
+ case Tag_STYLE:
+ case Tag_META:
+ case Tag_BASE:
+ case Tag_IMG:
+ return true;
+ default:
+ break;
+ }
+
+ if (tag.block && !tag.block->is_visible()) {
+ return true;
+ }
+
+ if (url.flags & RSPAMD_URL_FLAG_IMAGE) {
+ return true;
+ }
+
+ if (tag.id == Tag_A) {
+ if (optional_attr_contains_any(tag.find_rel(), service_rel_tokens, false)) {
+ return true;
+ }
+ if (tag.parent && (tag.parent->flags & CM_HEAD)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static auto compute_semantic_base_score(const html_tag &tag, const rspamd_url &url) -> float
+{
+ switch (tag.id) {
+ case Tag_BUTTON:
+ return 0.9f;
+ case Tag_INPUT: {
+ float base = 0.35f;
+ if (auto type_comp = tag.find_component<html_component_type>()) {
+ auto lowered = to_lower_ascii(trim_ascii(type_comp.value()->get_string_value()));
+ if (lowered == "submit" || lowered == "button" || lowered == "send") {
+ base = 0.85f;
+ }
+ else if (lowered == "image") {
+ base = 0.75f;
+ }
+ else if (lowered == "reset") {
+ base = 0.25f;
+ }
+ }
+ return base;
+ }
+ case Tag_FORM:
+ return 0.8f;
+ case Tag_A: {
+ float base = 0.35f;
+ if (optional_attr_contains_any(tag.find_class(), buttonish_class_tokens, true) ||
+ optional_attr_contains_any(tag.find_id(), buttonish_class_tokens, true)) {
+ base = 0.75f;
+ }
+ if (auto role_comp = tag.find_component<html_component_role>()) {
+ auto lowered = to_lower_ascii(trim_ascii(role_comp.value()->value));
+ if (lowered == "button" || lowered == "tab" || lowered == "menuitem") {
+ base = std::max(base, 0.7f);
+ }
+ }
+ if (url.protocol == PROTOCOL_MAILTO) {
+ base = std::min(base, 0.4f);
+ }
+ return base;
+ }
+ case Tag_AREA:
+ return 0.3f;
+ default:
+ if (tag.flags & FL_HREF) {
+ return 0.2f;
+ }
+ break;
+ }
+
+ return 0.0f;
+}
+
+static auto compute_visual_bonus(const html_tag &tag) -> float
+{
+ if (!tag.block || !tag.block->is_visible()) {
+ return 0.0f;
+ }
+
+ float bonus = 0.0f;
+ const auto &block = *tag.block;
+
+ switch (block.display) {
+ case css::css_display_value::DISPLAY_BLOCK:
+ bonus += 0.12f;
+ break;
+ case css::css_display_value::DISPLAY_TABLE_ROW:
+ bonus += 0.05f;
+ break;
+ default:
+ break;
+ }
+
+ if (block.width > 0 && block.height > 0) {
+ const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+ if (area >= 6000) {
+ bonus += 0.2f;
+ }
+ else if (area >= 2000) {
+ bonus += 0.12f;
+ }
+ else if (area >= 400) {
+ bonus += 0.06f;
+ }
+ }
+
+ if (block.font_size >= 16) {
+ bonus += 0.08f;
+ }
+ else if (block.font_size >= 13) {
+ bonus += 0.04f;
+ }
+
+ return bonus;
+}
+
+static auto compute_text_bonus(std::string_view text_lower) -> float
+{
+ if (text_lower.empty()) {
+ return 0.0f;
+ }
+
+ float bonus = 0.0f;
+ for (auto kw: cta_keywords) {
+ if (text_lower.find(kw) != std::string_view::npos) {
+ bonus += 0.18f;
+ break;
+ }
+ }
+
+ if (text_lower.find('!') != std::string_view::npos) {
+ bonus += 0.03f;
+ }
+
+ if (text_lower.size() <= 18 && text_lower.size() >= 3) {
+ bonus += 0.04f;
+ }
+
+ return bonus;
+}
+
+static auto compute_penalty(const html_tag &tag,
+ const rspamd_url &url,
+ std::string_view text_lower,
+ std::string_view text_original) -> float
+{
+ float penalty = 0.0f;
+
+ if (text_lower.empty()) {
+ penalty += 0.35f;
+ }
+ else {
+ unsigned int alpha = 0;
+ unsigned int graph = 0;
+ for (auto ch: text_lower) {
+ if (g_ascii_isspace(static_cast<gchar>(ch))) {
+ continue;
+ }
+ graph++;
+ if (g_ascii_isalpha(static_cast<gchar>(ch))) {
+ alpha++;
+ }
+ }
+ if (graph > 0 && alpha == 0) {
+ penalty += 0.25f;
+ }
+ if (text_original.size() > 80) {
+ penalty += 0.1f;
+ }
+ }
+
+ if (tag.block) {
+ const auto &block = *tag.block;
+ if (block.width > 0 && block.height > 0) {
+ const auto area = static_cast<int>(block.width) * static_cast<int>(block.height);
+ if (area <= 64) {
+ penalty += 0.25f;
+ }
+ else if (area <= 150) {
+ penalty += 0.15f;
+ }
+ }
+ if (block.font_size > 0 && block.font_size <= 9) {
+ penalty += 0.08f;
+ }
+ if (block.is_transparent()) {
+ penalty += 0.2f;
+ }
+ }
+
+ if (optional_attr_contains_any(tag.find_class(), negative_context_tokens, true) ||
+ optional_attr_contains_any(tag.find_id(), negative_context_tokens, true)) {
+ penalty += 0.2f;
+ }
+
+ if (url.flags & RSPAMD_URL_FLAG_INVISIBLE) {
+ penalty += 0.3f;
+ }
+
+ if (url.protocol == PROTOCOL_MAILTO || url.protocol == PROTOCOL_FTP) {
+ penalty += 0.05f;
+ }
+
+ return penalty;
+}
+
+static auto compute_cta_weight(const html_tag &tag,
+ const rspamd_url &url,
+ const html_content &hc) -> float
+{
+ if (is_service_link_tag(tag, url)) {
+ return 0.0f;
+ }
+
+ if (tag_is_effectively_hidden(&tag)) {
+ return 0.0f;
+ }
+
+ float base = compute_semantic_base_score(tag, url);
+ if (base <= 0.0f) {
+ return 0.0f;
+ }
+
+ auto label = get_cta_label(tag, hc);
+ std::string_view label_view = trim_ascii(label);
+ std::string lowered = to_lower_ascii(label_view);
+
+ float visual = compute_visual_bonus(tag);
+ float text_bonus = compute_text_bonus(lowered);
+ float order_bonus = 0.0f;
+ if (url.order == 0) {
+ order_bonus = 0.1f;
+ }
+ else {
+ order_bonus = std::max(0.0f, 0.06f / (1.0f + static_cast<float>(url.order)));
+ }
+ if (url.ext && url.ext->linked_url && url.ext->linked_url != &url) {
+ order_bonus += 0.12f;
+ }
+ float penalty = compute_penalty(tag, url, lowered, label_view);
+
+ float weight = base + visual + text_bonus + order_bonus - penalty;
+ if (weight < 0.0f) {
+ weight = 0.0f;
+ }
+ else if (weight > 1.0f) {
+ weight = 1.0f;
+ }
+
+ return weight;
+}
+
+}// namespace
+
+void html_compute_cta_weights(html_content &hc)
+{
+ hc.url_button_weights.clear();
+
+ for (const auto &tag_ptr: hc.all_tags) {
+ const auto &tag = *tag_ptr;
+ if (!std::holds_alternative<rspamd_url *>(tag.extra)) {
+ continue;
+ }
+
+ auto *url = std::get<rspamd_url *>(tag.extra);
+ if (!url) {
+ continue;
+ }
+
+ float weight = compute_cta_weight(tag, *url, hc);
+ if (weight <= 0.0f) {
+ continue;
+ }
+
+ auto it = hc.url_button_weights.find(url);
+ if (it == hc.url_button_weights.end()) {
+ hc.url_button_weights.emplace(url, weight);
+ }
+ else {
+ it->second = std::max(it->second, weight);
+ }
+ }
+}
+
+}// namespace rspamd::html
+
+extern "C" {
+
+void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part,
+ struct rspamd_task *task,
+ unsigned int max_cta)
+{
+ using namespace rspamd::html;
+
+ if (!text_part || !text_part->html || !text_part->mime_part || !text_part->mime_part->urls) {
+ return;
+ }
+ auto *part_urls = text_part->mime_part->urls;
+ unsigned int i;
+ rspamd_url *u;
+
+ auto *heap_ptr = rspamd_mempool_alloc_type(task->task_pool, rspamd_html_heap_storage_t);
+ rspamd_heap_init(rspamd_html_heap_storage, heap_ptr);
+ text_part->cta_urls = heap_ptr;
+ rspamd_mempool_add_destructor(task->task_pool, [](void *ptr) {
+ auto *h = static_cast<rspamd_html_heap_storage_t *>(ptr);
+ rspamd_heap_destroy(rspamd_html_heap_storage, h); }, heap_ptr);
+ PTR_ARRAY_FOREACH(part_urls, i, u)
+ {
+ if (!u) continue;
+ if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue;
+ if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue;
+ if (u->flags & RSPAMD_URL_FLAG_IMAGE) continue;
+
+ /* Use button_weight to filter CTA URLs vs technical URLs
+ * Technical tags like <link rel>, <script src> have weight=0
+ * Only actual content URLs (buttons, links) have weight > 0
+ */
+ float weight = rspamd_html_url_button_weight(text_part->html, u);
+
+ if (weight > 0.0) {
+ if (rspamd_heap_size(rspamd_html_heap_storage, heap_ptr) < max_cta) {
+ rspamd_html_cta_entry entry = {
+ .pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+ .idx = 0,
+ .url = u,
+ .weight = weight};
+ rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+ }
+ else {
+ auto *min = rspamd_heap_index(rspamd_html_heap_storage, heap_ptr, 0);
+ if (weight > min->weight) {
+ rspamd_heap_pop(rspamd_html_heap_storage, heap_ptr);
+ rspamd_html_cta_entry entry = {
+ .pri = static_cast<unsigned int>(weight * -CTA_WEIGHT_SCALE),
+ .idx = 0,
+ .url = u,
+ .weight = weight};
+ rspamd_heap_push_safe(rspamd_html_heap_storage, heap_ptr, &entry, heap_error);
+ }
+ }
+ }
+ }
+
+ return;
+
+heap_error:
+ rspamd_heap_destroy(rspamd_html_heap_storage, heap_ptr);
+ text_part->cta_urls = nullptr;
+}
+
+}// extern "C"
--- /dev/null
+/*-
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_HTML_CTA_HXX
+#define RSPAMD_HTML_CTA_HXX
+
+namespace rspamd::html {
+
+struct html_content;
+
+/**
+ * Recompute CTA weights for all URLs present in the HTML document.
+ */
+void html_compute_cta_weights(html_content &hc);
+
+}// namespace rspamd::html
+
+#endif//RSPAMD_HTML_CTA_HXX
* @return {table} table of stats
*/
LUA_FUNCTION_DEF(textpart, get_stats);
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+LUA_FUNCTION_DEF(textpart, get_cta_urls);
/***
* @method mime_part:get_words_count()
* Get words number in the part
LUA_INTERFACE_DEF(textpart, get_stats),
LUA_INTERFACE_DEF(textpart, get_fuzzy_hashes),
LUA_INTERFACE_DEF(textpart, get_html_fuzzy_hashes),
+ LUA_INTERFACE_DEF(textpart, get_cta_urls),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}};
return 2;
}
+/***
+ * @method text_part:get_cta_urls([max_urls])
+ * Get CTA (call-to-action) URLs from HTML part sorted by button weight
+ * @param {number} max_urls optional maximum number of URLs to return
+ * @return {table} array of URL objects sorted by importance (descending)
+ */
+static int
+lua_textpart_get_cta_urls(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_mime_text_part *part = lua_check_textpart(L);
+ unsigned int max_urls = 0;
+ unsigned int nret = 0;
+
+ if (part == NULL) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ /* Get optional max_urls parameter */
+ if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
+ max_urls = lua_tointeger(L, 2);
+ }
+
+ /* Check if this HTML part has CTA URLs */
+ if (!part->cta_urls) {
+ lua_newtable(L);
+ return 1;
+ }
+
+ /* Access heap structure from html.h */
+ rspamd_html_heap_storage_t *heap = part->cta_urls;
+
+ /* Heap is already top-K, but in min-heap order - need to reverse for descending */
+ unsigned int result_size = max_urls > 0 ? MIN(max_urls, heap->n) : heap->n;
+ lua_createtable(L, result_size, 0);
+
+ /* Iterate heap from end to start for descending order */
+ for (int i = (int) heap->n - 1; i >= 0 && nret < result_size; i--) {
+ struct rspamd_html_cta_entry *entry = &heap->a[i];
+ if (entry && entry->url) {
+ struct rspamd_lua_url *lua_url;
+
+ lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
+ rspamd_lua_setclass(L, rspamd_url_classname, -1);
+ lua_url->url = entry->url;
+ lua_rawseti(L, -2, ++nret);
+ }
+ }
+
+ return 1;
+}
+
static int
lua_textpart_get_mimepart(lua_State *L)
{
* @return {table rspamd_url} list of urls matching conditions
*/
LUA_FUNCTION_DEF(task, get_urls_filtered);
-/***
- * @method task:get_cta_urls([max_urls])
- * Get call-to-action URLs from HTML content, prioritized by button weight
- * These are URLs that users are likely to click (buttons, prominent links, etc.)
- * @param {number} max_urls maximum number of URLs to return (default: all)
- * @return {table rspamd_url} list of CTA urls sorted by importance
- */
-LUA_FUNCTION_DEF(task, get_cta_urls);
/***
* @method task:has_urls([need_emails])
* Returns 'true' if a task has urls listed
LUA_INTERFACE_DEF(task, has_urls),
LUA_INTERFACE_DEF(task, get_urls),
LUA_INTERFACE_DEF(task, get_urls_filtered),
- LUA_INTERFACE_DEF(task, get_cta_urls),
LUA_INTERFACE_DEF(task, inject_url),
LUA_INTERFACE_DEF(task, get_content),
LUA_INTERFACE_DEF(task, get_filename),
return 1;
}
-static int
-lua_task_get_cta_urls(lua_State *L)
-{
- LUA_TRACE_POINT;
- struct rspamd_task *task = lua_check_task(L, 1);
- GPtrArray *cta_urls;
- unsigned int max_urls = 0;
- unsigned int nret = 0;
-
- if (task == NULL) {
- return luaL_error(L, "invalid arguments, no task");
- }
-
- if (task->message == NULL) {
- lua_newtable(L);
- return 1;
- }
-
- /* Get optional max_urls parameter */
- if (lua_gettop(L) >= 2 && lua_isnumber(L, 2)) {
- max_urls = lua_tointeger(L, 2);
- }
-
- /* Retrieve CTA URLs from mempool */
- cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls");
-
- if (cta_urls == NULL || cta_urls->len == 0) {
- lua_newtable(L);
- return 1;
- }
-
- /* Create result table */
- unsigned int result_size = max_urls > 0 ? MIN(max_urls, cta_urls->len) : cta_urls->len;
- lua_createtable(L, result_size, 0);
-
- /* Add URLs to result */
- for (unsigned int i = 0; i < cta_urls->len; i++) {
- struct rspamd_url *u = g_ptr_array_index(cta_urls, i);
- if (u) {
- struct rspamd_lua_url *lua_url;
-
- lua_url = lua_newuserdata(L, sizeof(struct rspamd_lua_url));
- rspamd_lua_setclass(L, rspamd_url_classname, -1);
- lua_url->url = u;
- lua_rawseti(L, -2, ++nret);
-
- if (max_urls > 0 && nret >= max_urls) {
- break;
- }
- }
- }
-
- return 1;
-}
-
static int
lua_task_has_urls(lua_State *L)
{
#include "rspamd_cxx_unit_cryptobox.hxx"
#include "rspamd_cxx_unit_rfc2047.hxx"
#include "rspamd_cxx_unit_html_url_rewrite.hxx"
+#include "rspamd_cxx_unit_html_cta.hxx"
static gboolean verbose = false;
static const GOptionEntry entries[] =
--- /dev/null
+/*
+ * Copyright 2025 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_HTML_CTA_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include "libutil/mem_pool.h"
+#include "libserver/html/html.hxx"
+#include "libserver/html/html.h"
+#include "libserver/url.h"
+
+#include <string>
+#include <string_view>
+
+using namespace rspamd::html;
+
+namespace {
+
+struct html_fixture {
+ rspamd_mempool_t *pool = nullptr;
+ html_content *hc = nullptr;
+
+ explicit html_fixture(std::string_view html)
+ {
+ pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), nullptr, 0);
+ auto *input = g_byte_array_sized_new(html.size());
+ g_byte_array_append(input, reinterpret_cast<const guint8 *>(html.data()), html.size());
+ auto *parsed = rspamd_html_process_part(pool, input);
+ g_byte_array_free(input, TRUE);
+ hc = html_content::from_ptr(parsed);
+ }
+
+ ~html_fixture()
+ {
+ if (pool != nullptr) {
+ rspamd_mempool_delete(pool);
+ }
+ }
+
+ [[nodiscard]] auto weight_for(std::string_view url) const -> float
+ {
+ for (const auto &[u, weight]: hc->url_button_weights) {
+ if (!u || !u->string) {
+ continue;
+ }
+ std::string_view current(u->string, u->urllen);
+ if (current == url) {
+ return weight;
+ }
+ }
+ return 0.0f;
+ }
+};
+
+}// namespace
+
+TEST_SUITE("html_cta_scoring")
+{
+ TEST_CASE("button-like anchors outrank technical resources")
+ {
+ const auto html = R"HTML(
+ <html><body>
+ <a href="https://example.com/cta" class="btn primary">Click now</a>
+ <link rel="stylesheet" href="https://cdn.example.com/style.css" />
+ <a href="mailto:info@example.com">Email us</a>
+ </body></html>
+ )HTML";
+ html_fixture fx{html};
+
+ CHECK(fx.weight_for("https://example.com/cta") > 0.6f);
+ CHECK(fx.weight_for("https://cdn.example.com/style.css") == doctest::Approx(0.0f));
+ CHECK(fx.weight_for("mailto:info@example.com") < 0.3f);
+ }
+
+ TEST_CASE("footer links and hidden buttons are de-emphasised")
+ {
+ const auto html = R"HTML(
+ <html><body>
+ <a href="https://shop.example.com/buy" class="cta-button">BUY NOW!</a>
+ <div class="footer">
+ <a href="https://shop.example.com/privacy" class="footer-link">Privacy policy</a>
+ </div>
+ <div style="display:none">
+ <a href="https://shop.example.com/hidden" class="btn">Hidden CTA</a>
+ </div>
+ </body></html>
+ )HTML";
+ html_fixture fx{html};
+
+ CHECK(fx.weight_for("https://shop.example.com/buy") > 0.6f);
+ CHECK(fx.weight_for("https://shop.example.com/privacy") < 0.2f);
+ CHECK(fx.weight_for("https://shop.example.com/hidden") == doctest::Approx(0.0f));
+ }
+}
+
+#endif