From: Vsevolod Stakhov Date: Thu, 6 Nov 2025 13:46:50 +0000 (+0000) Subject: [Rework] Move CTA processing into dedicated module X-Git-Tag: 3.14.0~12^2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c6d0ee62795e5ad7d63965e99455144b4f1f68f0;p=thirdparty%2Frspamd.git [Rework] Move CTA processing into dedicated module Also refactor all absurdic logic it has previously --- diff --git a/lualib/llm_search_context.lua b/lualib/llm_search_context.lua index e75199b4fe..a691de523b 100644 --- a/lualib/llm_search_context.lua +++ b/lualib/llm_search_context.lua @@ -45,6 +45,7 @@ local rspamd_http = require "rspamd_http" local rspamd_logger = require "rspamd_logger" local lua_util = require "lua_util" local lua_cache = require "lua_cache" +local lua_mime = require "lua_mime" local ucl = require "ucl" local DEFAULTS = { @@ -77,11 +78,18 @@ local function extract_domains(task, max_domains, debug_module) -- First, try to get CTA URLs from HTML (most relevant for spam detection) -- Uses button weight and HTML structure analysis from C code - local cta_urls = task:get_cta_urls(max_domains * 2) or {} - lua_util.debugm(Np, task, "CTA analysis found %d URLs", #cta_urls) + local cta_urls = {} + local sel_part = lua_mime.get_displayed_text_part(task) + if sel_part then + cta_urls = sel_part:get_cta_urls() + end + lua_util.debugm(Np, task, + "CTA analysis found %d URLs across", #cta_urls) for _, url in ipairs(cta_urls) do - if #domains >= max_domains then break end + if #domains >= max_domains then + break + end local host = url:get_host() if host and not skip_domains[host:lower()] and not seen[host] then @@ -94,20 +102,22 @@ local function extract_domains(task, max_domains, debug_module) -- If we don't have enough domains from CTA, get more from content URLs if #domains < max_domains then lua_util.debugm(Np, task, "need more domains (%d/%d), extracting from content URLs", - #domains, max_domains) + #domains, max_domains) local urls = lua_util.extract_specific_urls({ task = task, limit = max_domains * 3, esld_limit = max_domains, - need_content = true, -- Content URLs (buttons, links in text) + need_content = true, -- Content URLs (buttons, links in text) need_images = false, }) or {} lua_util.debugm(Np, task, "extracted %d content URLs", #urls) for _, url in ipairs(urls) do - if #domains >= max_domains then break end + if #domains >= max_domains then + break + end local host = url:get_host() if host and not seen[host] and not skip_domains[host:lower()] then @@ -121,7 +131,7 @@ local function extract_domains(task, max_domains, debug_module) -- Still need more? Get from any URLs if #domains < max_domains then lua_util.debugm(Np, task, "still need more domains (%d/%d), extracting from all URLs", - #domains, max_domains) + #domains, max_domains) local urls = lua_util.extract_specific_urls({ task = task, @@ -132,7 +142,9 @@ local function extract_domains(task, max_domains, debug_module) lua_util.debugm(Np, task, "extracted %d all URLs", #urls) for _, url in ipairs(urls) do - if #domains >= max_domains then break end + if #domains >= max_domains then + break + end local host = url:get_host() if host and not seen[host] and not skip_domains[host:lower()] then @@ -176,7 +188,7 @@ local function query_search_api(task, domain, opts, callback, debug_module) if code ~= 200 then rspamd_logger.infox(task, "search API returned code %s for domain '%s', url: %s, body: %s", - code, domain, full_url, body and body:sub(1, 200) or 'nil') + code, domain, full_url, body and body:sub(1, 200) or 'nil') callback(nil, domain, string.format("HTTP %s", code)) return end @@ -188,7 +200,7 @@ local function query_search_api(task, domain, opts, callback, debug_module) local ok, parse_err = parser:parse_string(body) if not ok then rspamd_logger.errx(task, "%s: failed to parse search API response for %s: %s", - Np, domain, parse_err) + Np, domain, parse_err) callback(nil, domain, parse_err) return end @@ -208,7 +220,7 @@ local function query_search_api(task, domain, opts, callback, debug_module) local metadata = flat_data[1] lua_util.debugm(Np, task, "parsing domain '%s': flat_data has %d elements, metadata type: %s", - domain, #flat_data, type(metadata)) + domain, #flat_data, type(metadata)) if metadata and metadata.items and type(metadata.items) == 'number' then -- metadata.items is a 0-indexed pointer, add 1 for Lua @@ -217,7 +229,7 @@ local function query_search_api(task, domain, opts, callback, debug_module) if items and type(items) == 'table' then lua_util.debugm(Np, task, "found %d item indices for domain '%s', items_idx=%d", - #items, domain, items_idx) + #items, domain, items_idx) local count = 0 @@ -237,8 +249,8 @@ local function query_search_api(task, domain, opts, callback, debug_module) local title = result_template.title and flat_data[result_template.title + 1] lua_util.debugm(Np, task, "result %d template: link_idx=%s, snippet_idx=%s, title_idx=%s", - count + 1, tostring(result_template.link), tostring(result_template.snippet), - tostring(result_template.title)) + count + 1, tostring(result_template.link), tostring(result_template.snippet), + tostring(result_template.title)) if link or title or snippet then table.insert(search_results.results, { @@ -248,16 +260,16 @@ local function query_search_api(task, domain, opts, callback, debug_module) }) count = count + 1 lua_util.debugm(Np, task, "extracted result %d: title='%s', snippet_len=%d", - count, title or "nil", snippet and #snippet or 0) + count, title or "nil", snippet and #snippet or 0) end else lua_util.debugm(Np, task, "result_template at idx %d is not a table: %s", - result_template_idx, type(result_template)) + result_template_idx, type(result_template)) end end else lua_util.debugm(Np, task, "items is not a table for domain '%s', type: %s", - domain, type(items)) + domain, type(items)) end else lua_util.debugm(Np, task, "no valid metadata.items for domain '%s'", domain) @@ -266,7 +278,7 @@ local function query_search_api(task, domain, opts, callback, debug_module) end lua_util.debugm(Np, task, "extracted %d search results for domain '%s'", - #search_results.results, domain) + #search_results.results, domain) callback(search_results, domain, nil) end @@ -342,7 +354,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module) end lua_util.debugm(Np, task, "final domain list (%d domains) for search: %s", - #domains, table.concat(domains, ", ")) + #domains, table.concat(domains, ", ")) -- Create cache context local cache_ctx = nil @@ -378,7 +390,7 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module) else local context_snippet = format_search_results(all_results, opts) lua_util.debugm(Np, task, "search context formatted (%s bytes)", - context_snippet and #context_snippet or 0) + context_snippet and #context_snippet or 0) callback(task, true, context_snippet) end end @@ -391,29 +403,29 @@ function M.fetch_and_format(task, redis_params, opts, callback, debug_module) if cache_ctx then -- Use lua_cache for caching lua_cache.cache_get(task, cache_key, cache_ctx, opts.timeout, - function() - -- Cache miss - query API - query_search_api(task, domain, opts, function(api_results, d, api_err) - if api_results then - lua_cache.cache_set(task, cache_key, api_results, cache_ctx) - domain_complete(d, api_results) - else - lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err) - domain_complete(d, nil) + function() + -- Cache miss - query API + query_search_api(task, domain, opts, function(api_results, d, api_err) + if api_results then + lua_cache.cache_set(task, cache_key, api_results, cache_ctx) + domain_complete(d, api_results) + else + lua_util.debugm(Np, task, "search failed for domain %s: %s", d, api_err) + domain_complete(d, nil) + end + end, Np) + end, + function(_, err, data) + -- Cache hit or after miss callback + if data and type(data) == 'table' then + lua_util.debugm(Np, task, "cache hit for domain %s", domain) + domain_complete(domain, data) + -- If no data and no error, the miss callback was already invoked + elseif err then + lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err) + domain_complete(domain, nil) end - end, Np) - end, - function(_, err, data) - -- Cache hit or after miss callback - if data and type(data) == 'table' then - lua_util.debugm(Np, task, "cache hit for domain %s", domain) - domain_complete(domain, data) - -- If no data and no error, the miss callback was already invoked - elseif err then - lua_util.debugm(Np, task, "cache error for domain %s: %s", domain, err) - domain_complete(domain, nil) - end - end) + end) else -- No Redis, query directly query_search_api(task, domain, opts, function(api_results, d, api_err) diff --git a/src/libmime/message.c b/src/libmime/message.c index 84ea137113..910fe2082e 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -59,6 +59,9 @@ static const char gtube_pattern_no_action[] = "AJS*C4JDBQADN1.NSBN3*2IDNEN*" struct rspamd_multipattern *gtube_matcher = NULL; static const uint64_t words_hash_seed = 0xdeadbabe; +/* CTA URL configuration */ +#define MAX_CTA_URLS_PER_PART 25 + static void free_byte_array_callback(void *pointer) { @@ -127,16 +130,16 @@ rspamd_mime_part_extract_words(struct rspamd_task *task, *avg_len_p += total_len; } - short_len_p = rspamd_mempool_get_variable(task->task_pool, - RSPAMD_MEMPOOL_SHORT_WORDS_CNT); + short_len_p = rspamd_mempool_get_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT); - if (short_len_p == NULL) { - short_len_p = rspamd_mempool_alloc(task->task_pool, - sizeof(double)); - *short_len_p = short_len; - rspamd_mempool_set_variable(task->task_pool, - RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL); - } + if (short_len_p == NULL) { + short_len_p = rspamd_mempool_alloc(task->task_pool, + sizeof(double)); + *short_len_p = short_len; + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_SHORT_WORDS_CNT, short_len_p, NULL); + } else { *short_len_p += short_len; } @@ -795,6 +798,11 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, /* Wire aggregated HTML features */ text_part->html_features = (struct rspamd_html_features *) rspamd_html_get_features(text_part->html); + /* Collect top CTA URLs for this HTML part */ + if (text_part->html && text_part->mime_part && text_part->mime_part->urls) { + rspamd_html_process_cta_urls(text_part, task, MAX_CTA_URLS_PER_PART); + } + /* Optionally call CTA/affiliation Lua hook with capped candidates */ if (task->cfg && task->cfg->lua_state) { lua_State *L = task->cfg->lua_state; @@ -944,55 +952,6 @@ rspamd_message_process_html_text_part(struct rspamd_task *task, lua_settop(L, old_top); } - - /* Store top CTA URLs for LLM and other use cases */ - if (text_part->html && text_part->mime_part && text_part->mime_part->urls) { - /* Simple approach: just store URLs sorted by button weight */ - /* Use task-wide array to aggregate across all HTML parts */ - GPtrArray *cta_urls = rspamd_mempool_get_variable(task->task_pool, "html_cta_urls"); - if (!cta_urls) { - cta_urls = g_ptr_array_new(); - rspamd_mempool_add_destructor(task->task_pool, - (rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, - cta_urls); - rspamd_mempool_set_variable(task->task_pool, "html_cta_urls", cta_urls, NULL); - } - - /* Find best URLs by button weight in this HTML part */ - float best_weights[5] = {0.0, 0.0, 0.0, 0.0, 0.0}; - struct rspamd_url *best_urls[5] = {NULL, NULL, NULL, NULL, NULL}; - unsigned int max_cta_per_part = 5; - - for (unsigned int i = 0; i < text_part->mime_part->urls->len; i++) { - struct rspamd_url *u = g_ptr_array_index(text_part->mime_part->urls, i); - if (!u) continue; - if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue; - if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue; - - float weight = rspamd_html_url_button_weight(text_part->html, u); - - /* Insert into best list if weight is high enough */ - for (unsigned int j = 0; j < max_cta_per_part; j++) { - if (weight > best_weights[j]) { - /* Shift lower entries down */ - for (unsigned int k = max_cta_per_part - 1; k > j; k--) { - best_weights[k] = best_weights[k - 1]; - best_urls[k] = best_urls[k - 1]; - } - best_weights[j] = weight; - best_urls[j] = u; - break; - } - } - } - - /* Add to task-wide array */ - for (unsigned int i = 0; i < max_cta_per_part; i++) { - if (best_urls[i] && best_weights[i] > 0.0) { - g_ptr_array_add(cta_urls, best_urls[i]); - } - } - } } rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); diff --git a/src/libmime/message.h b/src/libmime/message.h index 83f36ff192..dc9987d01f 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -16,6 +16,7 @@ #include "libserver/url.h" #include "libutil/ref.h" #include "libutil/str_util.h" +#include "libutil/heap.h" #include "libserver/word.h" #include @@ -126,6 +127,15 @@ struct rspamd_mime_part { #define IS_TEXT_PART_HTML(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_HTML) #define IS_TEXT_PART_ATTACHMENT(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_ATTACHMENT) +/* CTA (call-to-action) URL heap entry structure */ +struct rspamd_html_cta_entry { + unsigned int pri; /* Priority for heap (weight * scale) */ + unsigned int idx; /* Heap index (managed by heap) */ + struct rspamd_url *url; /* URL pointer */ + float weight; /* Original button weight */ +}; + +RSPAMD_HEAP_DECLARE(rspamd_html_heap_storage, struct rspamd_html_cta_entry); struct rspamd_mime_text_part { const char *language; @@ -148,7 +158,9 @@ struct rspamd_mime_text_part { void *html; /* Optional HTML features collected during parsing */ struct rspamd_html_features *html_features; - GList *exceptions; /**< list of offsets of urls */ + /* CTA (call-to-action) URLs extracted from HTML with weights */ + rspamd_html_heap_storage_t *cta_urls; /**< cta_heap_t* for HTML parts, NULL for plain text */ + GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; unsigned int flags; diff --git a/src/libserver/CMakeLists.txt b/src/libserver/CMakeLists.txt index 721e09a65c..24deff707a 100644 --- a/src/libserver/CMakeLists.txt +++ b/src/libserver/CMakeLists.txt @@ -41,6 +41,7 @@ SET(LIBRSPAMDSERVERSRC ${CMAKE_CURRENT_SOURCE_DIR}/maps/map_helpers.c ${CMAKE_CURRENT_SOURCE_DIR}/html/html_entities.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/html/html_cta.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite.cxx ${CMAKE_CURRENT_SOURCE_DIR}/html/html_url_rewrite_c.cxx diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 1e982236d1..e66ba35652 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -29,6 +29,7 @@ #include "contrib/libucl/khash.h" #include "libmime/images.h" #include "libutil/cxx/utf8_util.h" +#include "libserver/html/html_cta.hxx" #include "html_tag_defs.hxx" #include "html_entities.hxx" @@ -40,6 +41,8 @@ #include "contrib/fmt/include/fmt/core.h" #include +#include +#include #include namespace rspamd::html { @@ -834,6 +837,7 @@ static const auto component_extractors = frozen::make_unordered_map std::optional { auto it = component_extractors.find(attr_name); @@ -890,7 +894,7 @@ enum tag_parser_state { struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; - std::string attr_name;// Store current attribute name + std::string attr_name; // Store current attribute name const char *value_start = nullptr;// Track where attribute value starts in input const char *html_start = nullptr; // Base pointer to HTML buffer start @@ -2406,26 +2410,6 @@ auto html_process_input(struct rspamd_task *task, if (cnt > hc->features.links.max_links_single_domain) { hc->features.links.max_links_single_domain = cnt; } - /* Heuristic button weight */ - float w = 0.0f; - if (url->ext && url->ext->linked_url && url->ext->linked_url != url) { - w += 0.5f; /* display mismatch bonus */ - } - w += 0.2f * (url->order == 0 ? 1.0f : 1.0f / (float) url->order); - if (cur_tag->block && cur_tag->block->is_visible()) { - if (cur_tag->block->has_display()) { - w += 0.1f; - } - if (cur_tag->block->width > 0 && cur_tag->block->height > 0) { - w += std::min(0.2f, (cur_tag->block->width * cur_tag->block->height) / 100000.0f); - } - if (cur_tag->block->font_size >= 14) { - w += 0.1f; - } - } - if (w > 0) { - hc->url_button_weights[url] += w; - } /* same eTLD+1 as first-party? */ if (!hc->first_party_etld1.empty()) { rspamd_ftok_t tld2; @@ -3180,6 +3164,8 @@ auto html_process_input(struct rspamd_task *task, } } + html_compute_cta_weights(*hc); + return hc; } diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index 368a22b08c..f256aae9dc 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -140,6 +140,17 @@ float rspamd_html_url_button_weight(void *html_content, struct rspamd_url *u); */ const struct rspamd_html_features *rspamd_html_get_features(void *html_content); +/** + * Creates CTA (call-to-action) URLs heap for a text part + * Collects top-K URLs by button weight using min-heap (O(n log k)) + * @param text_part text part to fill cta_urls for + * @param task task for mempool allocation + * @param max_cta maximum number of CTA URLs to collect + */ +void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part, + struct rspamd_task *task, + unsigned int max_cta); + #ifdef __cplusplus } diff --git a/src/libserver/html/html_cta.cxx b/src/libserver/html/html_cta.cxx new file mode 100644 index 0000000000..8646b8b50a --- /dev/null +++ b/src/libserver/html/html_cta.cxx @@ -0,0 +1,570 @@ +/*- + * Copyright 2025 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "config.h" +#include "libserver/html/html_cta.hxx" + +#include "util.h" +#include "message.h" +#include "libserver/html/html.hxx" +#include "libserver/html/html_block.hxx" +#include "libserver/html/html_tag.hxx" +#include "libserver/css/css.hxx" +#include "libserver/url.h" +#include "libserver/task.h" +#include "libutil/cxx/util.hxx" +#include "libutil/heap.h" + +#include +#include +#include +#include +#include + +#include + +static constexpr unsigned int CTA_WEIGHT_SCALE = 1000; + +namespace rspamd::html { +namespace { + +using namespace std::string_view_literals; + +static auto trim_ascii(std::string_view input) -> std::string_view +{ + while (!input.empty() && g_ascii_isspace(static_cast(input.front()))) { + input.remove_prefix(1); + } + + while (!input.empty() && g_ascii_isspace(static_cast(input.back()))) { + input.remove_suffix(1); + } + + return input; +} + +static auto space_separated_token_match(std::string_view attr, + std::string_view token, + bool allow_partial) -> bool +{ + attr = trim_ascii(attr); + if (attr.empty()) { + return false; + } + + std::size_t pos = 0; + while (pos < attr.size()) { + while (pos < attr.size() && g_ascii_isspace(static_cast(attr[pos]))) { + pos++; + } + if (pos >= attr.size()) { + break; + } + + auto end = pos; + while (end < attr.size() && !g_ascii_isspace(static_cast(attr[end]))) { + end++; + } + + auto chunk = attr.substr(pos, end - pos); + if (allow_partial) { + if (chunk.find(token) != std::string_view::npos) { + return true; + } + } + else { + if (chunk == token) { + return true; + } + } + + pos = end + 1; + } + + return false; +} + +static auto optional_attr_contains(const std::optional &attr, + std::string_view token, + bool allow_partial = false) -> bool +{ + if (!attr) { + return false; + } + + return space_separated_token_match(attr.value(), token, allow_partial); +} + +template +static auto optional_attr_contains_any(const std::optional &attr, + const Range &tokens, + bool allow_partial = false) -> bool +{ + if (!attr) { + return false; + } + + for (auto token: tokens) { + if (space_separated_token_match(attr.value(), token, allow_partial)) { + return true; + } + } + + return false; +} + +static auto to_lower_ascii(std::string_view input) -> std::string +{ + std::string out; + out.reserve(input.size()); + for (auto ch: input) { + out.push_back(static_cast(g_ascii_tolower(static_cast(ch)))); + } + return out; +} + +static auto get_cta_label(const html_tag &tag, const html_content &hc) -> std::string +{ + auto content = trim_ascii(tag.get_content(&hc)); + if (!content.empty()) { + return std::string{content}; + } + + if (auto title = tag.find_component()) { + auto value = trim_ascii(title.value()->value); + if (!value.empty()) { + return std::string{value}; + } + } + + if (auto aria_label = tag.find_component_by_name("aria-label"sv)) { + auto value = trim_ascii(aria_label.value()); + if (!value.empty()) { + return std::string{value}; + } + } + + if (auto alt = tag.find_component()) { + auto value = trim_ascii(alt.value()->value); + if (!value.empty()) { + return std::string{value}; + } + } + + return {}; +} + +static auto tag_is_effectively_hidden(const html_tag *tag) -> bool +{ + for (auto current = tag; current != nullptr; current = current->parent) { + if (current->block && !current->block->is_visible()) { + return true; + } + if (current->flags & FL_IGNORE) { + return true; + } + } + + return false; +} + +static constexpr auto buttonish_class_tokens = rspamd::array_of( + "btn", "button", "cta", "call-to-action", "submit", "primary", + "confirm", "action", "purchase", "buy", "signup", "sign-up", "apply"); + +static constexpr auto negative_context_tokens = rspamd::array_of( + "logo", "footer", "header", "nav", "menu", "social", + "tracking", "pixel", "unsubscribe", "legal", "copyright"); + +static constexpr auto service_rel_tokens = rspamd::array_of( + "alternate", "canonical", "dns-prefetch", "icon", "manifest", + "preconnect", "prefetch", "preload", "stylesheet"); + +static constexpr auto cta_keywords = rspamd::array_of( + "buy", "purchase", "order", "checkout", "pay", "confirm", "verify", + "update", "login", "log in", "sign in", "sign up", "signup", "register", + "download", "upgrade", "continue", "next", "open", "submit", "apply", + "approve", "activate", "subscribe"); + +static auto is_service_link_tag(const html_tag &tag, const rspamd_url &url) -> bool +{ + if (tag.flags & (FL_XML | FL_VIRTUAL | FL_COMMENT | FL_IGNORE | CM_HEAD)) { + return true; + } + + switch (tag.id) { + case Tag_LINK: + case Tag_SCRIPT: + case Tag_STYLE: + case Tag_META: + case Tag_BASE: + case Tag_IMG: + return true; + default: + break; + } + + if (tag.block && !tag.block->is_visible()) { + return true; + } + + if (url.flags & RSPAMD_URL_FLAG_IMAGE) { + return true; + } + + if (tag.id == Tag_A) { + if (optional_attr_contains_any(tag.find_rel(), service_rel_tokens, false)) { + return true; + } + if (tag.parent && (tag.parent->flags & CM_HEAD)) { + return true; + } + } + + return false; +} + +static auto compute_semantic_base_score(const html_tag &tag, const rspamd_url &url) -> float +{ + switch (tag.id) { + case Tag_BUTTON: + return 0.9f; + case Tag_INPUT: { + float base = 0.35f; + if (auto type_comp = tag.find_component()) { + auto lowered = to_lower_ascii(trim_ascii(type_comp.value()->get_string_value())); + if (lowered == "submit" || lowered == "button" || lowered == "send") { + base = 0.85f; + } + else if (lowered == "image") { + base = 0.75f; + } + else if (lowered == "reset") { + base = 0.25f; + } + } + return base; + } + case Tag_FORM: + return 0.8f; + case Tag_A: { + float base = 0.35f; + if (optional_attr_contains_any(tag.find_class(), buttonish_class_tokens, true) || + optional_attr_contains_any(tag.find_id(), buttonish_class_tokens, true)) { + base = 0.75f; + } + if (auto role_comp = tag.find_component()) { + auto lowered = to_lower_ascii(trim_ascii(role_comp.value()->value)); + if (lowered == "button" || lowered == "tab" || lowered == "menuitem") { + base = std::max(base, 0.7f); + } + } + if (url.protocol == PROTOCOL_MAILTO) { + base = std::min(base, 0.4f); + } + return base; + } + case Tag_AREA: + return 0.3f; + default: + if (tag.flags & FL_HREF) { + return 0.2f; + } + break; + } + + return 0.0f; +} + +static auto compute_visual_bonus(const html_tag &tag) -> float +{ + if (!tag.block || !tag.block->is_visible()) { + return 0.0f; + } + + float bonus = 0.0f; + const auto &block = *tag.block; + + switch (block.display) { + case css::css_display_value::DISPLAY_BLOCK: + bonus += 0.12f; + break; + case css::css_display_value::DISPLAY_TABLE_ROW: + bonus += 0.05f; + break; + default: + break; + } + + if (block.width > 0 && block.height > 0) { + const auto area = static_cast(block.width) * static_cast(block.height); + if (area >= 6000) { + bonus += 0.2f; + } + else if (area >= 2000) { + bonus += 0.12f; + } + else if (area >= 400) { + bonus += 0.06f; + } + } + + if (block.font_size >= 16) { + bonus += 0.08f; + } + else if (block.font_size >= 13) { + bonus += 0.04f; + } + + return bonus; +} + +static auto compute_text_bonus(std::string_view text_lower) -> float +{ + if (text_lower.empty()) { + return 0.0f; + } + + float bonus = 0.0f; + for (auto kw: cta_keywords) { + if (text_lower.find(kw) != std::string_view::npos) { + bonus += 0.18f; + break; + } + } + + if (text_lower.find('!') != std::string_view::npos) { + bonus += 0.03f; + } + + if (text_lower.size() <= 18 && text_lower.size() >= 3) { + bonus += 0.04f; + } + + return bonus; +} + +static auto compute_penalty(const html_tag &tag, + const rspamd_url &url, + std::string_view text_lower, + std::string_view text_original) -> float +{ + float penalty = 0.0f; + + if (text_lower.empty()) { + penalty += 0.35f; + } + else { + unsigned int alpha = 0; + unsigned int graph = 0; + for (auto ch: text_lower) { + if (g_ascii_isspace(static_cast(ch))) { + continue; + } + graph++; + if (g_ascii_isalpha(static_cast(ch))) { + alpha++; + } + } + if (graph > 0 && alpha == 0) { + penalty += 0.25f; + } + if (text_original.size() > 80) { + penalty += 0.1f; + } + } + + if (tag.block) { + const auto &block = *tag.block; + if (block.width > 0 && block.height > 0) { + const auto area = static_cast(block.width) * static_cast(block.height); + if (area <= 64) { + penalty += 0.25f; + } + else if (area <= 150) { + penalty += 0.15f; + } + } + if (block.font_size > 0 && block.font_size <= 9) { + penalty += 0.08f; + } + if (block.is_transparent()) { + penalty += 0.2f; + } + } + + if (optional_attr_contains_any(tag.find_class(), negative_context_tokens, true) || + optional_attr_contains_any(tag.find_id(), negative_context_tokens, true)) { + penalty += 0.2f; + } + + if (url.flags & RSPAMD_URL_FLAG_INVISIBLE) { + penalty += 0.3f; + } + + if (url.protocol == PROTOCOL_MAILTO || url.protocol == PROTOCOL_FTP) { + penalty += 0.05f; + } + + return penalty; +} + +static auto compute_cta_weight(const html_tag &tag, + const rspamd_url &url, + const html_content &hc) -> float +{ + if (is_service_link_tag(tag, url)) { + return 0.0f; + } + + if (tag_is_effectively_hidden(&tag)) { + return 0.0f; + } + + float base = compute_semantic_base_score(tag, url); + if (base <= 0.0f) { + return 0.0f; + } + + auto label = get_cta_label(tag, hc); + std::string_view label_view = trim_ascii(label); + std::string lowered = to_lower_ascii(label_view); + + float visual = compute_visual_bonus(tag); + float text_bonus = compute_text_bonus(lowered); + float order_bonus = 0.0f; + if (url.order == 0) { + order_bonus = 0.1f; + } + else { + order_bonus = std::max(0.0f, 0.06f / (1.0f + static_cast(url.order))); + } + if (url.ext && url.ext->linked_url && url.ext->linked_url != &url) { + order_bonus += 0.12f; + } + float penalty = compute_penalty(tag, url, lowered, label_view); + + float weight = base + visual + text_bonus + order_bonus - penalty; + if (weight < 0.0f) { + weight = 0.0f; + } + else if (weight > 1.0f) { + weight = 1.0f; + } + + return weight; +} + +}// namespace + +void html_compute_cta_weights(html_content &hc) +{ + hc.url_button_weights.clear(); + + for (const auto &tag_ptr: hc.all_tags) { + const auto &tag = *tag_ptr; + if (!std::holds_alternative(tag.extra)) { + continue; + } + + auto *url = std::get(tag.extra); + if (!url) { + continue; + } + + float weight = compute_cta_weight(tag, *url, hc); + if (weight <= 0.0f) { + continue; + } + + auto it = hc.url_button_weights.find(url); + if (it == hc.url_button_weights.end()) { + hc.url_button_weights.emplace(url, weight); + } + else { + it->second = std::max(it->second, weight); + } + } +} + +}// namespace rspamd::html + +extern "C" { + +void rspamd_html_process_cta_urls(struct rspamd_mime_text_part *text_part, + struct rspamd_task *task, + unsigned int max_cta) +{ + using namespace rspamd::html; + + if (!text_part || !text_part->html || !text_part->mime_part || !text_part->mime_part->urls) { + return; + } + auto *part_urls = text_part->mime_part->urls; + unsigned int i; + rspamd_url *u; + + auto *heap_ptr = rspamd_mempool_alloc_type(task->task_pool, rspamd_html_heap_storage_t); + rspamd_heap_init(rspamd_html_heap_storage, heap_ptr); + text_part->cta_urls = heap_ptr; + rspamd_mempool_add_destructor(task->task_pool, [](void *ptr) { + auto *h = static_cast(ptr); + rspamd_heap_destroy(rspamd_html_heap_storage, h); }, heap_ptr); + PTR_ARRAY_FOREACH(part_urls, i, u) + { + if (!u) continue; + if (!(u->protocol == PROTOCOL_HTTP || u->protocol == PROTOCOL_HTTPS)) continue; + if (u->flags & RSPAMD_URL_FLAG_INVISIBLE) continue; + if (u->flags & RSPAMD_URL_FLAG_IMAGE) continue; + + /* Use button_weight to filter CTA URLs vs technical URLs + * Technical tags like ,