rspamd_mempool_t *pool,
std::vector<std::string> &tokens,
std::vector<std::string_view> &cta_domains,
- std::vector<std::string_view> &all_domains)
+ std::vector<std::string_view> &all_domains,
+ bool ignore_link_domains)
{
tokens.reserve(hc->all_tags.size());
cta_domains.reserve(16);
auto etld1 = extract_etld1_from_url(url);
if (!etld1.empty()) {
- token += '@';
- token += etld1;
+ if (!ignore_link_domains) {
+ token += '@';
+ token += etld1;
+ }
- /* Add to all_domains */
+ /* Domain collection for separate hashes still happens unconditionally */
all_domains.push_back(etld1);
/* Check if this is a CTA link using button weights */
rspamd_mempool_t *pool,
rspamd_shingles_filter filter,
gpointer filterd,
- enum rspamd_shingle_alg alg)
+ enum rspamd_shingle_alg alg,
+ gboolean ignore_link_domains)
{
if (!html_content) {
return nullptr;
/* 1. Extract structural tokens and domain lists using modern C++ */
std::vector<std::string> tokens;
std::vector<std::string_view> cta_domains, all_domains;
- html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains);
+ html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains, ignore_link_domains);
if (tokens.empty()) {
/* Empty HTML structure after filtering */
gboolean html_shingles; /* Enable HTML fuzzy hashing */
gboolean text_hashes; /* Enable/disable generation of text hashes */
unsigned int min_html_tags; /* Minimum tags for HTML hash */
+ gboolean html_ignore_domains; /* Ignore link domains in HTML structural tokens */
int learn_condition_cb;
uint32_t retransmits;
struct rspamd_hash_map_helper *skip_map;
else if ((opt = ucl_object_lookup(cur, "weight")) != NULL) {
rule->html_weight = ucl_obj_todouble(opt);
}
+
+ if ((opt = ucl_object_lookup(cur, "ignore_link_domains")) != NULL) {
+ rule->html_ignore_domains = ucl_obj_toboolean(opt);
+ }
}
else {
/* Other checks are processed by lua_fuzzy; keep legacy behaviour */
rule->html_shingles = FALSE;
rule->text_hashes = TRUE;
rule->min_html_tags = 10;
+ rule->html_ignore_domains = FALSE;
return rule;
}
0,
NULL,
0);
+ rspamd_rcl_add_doc_by_path(cfg,
+ "fuzzy_check.rule",
+ "Ignore link domains in HTML structural tokens (for template-only matching)",
+ "html_ignore_domains",
+ UCL_BOOLEAN,
+ NULL,
+ 0,
+ "false",
+ 0);
rspamd_rcl_add_doc_by_path(cfg,
"fuzzy_check.rule",
"Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
struct rspamd_cached_shingles **html_cached_ptr;
memcpy(&key_part, rule->shingles_key->str, sizeof(key_part));
- rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html",
- rule->algorithm_str, key_part);
+ rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html%s",
+ rule->algorithm_str, key_part,
+ rule->html_ignore_domains ? "_nd" : "");
html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable(
task->task_pool, html_cache_key);
html_sh = rspamd_shingles_from_html(part->html,
(const unsigned char *) rule->shingles_key->str, task->task_pool,
rspamd_shingles_default_filter, NULL,
- rule->alg);
+ rule->alg, rule->html_ignore_domains);
if (html_sh != NULL) {
/* Use structure shingles for fuzzy matching */