From: Vsevolod Stakhov Date: Wed, 8 Oct 2025 16:23:10 +0000 (+0100) Subject: [Fix] Fix frequency-based ordering in HTML domain hashing X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=c5ccb00968b86a69d4ffbb16d4ddcb2dd6cb73b3;p=thirdparty%2Frspamd.git [Fix] Fix frequency-based ordering in HTML domain hashing The hash_top_domains function was sorting domains by frequency (descending), but hash_domain_list was immediately re-sorting them alphabetically, which negated the frequency information. This resulted in incorrect hashes where domain order mattered for fuzzy matching. Added preserve_order parameter to hash_domain_list to optionally skip alphabetical re-sorting when frequency-based ordering should be maintained. --- diff --git a/src/libutil/shingles_html.cxx b/src/libutil/shingles_html.cxx index 2ab7cc7d01..7dc7ec70e4 100644 --- a/src/libutil/shingles_html.cxx +++ b/src/libutil/shingles_html.cxx @@ -198,14 +198,16 @@ html_extract_structural_tokens(html_content *hc, /* Helper: hash a sorted list of domains */ static uint64_t -hash_domain_list(std::vector &domains, const unsigned char key[16]) +hash_domain_list(std::vector &domains, const unsigned char key[16], bool preserve_order = false) { if (domains.empty()) { return 0; } - /* Sort domains for consistent hashing */ - std::sort(domains.begin(), domains.end()); + /* Sort domains for consistent hashing (unless order should be preserved, e.g., for frequency-sorted domains) */ + if (!preserve_order) { + std::sort(domains.begin(), domains.end()); + } rspamd_cryptobox_hash_state_t st; unsigned char digest[rspamd_cryptobox_HASHBYTES]; @@ -217,7 +219,7 @@ hash_domain_list(std::vector &domains, const unsigned char key std::string_view prev; bool has_content = false; for (const auto &dom: domains) { - /* Skip empty domains and duplicates */ + /* Skip empty domains and duplicates (note: only detects consecutive duplicates if preserve_order=true) */ if (dom.empty() || (!prev.empty() && dom == prev)) { continue; } @@ -290,8 +292,8 @@ hash_top_domains(std::vector &domains, unsigned int top_n, con top_domain_names.push_back(dom); } - /* Hash the top domains */ - return hash_domain_list(top_domain_names, key); + /* Hash the top domains, preserving frequency-based order */ + return hash_domain_list(top_domain_names, key, true); } /* Helper: hash HTML features (bucketed) */