]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Auto-mark whitelist symbols with SYMBOL_TYPE_FINE flag 5769/head
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 29 Nov 2025 14:24:36 +0000 (14:24 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 29 Nov 2025 14:24:36 +0000 (14:24 +0000)
This change ensures that symbols with negative weight and symbols used
in whitelist composites (composites with negative score) will always
execute regardless of whether the reject threshold has been reached.

Previously, when the early-stop optimization kicked in after reaching
the reject score, whitelist symbols could be skipped, leading to
potential false positives where emails should have been whitelisted.

Changes:
- Symbols with negative weight are automatically marked as FINE during
  config validation in symcache::validate()
- New rspamd_composites_mark_whitelist_deps() function traverses all
  composites with negative score and marks their constituent symbols
  as FINE (with transitive expansion for nested composites)
- New C API rspamd_symcache_set_symbol_fine() to programmatically set
  the FINE flag with proper parent/child propagation
- FINE flag is properly synchronized between virtual symbols and their
  parent symbols

src/libserver/cfg_utils.cxx
src/libserver/composites/composites.h
src/libserver/composites/composites_internal.hxx
src/libserver/composites/composites_manager.cxx
src/libserver/rspamd_symcache.h
src/libserver/symcache/symcache_c.cxx
src/libserver/symcache/symcache_impl.cxx

index 2446db1da5350a2b316d55af0ffedc746f5a8758..d07c3c7f9d473e7b068c43e7ebd8f322cc1f5316 100644 (file)
@@ -1016,6 +1016,8 @@ rspamd_config_post_load(struct rspamd_config *cfg,
                        rspamd_composites_set_inverted_index(cfg->composites_manager,
                                                                                                 cfg->composites_inverted_index);
                        rspamd_composites_process_deps(cfg->composites_manager, cfg);
+                       /* Mark symbols used by whitelist composites (negative score) as FINE */
+                       rspamd_composites_mark_whitelist_deps(cfg->composites_manager, cfg);
                }
        }
 
index 63761eed296e9d1dba0deb0f5b6cd1e695e456ba..599590b78428b81cfecbdb5155edc2a85a2db2ab 100644 (file)
@@ -101,6 +101,15 @@ struct rspamd_composites_stats_export {
  */
 void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_export *stats);
 
+/**
+ * Mark symbols used in whitelist composites (negative score) with SYMBOL_TYPE_FINE
+ * so they won't be skipped when reject threshold is reached. This ensures
+ * whitelist composites can still evaluate correctly.
+ * @param cm_ptr composites manager pointer
+ * @param cfg config structure
+ */
+void rspamd_composites_mark_whitelist_deps(void *cm_ptr, struct rspamd_config *cfg);
+
 #ifdef __cplusplus
 }
 #endif
index 4f10eca3380028d21d85227e6eca6c1c503924ab..270ec9d699f35edc34e105925847f128a61135ad 100644 (file)
@@ -144,6 +144,8 @@ public:
        void process_dependencies();
        /* Build inverted index for fast composite lookup */
        void build_inverted_index();
+       /* Mark symbols used in whitelist composites (negative score) as FINE */
+       void mark_whitelist_dependencies();
 };
 
 /**
index 5663762f1f5b7a24e92081c8dddd8111d4c24030..f951e37e0b3e07f76a3e6b6de6b04a617cdd389f 100644 (file)
@@ -24,6 +24,7 @@
 #include "libserver/cfg_file.h"
 #include "libserver/logger.h"
 #include "libserver/maps/map.h"
+#include "libserver/rspamd_symcache.h"
 #include "libutil/cxx/util.hxx"
 
 namespace rspamd::composites {
@@ -605,6 +606,112 @@ void composites_manager::build_inverted_index()
                                         (int) symbol_to_composites.size(), (int) not_only_composites.size());
 }
 
+/* Callback data for collecting atoms from whitelist composites */
+struct whitelist_atom_cbdata {
+       ankerl::unordered_dense::set<std::string> *fine_symbols;
+};
+
+static void
+whitelist_atom_callback(const rspamd_ftok_t *atom, gpointer ud)
+{
+       auto *cbd = reinterpret_cast<whitelist_atom_cbdata *>(ud);
+
+       if (atom->len == 0) {
+               return;
+       }
+
+       std::string_view atom_str(atom->begin, atom->len);
+
+       /* Skip operators */
+       if (atom_str[0] == '&' || atom_str[0] == '|' ||
+               atom_str[0] == '!' || atom_str[0] == '(' || atom_str[0] == ')') {
+               return;
+       }
+
+       /* Skip prefix characters (~, -, ^) */
+       size_t start = 0;
+       while (start < atom_str.size() &&
+                  (atom_str[start] == '~' || atom_str[start] == '-' || atom_str[start] == '^')) {
+               ++start;
+       }
+
+       if (start >= atom_str.size()) {
+               return;
+       }
+
+       auto remaining = atom_str.substr(start);
+
+       /* Skip group matchers (g:, g+:, g-:) - we can't determine specific symbols */
+       if (remaining.starts_with("g:") || remaining.starts_with("g+:") || remaining.starts_with("g-:")) {
+               return;
+       }
+
+       /* Extract symbol name (before '[' if present for options) */
+       auto bracket_pos = remaining.find('[');
+       std::string symbol_name;
+       if (bracket_pos != std::string_view::npos) {
+               symbol_name = std::string(remaining.substr(0, bracket_pos));
+       }
+       else {
+               symbol_name = std::string(remaining);
+       }
+
+       if (!symbol_name.empty()) {
+               cbd->fine_symbols->emplace(std::move(symbol_name));
+       }
+}
+
+void composites_manager::mark_whitelist_dependencies()
+{
+       ankerl::unordered_dense::set<std::string> fine_symbols;
+
+       msg_debug_config("analyzing whitelist composites for FINE symbol marking");
+
+       /* Step 1: Find composites with negative score and collect their atoms */
+       for (const auto &comp: all_composites) {
+               auto *sym_def = static_cast<struct rspamd_symbol *>(
+                       g_hash_table_lookup(cfg->symbols, comp->sym.c_str()));
+
+               if (sym_def && *sym_def->weight_ptr < 0) {
+                       /* This is a whitelist composite - collect all its atoms */
+                       whitelist_atom_cbdata cbd{&fine_symbols};
+                       rspamd_expression_atom_foreach(comp->expr, whitelist_atom_callback, &cbd);
+
+                       msg_debug_config("composite '%s' has negative weight (%.2f), collecting dependencies",
+                                                        comp->sym.c_str(), *sym_def->weight_ptr);
+               }
+       }
+
+       /* Step 2: Transitively expand - if an atom is also a whitelist composite, add its atoms */
+       bool changed;
+       do {
+               changed = false;
+               for (const auto &comp: all_composites) {
+                       if (fine_symbols.contains(comp->sym)) {
+                               size_t before = fine_symbols.size();
+                               whitelist_atom_cbdata cbd{&fine_symbols};
+                               rspamd_expression_atom_foreach(comp->expr, whitelist_atom_callback, &cbd);
+                               if (fine_symbols.size() > before) {
+                                       changed = true;
+                               }
+                       }
+               }
+       } while (changed);
+
+       /* Step 3: Mark all collected symbols as FINE in symcache */
+       int marked_count = 0;
+       for (const auto &sym_name: fine_symbols) {
+               if (rspamd_symcache_set_symbol_fine(cfg->cache, sym_name.c_str())) {
+                       msg_debug_config("marked symbol '%s' as FINE (whitelist composite dependency)",
+                                                        sym_name.c_str());
+                       marked_count++;
+               }
+       }
+
+       msg_info_config("marked %d symbols as FINE for whitelist composite dependencies",
+                                       marked_count);
+}
+
 }// namespace rspamd::composites
 
 void rspamd_composites_process_deps(void *cm_ptr, struct rspamd_config *cfg)
@@ -642,4 +749,10 @@ void rspamd_composites_get_stats(void *cm_ptr, struct rspamd_composites_stats_ex
        stats->time_fast_mean = cm->stats.time_fast.mean;
        stats->time_fast_stddev = cm->stats.time_fast.stddev;
        stats->time_fast_count = cm->stats.time_fast.number;
+}
+
+void rspamd_composites_mark_whitelist_deps(void *cm_ptr, struct rspamd_config *cfg)
+{
+       auto *cm = COMPOSITE_MANAGER_FROM_PTR(cm_ptr);
+       cm->mark_whitelist_dependencies();
 }
\ No newline at end of file
index f020b6055b74db691d9a6d2f0ad6fbdc2f93b3a2..47582523f58c175895164aa73e31b7546bd7593f 100644 (file)
@@ -578,6 +578,16 @@ void rspamd_symcache_runtime_destroy(struct rspamd_task *task);
  */
 void rspamd_symcache_promote_resort(struct rspamd_symcache *cache);
 
+/**
+ * Marks a symbol with SYMBOL_TYPE_FINE flag so it won't be skipped on early stop
+ * (when reject threshold is reached). Also propagates flag to parent/children.
+ * @param cache
+ * @param symbol symbol name
+ * @return TRUE if symbol was found and marked
+ */
+gboolean rspamd_symcache_set_symbol_fine(struct rspamd_symcache *cache,
+                                                                                const char *symbol);
+
 #ifdef __cplusplus
 }
 #endif
index 6221aa238c20e6d7133b7fd7e15bf45f44ffaa28..50af26340343c0ab00fb0804d5bb51041a57f582 100644 (file)
@@ -727,3 +727,36 @@ void rspamd_symcache_promote_resort(struct rspamd_symcache *cache)
 
        real_cache->promote_resort();
 }
+
+gboolean rspamd_symcache_set_symbol_fine(struct rspamd_symcache *cache,
+                                                                                const char *symbol)
+{
+       auto *real_cache = C_API_SYMCACHE(cache);
+       auto *item = real_cache->get_item_by_name_mut(symbol, false);
+
+       if (item == nullptr) {
+               return FALSE;
+       }
+
+       if (!(item->flags & SYMBOL_TYPE_FINE)) {
+               item->flags |= SYMBOL_TYPE_FINE;
+
+               /* Also mark parent if this is a virtual symbol */
+               if (item->is_virtual()) {
+                       auto *parent = const_cast<rspamd::symcache::cache_item *>(item->get_parent(*real_cache));
+                       if (parent && !(parent->flags & SYMBOL_TYPE_FINE)) {
+                               parent->flags |= SYMBOL_TYPE_FINE;
+                       }
+               }
+
+               /* And mark all virtual children */
+               const auto *children = item->get_children();
+               if (children) {
+                       for (auto *child: *children) {
+                               child->flags |= SYMBOL_TYPE_FINE;
+                       }
+               }
+       }
+
+       return TRUE;
+}
index c1ca2a6ed4f116847223e2b99a484d0830a38e66..aae779375985c1c52614edd3d95f65148f067c38 100644 (file)
@@ -925,6 +925,17 @@ auto symcache::validate(bool strict) -> bool
                        item->priority++;
                }
 
+               /*
+                * Mark symbols with negative weight as FINE, so they are not skipped
+                * when reject threshold is reached. This ensures whitelist symbols
+                * always have a chance to execute.
+                */
+               if (item->st->weight < 0 && !(item->flags & SYMBOL_TYPE_FINE)) {
+                       item->flags |= SYMBOL_TYPE_FINE;
+                       msg_debug_cache("symbol %s has negative weight (%.2f), marking as FINE",
+                                                       item->symbol.c_str(), item->st->weight);
+               }
+
                if (item->is_virtual()) {
                        if (!(item->flags & SYMBOL_TYPE_GHOST)) {
                                auto *parent = const_cast<cache_item *>(item->get_parent(*this));
@@ -945,6 +956,17 @@ auto symcache::validate(bool strict) -> bool
                                        parent->priority = MAX(p1, p2);
                                        item->priority = parent->priority;
                                }
+
+                               /*
+                                * Sync SYMBOL_TYPE_FINE between virtual symbol and parent.
+                                * If either has negative weight and is marked FINE, propagate to both.
+                                */
+                               if ((item->flags & SYMBOL_TYPE_FINE) && !(parent->flags & SYMBOL_TYPE_FINE)) {
+                                       parent->flags |= SYMBOL_TYPE_FINE;
+                               }
+                               else if ((parent->flags & SYMBOL_TYPE_FINE) && !(item->flags & SYMBOL_TYPE_FINE)) {
+                                       item->flags |= SYMBOL_TYPE_FINE;
+                               }
                        }
                }